2021-04-24 00:43:48 +08:00
/*
2024-01-17 21:20:26 +08:00
* Copyright ( C ) 2021 - 2024 Intel Corporation
2021-04-24 00:43:48 +08:00
*
* SPDX - License - Identifier : MIT
*
*/
# pragma once
# include "shared/source/command_container/command_encoder.h"
2021-10-29 18:12:13 +08:00
# include "shared/source/command_container/walker_partition_interface.h"
2021-04-24 00:43:48 +08:00
# include "shared/source/debug_settings/debug_settings_manager.h"
2022-05-18 03:04:23 +08:00
# include "shared/source/helpers/aligned_memory.h"
2021-04-24 00:43:48 +08:00
# include "shared/source/helpers/basic_math.h"
2023-01-24 23:33:52 +08:00
# include "shared/source/helpers/common_types.h"
2023-02-02 00:23:01 +08:00
# include "shared/source/helpers/gfx_core_helper.h"
2021-12-22 22:11:05 +08:00
# include "shared/source/helpers/pipe_control_args.h"
2021-04-24 00:43:48 +08:00
# include "shared/source/helpers/ptr_math.h"
2022-08-31 22:58:49 +08:00
# include "shared/source/helpers/string.h"
2021-04-24 00:43:48 +08:00
# include <cassert>
# include <optional>
2021-11-04 23:28:06 +08:00
namespace NEO {
struct PipeControlArgs ;
}
2021-04-24 00:43:48 +08:00
namespace WalkerPartition {
template < typename GfxFamily >
using POSTSYNC_DATA = typename GfxFamily : : POSTSYNC_DATA ;
template < typename GfxFamily >
using BATCH_BUFFER_START = typename GfxFamily : : MI_BATCH_BUFFER_START ;
template < typename GfxFamily >
using BATCH_BUFFER_END = typename GfxFamily : : MI_BATCH_BUFFER_END ;
template < typename GfxFamily >
using LOAD_REGISTER_IMM = typename GfxFamily : : MI_LOAD_REGISTER_IMM ;
template < typename GfxFamily >
using LOAD_REGISTER_MEM = typename GfxFamily : : MI_LOAD_REGISTER_MEM ;
template < typename GfxFamily >
using MI_SET_PREDICATE = typename GfxFamily : : MI_SET_PREDICATE ;
template < typename GfxFamily >
using MI_SEMAPHORE_WAIT = typename GfxFamily : : MI_SEMAPHORE_WAIT ;
template < typename GfxFamily >
using MI_ATOMIC = typename GfxFamily : : MI_ATOMIC ;
template < typename GfxFamily >
using DATA_SIZE = typename GfxFamily : : MI_ATOMIC : : DATA_SIZE ;
template < typename GfxFamily >
using LOAD_REGISTER_REG = typename GfxFamily : : MI_LOAD_REGISTER_REG ;
template < typename GfxFamily >
using PIPE_CONTROL = typename GfxFamily : : PIPE_CONTROL ;
template < typename GfxFamily >
using MI_STORE_DATA_IMM = typename GfxFamily : : MI_STORE_DATA_IMM ;
2021-11-11 03:56:42 +08:00
template < typename GfxFamily >
using POST_SYNC_OPERATION = typename PIPE_CONTROL < GfxFamily > : : POST_SYNC_OPERATION ;
2021-04-24 00:43:48 +08:00
2023-10-26 23:06:55 +08:00
template < typename GfxFamily , typename WalkerType >
void appendWalkerFields ( WalkerType & walkerCmd , uint32_t tileCount ) ;
2021-04-24 00:43:48 +08:00
template < typename Command >
Command * putCommand ( void * & inputAddress , uint32_t & totalBytesProgrammed ) {
totalBytesProgrammed + = sizeof ( Command ) ;
auto commandToReturn = reinterpret_cast < Command * > ( inputAddress ) ;
inputAddress = ptrOffset ( inputAddress , sizeof ( Command ) ) ;
return commandToReturn ;
}
2022-08-31 22:58:49 +08:00
inline void * putCommand ( void * & inputAddress , uint32_t & totalBytesProgrammed , size_t commandSize ) {
totalBytesProgrammed + = static_cast < uint32_t > ( commandSize ) ;
auto commandToReturn = inputAddress ;
inputAddress = ptrOffset ( inputAddress , commandSize ) ;
return commandToReturn ;
}
2023-11-23 21:58:58 +08:00
template < typename GfxFamily , typename WalkerType >
2021-04-24 00:43:48 +08:00
uint32_t computePartitionCountAndPartitionType ( uint32_t preferredMinimalPartitionCount ,
bool preferStaticPartitioning ,
2021-09-08 05:21:19 +08:00
const Vec3 < size_t > & groupStart ,
const Vec3 < size_t > & groupCount ,
2023-11-23 21:58:58 +08:00
std : : optional < typename WalkerType : : PARTITION_TYPE > requestedPartitionType ,
typename WalkerType : : PARTITION_TYPE * outSelectedPartitionType ,
2021-04-24 00:43:48 +08:00
bool * outSelectStaticPartitioning ) {
2023-11-23 21:58:58 +08:00
using PARTITION_TYPE = typename WalkerType : : PARTITION_TYPE ;
2021-04-24 00:43:48 +08:00
// For non uniform starting point, there is no support for partition in Hardware. Disable partitioning and select dynamic algorithm
if ( groupStart . x | | groupStart . y | | groupStart . z ) {
2023-11-23 21:58:58 +08:00
* outSelectedPartitionType = PARTITION_TYPE : : PARTITION_TYPE_DISABLED ;
2021-04-24 00:43:48 +08:00
* outSelectStaticPartitioning = false ;
return 1u ;
}
size_t workgroupCount = 0u ;
bool disablePartitionForPartitionCountOne { } ;
2023-11-30 16:32:25 +08:00
if ( NEO : : debugManager . flags . ExperimentalSetWalkerPartitionType . get ( ) ! = - 1 ) {
requestedPartitionType = static_cast < PARTITION_TYPE > ( NEO : : debugManager . flags . ExperimentalSetWalkerPartitionType . get ( ) ) ;
2021-04-24 00:43:48 +08:00
}
if ( requestedPartitionType . has_value ( ) ) {
switch ( requestedPartitionType . value ( ) ) {
2023-11-23 21:58:58 +08:00
case PARTITION_TYPE : : PARTITION_TYPE_X :
2021-04-24 00:43:48 +08:00
workgroupCount = groupCount . x ;
break ;
2023-11-23 21:58:58 +08:00
case PARTITION_TYPE : : PARTITION_TYPE_Y :
2021-04-24 00:43:48 +08:00
workgroupCount = groupCount . y ;
break ;
2023-11-23 21:58:58 +08:00
case PARTITION_TYPE : : PARTITION_TYPE_Z :
2021-04-24 00:43:48 +08:00
workgroupCount = groupCount . z ;
break ;
default :
UNRECOVERABLE_IF ( true ) ;
}
* outSelectedPartitionType = requestedPartitionType . value ( ) ;
disablePartitionForPartitionCountOne = false ;
} else {
const size_t maxDimension = std : : max ( { groupCount . z , groupCount . y , groupCount . x } ) ;
auto goWithMaxAlgorithm = ! preferStaticPartitioning ;
2023-11-30 16:32:25 +08:00
if ( NEO : : debugManager . flags . WalkerPartitionPreferHighestDimension . get ( ) ! = - 1 ) {
goWithMaxAlgorithm = ! ! ! NEO : : debugManager . flags . WalkerPartitionPreferHighestDimension . get ( ) ;
2021-04-24 00:43:48 +08:00
}
2022-07-21 22:28:10 +08:00
// compute misaligned %, accept imbalance below threshold in favor of Z/Y/X distribution.
2021-04-24 00:43:48 +08:00
const float minimalThreshold = 0.05f ;
float zImbalance = static_cast < float > ( groupCount . z - alignDown ( groupCount . z , preferredMinimalPartitionCount ) ) / static_cast < float > ( groupCount . z ) ;
float yImbalance = static_cast < float > ( groupCount . y - alignDown ( groupCount . y , preferredMinimalPartitionCount ) ) / static_cast < float > ( groupCount . y ) ;
2022-07-21 22:28:10 +08:00
// we first try with deepest dimension to see if we can partition there
2021-04-24 00:43:48 +08:00
if ( groupCount . z > 1 & & ( zImbalance < = minimalThreshold ) ) {
2023-11-23 21:58:58 +08:00
* outSelectedPartitionType = PARTITION_TYPE : : PARTITION_TYPE_Z ;
2021-04-24 00:43:48 +08:00
} else if ( groupCount . y > 1 & & ( yImbalance < minimalThreshold ) ) {
2023-11-23 21:58:58 +08:00
* outSelectedPartitionType = PARTITION_TYPE : : PARTITION_TYPE_Y ;
2021-04-24 00:43:48 +08:00
} else if ( groupCount . x % preferredMinimalPartitionCount = = 0 ) {
2023-11-23 21:58:58 +08:00
* outSelectedPartitionType = PARTITION_TYPE : : PARTITION_TYPE_X ;
2021-04-24 00:43:48 +08:00
}
2022-07-21 22:28:10 +08:00
// if we are here then there is no dimension that results in even distribution, choose max dimension to minimize impact
2021-04-24 00:43:48 +08:00
else {
goWithMaxAlgorithm = true ;
}
if ( goWithMaxAlgorithm ) {
// default mode, select greatest dimension
if ( maxDimension = = groupCount . x ) {
2023-11-23 21:58:58 +08:00
* outSelectedPartitionType = PARTITION_TYPE : : PARTITION_TYPE_X ;
2021-04-24 00:43:48 +08:00
} else if ( maxDimension = = groupCount . y ) {
2023-11-23 21:58:58 +08:00
* outSelectedPartitionType = PARTITION_TYPE : : PARTITION_TYPE_Y ;
2021-04-24 00:43:48 +08:00
} else {
2023-11-23 21:58:58 +08:00
* outSelectedPartitionType = PARTITION_TYPE : : PARTITION_TYPE_Z ;
2021-04-24 00:43:48 +08:00
}
}
workgroupCount = maxDimension ;
disablePartitionForPartitionCountOne = true ;
}
// Static partitioning - partition count == tile count
* outSelectStaticPartitioning = preferStaticPartitioning ;
if ( preferStaticPartitioning ) {
return preferredMinimalPartitionCount ;
}
// Dynamic partitioning - compute optimal partition count
size_t partitionCount = std : : min ( static_cast < size_t > ( 16u ) , workgroupCount ) ;
partitionCount = Math : : prevPowerOfTwo ( partitionCount ) ;
2023-11-30 16:32:25 +08:00
if ( NEO : : debugManager . flags . SetMinimalPartitionSize . get ( ) ! = 0 ) {
const auto workgroupPerPartitionThreshold = NEO : : debugManager . flags . SetMinimalPartitionSize . get ( ) = = - 1
2021-04-24 00:43:48 +08:00
? 512u
2023-11-30 16:32:25 +08:00
: static_cast < unsigned > ( NEO : : debugManager . flags . SetMinimalPartitionSize . get ( ) ) ;
2021-04-24 00:43:48 +08:00
preferredMinimalPartitionCount = std : : max ( 2u , preferredMinimalPartitionCount ) ;
while ( partitionCount > preferredMinimalPartitionCount ) {
auto workgroupsPerPartition = workgroupCount / partitionCount ;
if ( workgroupsPerPartition > = workgroupPerPartitionThreshold ) {
break ;
}
partitionCount = partitionCount / 2 ;
}
}
if ( partitionCount = = 1u & & disablePartitionForPartitionCountOne ) {
2023-11-23 21:58:58 +08:00
* outSelectedPartitionType = PARTITION_TYPE : : PARTITION_TYPE_DISABLED ;
2021-04-24 00:43:48 +08:00
}
return static_cast < uint32_t > ( partitionCount ) ;
}
2023-11-23 21:58:58 +08:00
template < typename GfxFamily , typename WalkerType >
uint32_t computePartitionCountAndSetPartitionType ( WalkerType * walker ,
2023-12-05 23:21:29 +08:00
NEO : : RequiredPartitionDim requiredPartitionDim ,
2021-04-24 00:43:48 +08:00
uint32_t preferredMinimalPartitionCount ,
bool preferStaticPartitioning ,
bool * outSelectStaticPartitioning ) {
2023-11-23 21:58:58 +08:00
using PARTITION_TYPE = typename WalkerType : : PARTITION_TYPE ;
2021-04-24 00:43:48 +08:00
const Vec3 < size_t > groupStart = { walker - > getThreadGroupIdStartingX ( ) , walker - > getThreadGroupIdStartingY ( ) , walker - > getThreadGroupIdStartingZ ( ) } ;
const Vec3 < size_t > groupCount = { walker - > getThreadGroupIdXDimension ( ) , walker - > getThreadGroupIdYDimension ( ) , walker - > getThreadGroupIdZDimension ( ) } ;
2023-11-23 21:58:58 +08:00
std : : optional < PARTITION_TYPE > requestedPartitionType { } ;
2023-12-05 23:21:29 +08:00
switch ( requiredPartitionDim ) {
2023-12-13 22:51:31 +08:00
case NEO : : RequiredPartitionDim : : x :
2023-11-23 21:58:58 +08:00
requestedPartitionType = PARTITION_TYPE : : PARTITION_TYPE_X ;
2023-12-05 23:21:29 +08:00
break ;
2023-12-13 22:51:31 +08:00
case NEO : : RequiredPartitionDim : : y :
2023-12-05 23:21:29 +08:00
requestedPartitionType = PARTITION_TYPE : : PARTITION_TYPE_Y ;
break ;
2023-12-13 22:51:31 +08:00
case NEO : : RequiredPartitionDim : : z :
2023-12-05 23:21:29 +08:00
requestedPartitionType = PARTITION_TYPE : : PARTITION_TYPE_Z ;
break ;
default :
2023-12-13 22:51:31 +08:00
UNRECOVERABLE_IF ( requiredPartitionDim ! = NEO : : RequiredPartitionDim : : none ) ;
2023-12-05 23:21:29 +08:00
break ;
2023-11-23 21:58:58 +08:00
}
2023-12-05 23:21:29 +08:00
2023-11-23 21:58:58 +08:00
PARTITION_TYPE partitionType { } ;
const auto partitionCount = computePartitionCountAndPartitionType < GfxFamily , WalkerType > ( preferredMinimalPartitionCount ,
preferStaticPartitioning ,
groupStart ,
groupCount ,
requestedPartitionType ,
& partitionType ,
outSelectStaticPartitioning ) ;
2021-04-24 00:43:48 +08:00
walker - > setPartitionType ( partitionType ) ;
return partitionCount ;
}
template < typename GfxFamily >
void programRegisterWithValue ( void * & inputAddress , uint32_t registerOffset , uint32_t & totalBytesProgrammed , uint32_t registerValue ) {
auto loadRegisterImmediate = putCommand < LOAD_REGISTER_IMM < GfxFamily > > ( inputAddress , totalBytesProgrammed ) ;
LOAD_REGISTER_IMM < GfxFamily > cmd = GfxFamily : : cmdInitLoadRegisterImm ;
cmd . setRegisterOffset ( registerOffset ) ;
cmd . setDataDword ( registerValue ) ;
cmd . setMmioRemapEnable ( true ) ;
* loadRegisterImmediate = cmd ;
}
template < typename GfxFamily >
void programWaitForSemaphore ( void * & inputAddress , uint32_t & totalBytesProgrammed , uint64_t gpuAddress , uint32_t semaphoreCompareValue , typename MI_SEMAPHORE_WAIT < GfxFamily > : : COMPARE_OPERATION compareOperation ) {
auto semaphoreWait = putCommand < MI_SEMAPHORE_WAIT < GfxFamily > > ( inputAddress , totalBytesProgrammed ) ;
2024-03-26 19:56:45 +08:00
NEO : : EncodeSemaphore < GfxFamily > : : programMiSemaphoreWait ( semaphoreWait , gpuAddress , semaphoreCompareValue , compareOperation , false , true , false , false , false ) ;
2021-04-24 00:43:48 +08:00
}
template < typename GfxFamily >
bool programWparidMask ( void * & inputAddress , uint32_t & totalBytesProgrammed , uint32_t partitionCount ) {
2022-07-21 22:28:10 +08:00
// currently only power of 2 values of partitionCount are being supported
2021-04-24 00:43:48 +08:00
if ( ! Math : : isPow2 ( partitionCount ) | | partitionCount > 16 ) {
return false ;
}
auto mask = 0xFFE0 ;
auto fillValue = 0x10 ;
auto count = partitionCount ;
while ( count < 16 ) {
fillValue | = ( fillValue > > 1 ) ;
count * = 2 ;
}
mask | = ( mask | fillValue ) ;
programRegisterWithValue < GfxFamily > ( inputAddress , predicationMaskCCSOffset , totalBytesProgrammed , mask ) ;
return true ;
}
template < typename GfxFamily >
void programWparidPredication ( void * & inputAddress , uint32_t & totalBytesProgrammed , bool predicationEnabled ) {
auto miSetPredicate = putCommand < MI_SET_PREDICATE < GfxFamily > > ( inputAddress , totalBytesProgrammed ) ;
MI_SET_PREDICATE < GfxFamily > cmd = GfxFamily : : cmdInitSetPredicate ;
if ( predicationEnabled ) {
cmd . setPredicateEnableWparid ( MI_SET_PREDICATE < GfxFamily > : : PREDICATE_ENABLE_WPARID : : PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE ) ;
} else {
cmd . setPredicateEnable ( MI_SET_PREDICATE < GfxFamily > : : PREDICATE_ENABLE : : PREDICATE_ENABLE_PREDICATE_DISABLE ) ;
}
* miSetPredicate = cmd ;
}
template < typename GfxFamily >
void programMiAtomic ( void * & inputAddress , uint32_t & totalBytesProgrammed , uint64_t gpuAddress , bool requireReturnValue , typename MI_ATOMIC < GfxFamily > : : ATOMIC_OPCODES atomicOpcode ) {
auto miAtomic = putCommand < MI_ATOMIC < GfxFamily > > ( inputAddress , totalBytesProgrammed ) ;
NEO : : EncodeAtomic < GfxFamily > : : programMiAtomic ( miAtomic , gpuAddress , atomicOpcode , DATA_SIZE < GfxFamily > : : DATA_SIZE_DWORD ,
requireReturnValue , requireReturnValue , 0x0u , 0x0u ) ;
}
template < typename GfxFamily >
void programMiBatchBufferStart ( void * & inputAddress , uint32_t & totalBytesProgrammed ,
uint64_t gpuAddress , bool predicationEnabled , bool secondary ) {
auto batchBufferStart = putCommand < BATCH_BUFFER_START < GfxFamily > > ( inputAddress , totalBytesProgrammed ) ;
BATCH_BUFFER_START < GfxFamily > cmd = GfxFamily : : cmdInitBatchBufferStart ;
cmd . setSecondLevelBatchBuffer ( static_cast < typename BATCH_BUFFER_START < GfxFamily > : : SECOND_LEVEL_BATCH_BUFFER > ( secondary ) ) ;
cmd . setAddressSpaceIndicator ( BATCH_BUFFER_START < GfxFamily > : : ADDRESS_SPACE_INDICATOR : : ADDRESS_SPACE_INDICATOR_PPGTT ) ;
cmd . setPredicationEnable ( predicationEnabled ) ;
cmd . setBatchBufferStartAddress ( gpuAddress ) ;
* batchBufferStart = cmd ;
}
template < typename GfxFamily >
void programMiLoadRegisterReg ( void * & inputAddress , uint32_t & totalBytesProgrammed , uint32_t sourceRegisterOffset , uint32_t destinationRegisterOffset ) {
auto loadRegisterReg = putCommand < LOAD_REGISTER_REG < GfxFamily > > ( inputAddress , totalBytesProgrammed ) ;
LOAD_REGISTER_REG < GfxFamily > cmd = GfxFamily : : cmdInitLoadRegisterReg ;
cmd . setMmioRemapEnableSource ( true ) ;
cmd . setMmioRemapEnableDestination ( true ) ;
cmd . setSourceRegisterAddress ( sourceRegisterOffset ) ;
cmd . setDestinationRegisterAddress ( destinationRegisterOffset ) ;
* loadRegisterReg = cmd ;
}
template < typename GfxFamily >
void programMiLoadRegisterMem ( void * & inputAddress , uint32_t & totalBytesProgrammed , uint64_t gpuAddressToLoad , uint32_t destinationRegisterOffset ) {
auto loadRegisterReg = putCommand < LOAD_REGISTER_MEM < GfxFamily > > ( inputAddress , totalBytesProgrammed ) ;
LOAD_REGISTER_MEM < GfxFamily > cmd = GfxFamily : : cmdInitLoadRegisterMem ;
cmd . setMmioRemapEnable ( true ) ;
cmd . setMemoryAddress ( gpuAddressToLoad ) ;
cmd . setRegisterAddress ( destinationRegisterOffset ) ;
* loadRegisterReg = cmd ;
}
template < typename GfxFamily >
2021-11-11 03:56:42 +08:00
void programPipeControlCommand ( void * & inputAddress , uint32_t & totalBytesProgrammed , NEO : : PipeControlArgs & flushArgs ) {
2022-08-31 22:58:49 +08:00
auto singleBarrierSize = NEO : : MemorySynchronizationCommands < GfxFamily > : : getSizeForSingleBarrier ( flushArgs . tlbInvalidation ) ;
auto pipeControl = putCommand ( inputAddress , totalBytesProgrammed , singleBarrierSize ) ;
UNRECOVERABLE_IF ( sizeof ( PIPE_CONTROL < GfxFamily > ) < singleBarrierSize ) ;
uint8_t cmd [ sizeof ( PIPE_CONTROL < GfxFamily > ) ] = { } ;
NEO : : MemorySynchronizationCommands < GfxFamily > : : setSingleBarrier ( cmd , flushArgs ) ;
memcpy_s ( pipeControl , singleBarrierSize , cmd , singleBarrierSize ) ;
2021-04-24 00:43:48 +08:00
}
2021-11-11 03:56:42 +08:00
template < typename GfxFamily >
void programPostSyncPipeControlCommand ( void * & inputAddress ,
uint32_t & totalBytesProgrammed ,
WalkerPartitionArgs & args ,
NEO : : PipeControlArgs & flushArgs ,
2023-01-26 11:58:18 +08:00
const NEO : : RootDeviceEnvironment & rootDeviceEnvironment ) {
2021-11-11 03:56:42 +08:00
2022-07-21 22:28:10 +08:00
NEO : : MemorySynchronizationCommands < GfxFamily > : : setBarrierWithPostSyncOperation ( inputAddress ,
2023-12-05 20:06:54 +08:00
NEO : : PostSyncMode : : immediateData ,
2022-07-21 22:28:10 +08:00
args . postSyncGpuAddress ,
args . postSyncImmediateValue ,
2023-01-26 11:58:18 +08:00
rootDeviceEnvironment ,
2022-07-21 22:28:10 +08:00
flushArgs ) ;
2021-11-11 03:56:42 +08:00
2023-01-26 11:58:18 +08:00
totalBytesProgrammed + = static_cast < uint32_t > ( NEO : : MemorySynchronizationCommands < GfxFamily > : : getSizeForBarrierWithPostSyncOperation ( rootDeviceEnvironment , flushArgs . tlbInvalidation ) ) ;
2021-11-11 03:56:42 +08:00
}
2021-04-24 00:43:48 +08:00
template < typename GfxFamily >
void programStoreMemImmediateDword ( void * & inputAddress , uint32_t & totalBytesProgrammed , uint64_t gpuAddress , uint32_t data ) {
auto storeDataImmediate = putCommand < MI_STORE_DATA_IMM < GfxFamily > > ( inputAddress , totalBytesProgrammed ) ;
MI_STORE_DATA_IMM < GfxFamily > cmd = GfxFamily : : cmdInitStoreDataImm ;
cmd . setAddress ( gpuAddress ) ;
cmd . setStoreQword ( false ) ;
cmd . setDwordLength ( MI_STORE_DATA_IMM < GfxFamily > : : DWORD_LENGTH : : DWORD_LENGTH_STORE_DWORD ) ;
cmd . setDataDword0 ( static_cast < uint32_t > ( data ) ) ;
* storeDataImmediate = cmd ;
}
2021-09-14 01:39:55 +08:00
template < typename GfxFamily >
2021-09-16 20:11:22 +08:00
uint64_t computeSelfCleanupSectionSize ( bool useAtomicsForSelfCleanup ) {
if ( useAtomicsForSelfCleanup ) {
2021-09-14 01:39:55 +08:00
return sizeof ( MI_ATOMIC < GfxFamily > ) ;
} else {
return sizeof ( MI_STORE_DATA_IMM < GfxFamily > ) ;
}
}
2021-04-24 00:43:48 +08:00
template < typename GfxFamily >
2021-09-16 20:11:22 +08:00
void programSelfCleanupSection ( void * & inputAddress ,
uint32_t & totalBytesProgrammed ,
uint64_t address ,
bool useAtomicsForSelfCleanup ) {
if ( useAtomicsForSelfCleanup ) {
2021-07-03 00:31:57 +08:00
programMiAtomic < GfxFamily > ( inputAddress ,
totalBytesProgrammed ,
2021-09-14 01:39:55 +08:00
address ,
2021-07-03 00:31:57 +08:00
false ,
MI_ATOMIC < GfxFamily > : : ATOMIC_OPCODES : : ATOMIC_4B_MOVE ) ;
} else {
programStoreMemImmediateDword < GfxFamily > ( inputAddress ,
totalBytesProgrammed ,
2021-09-14 01:39:55 +08:00
address ,
2021-07-03 00:31:57 +08:00
0u ) ;
}
2021-04-24 00:43:48 +08:00
}
2021-09-14 01:39:55 +08:00
template < typename GfxFamily >
uint64_t computeTilesSynchronizationWithAtomicsSectionSize ( ) {
return sizeof ( MI_ATOMIC < GfxFamily > ) +
2023-03-10 21:49:06 +08:00
NEO : : EncodeSemaphore < GfxFamily > : : getSizeMiSemaphoreWait ( ) ;
2021-09-14 01:39:55 +08:00
}
template < typename GfxFamily >
void programTilesSynchronizationWithAtomics ( void * & currentBatchBufferPointer ,
uint32_t & totalBytesProgrammed ,
uint64_t atomicAddress ,
uint32_t tileCount ) {
programMiAtomic < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , atomicAddress , false , MI_ATOMIC < GfxFamily > : : ATOMIC_OPCODES : : ATOMIC_4B_INCREMENT ) ;
programWaitForSemaphore < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , atomicAddress , tileCount , MI_SEMAPHORE_WAIT < GfxFamily > : : COMPARE_OPERATION : : COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD ) ;
}
template < typename GfxFamily >
2022-10-14 08:11:28 +08:00
uint64_t computeSelfCleanupEndSectionSize ( size_t fieldsForCleanupCount , WalkerPartitionArgs & args ) {
size_t extraSize = 0 ;
if ( args . pipeControlBeforeCleanupCrossTileSync ) {
extraSize + = 2 * NEO : : MemorySynchronizationCommands < GfxFamily > : : getSizeForSingleBarrier ( false ) ;
}
return fieldsForCleanupCount * computeSelfCleanupSectionSize < GfxFamily > ( args . useAtomicsForSelfCleanup ) +
2 * computeTilesSynchronizationWithAtomicsSectionSize < GfxFamily > ( ) + extraSize ;
2021-09-14 01:39:55 +08:00
}
2021-04-24 00:43:48 +08:00
template < typename GfxFamily >
2021-09-16 20:11:22 +08:00
void programSelfCleanupEndSection ( void * & inputAddress ,
uint32_t & totalBytesProgrammed ,
uint64_t finalSyncTileCountAddress ,
uint64_t baseAddressForCleanup ,
size_t fieldsForCleanupCount ,
2022-10-14 08:11:28 +08:00
WalkerPartitionArgs & args ) {
NEO : : PipeControlArgs pipeControlArgs ;
if ( args . pipeControlBeforeCleanupCrossTileSync ) {
programPipeControlCommand < GfxFamily > ( inputAddress , totalBytesProgrammed , pipeControlArgs ) ;
}
2021-04-24 00:43:48 +08:00
// Synchronize tiles, so the fields are not cleared while still in use
2022-10-14 08:11:28 +08:00
programTilesSynchronizationWithAtomics < GfxFamily > ( inputAddress , totalBytesProgrammed , finalSyncTileCountAddress , args . tileCount ) ;
2021-04-24 00:43:48 +08:00
for ( auto fieldIndex = 0u ; fieldIndex < fieldsForCleanupCount ; fieldIndex + + ) {
const uint64_t addressForCleanup = baseAddressForCleanup + fieldIndex * sizeof ( uint32_t ) ;
2021-09-16 20:11:22 +08:00
programSelfCleanupSection < GfxFamily > ( inputAddress ,
totalBytesProgrammed ,
addressForCleanup ,
2022-10-14 08:11:28 +08:00
args . useAtomicsForSelfCleanup ) ;
}
if ( args . pipeControlBeforeCleanupCrossTileSync ) {
programPipeControlCommand < GfxFamily > ( inputAddress , totalBytesProgrammed , pipeControlArgs ) ;
2021-04-24 00:43:48 +08:00
}
2022-07-21 22:28:10 +08:00
// this synchronization point ensures that all tiles finished zeroing and will fairly access control section atomic variables
2022-10-14 08:11:28 +08:00
programTilesSynchronizationWithAtomics < GfxFamily > ( inputAddress , totalBytesProgrammed , finalSyncTileCountAddress , 2 * args . tileCount ) ;
2021-04-24 00:43:48 +08:00
}
2023-11-23 21:58:58 +08:00
template < typename GfxFamily , typename WalkerType >
2021-04-24 00:43:48 +08:00
void programTilesSynchronizationWithPostSyncs ( void * & currentBatchBufferPointer ,
uint32_t & totalBytesProgrammed ,
2023-11-23 21:58:58 +08:00
WalkerType * inputWalker ,
2021-04-24 00:43:48 +08:00
uint32_t partitionCount ) {
const auto postSyncAddress = inputWalker - > getPostSync ( ) . getDestinationAddress ( ) + 8llu ;
for ( uint32_t partitionId = 0u ; partitionId < partitionCount ; partitionId + + ) {
programWaitForSemaphore < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , postSyncAddress + partitionId * 16llu , 1u , MI_SEMAPHORE_WAIT < GfxFamily > : : COMPARE_OPERATION : : COMPARE_OPERATION_SAD_NOT_EQUAL_SDD ) ;
}
}
2023-12-21 22:33:38 +08:00
template < typename GfxFamily , typename WalkerType >
2021-04-24 00:43:48 +08:00
uint64_t computeWalkerSectionSize ( ) {
return sizeof ( BATCH_BUFFER_START < GfxFamily > ) +
2023-12-21 22:33:38 +08:00
sizeof ( WalkerType ) ;
2021-04-24 00:43:48 +08:00
}
2023-12-21 22:33:38 +08:00
template < typename GfxFamily , typename WalkerType >
2021-09-14 01:39:55 +08:00
uint64_t computeControlSectionOffset ( WalkerPartitionArgs & args ) {
uint64_t size = 0u ;
2021-04-24 00:43:48 +08:00
2021-09-14 01:39:55 +08:00
size + = args . synchronizeBeforeExecution ? computeTilesSynchronizationWithAtomicsSectionSize < GfxFamily > ( ) : 0 ;
2022-07-21 22:28:10 +08:00
size + = sizeof ( LOAD_REGISTER_IMM < GfxFamily > ) ; // predication mask
size + = sizeof ( MI_ATOMIC < GfxFamily > ) ; // current id for partition
size + = sizeof ( LOAD_REGISTER_REG < GfxFamily > ) ; // id into register
2021-09-14 01:39:55 +08:00
size + = sizeof ( MI_SET_PREDICATE < GfxFamily > ) * 2 +
sizeof ( BATCH_BUFFER_START < GfxFamily > ) * 2 ;
2023-03-10 21:49:06 +08:00
size + = ( args . semaphoreProgrammingRequired ? NEO : : EncodeSemaphore < GfxFamily > : : getSizeMiSemaphoreWait ( ) * args . partitionCount : 0u ) ;
2023-12-21 22:33:38 +08:00
size + = computeWalkerSectionSize < GfxFamily , WalkerType > ( ) ;
2022-08-31 22:58:49 +08:00
size + = args . emitPipeControlStall ? NEO : : MemorySynchronizationCommands < GfxFamily > : : getSizeForSingleBarrier ( false ) : 0u ;
2021-09-16 20:11:22 +08:00
if ( args . crossTileAtomicSynchronization | | args . emitSelfCleanup ) {
2021-09-14 01:39:55 +08:00
size + = computeTilesSynchronizationWithAtomicsSectionSize < GfxFamily > ( ) ;
2021-04-24 00:43:48 +08:00
}
2021-09-16 20:11:22 +08:00
if ( args . emitSelfCleanup ) {
size + = computeSelfCleanupSectionSize < GfxFamily > ( args . useAtomicsForSelfCleanup ) ;
2021-09-14 01:39:55 +08:00
}
2021-10-05 00:37:12 +08:00
size + = args . preferredStaticPartitioning ? sizeof ( LOAD_REGISTER_MEM < GfxFamily > ) : 0u ;
2021-09-14 01:39:55 +08:00
return size ;
2021-04-24 00:43:48 +08:00
}
2023-12-21 22:33:38 +08:00
template < typename GfxFamily , typename WalkerType >
2021-09-14 01:39:55 +08:00
uint64_t computeWalkerSectionStart ( WalkerPartitionArgs & args ) {
2023-12-21 22:33:38 +08:00
return computeControlSectionOffset < GfxFamily , WalkerType > ( args ) -
computeWalkerSectionSize < GfxFamily , WalkerType > ( ) ;
2021-04-24 00:43:48 +08:00
}
2023-11-23 21:58:58 +08:00
template < typename GfxFamily , typename WalkerType >
2023-09-22 19:18:43 +08:00
void * programPartitionedWalker ( void * & inputAddress , uint32_t & totalBytesProgrammed ,
2023-11-23 21:58:58 +08:00
WalkerType * inputWalker ,
2023-09-22 19:18:43 +08:00
uint32_t partitionCount ,
2024-01-17 21:20:26 +08:00
uint32_t tileCount ,
2023-09-22 19:18:43 +08:00
bool forceExecutionOnSingleTile ) {
2023-11-23 21:58:58 +08:00
auto computeWalker = putCommand < WalkerType > ( inputAddress , totalBytesProgrammed ) ;
2021-04-24 00:43:48 +08:00
if ( partitionCount > 1 ) {
auto partitionType = inputWalker - > getPartitionType ( ) ;
assert ( inputWalker - > getThreadGroupIdStartingX ( ) = = 0u ) ;
assert ( inputWalker - > getThreadGroupIdStartingY ( ) = = 0u ) ;
assert ( inputWalker - > getThreadGroupIdStartingZ ( ) = = 0u ) ;
2023-11-23 21:58:58 +08:00
assert ( partitionType ! = WalkerType : : PARTITION_TYPE : : PARTITION_TYPE_DISABLED ) ;
2021-04-24 00:43:48 +08:00
2024-04-17 06:23:53 +08:00
inputWalker - > setWorkloadPartitionEnable ( true ) ;
2021-04-24 00:43:48 +08:00
auto workgroupCount = 0u ;
2023-11-23 21:58:58 +08:00
if ( partitionType = = WalkerType : : PARTITION_TYPE : : PARTITION_TYPE_X ) {
2021-04-24 00:43:48 +08:00
workgroupCount = inputWalker - > getThreadGroupIdXDimension ( ) ;
2023-11-23 21:58:58 +08:00
} else if ( partitionType = = WalkerType : : PARTITION_TYPE : : PARTITION_TYPE_Y ) {
2021-04-24 00:43:48 +08:00
workgroupCount = inputWalker - > getThreadGroupIdYDimension ( ) ;
} else {
workgroupCount = inputWalker - > getThreadGroupIdZDimension ( ) ;
}
2022-11-15 21:48:45 +08:00
if ( forceExecutionOnSingleTile ) {
2024-04-17 06:23:53 +08:00
inputWalker - > setPartitionSize ( workgroupCount ) ;
2022-11-15 21:48:45 +08:00
} else {
2024-04-17 06:23:53 +08:00
inputWalker - > setPartitionSize ( Math : : divideAndRoundUp ( workgroupCount , partitionCount ) ) ;
2022-11-15 21:48:45 +08:00
}
2021-04-24 00:43:48 +08:00
}
2024-01-17 21:20:26 +08:00
2024-04-17 06:23:53 +08:00
appendWalkerFields < GfxFamily , WalkerType > ( * inputWalker , tileCount ) ;
2024-01-17 21:20:26 +08:00
2024-04-17 06:23:53 +08:00
* computeWalker = * inputWalker ;
2023-09-22 19:18:43 +08:00
return computeWalker ;
2021-04-24 00:43:48 +08:00
}
/* SAMPLE COMMAND BUFFER STRUCTURE, birds eye view for 16 partitions, 4 tiles
//inital setup section
1. MI_LOAD_REGISTER ( PREDICATION_MASK , active partition mask )
//loop 1 - loop as long as there are partitions to be serviced
2022-05-18 03:04:23 +08:00
2. MI_ATOMIC_INC ( ATOMIC LOCATION # 31 within CMD buffer )
2021-04-24 00:43:48 +08:00
3. MI_LOAD_REGISTER_REG ( ATOMIC RESULT - > WPARID )
2022-05-18 03:04:23 +08:00
4. MI_SET_PREDICATE ( WPARID MODE )
2021-04-24 00:43:48 +08:00
5. BATCH_BUFFER_START ( LOCATION # 28 ) // this will not be executed if partition outside of active virtual partitions
//loop 1 ends here, if we are here it means there are no more partitions
6. MI_SET_PREDICATE ( OFF )
//Walker synchronization section starts here, make sure that Walker is done
7 , PIPE_CONTROL ( DC_FLUSH )
//wait for all post syncs to make sure whole work is done, caller needs to set them to 1.
//now epilogue starts synchro all engines prior to coming back to RING, this will be once per command buffer to make sure that all engines actually passed via cmd buffer.
//epilogue section, make sure every tile completed prior to continuing
//This is cross-tile synchronization
24. ATOMIC_INC ( LOCATION # 31 )
25. WAIT_FOR_SEMAPHORE ( LOCATION # 31 , LOWER THEN 4 ) // wait till all tiles hit atomic
26. PIPE_CONTROL ( TAG UPDATE ) ( not implemented )
27. BATCH_BUFFER_STAT ( LOCATION # 32 ) // go to the very end
//Walker section
28. COMPUTE_WALKER
29. BATCH BUFFER_START ( GO BACK TO # 2 )
//Batch Buffer Control Data section, there are no real commands here but we have memory here
//That will be updated via atomic operations.
30. uint32_t virtualPartitionID //atomic location
31. uint32_t completionTileID //all tiles needs to report completion
32. BATCH_BUFFER_END ( optional )
*/
2023-11-23 21:58:58 +08:00
template < typename GfxFamily , typename WalkerType >
2021-04-24 00:43:48 +08:00
void constructDynamicallyPartitionedCommandBuffer ( void * cpuPointer ,
2023-09-22 19:18:43 +08:00
void * * outWalkerPtr ,
2021-04-24 00:43:48 +08:00
uint64_t gpuAddressOfAllocation ,
2023-11-23 21:58:58 +08:00
WalkerType * inputWalker ,
2021-04-24 00:43:48 +08:00
uint32_t & totalBytesProgrammed ,
2021-12-20 22:37:33 +08:00
WalkerPartitionArgs & args ,
const NEO : : HardwareInfo & hwInfo ) {
2021-04-24 00:43:48 +08:00
totalBytesProgrammed = 0u ;
void * currentBatchBufferPointer = cpuPointer ;
2023-12-21 22:33:38 +08:00
auto controlSectionOffset = computeControlSectionOffset < GfxFamily , WalkerType > ( args ) ;
2021-09-14 01:39:55 +08:00
if ( args . synchronizeBeforeExecution ) {
2021-04-24 00:43:48 +08:00
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof ( BatchBufferControlData , inTileCount ) ;
2021-09-14 01:39:55 +08:00
programTilesSynchronizationWithAtomics < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , tileAtomicAddress , args . tileCount ) ;
2021-04-24 00:43:48 +08:00
}
2021-09-14 01:39:55 +08:00
programWparidMask < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , args . partitionCount ) ;
2021-04-24 00:43:48 +08:00
programMiAtomic < GfxFamily > ( currentBatchBufferPointer ,
totalBytesProgrammed ,
gpuAddressOfAllocation + controlSectionOffset ,
true ,
MI_ATOMIC < GfxFamily > : : ATOMIC_OPCODES : : ATOMIC_4B_INCREMENT ) ;
2022-07-21 22:28:10 +08:00
// move atomic result to wparid
2021-04-24 00:43:48 +08:00
programMiLoadRegisterReg < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , generalPurposeRegister4 , wparidCCSOffset ) ;
2022-07-21 22:28:10 +08:00
// enable predication basing on wparid value
2021-04-24 00:43:48 +08:00
programWparidPredication < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , true ) ;
programMiBatchBufferStart < GfxFamily > ( currentBatchBufferPointer ,
totalBytesProgrammed ,
gpuAddressOfAllocation +
2023-12-21 22:33:38 +08:00
computeWalkerSectionStart < GfxFamily , WalkerType > ( args ) ,
2021-04-24 00:43:48 +08:00
true ,
2021-09-14 01:39:55 +08:00
args . secondaryBatchBuffer ) ;
2021-04-24 00:43:48 +08:00
2022-07-21 22:28:10 +08:00
// disable predication to not noop subsequent commands.
2021-04-24 00:43:48 +08:00
programWparidPredication < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , false ) ;
2021-09-16 20:11:22 +08:00
if ( args . emitSelfCleanup ) {
2021-04-24 00:43:48 +08:00
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof ( BatchBufferControlData , finalSyncTileCount ) ;
2021-09-16 20:11:22 +08:00
programSelfCleanupSection < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , finalSyncTileCountField , args . useAtomicsForSelfCleanup ) ;
2021-04-24 00:43:48 +08:00
}
2021-09-16 20:11:22 +08:00
if ( args . emitPipeControlStall ) {
2022-10-14 20:34:24 +08:00
NEO : : PipeControlArgs pipeControlArgs ;
pipeControlArgs . dcFlushEnable = args . dcFlushEnable ;
programPipeControlCommand < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , pipeControlArgs ) ;
2021-09-14 01:39:55 +08:00
}
2021-04-24 00:43:48 +08:00
2021-09-14 01:39:55 +08:00
if ( args . semaphoreProgrammingRequired ) {
2021-04-24 00:43:48 +08:00
auto postSyncAddress = inputWalker - > getPostSync ( ) . getDestinationAddress ( ) + 8llu ;
2021-09-14 01:39:55 +08:00
for ( uint32_t partitionId = 0u ; partitionId < args . partitionCount ; partitionId + + ) {
2021-04-24 00:43:48 +08:00
programWaitForSemaphore < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , postSyncAddress + partitionId * 16llu , 1u , MI_SEMAPHORE_WAIT < GfxFamily > : : COMPARE_OPERATION : : COMPARE_OPERATION_SAD_NOT_EQUAL_SDD ) ;
}
}
2021-09-16 20:11:22 +08:00
if ( args . crossTileAtomicSynchronization | | args . emitSelfCleanup ) {
2021-04-24 00:43:48 +08:00
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof ( BatchBufferControlData , tileCount ) ;
2021-09-14 01:39:55 +08:00
programTilesSynchronizationWithAtomics < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , tileAtomicAddress , args . tileCount ) ;
2021-04-24 00:43:48 +08:00
}
2021-10-05 00:37:12 +08:00
if ( args . preferredStaticPartitioning ) {
programMiLoadRegisterMem < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , args . workPartitionAllocationGpuVa , wparidCCSOffset ) ;
}
2022-07-21 22:28:10 +08:00
// this bb start goes to the end of partitioned command buffer
2021-04-24 00:43:48 +08:00
programMiBatchBufferStart < GfxFamily > (
currentBatchBufferPointer ,
totalBytesProgrammed ,
gpuAddressOfAllocation + controlSectionOffset + sizeof ( BatchBufferControlData ) ,
false ,
2021-09-14 01:39:55 +08:00
args . secondaryBatchBuffer ) ;
2021-04-24 00:43:48 +08:00
2022-07-21 22:28:10 +08:00
// Walker section
2024-01-17 21:20:26 +08:00
auto walkerPtr = programPartitionedWalker < GfxFamily , WalkerType > ( currentBatchBufferPointer , totalBytesProgrammed , inputWalker , args . partitionCount , args . tileCount , args . forceExecutionOnSingleTile ) ;
2023-09-22 19:18:43 +08:00
if ( outWalkerPtr ) {
* outWalkerPtr = walkerPtr ;
}
2021-04-24 00:43:48 +08:00
2021-09-14 01:39:55 +08:00
programMiBatchBufferStart < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , gpuAddressOfAllocation , false , args . secondaryBatchBuffer ) ;
2021-04-24 00:43:48 +08:00
auto controlSection = reinterpret_cast < BatchBufferControlData * > ( ptrOffset ( cpuPointer , static_cast < size_t > ( controlSectionOffset ) ) ) ;
controlSection - > partitionCount = 0u ;
controlSection - > tileCount = 0u ;
controlSection - > inTileCount = 0u ;
controlSection - > finalSyncTileCount = 0u ;
totalBytesProgrammed + = sizeof ( BatchBufferControlData ) ;
currentBatchBufferPointer = ptrOffset ( currentBatchBufferPointer , sizeof ( BatchBufferControlData ) ) ;
2021-09-16 20:11:22 +08:00
if ( args . emitSelfCleanup ) {
2021-04-24 00:43:48 +08:00
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof ( BatchBufferControlData , finalSyncTileCount ) ;
2021-09-16 20:11:22 +08:00
programSelfCleanupEndSection < GfxFamily > ( currentBatchBufferPointer ,
totalBytesProgrammed ,
finalSyncTileCountAddress ,
gpuAddressOfAllocation + controlSectionOffset ,
dynamicPartitioningFieldsForCleanupCount ,
2022-10-14 08:11:28 +08:00
args ) ;
2021-04-24 00:43:48 +08:00
}
2021-09-14 01:39:55 +08:00
if ( args . emitBatchBufferEnd ) {
2021-04-24 00:43:48 +08:00
auto batchBufferEnd = putCommand < BATCH_BUFFER_END < GfxFamily > > ( currentBatchBufferPointer , totalBytesProgrammed ) ;
* batchBufferEnd = GfxFamily : : cmdInitBatchBufferEnd ;
}
}
2021-10-29 18:12:13 +08:00
template < typename GfxFamily >
bool isStartAndControlSectionRequired ( WalkerPartitionArgs & args ) {
return args . synchronizeBeforeExecution | | args . crossTileAtomicSynchronization | | args . emitSelfCleanup ;
}
2023-12-21 22:33:38 +08:00
template < typename GfxFamily , typename WalkerType >
2021-09-14 01:39:55 +08:00
uint64_t computeStaticPartitioningControlSectionOffset ( WalkerPartitionArgs & args ) {
const auto beforeExecutionSyncAtomicSize = args . synchronizeBeforeExecution
? computeTilesSynchronizationWithAtomicsSectionSize < GfxFamily > ( )
: 0u ;
2021-09-16 20:11:22 +08:00
const auto afterExecutionSyncAtomicSize = ( args . crossTileAtomicSynchronization | | args . emitSelfCleanup )
2021-09-14 01:39:55 +08:00
? computeTilesSynchronizationWithAtomicsSectionSize < GfxFamily > ( )
: 0u ;
const auto afterExecutionSyncPostSyncSize = args . semaphoreProgrammingRequired
2023-03-10 21:49:06 +08:00
? NEO : : EncodeSemaphore < GfxFamily > : : getSizeMiSemaphoreWait ( ) * args . partitionCount
2021-09-14 01:39:55 +08:00
: 0u ;
2021-09-16 20:11:22 +08:00
const auto selfCleanupSectionSize = args . emitSelfCleanup
? computeSelfCleanupSectionSize < GfxFamily > ( args . useAtomicsForSelfCleanup )
: 0u ;
2021-09-14 01:39:55 +08:00
const auto wparidRegisterSize = args . initializeWparidRegister
? sizeof ( LOAD_REGISTER_MEM < GfxFamily > )
: 0u ;
2021-09-16 20:11:22 +08:00
const auto pipeControlSize = args . emitPipeControlStall
2022-08-31 22:58:49 +08:00
? NEO : : MemorySynchronizationCommands < GfxFamily > : : getSizeForSingleBarrier ( false )
2021-09-14 01:39:55 +08:00
: 0u ;
2021-10-29 18:12:13 +08:00
const auto bbStartSize = isStartAndControlSectionRequired < GfxFamily > ( args )
? sizeof ( BATCH_BUFFER_START < GfxFamily > )
: 0u ;
2021-04-24 00:43:48 +08:00
return beforeExecutionSyncAtomicSize +
2021-09-14 01:39:55 +08:00
wparidRegisterSize +
pipeControlSize +
2023-12-21 22:33:38 +08:00
sizeof ( WalkerType ) +
2021-09-16 20:11:22 +08:00
selfCleanupSectionSize +
2021-04-24 00:43:48 +08:00
afterExecutionSyncAtomicSize +
afterExecutionSyncPostSyncSize +
2021-10-29 18:12:13 +08:00
bbStartSize ;
2021-04-24 00:43:48 +08:00
}
2023-11-23 21:58:58 +08:00
template < typename GfxFamily , typename WalkerType >
2021-04-24 00:43:48 +08:00
void constructStaticallyPartitionedCommandBuffer ( void * cpuPointer ,
2023-09-22 19:18:43 +08:00
void * * outWalkerPtr ,
2021-04-24 00:43:48 +08:00
uint64_t gpuAddressOfAllocation ,
2023-11-23 21:58:58 +08:00
WalkerType * inputWalker ,
2021-04-24 00:43:48 +08:00
uint32_t & totalBytesProgrammed ,
2021-12-20 22:37:33 +08:00
WalkerPartitionArgs & args ,
const NEO : : HardwareInfo & hwInfo ) {
2021-04-24 00:43:48 +08:00
totalBytesProgrammed = 0u ;
void * currentBatchBufferPointer = cpuPointer ;
// Get address of the control section
2023-12-21 22:33:38 +08:00
const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset < GfxFamily , WalkerType > ( args ) ;
2021-04-24 00:43:48 +08:00
const auto afterControlSectionOffset = controlSectionOffset + sizeof ( StaticPartitioningControlSection ) ;
// Synchronize tiles before walker
2021-09-14 01:39:55 +08:00
if ( args . synchronizeBeforeExecution ) {
2021-04-24 00:43:48 +08:00
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof ( StaticPartitioningControlSection , synchronizeBeforeWalkerCounter ) ;
2021-09-14 01:39:55 +08:00
programTilesSynchronizationWithAtomics < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , atomicAddress , args . tileCount ) ;
2021-04-24 00:43:48 +08:00
}
// Load partition ID to wparid register and execute walker
2021-09-14 01:39:55 +08:00
if ( args . initializeWparidRegister ) {
programMiLoadRegisterMem < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , args . workPartitionAllocationGpuVa , wparidCCSOffset ) ;
}
2024-01-17 21:20:26 +08:00
auto walkerPtr = programPartitionedWalker < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , inputWalker , args . partitionCount , args . tileCount , args . forceExecutionOnSingleTile ) ;
2023-09-22 19:18:43 +08:00
if ( outWalkerPtr ) {
* outWalkerPtr = walkerPtr ;
}
2021-04-24 00:43:48 +08:00
// Prepare for cleanup section
2021-09-16 20:11:22 +08:00
if ( args . emitSelfCleanup ) {
2021-04-24 00:43:48 +08:00
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof ( StaticPartitioningControlSection , finalSyncTileCounter ) ;
2021-09-16 20:11:22 +08:00
programSelfCleanupSection < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , finalSyncTileCountField , args . useAtomicsForSelfCleanup ) ;
2021-04-24 00:43:48 +08:00
}
2021-09-16 20:11:22 +08:00
if ( args . emitPipeControlStall ) {
2022-10-14 20:34:24 +08:00
NEO : : PipeControlArgs pipeControlArgs ;
pipeControlArgs . dcFlushEnable = args . dcFlushEnable ;
programPipeControlCommand < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , pipeControlArgs ) ;
2021-09-14 01:39:55 +08:00
}
2021-07-01 17:41:08 +08:00
2021-04-24 00:43:48 +08:00
// Synchronize tiles after walker
2021-09-14 01:39:55 +08:00
if ( args . semaphoreProgrammingRequired ) {
2023-11-23 21:58:58 +08:00
programTilesSynchronizationWithPostSyncs < GfxFamily , WalkerType > ( currentBatchBufferPointer , totalBytesProgrammed , inputWalker , args . partitionCount ) ;
2021-04-24 00:43:48 +08:00
}
2021-09-14 01:39:55 +08:00
2021-09-16 20:11:22 +08:00
if ( args . crossTileAtomicSynchronization | | args . emitSelfCleanup ) {
2021-04-24 00:43:48 +08:00
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof ( StaticPartitioningControlSection , synchronizeAfterWalkerCounter ) ;
2021-09-14 01:39:55 +08:00
programTilesSynchronizationWithAtomics < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , atomicAddress , args . tileCount ) ;
2021-04-24 00:43:48 +08:00
}
2021-10-29 18:12:13 +08:00
// Jump over the control section only when needed
if ( isStartAndControlSectionRequired < GfxFamily > ( args ) ) {
programMiBatchBufferStart < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , gpuAddressOfAllocation + afterControlSectionOffset , false , args . secondaryBatchBuffer ) ;
2021-04-24 00:43:48 +08:00
2021-10-29 18:12:13 +08:00
// Control section
DEBUG_BREAK_IF ( totalBytesProgrammed ! = controlSectionOffset ) ;
StaticPartitioningControlSection * controlSection = putCommand < StaticPartitioningControlSection > ( currentBatchBufferPointer , totalBytesProgrammed ) ;
controlSection - > synchronizeBeforeWalkerCounter = 0u ;
controlSection - > synchronizeAfterWalkerCounter = 0u ;
controlSection - > finalSyncTileCounter = 0u ;
DEBUG_BREAK_IF ( totalBytesProgrammed ! = afterControlSectionOffset ) ;
}
2021-04-24 00:43:48 +08:00
// Cleanup section
2021-09-16 20:11:22 +08:00
if ( args . emitSelfCleanup ) {
2021-04-24 00:43:48 +08:00
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof ( StaticPartitioningControlSection , finalSyncTileCounter ) ;
2021-09-16 20:11:22 +08:00
programSelfCleanupEndSection < GfxFamily > ( currentBatchBufferPointer ,
totalBytesProgrammed ,
finalSyncTileCountAddress ,
gpuAddressOfAllocation + controlSectionOffset ,
staticPartitioningFieldsForCleanupCount ,
2022-10-14 08:11:28 +08:00
args ) ;
2021-04-24 00:43:48 +08:00
}
}
2023-12-21 22:33:38 +08:00
template < typename GfxFamily , typename WalkerType >
2021-09-14 01:39:55 +08:00
uint64_t estimateSpaceRequiredInCommandBuffer ( WalkerPartitionArgs & args ) {
2021-04-24 00:43:48 +08:00
uint64_t size = { } ;
2021-09-14 01:39:55 +08:00
if ( args . staticPartitioning ) {
2023-12-21 22:33:38 +08:00
size + = computeStaticPartitioningControlSectionOffset < GfxFamily , WalkerType > ( args ) ;
2021-10-29 18:12:13 +08:00
size + = isStartAndControlSectionRequired < GfxFamily > ( args ) ? sizeof ( StaticPartitioningControlSection ) : 0u ;
2022-10-14 08:11:28 +08:00
size + = args . emitSelfCleanup ? computeSelfCleanupEndSectionSize < GfxFamily > ( staticPartitioningFieldsForCleanupCount , args ) : 0u ;
2021-04-24 00:43:48 +08:00
} else {
2023-12-21 22:33:38 +08:00
size + = computeControlSectionOffset < GfxFamily , WalkerType > ( args ) ;
2021-04-24 00:43:48 +08:00
size + = sizeof ( BatchBufferControlData ) ;
2021-09-14 01:39:55 +08:00
size + = args . emitBatchBufferEnd ? sizeof ( BATCH_BUFFER_END < GfxFamily > ) : 0u ;
2022-10-14 08:11:28 +08:00
size + = args . emitSelfCleanup ? computeSelfCleanupEndSectionSize < GfxFamily > ( dynamicPartitioningFieldsForCleanupCount , args ) : 0u ;
2021-04-24 00:43:48 +08:00
}
return size ;
}
2021-10-29 18:12:13 +08:00
template < typename GfxFamily >
2021-11-11 03:56:42 +08:00
uint64_t computeBarrierControlSectionOffset ( WalkerPartitionArgs & args ,
2023-01-26 11:58:18 +08:00
const NEO : : RootDeviceEnvironment & rootDeviceEnvironment ) {
2021-10-29 18:12:13 +08:00
uint64_t offset = 0u ;
if ( args . emitSelfCleanup ) {
offset + = computeSelfCleanupSectionSize < GfxFamily > ( args . useAtomicsForSelfCleanup ) ;
}
2021-11-11 03:56:42 +08:00
if ( args . usePostSync ) {
2023-01-26 11:58:18 +08:00
offset + = NEO : : MemorySynchronizationCommands < GfxFamily > : : getSizeForBarrierWithPostSyncOperation ( rootDeviceEnvironment , false ) ;
2021-11-11 03:56:42 +08:00
} else {
2022-08-19 23:56:22 +08:00
offset + = NEO : : MemorySynchronizationCommands < GfxFamily > : : getSizeForSingleBarrier ( false ) ;
2021-11-11 03:56:42 +08:00
}
offset + = ( computeTilesSynchronizationWithAtomicsSectionSize < GfxFamily > ( ) +
2021-10-29 18:12:13 +08:00
sizeof ( BATCH_BUFFER_START < GfxFamily > ) ) ;
return offset ;
}
template < typename GfxFamily >
2021-11-11 03:56:42 +08:00
uint64_t estimateBarrierSpaceRequiredInCommandBuffer ( WalkerPartitionArgs & args ,
2023-01-26 11:58:18 +08:00
const NEO : : RootDeviceEnvironment & rootDeviceEnvironment ) {
uint64_t size = computeBarrierControlSectionOffset < GfxFamily > ( args , rootDeviceEnvironment ) +
2021-10-29 18:12:13 +08:00
sizeof ( BarrierControlSection ) ;
if ( args . emitSelfCleanup ) {
2022-10-14 08:11:28 +08:00
size + = computeSelfCleanupEndSectionSize < GfxFamily > ( barrierControlSectionFieldsForCleanupCount , args ) ;
2021-10-29 18:12:13 +08:00
}
return size ;
}
template < typename GfxFamily >
void constructBarrierCommandBuffer ( void * cpuPointer ,
uint64_t gpuAddressOfAllocation ,
uint32_t & totalBytesProgrammed ,
2021-11-04 23:28:06 +08:00
WalkerPartitionArgs & args ,
2021-11-11 03:56:42 +08:00
NEO : : PipeControlArgs & flushArgs ,
2023-01-26 11:58:18 +08:00
const NEO : : RootDeviceEnvironment & rootDeviceEnvironment ) {
2021-10-29 18:12:13 +08:00
void * currentBatchBufferPointer = cpuPointer ;
2023-01-26 11:58:18 +08:00
const auto controlSectionOffset = computeBarrierControlSectionOffset < GfxFamily > ( args , rootDeviceEnvironment ) ;
2021-10-29 18:12:13 +08:00
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof ( BarrierControlSection , finalSyncTileCount ) ;
if ( args . emitSelfCleanup ) {
programSelfCleanupSection < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , finalSyncTileCountField , args . useAtomicsForSelfCleanup ) ;
}
2021-11-11 03:56:42 +08:00
if ( args . usePostSync ) {
2023-01-26 11:58:18 +08:00
programPostSyncPipeControlCommand < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , args , flushArgs , rootDeviceEnvironment ) ;
2021-11-11 03:56:42 +08:00
} else {
programPipeControlCommand < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , flushArgs ) ;
}
2021-10-29 18:12:13 +08:00
const auto crossTileSyncCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof ( BarrierControlSection , crossTileSyncCount ) ;
programTilesSynchronizationWithAtomics < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , crossTileSyncCountField , args . tileCount ) ;
const auto afterControlSectionOffset = controlSectionOffset + sizeof ( BarrierControlSection ) ;
programMiBatchBufferStart < GfxFamily > ( currentBatchBufferPointer , totalBytesProgrammed , gpuAddressOfAllocation + afterControlSectionOffset , false , args . secondaryBatchBuffer ) ;
DEBUG_BREAK_IF ( totalBytesProgrammed ! = controlSectionOffset ) ;
BarrierControlSection * controlSection = putCommand < BarrierControlSection > ( currentBatchBufferPointer , totalBytesProgrammed ) ;
controlSection - > crossTileSyncCount = 0u ;
controlSection - > finalSyncTileCount = 0u ;
DEBUG_BREAK_IF ( totalBytesProgrammed ! = afterControlSectionOffset ) ;
if ( args . emitSelfCleanup ) {
programSelfCleanupEndSection < GfxFamily > ( currentBatchBufferPointer ,
totalBytesProgrammed ,
finalSyncTileCountField ,
gpuAddressOfAllocation + controlSectionOffset ,
barrierControlSectionFieldsForCleanupCount ,
2022-10-14 08:11:28 +08:00
args ) ;
2021-10-29 18:12:13 +08:00
}
}
2021-04-24 00:43:48 +08:00
} // namespace WalkerPartition