/* * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/walker_partition_interface.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/basic_math.h" #include "shared/source/helpers/common_types.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/pipe_control_args.h" #include "shared/source/helpers/ptr_math.h" #include "shared/source/helpers/string.h" #include #include namespace NEO { struct PipeControlArgs; } namespace WalkerPartition { template using POSTSYNC_DATA = typename GfxFamily::POSTSYNC_DATA; template using BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; template using BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END; template using LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM; template using LOAD_REGISTER_MEM = typename GfxFamily::MI_LOAD_REGISTER_MEM; template using MI_SET_PREDICATE = typename GfxFamily::MI_SET_PREDICATE; template using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; template using MI_ATOMIC = typename GfxFamily::MI_ATOMIC; template using DATA_SIZE = typename GfxFamily::MI_ATOMIC::DATA_SIZE; template using LOAD_REGISTER_REG = typename GfxFamily::MI_LOAD_REGISTER_REG; template using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; template using MI_STORE_DATA_IMM = typename GfxFamily::MI_STORE_DATA_IMM; template using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; template void appendWalkerFields(WalkerType &walkerCmd, uint32_t tileCount, uint32_t workgroupCount); template Command *putCommand(void *&inputAddress, uint32_t &totalBytesProgrammed) { totalBytesProgrammed += sizeof(Command); auto commandToReturn = reinterpret_cast(inputAddress); inputAddress = ptrOffset(inputAddress, sizeof(Command)); return commandToReturn; } inline void *putCommand(void *&inputAddress, uint32_t &totalBytesProgrammed, size_t commandSize) { totalBytesProgrammed += static_cast(commandSize); auto commandToReturn = inputAddress; inputAddress = ptrOffset(inputAddress, commandSize); return commandToReturn; } template uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitionCount, bool preferStaticPartitioning, const Vec3 &groupStart, const Vec3 &groupCount, std::optional requestedPartitionType, typename WalkerType::PARTITION_TYPE *outSelectedPartitionType, bool *outSelectStaticPartitioning) { using PARTITION_TYPE = typename WalkerType::PARTITION_TYPE; // For non uniform starting point, there is no support for partition in Hardware. Disable partitioning and select dynamic algorithm if (groupStart.x || groupStart.y || groupStart.z) { *outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_DISABLED; *outSelectStaticPartitioning = false; return 1u; } size_t workgroupCount = 0u; bool disablePartitionForPartitionCountOne{}; if (NEO::debugManager.flags.ExperimentalSetWalkerPartitionType.get() != -1) { requestedPartitionType = static_cast(NEO::debugManager.flags.ExperimentalSetWalkerPartitionType.get()); } if (requestedPartitionType.has_value()) { switch (requestedPartitionType.value()) { case PARTITION_TYPE::PARTITION_TYPE_X: workgroupCount = groupCount.x; break; case PARTITION_TYPE::PARTITION_TYPE_Y: workgroupCount = groupCount.y; break; case PARTITION_TYPE::PARTITION_TYPE_Z: workgroupCount = groupCount.z; break; default: UNRECOVERABLE_IF(true); } *outSelectedPartitionType = requestedPartitionType.value(); disablePartitionForPartitionCountOne = false; } else { const size_t maxDimension = std::max({groupCount.z, groupCount.y, groupCount.x}); auto goWithMaxAlgorithm = !preferStaticPartitioning; if (NEO::debugManager.flags.WalkerPartitionPreferHighestDimension.get() != -1) { goWithMaxAlgorithm = !!!NEO::debugManager.flags.WalkerPartitionPreferHighestDimension.get(); } // compute misaligned %, accept imbalance below threshold in favor of Z/Y/X distribution. const float minimalThreshold = 0.05f; float zImbalance = static_cast(groupCount.z - alignDown(groupCount.z, preferredMinimalPartitionCount)) / static_cast(groupCount.z); float yImbalance = static_cast(groupCount.y - alignDown(groupCount.y, preferredMinimalPartitionCount)) / static_cast(groupCount.y); // we first try with deepest dimension to see if we can partition there if (groupCount.z > 1 && (zImbalance <= minimalThreshold)) { *outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Z; } else if (groupCount.y > 1 && (yImbalance < minimalThreshold)) { *outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Y; } else if (groupCount.x % preferredMinimalPartitionCount == 0) { *outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_X; } // if we are here then there is no dimension that results in even distribution, choose max dimension to minimize impact else { goWithMaxAlgorithm = true; } if (goWithMaxAlgorithm) { // default mode, select greatest dimension if (maxDimension == groupCount.x) { *outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_X; } else if (maxDimension == groupCount.y) { *outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Y; } else { *outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Z; } } workgroupCount = maxDimension; disablePartitionForPartitionCountOne = true; } // Static partitioning - partition count == tile count *outSelectStaticPartitioning = preferStaticPartitioning; if (preferStaticPartitioning) { return preferredMinimalPartitionCount; } // Dynamic partitioning - compute optimal partition count size_t partitionCount = std::min(static_cast(16u), workgroupCount); partitionCount = Math::prevPowerOfTwo(partitionCount); if (NEO::debugManager.flags.SetMinimalPartitionSize.get() != 0) { const auto workgroupPerPartitionThreshold = NEO::debugManager.flags.SetMinimalPartitionSize.get() == -1 ? 512u : static_cast(NEO::debugManager.flags.SetMinimalPartitionSize.get()); preferredMinimalPartitionCount = std::max(2u, preferredMinimalPartitionCount); while (partitionCount > preferredMinimalPartitionCount) { auto workgroupsPerPartition = workgroupCount / partitionCount; if (workgroupsPerPartition >= workgroupPerPartitionThreshold) { break; } partitionCount = partitionCount / 2; } } if (partitionCount == 1u && disablePartitionForPartitionCountOne) { *outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_DISABLED; } return static_cast(partitionCount); } template uint32_t computePartitionCountAndSetPartitionType(WalkerType *walker, NEO::RequiredPartitionDim requiredPartitionDim, uint32_t preferredMinimalPartitionCount, bool preferStaticPartitioning, bool *outSelectStaticPartitioning) { using PARTITION_TYPE = typename WalkerType::PARTITION_TYPE; const Vec3 groupStart = {walker->getThreadGroupIdStartingX(), walker->getThreadGroupIdStartingY(), walker->getThreadGroupIdStartingZ()}; const Vec3 groupCount = {walker->getThreadGroupIdXDimension(), walker->getThreadGroupIdYDimension(), walker->getThreadGroupIdZDimension()}; std::optional requestedPartitionType{}; switch (requiredPartitionDim) { case NEO::RequiredPartitionDim::x: requestedPartitionType = PARTITION_TYPE::PARTITION_TYPE_X; break; case NEO::RequiredPartitionDim::y: requestedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Y; break; case NEO::RequiredPartitionDim::z: requestedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Z; break; default: UNRECOVERABLE_IF(requiredPartitionDim != NEO::RequiredPartitionDim::none); break; } PARTITION_TYPE partitionType{}; const auto partitionCount = computePartitionCountAndPartitionType(preferredMinimalPartitionCount, preferStaticPartitioning, groupStart, groupCount, requestedPartitionType, &partitionType, outSelectStaticPartitioning); walker->setPartitionType(partitionType); return partitionCount; } template void programRegisterWithValue(void *&inputAddress, uint32_t registerOffset, uint32_t &totalBytesProgrammed, uint32_t registerValue) { auto loadRegisterImmediate = putCommand>(inputAddress, totalBytesProgrammed); LOAD_REGISTER_IMM cmd = GfxFamily::cmdInitLoadRegisterImm; cmd.setRegisterOffset(registerOffset); cmd.setDataDword(registerValue); cmd.setMmioRemapEnable(true); *loadRegisterImmediate = cmd; } template void programWaitForSemaphore(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddress, uint32_t semaphoreCompareValue, typename MI_SEMAPHORE_WAIT::COMPARE_OPERATION compareOperation) { auto semaphoreWait = putCommand>(inputAddress, totalBytesProgrammed); NEO::EncodeSemaphore::programMiSemaphoreWait(semaphoreWait, gpuAddress, semaphoreCompareValue, compareOperation, false, true, false, false, false); } template bool programWparidMask(void *&inputAddress, uint32_t &totalBytesProgrammed, uint32_t partitionCount) { // currently only power of 2 values of partitionCount are being supported if (!Math::isPow2(partitionCount) || partitionCount > 16) { return false; } auto mask = 0xFFE0; auto fillValue = 0x10; auto count = partitionCount; while (count < 16) { fillValue |= (fillValue >> 1); count *= 2; } mask |= (mask | fillValue); programRegisterWithValue(inputAddress, predicationMaskCCSOffset, totalBytesProgrammed, mask); return true; } template void programWparidPredication(void *&inputAddress, uint32_t &totalBytesProgrammed, bool predicationEnabled) { auto miSetPredicate = putCommand>(inputAddress, totalBytesProgrammed); MI_SET_PREDICATE cmd = GfxFamily::cmdInitSetPredicate; if (predicationEnabled) { cmd.setPredicateEnableWparid(MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE); } else { cmd.setPredicateEnable(MI_SET_PREDICATE::PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE); } *miSetPredicate = cmd; } template void programMiAtomic(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddress, bool requireReturnValue, typename MI_ATOMIC::ATOMIC_OPCODES atomicOpcode) { auto miAtomic = putCommand>(inputAddress, totalBytesProgrammed); NEO::EncodeAtomic::programMiAtomic(miAtomic, gpuAddress, atomicOpcode, DATA_SIZE::DATA_SIZE_DWORD, requireReturnValue, requireReturnValue, 0x0u, 0x0u); } template void programMiBatchBufferStart(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddress, bool predicationEnabled, bool secondary) { auto batchBufferStart = putCommand>(inputAddress, totalBytesProgrammed); BATCH_BUFFER_START cmd = GfxFamily::cmdInitBatchBufferStart; cmd.setSecondLevelBatchBuffer(static_cast::SECOND_LEVEL_BATCH_BUFFER>(secondary)); cmd.setAddressSpaceIndicator(BATCH_BUFFER_START::ADDRESS_SPACE_INDICATOR::ADDRESS_SPACE_INDICATOR_PPGTT); cmd.setPredicationEnable(predicationEnabled); cmd.setBatchBufferStartAddress(gpuAddress); *batchBufferStart = cmd; } template void programMiLoadRegisterReg(void *&inputAddress, uint32_t &totalBytesProgrammed, uint32_t sourceRegisterOffset, uint32_t destinationRegisterOffset) { auto loadRegisterReg = putCommand>(inputAddress, totalBytesProgrammed); LOAD_REGISTER_REG cmd = GfxFamily::cmdInitLoadRegisterReg; cmd.setMmioRemapEnableSource(true); cmd.setMmioRemapEnableDestination(true); cmd.setSourceRegisterAddress(sourceRegisterOffset); cmd.setDestinationRegisterAddress(destinationRegisterOffset); *loadRegisterReg = cmd; } template void programMiLoadRegisterMem(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddressToLoad, uint32_t destinationRegisterOffset) { auto loadRegisterReg = putCommand>(inputAddress, totalBytesProgrammed); LOAD_REGISTER_MEM cmd = GfxFamily::cmdInitLoadRegisterMem; cmd.setMmioRemapEnable(true); cmd.setMemoryAddress(gpuAddressToLoad); cmd.setRegisterAddress(destinationRegisterOffset); *loadRegisterReg = cmd; } template void programPipeControlCommand(void *&inputAddress, uint32_t &totalBytesProgrammed, NEO::PipeControlArgs &flushArgs) { auto singleBarrierSize = NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(flushArgs.tlbInvalidation); auto pipeControl = putCommand(inputAddress, totalBytesProgrammed, singleBarrierSize); UNRECOVERABLE_IF(sizeof(PIPE_CONTROL) < singleBarrierSize); uint8_t cmd[sizeof(PIPE_CONTROL)] = {}; NEO::MemorySynchronizationCommands::setSingleBarrier(cmd, flushArgs); memcpy_s(pipeControl, singleBarrierSize, cmd, singleBarrierSize); } template void programPostSyncPipeControlCommand(void *&inputAddress, uint32_t &totalBytesProgrammed, WalkerPartitionArgs &args, NEO::PipeControlArgs &flushArgs, const NEO::RootDeviceEnvironment &rootDeviceEnvironment) { NEO::MemorySynchronizationCommands::setBarrierWithPostSyncOperation(inputAddress, NEO::PostSyncMode::immediateData, args.postSyncGpuAddress, args.postSyncImmediateValue, rootDeviceEnvironment, flushArgs); totalBytesProgrammed += static_cast(NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, flushArgs.tlbInvalidation)); } template void programStoreMemImmediateDword(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddress, uint32_t data) { auto storeDataImmediate = putCommand>(inputAddress, totalBytesProgrammed); MI_STORE_DATA_IMM cmd = GfxFamily::cmdInitStoreDataImm; cmd.setAddress(gpuAddress); cmd.setStoreQword(false); cmd.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD); cmd.setDataDword0(static_cast(data)); *storeDataImmediate = cmd; } template uint64_t computeSelfCleanupSectionSize(bool useAtomicsForSelfCleanup) { if (useAtomicsForSelfCleanup) { return sizeof(MI_ATOMIC); } else { return sizeof(MI_STORE_DATA_IMM); } } template void programSelfCleanupSection(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t address, bool useAtomicsForSelfCleanup) { if (useAtomicsForSelfCleanup) { programMiAtomic(inputAddress, totalBytesProgrammed, address, false, MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE); } else { programStoreMemImmediateDword(inputAddress, totalBytesProgrammed, address, 0u); } } template uint64_t computeTilesSynchronizationWithAtomicsSectionSize() { return sizeof(MI_ATOMIC) + NEO::EncodeSemaphore::getSizeMiSemaphoreWait(); } template void programTilesSynchronizationWithAtomics(void *¤tBatchBufferPointer, uint32_t &totalBytesProgrammed, uint64_t atomicAddress, uint32_t tileCount) { programMiAtomic(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, false, MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT); programWaitForSemaphore(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount, MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); } template uint64_t computeSelfCleanupEndSectionSize(size_t fieldsForCleanupCount, WalkerPartitionArgs &args) { size_t extraSize = 0; if (args.pipeControlBeforeCleanupCrossTileSync) { extraSize += 2 * NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(false); } return fieldsForCleanupCount * computeSelfCleanupSectionSize(args.useAtomicsForSelfCleanup) + 2 * computeTilesSynchronizationWithAtomicsSectionSize() + extraSize; } template void programSelfCleanupEndSection(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t finalSyncTileCountAddress, uint64_t baseAddressForCleanup, size_t fieldsForCleanupCount, WalkerPartitionArgs &args) { NEO::PipeControlArgs pipeControlArgs; if (args.pipeControlBeforeCleanupCrossTileSync) { programPipeControlCommand(inputAddress, totalBytesProgrammed, pipeControlArgs); } // Synchronize tiles, so the fields are not cleared while still in use programTilesSynchronizationWithAtomics(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, args.tileCount); for (auto fieldIndex = 0u; fieldIndex < fieldsForCleanupCount; fieldIndex++) { const uint64_t addressForCleanup = baseAddressForCleanup + fieldIndex * sizeof(uint32_t); programSelfCleanupSection(inputAddress, totalBytesProgrammed, addressForCleanup, args.useAtomicsForSelfCleanup); } if (args.pipeControlBeforeCleanupCrossTileSync) { programPipeControlCommand(inputAddress, totalBytesProgrammed, pipeControlArgs); } // this synchronization point ensures that all tiles finished zeroing and will fairly access control section atomic variables programTilesSynchronizationWithAtomics(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * args.tileCount); } template void programTilesSynchronizationWithPostSyncs(void *¤tBatchBufferPointer, uint32_t &totalBytesProgrammed, WalkerType *inputWalker, uint32_t partitionCount) { const auto postSyncAddress = inputWalker->getPostSync().getDestinationAddress() + 8llu; for (uint32_t partitionId = 0u; partitionId < partitionCount; partitionId++) { programWaitForSemaphore(currentBatchBufferPointer, totalBytesProgrammed, postSyncAddress + partitionId * 16llu, 1u, MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); } } template uint64_t computeWalkerSectionSize() { return sizeof(BATCH_BUFFER_START) + sizeof(WalkerType); } template uint64_t computeControlSectionOffset(WalkerPartitionArgs &args) { uint64_t size = 0u; size += args.synchronizeBeforeExecution ? computeTilesSynchronizationWithAtomicsSectionSize() : 0; size += sizeof(LOAD_REGISTER_IMM); // predication mask size += sizeof(MI_ATOMIC); // current id for partition size += sizeof(LOAD_REGISTER_REG); // id into register size += sizeof(MI_SET_PREDICATE) * 2 + sizeof(BATCH_BUFFER_START) * 2; size += (args.semaphoreProgrammingRequired ? NEO::EncodeSemaphore::getSizeMiSemaphoreWait() * args.partitionCount : 0u); size += computeWalkerSectionSize(); size += args.emitPipeControlStall ? NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(false) : 0u; if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) { size += computeTilesSynchronizationWithAtomicsSectionSize(); } if (args.emitSelfCleanup) { size += computeSelfCleanupSectionSize(args.useAtomicsForSelfCleanup); } size += args.preferredStaticPartitioning ? sizeof(LOAD_REGISTER_MEM) : 0u; return size; } template uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) { return computeControlSectionOffset(args) - computeWalkerSectionSize(); } template void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgrammed, WalkerType *inputWalker, uint32_t partitionCount, uint32_t tileCount, bool forceExecutionOnSingleTile, bool blockDispatchToCommandBuffer) { WalkerType *computeWalker = nullptr; if (!blockDispatchToCommandBuffer) { computeWalker = putCommand(inputAddress, totalBytesProgrammed); } if (partitionCount > 1) { auto partitionType = inputWalker->getPartitionType(); assert(inputWalker->getThreadGroupIdStartingX() == 0u); assert(inputWalker->getThreadGroupIdStartingY() == 0u); assert(inputWalker->getThreadGroupIdStartingZ() == 0u); assert(partitionType != WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED); inputWalker->setWorkloadPartitionEnable(true); uint32_t workgroupCount = 0; if (partitionType == WalkerType::PARTITION_TYPE::PARTITION_TYPE_X) { workgroupCount = inputWalker->getThreadGroupIdXDimension(); } else if (partitionType == WalkerType::PARTITION_TYPE::PARTITION_TYPE_Y) { workgroupCount = inputWalker->getThreadGroupIdYDimension(); } else { workgroupCount = inputWalker->getThreadGroupIdZDimension(); } if (forceExecutionOnSingleTile) { inputWalker->setPartitionSize(workgroupCount); } else { inputWalker->setPartitionSize(Math::divideAndRoundUp(workgroupCount, partitionCount)); } appendWalkerFields(*inputWalker, tileCount, workgroupCount); } if (!blockDispatchToCommandBuffer) { *computeWalker = *inputWalker; } return computeWalker; } /* SAMPLE COMMAND BUFFER STRUCTURE, birds eye view for 16 partitions, 4 tiles //inital setup section 1. MI_LOAD_REGISTER(PREDICATION_MASK, active partition mask ) //loop 1 - loop as long as there are partitions to be serviced 2. MI_ATOMIC_INC( ATOMIC LOCATION #31 within CMD buffer ) 3. MI_LOAD_REGISTER_REG ( ATOMIC RESULT -> WPARID ) 4. MI_SET_PREDICATE( WPARID MODE ) 5. BATCH_BUFFER_START( LOCATION #28 ) // this will not be executed if partition outside of active virtual partitions //loop 1 ends here, if we are here it means there are no more partitions 6. MI_SET_PREDICATE ( OFF ) //Walker synchronization section starts here, make sure that Walker is done 7, PIPE_CONTROL ( DC_FLUSH ) //wait for all post syncs to make sure whole work is done, caller needs to set them to 1. //now epilogue starts synchro all engines prior to coming back to RING, this will be once per command buffer to make sure that all engines actually passed via cmd buffer. //epilogue section, make sure every tile completed prior to continuing //This is cross-tile synchronization 24. ATOMIC_INC( LOCATION #31) 25. WAIT_FOR_SEMAPHORE ( LOCATION #31, LOWER THEN 4 ) // wait till all tiles hit atomic 26. PIPE_CONTROL ( TAG UPDATE ) (not implemented) 27. BATCH_BUFFER_STAT (LOCATION #32) // go to the very end //Walker section 28. COMPUTE_WALKER 29. BATCH BUFFER_START ( GO BACK TO #2) //Batch Buffer Control Data section, there are no real commands here but we have memory here //That will be updated via atomic operations. 30. uint32_t virtualPartitionID //atomic location 31. uint32_t completionTileID //all tiles needs to report completion 32. BATCH_BUFFER_END ( optional ) */ template void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer, void **outWalkerPtr, uint64_t gpuAddressOfAllocation, WalkerType *inputWalker, uint32_t &totalBytesProgrammed, WalkerPartitionArgs &args, const NEO::HardwareInfo &hwInfo) { totalBytesProgrammed = 0u; void *currentBatchBufferPointer = cpuPointer; auto controlSectionOffset = computeControlSectionOffset(args); if (args.synchronizeBeforeExecution) { auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, inTileCount); programTilesSynchronizationWithAtomics(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount); } programWparidMask(currentBatchBufferPointer, totalBytesProgrammed, args.partitionCount); programMiAtomic(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + controlSectionOffset, true, MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT); // move atomic result to wparid programMiLoadRegisterReg(currentBatchBufferPointer, totalBytesProgrammed, generalPurposeRegister4, wparidCCSOffset); // enable predication basing on wparid value programWparidPredication(currentBatchBufferPointer, totalBytesProgrammed, true); programMiBatchBufferStart(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + computeWalkerSectionStart(args), true, args.secondaryBatchBuffer); // disable predication to not noop subsequent commands. programWparidPredication(currentBatchBufferPointer, totalBytesProgrammed, false); if (args.emitSelfCleanup) { const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount); programSelfCleanupSection(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForSelfCleanup); } if (args.emitPipeControlStall) { NEO::PipeControlArgs pipeControlArgs; pipeControlArgs.dcFlushEnable = args.dcFlushEnable; programPipeControlCommand(currentBatchBufferPointer, totalBytesProgrammed, pipeControlArgs); } if (args.semaphoreProgrammingRequired) { auto postSyncAddress = inputWalker->getPostSync().getDestinationAddress() + 8llu; for (uint32_t partitionId = 0u; partitionId < args.partitionCount; partitionId++) { programWaitForSemaphore(currentBatchBufferPointer, totalBytesProgrammed, postSyncAddress + partitionId * 16llu, 1u, MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); } } if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) { auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, tileCount); programTilesSynchronizationWithAtomics(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount); } if (args.preferredStaticPartitioning) { programMiLoadRegisterMem(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset); } // this bb start goes to the end of partitioned command buffer programMiBatchBufferStart( currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + controlSectionOffset + sizeof(BatchBufferControlData), false, args.secondaryBatchBuffer); // Walker section auto walkerPtr = programPartitionedWalker(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile, args.blockDispatchToCommandBuffer); if (outWalkerPtr) { *outWalkerPtr = walkerPtr; } programMiBatchBufferStart(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, args.secondaryBatchBuffer); auto controlSection = reinterpret_cast(ptrOffset(cpuPointer, static_cast(controlSectionOffset))); controlSection->partitionCount = 0u; controlSection->tileCount = 0u; controlSection->inTileCount = 0u; controlSection->finalSyncTileCount = 0u; totalBytesProgrammed += sizeof(BatchBufferControlData); currentBatchBufferPointer = ptrOffset(currentBatchBufferPointer, sizeof(BatchBufferControlData)); if (args.emitSelfCleanup) { const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount); programSelfCleanupEndSection(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountAddress, gpuAddressOfAllocation + controlSectionOffset, dynamicPartitioningFieldsForCleanupCount, args); } if (args.emitBatchBufferEnd) { auto batchBufferEnd = putCommand>(currentBatchBufferPointer, totalBytesProgrammed); *batchBufferEnd = GfxFamily::cmdInitBatchBufferEnd; } } template bool isStartAndControlSectionRequired(WalkerPartitionArgs &args) { return args.synchronizeBeforeExecution || args.crossTileAtomicSynchronization || args.emitSelfCleanup; } template uint64_t computeStaticPartitioningControlSectionOffset(WalkerPartitionArgs &args) { const auto beforeExecutionSyncAtomicSize = args.synchronizeBeforeExecution ? computeTilesSynchronizationWithAtomicsSectionSize() : 0u; const auto afterExecutionSyncAtomicSize = (args.crossTileAtomicSynchronization || args.emitSelfCleanup) ? computeTilesSynchronizationWithAtomicsSectionSize() : 0u; const auto afterExecutionSyncPostSyncSize = args.semaphoreProgrammingRequired ? NEO::EncodeSemaphore::getSizeMiSemaphoreWait() * args.partitionCount : 0u; const auto selfCleanupSectionSize = args.emitSelfCleanup ? computeSelfCleanupSectionSize(args.useAtomicsForSelfCleanup) : 0u; const auto wparidRegisterSize = args.initializeWparidRegister ? sizeof(LOAD_REGISTER_MEM) : 0u; const auto pipeControlSize = args.emitPipeControlStall ? NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(false) : 0u; const auto bbStartSize = isStartAndControlSectionRequired(args) ? sizeof(BATCH_BUFFER_START) : 0u; return beforeExecutionSyncAtomicSize + wparidRegisterSize + pipeControlSize + sizeof(WalkerType) + selfCleanupSectionSize + afterExecutionSyncAtomicSize + afterExecutionSyncPostSyncSize + bbStartSize; } template void constructStaticallyPartitionedCommandBuffer(void *cpuPointer, void **outWalkerPtr, uint64_t gpuAddressOfAllocation, WalkerType *inputWalker, uint32_t &totalBytesProgrammed, WalkerPartitionArgs &args, const NEO::HardwareInfo &hwInfo) { totalBytesProgrammed = 0u; void *currentBatchBufferPointer = cpuPointer; // Get address of the control section const auto controlSectionOffset = args.blockDispatchToCommandBuffer ? 0u : computeStaticPartitioningControlSectionOffset(args); const auto afterControlSectionOffset = args.blockDispatchToCommandBuffer ? 0u : controlSectionOffset + sizeof(StaticPartitioningControlSection); if (!args.blockDispatchToCommandBuffer) { // Synchronize tiles before walker if (args.synchronizeBeforeExecution) { const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter); programTilesSynchronizationWithAtomics(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount); } // Load partition ID to wparid register and execute walker if (args.initializeWparidRegister) { programMiLoadRegisterMem(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset); } } auto walkerPtr = programPartitionedWalker(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile, args.blockDispatchToCommandBuffer); if (!args.blockDispatchToCommandBuffer) { if (outWalkerPtr) { *outWalkerPtr = walkerPtr; } // Prepare for cleanup section if (args.emitSelfCleanup) { const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter); programSelfCleanupSection(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForSelfCleanup); } if (args.emitPipeControlStall) { NEO::PipeControlArgs pipeControlArgs; pipeControlArgs.dcFlushEnable = args.dcFlushEnable; programPipeControlCommand(currentBatchBufferPointer, totalBytesProgrammed, pipeControlArgs); } // Synchronize tiles after walker if (args.semaphoreProgrammingRequired) { programTilesSynchronizationWithPostSyncs(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount); } if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) { const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter); programTilesSynchronizationWithAtomics(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount); } // Jump over the control section only when needed if (isStartAndControlSectionRequired(args)) { programMiBatchBufferStart(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer); // Control section DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset); StaticPartitioningControlSection *controlSection = putCommand(currentBatchBufferPointer, totalBytesProgrammed); controlSection->synchronizeBeforeWalkerCounter = 0u; controlSection->synchronizeAfterWalkerCounter = 0u; controlSection->finalSyncTileCounter = 0u; DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset); } // Cleanup section if (args.emitSelfCleanup) { const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter); programSelfCleanupEndSection(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountAddress, gpuAddressOfAllocation + controlSectionOffset, staticPartitioningFieldsForCleanupCount, args); } } } template uint64_t estimateSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args) { uint64_t size = {}; if (args.staticPartitioning) { size += computeStaticPartitioningControlSectionOffset(args); size += isStartAndControlSectionRequired(args) ? sizeof(StaticPartitioningControlSection) : 0u; size += args.emitSelfCleanup ? computeSelfCleanupEndSectionSize(staticPartitioningFieldsForCleanupCount, args) : 0u; } else { size += computeControlSectionOffset(args); size += sizeof(BatchBufferControlData); size += args.emitBatchBufferEnd ? sizeof(BATCH_BUFFER_END) : 0u; size += args.emitSelfCleanup ? computeSelfCleanupEndSectionSize(dynamicPartitioningFieldsForCleanupCount, args) : 0u; } return size; } template uint64_t computeBarrierControlSectionOffset(WalkerPartitionArgs &args, const NEO::RootDeviceEnvironment &rootDeviceEnvironment) { uint64_t offset = 0u; if (args.emitSelfCleanup) { offset += computeSelfCleanupSectionSize(args.useAtomicsForSelfCleanup); } if (args.usePostSync) { offset += NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false); } else { offset += NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(false); } offset += (computeTilesSynchronizationWithAtomicsSectionSize() + sizeof(BATCH_BUFFER_START)); return offset; } template uint64_t estimateBarrierSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args, const NEO::RootDeviceEnvironment &rootDeviceEnvironment) { uint64_t size = computeBarrierControlSectionOffset(args, rootDeviceEnvironment) + sizeof(BarrierControlSection); if (args.emitSelfCleanup) { size += computeSelfCleanupEndSectionSize(barrierControlSectionFieldsForCleanupCount, args); } return size; } template void constructBarrierCommandBuffer(void *cpuPointer, uint64_t gpuAddressOfAllocation, uint32_t &totalBytesProgrammed, WalkerPartitionArgs &args, NEO::PipeControlArgs &flushArgs, const NEO::RootDeviceEnvironment &rootDeviceEnvironment) { void *currentBatchBufferPointer = cpuPointer; const auto controlSectionOffset = computeBarrierControlSectionOffset(args, rootDeviceEnvironment); const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BarrierControlSection, finalSyncTileCount); if (args.emitSelfCleanup) { programSelfCleanupSection(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForSelfCleanup); } if (args.usePostSync) { programPostSyncPipeControlCommand(currentBatchBufferPointer, totalBytesProgrammed, args, flushArgs, rootDeviceEnvironment); } else { programPipeControlCommand(currentBatchBufferPointer, totalBytesProgrammed, flushArgs); } const auto crossTileSyncCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BarrierControlSection, crossTileSyncCount); programTilesSynchronizationWithAtomics(currentBatchBufferPointer, totalBytesProgrammed, crossTileSyncCountField, args.tileCount); const auto afterControlSectionOffset = controlSectionOffset + sizeof(BarrierControlSection); programMiBatchBufferStart(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer); DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset); BarrierControlSection *controlSection = putCommand(currentBatchBufferPointer, totalBytesProgrammed); controlSection->crossTileSyncCount = 0u; controlSection->finalSyncTileCount = 0u; DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset); if (args.emitSelfCleanup) { programSelfCleanupEndSection(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, gpuAddressOfAllocation + controlSectionOffset, barrierControlSectionFieldsForCleanupCount, args); } } } // namespace WalkerPartition