diff --git a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl index d6a5e3b882..bed9eb4147 100644 --- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl +++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl @@ -162,25 +162,25 @@ inline void HardwareInterface::programWalker( uint32_t partitionCount = 0u; RequiredPartitionDim requiredPartitionDim = kernel.usesImages() ? RequiredPartitionDim::x : RequiredPartitionDim::none; - void *outWalker = nullptr; - ImplicitScalingDispatchCommandArgs implicitScalingArgs{ workPartitionAllocationGpuVa, // workPartitionAllocationGpuVa &hwInfo, // hwInfo - &outWalker, // outWalkerPtr + nullptr, // outWalkerPtr requiredPartitionDim, // requiredPartitionDim partitionCount, // partitionCount + workgroupSize, // workgroupSize + maxWgCountPerTile, // maxWgCountPerTile false, // useSecondaryBatchBuffer false, // apiSelfCleanup queueCsr.getDcFlushSupport(), // dcFlush kernel.isSingleSubdevicePreferred(), // forceExecutionOnSingleTile - false}; // blockDispatchToCommandBuffer + false, // blockDispatchToCommandBuffer + requiredWalkOrder != 0}; // isRequiredWorkGroupOrder ImplicitScalingDispatch::template dispatchCommands(commandStream, walkerCmd, devices, implicitScalingArgs); - EncodeDispatchKernel::setWalkerRegionSettings(*static_cast(outWalker), hwInfo, implicitScalingArgs.partitionCount, workgroupSize, maxWgCountPerTile, requiredWalkOrder != 0); if (queueCsr.isStaticWorkPartitioningEnabled()) { queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), implicitScalingArgs.partitionCount)); diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 2ef8c187e0..4f7871f11e 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -413,7 +413,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis PreemptionHelper::applyPreemptionWaCmdsBegin(listCmdBufferStream, *args.device); uint32_t workgroupSize = args.dispatchInterface->getGroupSize()[0] * args.dispatchInterface->getGroupSize()[1] * args.dispatchInterface->getGroupSize()[2]; - + bool isRequiredWorkGroupOrder = args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none; if (args.partitionCount > 1 && !args.isInternal) { const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress(); @@ -423,24 +423,23 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis &args.outWalkerPtr, // outWalkerPtr args.requiredPartitionDim, // requiredPartitionDim args.partitionCount, // partitionCount + workgroupSize, // workgroupSize + args.maxWgCountPerTile, // maxWgCountPerTile !(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer !args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup args.dcFlushEnable, // dcFlush EncodeDispatchKernel::singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile - args.makeCommandView}; // blockDispatchToCommandBuffer + args.makeCommandView, // blockDispatchToCommandBuffer + isRequiredWorkGroupOrder}; // isRequiredWorkGroupOrder ImplicitScalingDispatch::dispatchCommands(*listCmdBufferStream, walkerCmd, args.device->getDeviceBitfield(), implicitScalingArgs); args.partitionCount = implicitScalingArgs.partitionCount; - - void *walkerToModify = args.outWalkerPtr ? args.outWalkerPtr : &walkerCmd; - - EncodeDispatchKernel::setWalkerRegionSettings(*static_cast(walkerToModify), hwInfo, args.partitionCount, workgroupSize, args.maxWgCountPerTile, args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none); } else { args.partitionCount = 1; - EncodeDispatchKernel::setWalkerRegionSettings(walkerCmd, hwInfo, args.partitionCount, workgroupSize, args.maxWgCountPerTile, args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none); + EncodeDispatchKernel::setWalkerRegionSettings(walkerCmd, hwInfo, args.partitionCount, workgroupSize, args.maxWgCountPerTile, isRequiredWorkGroupOrder); if (!args.makeCommandView) { auto buffer = listCmdBufferStream->getSpaceForCmd(); diff --git a/shared/source/command_container/implicit_scaling.h b/shared/source/command_container/implicit_scaling.h index d53f051cf0..fa478b6ea3 100644 --- a/shared/source/command_container/implicit_scaling.h +++ b/shared/source/command_container/implicit_scaling.h @@ -44,12 +44,15 @@ struct ImplicitScalingDispatchCommandArgs { RequiredPartitionDim requiredPartitionDim = RequiredPartitionDim::none; uint32_t partitionCount = 0; + uint32_t workgroupSize = 0; + uint32_t maxWgCountPerTile = 0; bool useSecondaryBatchBuffer = false; bool apiSelfCleanup = false; bool dcFlush = false; bool forceExecutionOnSingleTile = false; bool blockDispatchToCommandBuffer = false; + bool isRequiredWorkGroupOrder = false; }; template diff --git a/shared/source/command_container/implicit_scaling_xehp_and_later.inl b/shared/source/command_container/implicit_scaling_xehp_and_later.inl index b9c4864884..c3a6023d4f 100644 --- a/shared/source/command_container/implicit_scaling_xehp_and_later.inl +++ b/shared/source/command_container/implicit_scaling_xehp_and_later.inl @@ -49,6 +49,10 @@ WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(ImplicitScalingD args.blockDispatchToCommandBuffer = dispatchCommandArgs.blockDispatchToCommandBuffer; + args.workgroupSize = dispatchCommandArgs.workgroupSize; + args.maxWgCountPerTile = dispatchCommandArgs.maxWgCountPerTile; + args.isRequiredWorkGroupOrder = dispatchCommandArgs.isRequiredWorkGroupOrder; + return args; } diff --git a/shared/source/command_container/walker_partition_interface.h b/shared/source/command_container/walker_partition_interface.h index bf9e491d52..5e6984b71f 100644 --- a/shared/source/command_container/walker_partition_interface.h +++ b/shared/source/command_container/walker_partition_interface.h @@ -17,6 +17,8 @@ struct WalkerPartitionArgs { uint64_t postSyncImmediateValue = 0; uint32_t partitionCount = 0; uint32_t tileCount = 0; + uint32_t workgroupSize = 0; + uint32_t maxWgCountPerTile = 0; bool emitBatchBufferEnd = false; bool secondaryBatchBuffer = false; bool synchronizeBeforeExecution = false; @@ -33,6 +35,7 @@ struct WalkerPartitionArgs { bool dcFlushEnable = false; bool forceExecutionOnSingleTile = false; bool blockDispatchToCommandBuffer = false; + bool isRequiredWorkGroupOrder = false; }; inline constexpr uint32_t wparidCCSOffset = 0x221C; diff --git a/shared/source/command_container/walker_partition_xehp_and_later.h b/shared/source/command_container/walker_partition_xehp_and_later.h index fe74720636..cfc8b36f47 100644 --- a/shared/source/command_container/walker_partition_xehp_and_later.h +++ b/shared/source/command_container/walker_partition_xehp_and_later.h @@ -494,16 +494,14 @@ uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) { template void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgrammed, WalkerType *inputWalker, - uint32_t partitionCount, - uint32_t tileCount, - bool forceExecutionOnSingleTile, - bool blockDispatchToCommandBuffer) { + WalkerPartitionArgs &args, + const NEO::HardwareInfo &hwInfo) { WalkerType *computeWalker = nullptr; - if (!blockDispatchToCommandBuffer) { + if (!args.blockDispatchToCommandBuffer) { computeWalker = putCommand(inputAddress, totalBytesProgrammed); } - if (partitionCount > 1) { + if (args.partitionCount > 1) { auto partitionType = inputWalker->getPartitionType(); assert(inputWalker->getThreadGroupIdStartingX() == 0u); @@ -522,16 +520,23 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm workgroupCount = inputWalker->getThreadGroupIdZDimension(); } - if (forceExecutionOnSingleTile) { + if (args.forceExecutionOnSingleTile) { inputWalker->setPartitionSize(workgroupCount); } else { - inputWalker->setPartitionSize(Math::divideAndRoundUp(workgroupCount, partitionCount)); + inputWalker->setPartitionSize(Math::divideAndRoundUp(workgroupCount, args.partitionCount)); } - appendWalkerFields(*inputWalker, tileCount, workgroupCount); + NEO::EncodeDispatchKernel::setWalkerRegionSettings(*inputWalker, + hwInfo, + args.partitionCount, + args.workgroupSize, + args.maxWgCountPerTile, + args.isRequiredWorkGroupOrder); + + appendWalkerFields(*inputWalker, args.tileCount, workgroupCount); } - if (!blockDispatchToCommandBuffer) { + if (computeWalker != nullptr) { *computeWalker = *inputWalker; } @@ -645,7 +650,7 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer, args.secondaryBatchBuffer); // Walker section - auto walkerPtr = programPartitionedWalker(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile, args.blockDispatchToCommandBuffer); + auto walkerPtr = programPartitionedWalker(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args, hwInfo); if (outWalkerPtr) { *outWalkerPtr = walkerPtr; } @@ -742,7 +747,7 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer, } } - auto walkerPtr = programPartitionedWalker(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile, args.blockDispatchToCommandBuffer); + auto walkerPtr = programPartitionedWalker(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args, hwInfo); if (!args.blockDispatchToCommandBuffer) { if (outWalkerPtr) { diff --git a/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp b/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp index 7a66aa2d44..91ab084d2a 100644 --- a/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp +++ b/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp @@ -426,7 +426,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_X); void *walkerCommandAddress = cmdBufferAddress; - programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false); + WalkerPartition::WalkerPartitionArgs args = {}; + args.partitionCount = 2; + args.tileCount = 2; + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo); auto walkerCommand = genCmdCast(walkerCommandAddress); ASSERT_NE(nullptr, walkerCommand); @@ -437,7 +440,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_Y); walkerCommandAddress = cmdBufferAddress; - programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false); + args = {}; + args.partitionCount = 2; + args.tileCount = 2; + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo); walkerCommand = genCmdCast(walkerCommandAddress); ASSERT_NE(nullptr, walkerCommand); @@ -446,7 +452,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_Z); walkerCommandAddress = cmdBufferAddress; - programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false); + args = {}; + args.partitionCount = 2; + args.tileCount = 2; + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo); walkerCommand = genCmdCast(walkerCommandAddress); ASSERT_NE(nullptr, walkerCommand); @@ -456,7 +465,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen // if we program with partition Count == 1 then do not trigger partition stuff walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED); walkerCommandAddress = cmdBufferAddress; - programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 1u, 2, false, false); + args = {}; + args.partitionCount = 1; + args.tileCount = 2; + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo); walkerCommand = genCmdCast(walkerCommandAddress); ASSERT_NE(nullptr, walkerCommand); @@ -1763,7 +1775,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTi bool forceExecutionOnSingleTile = false; walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_X); void *walkerCommandAddress = cmdBufferAddress; - programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, forceExecutionOnSingleTile, false); + WalkerPartition::WalkerPartitionArgs args = {}; + args.partitionCount = 2; + args.tileCount = 2; + args.forceExecutionOnSingleTile = forceExecutionOnSingleTile; + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo); auto walkerCommand = genCmdCast(walkerCommandAddress); ASSERT_NE(nullptr, walkerCommand); @@ -1773,7 +1789,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTi forceExecutionOnSingleTile = true; walkerCommandAddress = cmdBufferAddress; - programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, forceExecutionOnSingleTile, false); + args = {}; + args.partitionCount = 2; + args.tileCount = 2; + args.forceExecutionOnSingleTile = forceExecutionOnSingleTile; + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo); walkerCommand = genCmdCast(walkerCommandAddress); ASSERT_NE(nullptr, walkerCommand); diff --git a/shared/test/unit_test/fixtures/implicit_scaling_fixture.cpp b/shared/test/unit_test/fixtures/implicit_scaling_fixture.cpp index 4ed5c36d16..516b663978 100644 --- a/shared/test/unit_test/fixtures/implicit_scaling_fixture.cpp +++ b/shared/test/unit_test/fixtures/implicit_scaling_fixture.cpp @@ -47,11 +47,14 @@ ImplicitScalingDispatchCommandArgs ImplicitScalingFixture::createDispatchCommand nullptr, // outWalkerPtr NEO::RequiredPartitionDim::none, // requiredPartitionDim partitionCount, // partitionCount + 1, // workgroupSize + 1, // maxWgCountPerTile true, // useSecondaryBatchBuffer false, // apiSelfCleanup dcFlushFlag, // dcFlush forceExecutionOnSingleTileFlag, // forceExecutionOnSingleTile - false}; // blockDispatchToCommandBuffer + false, // blockDispatchToCommandBuffer + false}; // isRequiredWorkGroupOrder return args; }