From 914939c377f9825ada4853ade94a78e513edfc45 Mon Sep 17 00:00:00 2001 From: "Naklicki, Mateusz" Date: Tue, 15 Nov 2022 13:48:45 +0000 Subject: [PATCH] Fix execution of cooperative kernels on multi-tile device Add flag for forcing execution of kernels on single tile Force cooperative kernels to use only single tile Related-to: NEO-6729 Signed-off-by: Naklicki, Mateusz --- .../test_cmdlist_append_launch_kernel_2.cpp | 8 +- opencl/source/command_queue/enqueue_common.h | 2 +- .../gpgpu_walker_xehp_and_later.inl | 3 +- .../hardware_interface_xehp_and_later.inl | 3 +- opencl/source/helpers/task_information.cpp | 2 +- .../dispatch_walker_tests_xehp_and_later.cpp | 89 ++++++++++++++----- .../command_encoder_xehp_and_later.inl | 4 +- .../command_container/implicit_scaling.h | 1 + .../implicit_scaling_xehp_and_later.inl | 9 +- .../walker_partition_interface.h | 1 + .../walker_partition_xehp_and_later.h | 13 ++- ..._encode_dispatch_kernel_xehp_and_later.cpp | 30 +++++++ .../test_implicit_scaling_xehp_and_later.cpp | 39 ++++---- ...alker_partition_tests_xehp_and_later_2.cpp | 39 ++++++-- .../fixtures/implicit_scaling_fixture.h | 1 + 15 files changed, 182 insertions(+), 62 deletions(-) diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index 6afef1a221..09f141a9ae 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -1417,8 +1417,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, MultiTileCommandListAppendLaunchKernelXeHpCoreTest, EXPECT_EQ(4u, commandList->partitionCount); } -HWTEST2_F(MultiTileCommandListAppendLaunchKernelXeHpCoreTest, givenCooperativeKernelWhenAppendingKernelsThenDoNotUseImplicitScaling, IsAtLeastXeHpCore) { - ze_group_count_t groupCount{1, 1, 1}; +HWTEST2_F(MultiTileCommandListAppendLaunchKernelXeHpCoreTest, givenCooperativeKernelWhenAppendingKernelsThenSetProperPartitionSize, IsAtLeastXeHpCore) { + ze_group_count_t groupCount{16, 1, 1}; auto commandListWithNonCooperativeKernel = std::make_unique>>(); auto result = commandListWithNonCooperativeKernel->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); @@ -1434,6 +1434,7 @@ HWTEST2_F(MultiTileCommandListAppendLaunchKernelXeHpCoreTest, givenCooperativeKe auto itorWalker = find(cmdList.begin(), cmdList.end()); auto cmd = genCmdCast(*itorWalker); EXPECT_TRUE(cmd->getWorkloadPartitionEnable()); + EXPECT_EQ(4u, cmd->getPartitionSize()); auto commandListWithCooperativeKernel = std::make_unique>>(); result = commandListWithCooperativeKernel->initialize(device, NEO::EngineGroupType::RenderCompute, 0u); @@ -1445,11 +1446,12 @@ HWTEST2_F(MultiTileCommandListAppendLaunchKernelXeHpCoreTest, givenCooperativeKe sizeAfter = commandListWithCooperativeKernel->commandContainer.getCommandStream()->getUsed(); cmdList.clear(); ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( - cmdList, ptrOffset(commandListWithNonCooperativeKernel->commandContainer.getCommandStream()->getCpuBase(), sizeBefore), sizeAfter - sizeBefore)); + cmdList, ptrOffset(commandListWithCooperativeKernel->commandContainer.getCommandStream()->getCpuBase(), sizeBefore), sizeAfter - sizeBefore)); itorWalker = find(cmdList.begin(), cmdList.end()); cmd = genCmdCast(*itorWalker); EXPECT_TRUE(cmd->getWorkloadPartitionEnable()); + EXPECT_EQ(16u, cmd->getPartitionSize()); } HWTEST2_F(MultiTileCommandListAppendLaunchKernelXeHpCoreTest, diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index ee7a3313b4..eabab3dc5f 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -799,7 +799,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( !eventBuilder.getEvent() || getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), // outOfOrderExecutionAllowed false, // epilogueRequired false, // usePerDssBackedBuffer - kernel->isSingleSubdevicePreferred(), // useSingleSubdevice + false, // useSingleSubdevice useGlobalAtomics, // useGlobalAtomics kernel->areMultipleSubDevicesInContext(), // areMultipleSubDevicesInContext kernel->requiresMemoryMigration(), // memoryMigrationRequired diff --git a/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl b/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl index 7e9fcd8a8d..ce2fe1b7e9 100644 --- a/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl +++ b/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl @@ -138,8 +138,7 @@ size_t EnqueueOperation::getSizeRequiredCSKernel(bool reserveProfilin HardwareCommandsHelper::getSizeRequiredCS() + EncodeMemoryPrefetch::getSizeForMemoryPrefetch(pKernel->getKernelInfo().heapInfo.KernelHeapSize, commandQueue.getDevice().getHardwareInfo()); auto devices = commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getDeviceBitfield(); - auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, - !pKernel->isSingleSubdevicePreferred()); + auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true); if (partitionWalker) { Vec3 groupStart = dispatchInfo.getStartOfWorkgroups(); Vec3 groupCount = dispatchInfo.getNumberOfWorkgroups(); diff --git a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl index 94861869fa..53814c65c2 100644 --- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl +++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl @@ -122,7 +122,7 @@ inline void HardwareInterface::programWalker( EncodeDispatchKernel::encodeAdditionalWalkerFields(hwInfo, walkerCmd, encodeWalkerArgs); auto devices = queueCsr.getOsContext().getDeviceBitfield(); - auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, !kernel.isSingleSubdevicePreferred()); + auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true); if (partitionWalker) { const uint64_t workPartitionAllocationGpuVa = commandQueue.getDevice().getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress(); @@ -135,6 +135,7 @@ inline void HardwareInterface::programWalker( false, kernel.usesImages(), queueCsr.getDcFlushSupport(), + kernel.isSingleSubdevicePreferred(), workPartitionAllocationGpuVa, hwInfo); if (queueCsr.isStaticWorkPartitioningEnabled()) { diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index 282c24366d..6fc4095f10 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -205,7 +205,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate commandQueue.getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), // outOfOrderExecutionAllowed false, // epilogueRequired false, // usePerDssBackedBuffer - kernel->isSingleSubdevicePreferred(), // useSingleSubdevice + false, // useSingleSubdevice kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, // useGlobalAtomics kernel->areMultipleSubDevicesInContext(), // areMultipleSubDevicesInContext kernel->requiresMemoryMigration(), // memoryMigrationRequired diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp index ba62c06cdd..c08b3e1bac 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp @@ -1319,29 +1319,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenProgramWal EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType()); } -HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenKernelThatPrefersSingleSubdeviceWhenProgramWalkerThenPartitioningIsNotUsed) { - if (!OSInterface::osEnableLocalMemory) { - GTEST_SKIP(); - } - - struct SingleSubdeviceKernel : public MockKernel { - using MockKernel::MockKernel; - bool isSingleSubdevicePreferred() const override { return true; } - }; - - auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); - size_t gws[] = {2, 1, 1}; - size_t lws[] = {1, 1, 1}; - SingleSubdeviceKernel subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device); - cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); - - ClHardwareParse hwParser; - hwParser.parseCommands(*cmdQ); - auto computeWalker = reinterpret_cast(hwParser.cmdWalker); - ASSERT_NE(nullptr, computeWalker); - EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, computeWalker->getPartitionType()); -} - HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenProgramWalkerIsCalledWithPartitionLogicDisabledThenWalkerPartitionLogicIsNotExecuted) { if (!OSInterface::osEnableLocalMemory) { GTEST_SKIP(); @@ -1914,3 +1891,69 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenSimdSize1TWhenCheckToGener EXPECT_TRUE(EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd)); } + +struct XeHPAndLaterDispatchWalkerTestMultiTileDevice : public XeHPAndLaterDispatchWalkerBasicTest { + void SetUp() override { + DebugManager.flags.CreateMultipleSubDevices.set(2u); + + XeHPAndLaterDispatchWalkerBasicTest::SetUp(); + } + void TearDown() override { + XeHPAndLaterDispatchWalkerBasicTest::TearDown(); + } +}; + +struct KernelWithSingleSubdevicePreferences : public MockKernel { + using MockKernel::MockKernel; + bool isSingleSubdevicePreferred() const override { return singleSubdevicePreferred; } + + bool singleSubdevicePreferred = true; +}; + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerTestMultiTileDevice, givenKernelThatPrefersSingleSubdeviceWhenProgramWalkerThenKernelIsExecutedOnSingleTile) { + if (!OSInterface::osEnableLocalMemory) { + GTEST_SKIP(); + } + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {2, 1, 1}; + size_t lws[] = {1, 1, 1}; + auto &commandStreamReceiver = cmdQ->getUltCommandStreamReceiver(); + if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) { + commandStreamReceiver.createPreemptionAllocation(); + } + KernelWithSingleSubdevicePreferences subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device); + subdeviceKernel.singleSubdevicePreferred = true; + cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + ClHardwareParse hwParser; + hwParser.parseCommands(*cmdQ); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + ASSERT_NE(nullptr, computeWalker); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType()); + EXPECT_EQ(2u, computeWalker->getPartitionSize()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerTestMultiTileDevice, givenKernelThatDoesntPreferSingleSubdeviceWhenProgramWalkerThenKernelIsExecutedOnAllTiles) { + if (!OSInterface::osEnableLocalMemory) { + GTEST_SKIP(); + } + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {2, 1, 1}; + size_t lws[] = {1, 1, 1}; + auto &commandStreamReceiver = cmdQ->getUltCommandStreamReceiver(); + if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) { + commandStreamReceiver.createPreemptionAllocation(); + } + KernelWithSingleSubdevicePreferences subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device); + subdeviceKernel.singleSubdevicePreferred = false; + cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + ClHardwareParse hwParser; + hwParser.parseCommands(*cmdQ); + auto computeWalker = reinterpret_cast(hwParser.cmdWalker); + ASSERT_NE(nullptr, computeWalker); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType()); + EXPECT_EQ(1u, computeWalker->getPartitionSize()); +} \ No newline at end of file diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index fff61a5181..20063b0cf1 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -301,8 +301,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis PreemptionHelper::applyPreemptionWaCmdsBegin(listCmdBufferStream, *args.device); - if ((args.partitionCount > 1 && !args.isCooperative) && - !args.isInternal) { + if (args.partitionCount > 1 && !args.isInternal) { const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress(); if (args.eventAddress != 0) { postSync.setOperation(POSTSYNC_DATA::OPERATION_WRITE_TIMESTAMP); @@ -315,6 +314,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis !args.isKernelDispatchedFromImmediateCmdList, false, args.dcFlushEnable, + args.isCooperative, workPartitionAllocationGpuVa, hwInfo); } else { diff --git a/shared/source/command_container/implicit_scaling.h b/shared/source/command_container/implicit_scaling.h index 4c2ccb1b86..8547c5a2eb 100644 --- a/shared/source/command_container/implicit_scaling.h +++ b/shared/source/command_container/implicit_scaling.h @@ -53,6 +53,7 @@ struct ImplicitScalingDispatch { bool apiSelfCleanup, bool usesImages, bool dcFlush, + bool forceExecutionOnSingleTile, uint64_t workPartitionAllocationGpuVa, const HardwareInfo &hwInfo); diff --git a/shared/source/command_container/implicit_scaling_xehp_and_later.inl b/shared/source/command_container/implicit_scaling_xehp_and_later.inl index 084af527ab..58a1f356a5 100644 --- a/shared/source/command_container/implicit_scaling_xehp_and_later.inl +++ b/shared/source/command_container/implicit_scaling_xehp_and_later.inl @@ -22,7 +22,8 @@ WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(uint64_t workPar bool preferStaticPartitioning, bool staticPartitioning, bool useSecondaryBatchBuffer, - bool dcFlush) { + bool dcFlush, + bool forceExecutionOnSingleTile) { WalkerPartition::WalkerPartitionArgs args = {}; args.workPartitionAllocationGpuVa = workPartitionAllocationGpuVa; @@ -30,6 +31,7 @@ WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(uint64_t workPar args.tileCount = tileCount; args.staticPartitioning = staticPartitioning; args.preferredStaticPartitioning = preferStaticPartitioning; + args.forceExecutionOnSingleTile = forceExecutionOnSingleTile; args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup(); args.initializeWparidRegister = ImplicitScalingHelper::isWparidRegisterInitializationRequired(); @@ -76,6 +78,7 @@ size_t ImplicitScalingDispatch::getSize(bool apiSelfCleanup, preferStaticPartitioning, staticPartitioning, false, + false, false); return static_cast(WalkerPartition::estimateSpaceRequiredInCommandBuffer(args)); @@ -90,6 +93,7 @@ void ImplicitScalingDispatch::dispatchCommands(LinearStream &commandS bool apiSelfCleanup, bool usesImages, bool dcFlush, + bool forceExecutionOnSingleTile, uint64_t workPartitionAllocationGpuVa, const HardwareInfo &hwInfo) { uint32_t totalProgrammedSize = 0u; @@ -106,7 +110,8 @@ void ImplicitScalingDispatch::dispatchCommands(LinearStream &commandS preferStaticPartitioning, staticPartitioning, useSecondaryBatchBuffer, - dcFlush); + dcFlush, + forceExecutionOnSingleTile); auto dispatchCommandsSize = getSize(apiSelfCleanup, preferStaticPartitioning, devices, {walkerCmd.getThreadGroupIdStartingX(), walkerCmd.getThreadGroupIdStartingY(), walkerCmd.getThreadGroupIdStartingZ()}, {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}); void *commandBuffer = commandStream.getSpace(dispatchCommandsSize); diff --git a/shared/source/command_container/walker_partition_interface.h b/shared/source/command_container/walker_partition_interface.h index 673d969cdf..3de29af39c 100644 --- a/shared/source/command_container/walker_partition_interface.h +++ b/shared/source/command_container/walker_partition_interface.h @@ -31,6 +31,7 @@ struct WalkerPartitionArgs { bool usePostSync = false; bool pipeControlBeforeCleanupCrossTileSync = false; bool dcFlushEnable = false; + bool forceExecutionOnSingleTile = false; }; constexpr uint32_t wparidCCSOffset = 0x221C; diff --git a/shared/source/command_container/walker_partition_xehp_and_later.h b/shared/source/command_container/walker_partition_xehp_and_later.h index 5a28af787e..e1182571bd 100644 --- a/shared/source/command_container/walker_partition_xehp_and_later.h +++ b/shared/source/command_container/walker_partition_xehp_and_later.h @@ -480,7 +480,8 @@ uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) { template void programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgrammed, COMPUTE_WALKER *inputWalker, - uint32_t partitionCount) { + uint32_t partitionCount, + bool forceExecutionOnSingleTile) { auto computeWalker = putCommand>(inputAddress, totalBytesProgrammed); COMPUTE_WALKER cmd = *inputWalker; @@ -503,7 +504,11 @@ void programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramme workgroupCount = inputWalker->getThreadGroupIdZDimension(); } - cmd.setPartitionSize((workgroupCount + partitionCount - 1u) / partitionCount); + if (forceExecutionOnSingleTile) { + cmd.setPartitionSize(workgroupCount); + } else { + cmd.setPartitionSize(Math::divideAndRoundUp(workgroupCount, partitionCount)); + } } *computeWalker = cmd; } @@ -614,7 +619,7 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer, args.secondaryBatchBuffer); // Walker section - programPartitionedWalker(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount); + programPartitionedWalker(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.forceExecutionOnSingleTile); programMiBatchBufferStart(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, args.secondaryBatchBuffer); @@ -704,7 +709,7 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer, if (args.initializeWparidRegister) { programMiLoadRegisterMem(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset); } - programPartitionedWalker(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount); + programPartitionedWalker(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.forceExecutionOnSingleTile); // Prepare for cleanup section if (args.emitSelfCleanup) { diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp index 8292f6321a..46e770bb43 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp @@ -1026,6 +1026,36 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesImplicitScaling, EXPECT_EQ(eventAddress, postSync.getDestinationAddress()); } +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesImplicitScaling, givenCooperativeKernelWhenEncodingDispatchKernelThenExpectPartitionSizeEqualWorkgroupSize) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + uint32_t dims[] = {16, 1, 1}; + std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); + + bool requiresUncachedMocs = false; + bool isInternal = false; + bool isCooperative = true; + + EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs); + dispatchArgs.isInternal = isInternal; + dispatchArgs.isCooperative = isCooperative; + dispatchArgs.partitionCount = 2; + EncodeDispatchKernel::encode(*cmdContainer.get(), dispatchArgs, nullptr); + + size_t containerUsedAfterBase = cmdContainer->getCommandStream()->getUsed(); + + GenCmdList partitionedWalkerList; + CmdParse::parseCommandBuffer(partitionedWalkerList, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), containerUsedAfterBase); + auto itor = find(partitionedWalkerList.begin(), partitionedWalkerList.end()); + ASSERT_NE(itor, partitionedWalkerList.end()); + + auto partitionWalkerCmd = genCmdCast(*itor); + EXPECT_EQ(WALKER_TYPE::PARTITION_TYPE::PARTITION_TYPE_X, partitionWalkerCmd->getPartitionType()); + uint32_t expectedPartitionSize = dims[0]; + EXPECT_EQ(expectedPartitionSize, partitionWalkerCmd->getPartitionSize()); +} + struct CommandEncodeStatesDynamicImplicitScalingFixture : CommandEncodeStatesImplicitScalingFixture { void setUp() { DebugManager.flags.EnableStaticPartitioning.set(0); diff --git a/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp b/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp index 1a1300ac56..ec44e063f9 100644 --- a/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp +++ b/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp @@ -30,7 +30,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenGetSizeWhenDispatchingCm expectedSize = ImplicitScalingDispatch::getSize(false, false, twoTile, Vec3(0, 0, 0), Vec3(32, 1, 1)); uint32_t partitionCount = 0; - ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, 0u, *defaultHwInfo); + ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, + forceExecutionOnSingleTileFlag, 0u, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(2u, partitionCount); @@ -72,7 +73,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenWorkgroupOneAndNoPartiti expectedSize = ImplicitScalingDispatch::getSize(false, false, twoTile, Vec3(0, 0, 0), Vec3(1, 1, 1)); uint32_t partitionCount = 0; - ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, false, false, false, dcFlushFlag, 0u, *defaultHwInfo); + ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, false, false, false, dcFlushFlag, + forceExecutionOnSingleTileFlag, 0u, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(1u, partitionCount); @@ -115,7 +117,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenWorkgroupOneAndPartition expectedSize = ImplicitScalingDispatch::getSize(false, false, twoTile, Vec3(0, 0, 0), Vec3(1, 1, 1)); uint32_t partitionCount = 0; - ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, 0u, *defaultHwInfo); + ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, + forceExecutionOnSingleTileFlag, 0u, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(1u, partitionCount); @@ -162,7 +165,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenDi uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(2u, partitionCount); @@ -214,7 +217,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenPa uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(2u, partitionCount); @@ -268,7 +271,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); @@ -319,7 +322,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); @@ -356,7 +359,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); @@ -393,7 +396,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenDynamicPartitioningPrefe uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); @@ -441,7 +444,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); @@ -509,7 +512,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); @@ -569,7 +572,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); @@ -629,7 +632,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); @@ -696,7 +699,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); @@ -759,7 +762,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); @@ -824,7 +827,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); @@ -892,7 +895,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); @@ -959,7 +962,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, uint32_t partitionCount = 0; ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, - workPartitionAllocationAddress, *defaultHwInfo); + forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo); totalBytesProgrammed = commandStream.getUsed(); EXPECT_EQ(expectedSize, totalBytesProgrammed); EXPECT_EQ(twoTile.count(), partitionCount); diff --git a/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp b/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp index ec8b89fb13..8212b02f14 100644 --- a/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp +++ b/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp @@ -400,7 +400,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X); void *walkerCommandAddress = cmdBufferAddress; - programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u); + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, false); auto walkerCommand = genCmdCast *>(walkerCommandAddress); ASSERT_NE(nullptr, walkerCommand); @@ -411,7 +411,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y); walkerCommandAddress = cmdBufferAddress; - programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u); + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, false); walkerCommand = genCmdCast *>(walkerCommandAddress); ASSERT_NE(nullptr, walkerCommand); @@ -420,7 +420,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z); walkerCommandAddress = cmdBufferAddress; - programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u); + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, false); walkerCommand = genCmdCast *>(walkerCommandAddress); ASSERT_NE(nullptr, walkerCommand); @@ -430,7 +430,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen //if we program with partition Count == 1 then do not trigger partition stuff walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED); walkerCommandAddress = cmdBufferAddress; - programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 1u); + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 1u, false); walkerCommand = genCmdCast *>(walkerCommandAddress); ASSERT_NE(nullptr, walkerCommand); @@ -506,7 +506,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerWithDifferentWorkg EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z, walker.getPartitionType()); } -HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenDisalbedMinimalPartitionSizeWhenCoomputePartitionSizeThenProperValueIsReturned) { +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenDisabledMinimalPartitionSizeWhenComputePartitionSizeThenProperValueIsReturned) { WalkerPartition::COMPUTE_WALKER walker; walker = FamilyType::cmdInitGpgpuWalker; walker.setThreadGroupIdXDimension(64u); @@ -1672,3 +1672,32 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenBarrierProgrammingWhenEm EXPECT_EQ(parsedOffset, expectedCommandUsedSize); } + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTileWhenProgramComputeWalkerThenWalkerIsProperlyProgrammed) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(32u); + walker.setThreadGroupIdYDimension(1u); + walker.setThreadGroupIdZDimension(1u); + + bool forceExecutionOnSingleTile = false; + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X); + void *walkerCommandAddress = cmdBufferAddress; + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, forceExecutionOnSingleTile); + auto walkerCommand = genCmdCast *>(walkerCommandAddress); + + ASSERT_NE(nullptr, walkerCommand); + EXPECT_TRUE(walkerCommand->getWorkloadPartitionEnable()); + EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walkerCommand->getPartitionType()); + EXPECT_EQ(16u, walkerCommand->getPartitionSize()); + + forceExecutionOnSingleTile = true; + walkerCommandAddress = cmdBufferAddress; + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, forceExecutionOnSingleTile); + walkerCommand = genCmdCast *>(walkerCommandAddress); + + ASSERT_NE(nullptr, walkerCommand); + EXPECT_TRUE(walkerCommand->getWorkloadPartitionEnable()); + EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walkerCommand->getPartitionType()); + EXPECT_EQ(32u, walkerCommand->getPartitionSize()); +} diff --git a/shared/test/unit_test/fixtures/implicit_scaling_fixture.h b/shared/test/unit_test/fixtures/implicit_scaling_fixture.h index 420c364794..2a70023087 100644 --- a/shared/test/unit_test/fixtures/implicit_scaling_fixture.h +++ b/shared/test/unit_test/fixtures/implicit_scaling_fixture.h @@ -34,6 +34,7 @@ struct ImplicitScalingFixture : public CommandEncodeStatesFixture { DeviceBitfield twoTile; void *alignedMemory = nullptr; bool dcFlushFlag = false; + bool forceExecutionOnSingleTileFlag = false; }; using ImplicitScalingTests = Test;