From 7c16278beed80cb93760768a0aaca51867dd409b Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Tue, 16 Apr 2024 22:23:53 +0000 Subject: [PATCH] feature: add option to store walker command content in cpu memory Related-To: NEO-10066 Signed-off-by: Zbigniew Zdanowicz --- .../cmdlist/cmdlist_hw_skl_to_tgllp.inl | 1 + .../cmdlist/cmdlist_hw_xehp_and_later.inl | 1 + .../test_cmdlist_append_launch_kernel_1.cpp | 1 + .../test_cmdlist_append_launch_kernel_3.cpp | 1 + .../command_container/command_encoder.h | 1 + .../command_encoder_xehp_and_later.inl | 4 +++ .../walker_partition_xehp_and_later.h | 11 ++++---- ..._encode_dispatch_kernel_xehp_and_later.cpp | 27 +++++++++++++++++++ ...alker_partition_tests_xehp_and_later_2.cpp | 4 +-- .../fixtures/command_container_fixture.cpp | 1 + 10 files changed, 44 insertions(+), 8 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl index d86eaa94fc..0b0242119d 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl @@ -200,6 +200,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K dsh, // dynamicStateHeap reinterpret_cast(&threadGroupDimensions), // threadGroupDimensions nullptr, // outWalkerPtr + nullptr, // cpuWalkerBuffer &additionalCommands, // additionalCommands commandListPreemptionMode, // preemptionMode launchParams.requiredPartitionDim, // requiredPartitionDim diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 484f50ceaa..ee4fae58f1 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -327,6 +327,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K dsh, // dynamicStateHeap reinterpret_cast(&threadGroupDimensions), // threadGroupDimensions nullptr, // outWalkerPtr + nullptr, // cpuWalkerBuffer &additionalCommands, // additionalCommands kernelPreemptionMode, // preemptionMode launchParams.requiredPartitionDim, // requiredPartitionDim diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp index 8ab7fda263..735577c2fe 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp @@ -195,6 +195,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenNotEnoughSpaceInCommandStreamWhenA nullptr, // dynamicStateHeap threadGroupDimensions, // threadGroupDimensions nullptr, // outWalkerPtr + nullptr, // cpuWalkerBuffer nullptr, // additionalCommands PreemptionMode::MidBatch, // preemptionMode NEO::RequiredPartitionDim::none, // requiredPartitionDim diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index c46bf168b5..b98eb1641d 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -692,6 +692,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenNotEnoughSpaceInCommandStreamWhenA nullptr, // dynamicStateHeap threadGroupDimensions, // threadGroupDimensions nullptr, // outWalkerPtr + nullptr, // cpuWalkerBuffer nullptr, // additionalCommands PreemptionMode::MidBatch, // preemptionMode NEO::RequiredPartitionDim::none, // requiredPartitionDim diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index ace8cb6093..f7f29c2665 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -54,6 +54,7 @@ struct EncodeDispatchKernelArgs { IndirectHeap *dynamicStateHeap = nullptr; const void *threadGroupDimensions = nullptr; void *outWalkerPtr = nullptr; + void *cpuWalkerBuffer = nullptr; std::list *additionalCommands = nullptr; PreemptionMode preemptionMode = PreemptionMode::Initial; NEO::RequiredPartitionDim requiredPartitionDim = NEO::RequiredPartitionDim::none; diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index c807df8ee2..321a51b65a 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -413,6 +413,10 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis *buffer = walkerCmd; } + if (args.cpuWalkerBuffer) { + *reinterpret_cast(args.cpuWalkerBuffer) = walkerCmd; + } + PreemptionHelper::applyPreemptionWaCmdsEnd(listCmdBufferStream, *args.device); if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::AfterWorkload)) { diff --git a/shared/source/command_container/walker_partition_xehp_and_later.h b/shared/source/command_container/walker_partition_xehp_and_later.h index 3473817508..9474c533b5 100644 --- a/shared/source/command_container/walker_partition_xehp_and_later.h +++ b/shared/source/command_container/walker_partition_xehp_and_later.h @@ -498,7 +498,6 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm uint32_t tileCount, bool forceExecutionOnSingleTile) { auto computeWalker = putCommand(inputAddress, totalBytesProgrammed); - WalkerType cmd = *inputWalker; if (partitionCount > 1) { auto partitionType = inputWalker->getPartitionType(); @@ -508,7 +507,7 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm assert(inputWalker->getThreadGroupIdStartingZ() == 0u); assert(partitionType != WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED); - cmd.setWorkloadPartitionEnable(true); + inputWalker->setWorkloadPartitionEnable(true); auto workgroupCount = 0u; if (partitionType == WalkerType::PARTITION_TYPE::PARTITION_TYPE_X) { @@ -520,15 +519,15 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm } if (forceExecutionOnSingleTile) { - cmd.setPartitionSize(workgroupCount); + inputWalker->setPartitionSize(workgroupCount); } else { - cmd.setPartitionSize(Math::divideAndRoundUp(workgroupCount, partitionCount)); + inputWalker->setPartitionSize(Math::divideAndRoundUp(workgroupCount, partitionCount)); } } - appendWalkerFields(cmd, tileCount); + appendWalkerFields(*inputWalker, tileCount); - *computeWalker = cmd; + *computeWalker = *inputWalker; return computeWalker; } diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp index 98402181de..7c32e4032d 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp @@ -1567,3 +1567,30 @@ HWTEST2_F(CommandEncodeStatesTest, givenEncodeDispatchKernelWhenGettingInlineDat EXPECT_EQ(expectedOffset, EncodeDispatchKernel::getInlineDataOffset(dispatchArgs)); } + +HWTEST2_F(CommandEncodeStatesTest, givenEncodeDispatchKernelWhenCpuWalkerPointerIsSetThenProvideWalkerContentInCpuBuffer, IsAtLeastXeHpCore) { + using DefaultWalkerType = typename FamilyType::DefaultWalkerType; + + uint32_t dims[] = {1, 1, 1}; + std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); + + auto walkerPtr = std::make_unique(); + DefaultWalkerType *cpuWalkerPointer = walkerPtr.get(); + + EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, false); + dispatchArgs.cpuWalkerBuffer = cpuWalkerPointer; + + EncodeDispatchKernel::template encode(*cmdContainer.get(), dispatchArgs); + + GenCmdList commands; + CmdParse::parseCommandBuffer(commands, + cmdContainer->getCommandStream()->getCpuBase(), + cmdContainer->getCommandStream()->getUsed()); + + auto itor = find(commands.begin(), commands.end()); + ASSERT_NE(itor, commands.end()); + + auto cmdWalkerGfxMemory = genCmdCast(*itor); + + EXPECT_EQ(0, memcmp(cmdWalkerGfxMemory, cpuWalkerPointer, sizeof(DefaultWalkerType))); +} diff --git a/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp b/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp index 3d686d19d3..2b418a139e 100644 --- a/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp +++ b/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp @@ -460,8 +460,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen walkerCommand = genCmdCast(walkerCommandAddress); ASSERT_NE(nullptr, walkerCommand); - EXPECT_EQ(0u, walkerCommand->getPartitionSize()); - EXPECT_FALSE(walkerCommand->getWorkloadPartitionEnable()); + EXPECT_EQ(6u, walkerCommand->getPartitionSize()); + EXPECT_TRUE(walkerCommand->getWorkloadPartitionEnable()); EXPECT_EQ(WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED, walkerCommand->getPartitionType()); } HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerWhenComputePartitionCountIsCalledThenDefaultSizeAndTypeIsReturned) { diff --git a/shared/test/unit_test/fixtures/command_container_fixture.cpp b/shared/test/unit_test/fixtures/command_container_fixture.cpp index d105f46ddc..24e227a21e 100644 --- a/shared/test/unit_test/fixtures/command_container_fixture.cpp +++ b/shared/test/unit_test/fixtures/command_container_fixture.cpp @@ -49,6 +49,7 @@ EncodeDispatchKernelArgs CommandEncodeStatesFixture::createDefaultDispatchKernel nullptr, // dynamicStateHeap threadGroupDimensions, // threadGroupDimensions nullptr, // outWalkerPtr + nullptr, // cpuWalkerBuffer nullptr, // additionalCommands PreemptionMode::Disabled, // preemptionMode NEO::RequiredPartitionDim::none, // requiredPartitionDim