From edd230b6cbaa59db356f6a1512d5c6d6bd81b5ab Mon Sep 17 00:00:00 2001 From: Compute-Runtime-Validation Date: Fri, 2 May 2025 12:39:48 +0200 Subject: [PATCH] Revert "performance: Remove global fence allocation from BMG" This reverts commit c9a21c158e69f4e90f1a032dec1c6b26a0e3db70. Signed-off-by: Compute-Runtime-Validation --- .../test_cmdqueue_xe2_hpg_core.cpp | 35 +++++ ..._stream_receiver_hw_tests_xe2_hpg_core.cpp | 127 ++++++++++++++++++ .../helpers/gfx_core_helper_pvc_and_later.inl | 4 +- .../command_stream_receiver_tests.cpp | 2 +- .../direct_submission_tests_2.cpp | 8 +- .../gfx_core_helper_tests_xe2_hpg_core.cpp | 2 +- 6 files changed, 166 insertions(+), 12 deletions(-) diff --git a/level_zero/core/test/unit_tests/xe2_hpg_core/test_cmdqueue_xe2_hpg_core.cpp b/level_zero/core/test/unit_tests/xe2_hpg_core/test_cmdqueue_xe2_hpg_core.cpp index a2e1c40697..adf3252b70 100644 --- a/level_zero/core/test/unit_tests/xe2_hpg_core/test_cmdqueue_xe2_hpg_core.cpp +++ b/level_zero/core/test/unit_tests/xe2_hpg_core/test_cmdqueue_xe2_hpg_core.cpp @@ -21,6 +21,41 @@ namespace ult { using CommandQueueCommandsXe2HpgCore = Test; +HWTEST2_F(CommandQueueCommandsXe2HpgCore, givenCommandQueueWhenExecutingCommandListsThenStateSystemMemFenceAddressCmdIsGenerated, IsXe2HpgCore) { + if (neoDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) { + GTEST_SKIP(); + } + + using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; + ze_command_queue_desc_t desc = {}; + auto csr = neoDevice->getDefaultEngine().commandStreamReceiver; + + auto commandQueue = new MockCommandQueueHw(device, csr, &desc); + commandQueue->initialize(false, false, false); + + ze_result_t returnValue; + std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::compute, 0u, returnValue, false)); + auto commandListHandle = commandList->toHandle(); + commandList->close(); + + commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr); + + auto globalFence = csr->getGlobalFenceAllocation(); + + auto used = commandQueue->commandStream.getUsed(); + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer( + cmdList, commandQueue->commandStream.getCpuBase(), used)); + + auto itor = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itor); + + auto systemMemFenceAddressCmd = genCmdCast(*itor); + EXPECT_EQ(globalFence->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); + + commandQueue->destroy(); +} + HWTEST2_F(CommandQueueCommandsXe2HpgCore, givenCommandQueueWhenExecutingCommandListsForTheSecondTimeThenStateSystemMemFenceAddressCmdIsNotGenerated, IsXe2HpgCore) { using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; ze_command_queue_desc_t desc = {}; diff --git a/opencl/test/unit_test/xe2_hpg_core/command_stream_receiver_hw_tests_xe2_hpg_core.cpp b/opencl/test/unit_test/xe2_hpg_core/command_stream_receiver_hw_tests_xe2_hpg_core.cpp index b005893462..db48ca4f57 100644 --- a/opencl/test/unit_test/xe2_hpg_core/command_stream_receiver_hw_tests_xe2_hpg_core.cpp +++ b/opencl/test/unit_test/xe2_hpg_core/command_stream_receiver_hw_tests_xe2_hpg_core.cpp @@ -375,6 +375,133 @@ struct SystemMemoryFenceInDefaultConfigurationTest : public UltCommandStreamRece DebugManagerStateRestore restore; }; +using SystemMemoryFenceInDefaultConfigurationTestXe2HpgCore = SystemMemoryFenceInDefaultConfigurationTest; + +XE2_HPG_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTestXe2HpgCore, + givenNoEventProvidedWhenEnqueueKernelNotUsingSystemMemoryThenNoPostSyncFenceRequestDispatched) { + using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; + if (pClDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) { + GTEST_SKIP(); + } + + MockKernelWithInternals kernel(*pClDevice); + MockContext context(pClDevice); + MockCommandQueueHw commandQueue(&context, pClDevice, nullptr); + auto &commandStreamReceiver = pClDevice->getUltCommandStreamReceiver(); + + size_t globalWorkSize[3] = {1, 1, 1}; + commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr); + + ClHardwareParse hwParser; + hwParser.parseCommands(commandQueue); + + auto itorSystemMemFenceAddress = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress); + auto systemMemFenceAddressCmd = genCmdCast(*itorSystemMemFenceAddress); + EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); + + auto itorComputeWalker = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker); + auto walkerCmd = genCmdCast(*itorComputeWalker); + auto &postSyncData = walkerCmd->getPostSync(); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); + + if (MemorySynchronizationCommands::getSizeForAdditonalSynchronization(pClDevice->getRootDeviceEnvironment()) > 0) { + auto itorMiMemFence = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence); + auto fenceCmd = genCmdCast(*itorMiMemFence); + ASSERT_NE(nullptr, fenceCmd); + EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE_FENCE, fenceCmd->getFenceType()); + } +} + +XE2_HPG_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTestXe2HpgCore, + givenNoEventProvidedWhenEnqueueKernelUsingSystemMemoryThenPostSyncFenceRequestNotDispatched) { + using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; + if (pClDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) { + GTEST_SKIP(); + } + + MockKernelWithInternals kernel(*pClDevice); + MockContext context(pClDevice); + MockCommandQueueHw commandQueue(&context, pClDevice, nullptr); + auto &commandStreamReceiver = pClDevice->getUltCommandStreamReceiver(); + + size_t globalWorkSize[3] = {1, 1, 1}; + kernel.mockKernel->anyKernelArgumentUsingSystemMemory = true; + commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr); + + ClHardwareParse hwParser; + hwParser.parseCommands(commandQueue); + + auto itorSystemMemFenceAddress = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress); + auto systemMemFenceAddressCmd = genCmdCast(*itorSystemMemFenceAddress); + EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); + + auto itorComputeWalker = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker); + auto walkerCmd = genCmdCast(*itorComputeWalker); + auto &postSyncData = walkerCmd->getPostSync(); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); + + if (MemorySynchronizationCommands::getSizeForAdditonalSynchronization(pClDevice->getRootDeviceEnvironment()) > 0) { + auto itorMiMemFence = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence); + auto fenceCmd = genCmdCast(*itorMiMemFence); + ASSERT_NE(nullptr, fenceCmd); + EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE_FENCE, fenceCmd->getFenceType()); + } +} + +XE2_HPG_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTestXe2HpgCore, + givenEventProvidedWhenEnqueueKernelNotUsingSystemMemoryThenPostSyncFenceRequestNotDispatched) { + using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; + if (pClDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) { + GTEST_SKIP(); + } + + MockKernelWithInternals kernel(*pClDevice); + MockContext context(pClDevice); + MockCommandQueueHw commandQueue(&context, pClDevice, nullptr); + auto &commandStreamReceiver = pClDevice->getUltCommandStreamReceiver(); + + size_t globalWorkSize[3] = {1, 1, 1}; + cl_event kernelEvent{}; + commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, &kernelEvent); + + ClHardwareParse hwParser; + hwParser.parseCommands(commandQueue); + + auto itorSystemMemFenceAddress = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress); + auto systemMemFenceAddressCmd = genCmdCast(*itorSystemMemFenceAddress); + EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); + + auto itorComputeWalker = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker); + auto walkerCmd = genCmdCast(*itorComputeWalker); + auto &postSyncData = walkerCmd->getPostSync(); + EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); + + if (MemorySynchronizationCommands::getSizeForAdditonalSynchronization(pClDevice->getRootDeviceEnvironment()) > 0) { + auto itorMiMemFence = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence); + auto fenceCmd = genCmdCast(*itorMiMemFence); + ASSERT_NE(nullptr, fenceCmd); + EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE_FENCE, fenceCmd->getFenceType()); + } + + auto event = castToObject(kernelEvent); + event->release(); +} + struct Xe2MidThreadCommandStreamReceiverTest : public UltCommandStreamReceiverTest { void SetUp() override { debugManager.flags.ForcePreemptionMode.set(static_cast(PreemptionMode::MidThread)); diff --git a/shared/source/helpers/gfx_core_helper_pvc_and_later.inl b/shared/source/helpers/gfx_core_helper_pvc_and_later.inl index 15f8153e37..3bb4fdc705 100644 --- a/shared/source/helpers/gfx_core_helper_pvc_and_later.inl +++ b/shared/source/helpers/gfx_core_helper_pvc_and_later.inl @@ -24,9 +24,7 @@ bool GfxCoreHelperHw::isFenceAllocationRequired(const HardwareInfo &hwIn (debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.get() == 0)) { return false; } - return productHelper.isGlobalFenceInCommandStreamRequired(hwInfo) || - productHelper.isGlobalFenceInDirectSubmissionRequired(hwInfo) || - productHelper.isGlobalFenceInPostSyncRequired(hwInfo); + return !hwInfo.capabilityTable.isIntegratedDevice; } template diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 6a80c608fa..ba96ac25c2 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -4472,7 +4472,7 @@ HWTEST2_F(CommandStreamReceiverHwTest, givenImmediateFlushTaskWhenOneTimeContextSystemFenceRequiredThenExpectOneTimeSystemFenceCommand, IsHeapfulSupportedAndAtLeastXeHpcCore) { using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; - if (!pDevice->getGfxCoreHelper().isFenceAllocationRequired(pDevice->getHardwareInfo(), pDevice->getProductHelper())) { + if (pDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) { GTEST_SKIP(); } diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp index 6bd4fea9a8..ed9843afc5 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp @@ -219,14 +219,10 @@ HWTEST_F(DirectSubmissionDispatchMiMemFenceTest, givenDebugFlagSetToTrueWhenCrea DebugManagerStateRestore restorer; debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(1); - if (heaplessStateInit) { + if (heaplessStateInit || pDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) { GTEST_SKIP(); } - if (!pDevice->getDefaultEngine().commandStreamReceiver->getGlobalFenceAllocation()) { - pDevice->getDefaultEngine().commandStreamReceiver->createGlobalFenceAllocation(); - } - MockDirectSubmissionHw> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); EXPECT_TRUE(directSubmission.miMemFenceRequired); @@ -236,8 +232,6 @@ HWTEST_F(DirectSubmissionDispatchMiMemFenceTest, givenDebugFlagSetToTrueWhenCrea EXPECT_EQ(directSubmission.systemMemoryFenceAddressSet, directSubmission.globalFenceAllocation != nullptr); EXPECT_TRUE(directSubmission.miMemFenceRequired); - - reinterpret_cast *>(pDevice->getDefaultEngine().commandStreamReceiver)->cleanupResources(); } HWCMDTEST_F(IGFX_XE_HP_CORE, DirectSubmissionDispatchBufferTest, diff --git a/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp b/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp index 50538ef1b0..84b2f2e25c 100644 --- a/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp +++ b/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp @@ -486,7 +486,7 @@ XE2_HPG_CORETEST_F(GfxCoreHelperTestsXe2HpgCore, givenGfxCoreHelperWhenAskedIfFe debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(-1); debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(-1); debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(-1); - EXPECT_FALSE(gfxCoreHelper.isFenceAllocationRequired(hwInfo, productHelper)); + EXPECT_EQ(gfxCoreHelper.isFenceAllocationRequired(hwInfo, productHelper), !hwInfo.capabilityTable.isIntegratedDevice); debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0); debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(0);