From c9a21c158e69f4e90f1a032dec1c6b26a0e3db70 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Mon, 28 Apr 2025 15:54:56 +0000 Subject: [PATCH] performance: Remove global fence allocation from BMG Resolves: NEO-14642 Signed-off-by: Lukasz Jobczyk --- .../test_cmdqueue_xe2_hpg_core.cpp | 35 ----- ..._stream_receiver_hw_tests_xe2_hpg_core.cpp | 127 ------------------ .../helpers/gfx_core_helper_pvc_and_later.inl | 4 +- .../command_stream_receiver_tests.cpp | 2 +- .../direct_submission_tests_2.cpp | 8 +- .../gfx_core_helper_tests_xe2_hpg_core.cpp | 2 +- 6 files changed, 12 insertions(+), 166 deletions(-) diff --git a/level_zero/core/test/unit_tests/xe2_hpg_core/test_cmdqueue_xe2_hpg_core.cpp b/level_zero/core/test/unit_tests/xe2_hpg_core/test_cmdqueue_xe2_hpg_core.cpp index adf3252b70..a2e1c40697 100644 --- a/level_zero/core/test/unit_tests/xe2_hpg_core/test_cmdqueue_xe2_hpg_core.cpp +++ b/level_zero/core/test/unit_tests/xe2_hpg_core/test_cmdqueue_xe2_hpg_core.cpp @@ -21,41 +21,6 @@ namespace ult { using CommandQueueCommandsXe2HpgCore = Test; -HWTEST2_F(CommandQueueCommandsXe2HpgCore, givenCommandQueueWhenExecutingCommandListsThenStateSystemMemFenceAddressCmdIsGenerated, IsXe2HpgCore) { - if (neoDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) { - GTEST_SKIP(); - } - - using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; - ze_command_queue_desc_t desc = {}; - auto csr = neoDevice->getDefaultEngine().commandStreamReceiver; - - auto commandQueue = new MockCommandQueueHw(device, csr, &desc); - commandQueue->initialize(false, false, false); - - ze_result_t returnValue; - std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::compute, 0u, returnValue, false)); - auto commandListHandle = commandList->toHandle(); - commandList->close(); - - commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr); - - auto globalFence = csr->getGlobalFenceAllocation(); - - auto used = commandQueue->commandStream.getUsed(); - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer( - cmdList, commandQueue->commandStream.getCpuBase(), used)); - - auto itor = find(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itor); - - auto systemMemFenceAddressCmd = genCmdCast(*itor); - EXPECT_EQ(globalFence->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); - - commandQueue->destroy(); -} - HWTEST2_F(CommandQueueCommandsXe2HpgCore, givenCommandQueueWhenExecutingCommandListsForTheSecondTimeThenStateSystemMemFenceAddressCmdIsNotGenerated, IsXe2HpgCore) { using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; ze_command_queue_desc_t desc = {}; diff --git a/opencl/test/unit_test/xe2_hpg_core/command_stream_receiver_hw_tests_xe2_hpg_core.cpp b/opencl/test/unit_test/xe2_hpg_core/command_stream_receiver_hw_tests_xe2_hpg_core.cpp index db48ca4f57..b005893462 100644 --- a/opencl/test/unit_test/xe2_hpg_core/command_stream_receiver_hw_tests_xe2_hpg_core.cpp +++ b/opencl/test/unit_test/xe2_hpg_core/command_stream_receiver_hw_tests_xe2_hpg_core.cpp @@ -375,133 +375,6 @@ struct SystemMemoryFenceInDefaultConfigurationTest : public UltCommandStreamRece DebugManagerStateRestore restore; }; -using SystemMemoryFenceInDefaultConfigurationTestXe2HpgCore = SystemMemoryFenceInDefaultConfigurationTest; - -XE2_HPG_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTestXe2HpgCore, - givenNoEventProvidedWhenEnqueueKernelNotUsingSystemMemoryThenNoPostSyncFenceRequestDispatched) { - using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; - using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; - using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; - if (pClDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) { - GTEST_SKIP(); - } - - MockKernelWithInternals kernel(*pClDevice); - MockContext context(pClDevice); - MockCommandQueueHw commandQueue(&context, pClDevice, nullptr); - auto &commandStreamReceiver = pClDevice->getUltCommandStreamReceiver(); - - size_t globalWorkSize[3] = {1, 1, 1}; - commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr); - - ClHardwareParse hwParser; - hwParser.parseCommands(commandQueue); - - auto itorSystemMemFenceAddress = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress); - auto systemMemFenceAddressCmd = genCmdCast(*itorSystemMemFenceAddress); - EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); - - auto itorComputeWalker = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker); - auto walkerCmd = genCmdCast(*itorComputeWalker); - auto &postSyncData = walkerCmd->getPostSync(); - EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); - - if (MemorySynchronizationCommands::getSizeForAdditonalSynchronization(pClDevice->getRootDeviceEnvironment()) > 0) { - auto itorMiMemFence = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence); - auto fenceCmd = genCmdCast(*itorMiMemFence); - ASSERT_NE(nullptr, fenceCmd); - EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE_FENCE, fenceCmd->getFenceType()); - } -} - -XE2_HPG_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTestXe2HpgCore, - givenNoEventProvidedWhenEnqueueKernelUsingSystemMemoryThenPostSyncFenceRequestNotDispatched) { - using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; - using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; - using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; - if (pClDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) { - GTEST_SKIP(); - } - - MockKernelWithInternals kernel(*pClDevice); - MockContext context(pClDevice); - MockCommandQueueHw commandQueue(&context, pClDevice, nullptr); - auto &commandStreamReceiver = pClDevice->getUltCommandStreamReceiver(); - - size_t globalWorkSize[3] = {1, 1, 1}; - kernel.mockKernel->anyKernelArgumentUsingSystemMemory = true; - commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr); - - ClHardwareParse hwParser; - hwParser.parseCommands(commandQueue); - - auto itorSystemMemFenceAddress = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress); - auto systemMemFenceAddressCmd = genCmdCast(*itorSystemMemFenceAddress); - EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); - - auto itorComputeWalker = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker); - auto walkerCmd = genCmdCast(*itorComputeWalker); - auto &postSyncData = walkerCmd->getPostSync(); - EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); - - if (MemorySynchronizationCommands::getSizeForAdditonalSynchronization(pClDevice->getRootDeviceEnvironment()) > 0) { - auto itorMiMemFence = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence); - auto fenceCmd = genCmdCast(*itorMiMemFence); - ASSERT_NE(nullptr, fenceCmd); - EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE_FENCE, fenceCmd->getFenceType()); - } -} - -XE2_HPG_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTestXe2HpgCore, - givenEventProvidedWhenEnqueueKernelNotUsingSystemMemoryThenPostSyncFenceRequestNotDispatched) { - using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; - using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; - using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; - if (pClDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) { - GTEST_SKIP(); - } - - MockKernelWithInternals kernel(*pClDevice); - MockContext context(pClDevice); - MockCommandQueueHw commandQueue(&context, pClDevice, nullptr); - auto &commandStreamReceiver = pClDevice->getUltCommandStreamReceiver(); - - size_t globalWorkSize[3] = {1, 1, 1}; - cl_event kernelEvent{}; - commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, &kernelEvent); - - ClHardwareParse hwParser; - hwParser.parseCommands(commandQueue); - - auto itorSystemMemFenceAddress = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress); - auto systemMemFenceAddressCmd = genCmdCast(*itorSystemMemFenceAddress); - EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress()); - - auto itorComputeWalker = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker); - auto walkerCmd = genCmdCast(*itorComputeWalker); - auto &postSyncData = walkerCmd->getPostSync(); - EXPECT_FALSE(postSyncData.getSystemMemoryFenceRequest()); - - if (MemorySynchronizationCommands::getSizeForAdditonalSynchronization(pClDevice->getRootDeviceEnvironment()) > 0) { - auto itorMiMemFence = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence); - auto fenceCmd = genCmdCast(*itorMiMemFence); - ASSERT_NE(nullptr, fenceCmd); - EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE_FENCE, fenceCmd->getFenceType()); - } - - auto event = castToObject(kernelEvent); - event->release(); -} - struct Xe2MidThreadCommandStreamReceiverTest : public UltCommandStreamReceiverTest { void SetUp() override { debugManager.flags.ForcePreemptionMode.set(static_cast(PreemptionMode::MidThread)); diff --git a/shared/source/helpers/gfx_core_helper_pvc_and_later.inl b/shared/source/helpers/gfx_core_helper_pvc_and_later.inl index 3bb4fdc705..15f8153e37 100644 --- a/shared/source/helpers/gfx_core_helper_pvc_and_later.inl +++ b/shared/source/helpers/gfx_core_helper_pvc_and_later.inl @@ -24,7 +24,9 @@ bool GfxCoreHelperHw::isFenceAllocationRequired(const HardwareInfo &hwIn (debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.get() == 0)) { return false; } - return !hwInfo.capabilityTable.isIntegratedDevice; + return productHelper.isGlobalFenceInCommandStreamRequired(hwInfo) || + productHelper.isGlobalFenceInDirectSubmissionRequired(hwInfo) || + productHelper.isGlobalFenceInPostSyncRequired(hwInfo); } template diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index ba96ac25c2..6a80c608fa 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -4472,7 +4472,7 @@ HWTEST2_F(CommandStreamReceiverHwTest, givenImmediateFlushTaskWhenOneTimeContextSystemFenceRequiredThenExpectOneTimeSystemFenceCommand, IsHeapfulSupportedAndAtLeastXeHpcCore) { using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; - if (pDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) { + if (!pDevice->getGfxCoreHelper().isFenceAllocationRequired(pDevice->getHardwareInfo(), pDevice->getProductHelper())) { GTEST_SKIP(); } diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp index ed9843afc5..6bd4fea9a8 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp @@ -219,10 +219,14 @@ HWTEST_F(DirectSubmissionDispatchMiMemFenceTest, givenDebugFlagSetToTrueWhenCrea DebugManagerStateRestore restorer; debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(1); - if (heaplessStateInit || pDevice->getHardwareInfo().capabilityTable.isIntegratedDevice) { + if (heaplessStateInit) { GTEST_SKIP(); } + if (!pDevice->getDefaultEngine().commandStreamReceiver->getGlobalFenceAllocation()) { + pDevice->getDefaultEngine().commandStreamReceiver->createGlobalFenceAllocation(); + } + MockDirectSubmissionHw> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); EXPECT_TRUE(directSubmission.miMemFenceRequired); @@ -232,6 +236,8 @@ HWTEST_F(DirectSubmissionDispatchMiMemFenceTest, givenDebugFlagSetToTrueWhenCrea EXPECT_EQ(directSubmission.systemMemoryFenceAddressSet, directSubmission.globalFenceAllocation != nullptr); EXPECT_TRUE(directSubmission.miMemFenceRequired); + + reinterpret_cast *>(pDevice->getDefaultEngine().commandStreamReceiver)->cleanupResources(); } HWCMDTEST_F(IGFX_XE_HP_CORE, DirectSubmissionDispatchBufferTest, diff --git a/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp b/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp index 84b2f2e25c..50538ef1b0 100644 --- a/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp +++ b/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp @@ -486,7 +486,7 @@ XE2_HPG_CORETEST_F(GfxCoreHelperTestsXe2HpgCore, givenGfxCoreHelperWhenAskedIfFe debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(-1); debugManager.flags.ProgramGlobalFenceAsKernelInstructionInEUKernel.set(-1); debugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(-1); - EXPECT_EQ(gfxCoreHelper.isFenceAllocationRequired(hwInfo, productHelper), !hwInfo.capabilityTable.isIntegratedDevice); + EXPECT_FALSE(gfxCoreHelper.isFenceAllocationRequired(hwInfo, productHelper)); debugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0); debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.set(0);