From 9647322cbf665f20518ee86bc34b848add0dbf0b Mon Sep 17 00:00:00 2001 From: Szymon Morek Date: Wed, 17 Sep 2025 11:03:55 +0000 Subject: [PATCH] performance: remove cache flush before stopping ULLS Related-To: NEO-16153 Signed-off-by: Szymon Morek --- .../direct_submission_hw.inl | 2 -- .../dispatchers/blitter_dispatcher.h | 2 -- .../dispatchers/blitter_dispatcher.inl | 10 ------- .../dispatchers/render_dispatcher.h | 2 -- .../dispatchers/render_dispatcher.inl | 11 -------- shared/source/helpers/gfx_core_helper.h | 1 - .../source/helpers/gfx_core_helper_base.inl | 15 ----------- .../direct_submission_tests_1.cpp | 12 --------- .../dispatchers/blitter_dispatcher_tests.cpp | 20 -------------- .../dispatchers/render_dispatcher_tests.cpp | 26 ------------------- .../linux/drm_direct_submission_tests.cpp | 1 - .../windows/wddm_direct_submission_tests.cpp | 1 - .../helpers/gfx_core_helper_tests.cpp | 21 --------------- 13 files changed, 124 deletions(-) diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index c372988234..e65517f17d 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -264,7 +264,6 @@ bool DirectSubmissionHw::stopRingBuffer(bool blocking) { dispatchRelaxedOrderingQueueStall(); } - Dispatcher::dispatchCacheFlush(ringCommandStream, this->rootDeviceEnvironment, gpuVaForMiFlush); dispatchStopRingBufferSection(); Dispatcher::dispatchStopCommandBuffer(ringCommandStream); @@ -360,7 +359,6 @@ inline size_t DirectSubmissionHw::getSizeSwitchRingBuffer template inline size_t DirectSubmissionHw::getSizeEnd(bool relaxedOrderingSchedulerRequired) { size_t size = Dispatcher::getSizeStopCommandBuffer() + - Dispatcher::getSizeCacheFlush(rootDeviceEnvironment) + (Dispatcher::getSizeStartCommandBuffer() - Dispatcher::getSizeStopCommandBuffer()) + MemoryConstants::cacheLineSize + dispatchStopRingBufferSectionSize(); if (this->relaxedOrderingEnabled && relaxedOrderingSchedulerRequired) { diff --git a/shared/source/direct_submission/dispatchers/blitter_dispatcher.h b/shared/source/direct_submission/dispatchers/blitter_dispatcher.h index 7c1ef26bd9..9ec2657036 100644 --- a/shared/source/direct_submission/dispatchers/blitter_dispatcher.h +++ b/shared/source/direct_submission/dispatchers/blitter_dispatcher.h @@ -28,9 +28,7 @@ class BlitterDispatcher : public Dispatcher { bool notifyKmd); static size_t getSizeMonitorFence(const RootDeviceEnvironment &rootDeviceEnvironment); - static void dispatchCacheFlush(LinearStream &cmdBuffer, const RootDeviceEnvironment &rootDeviceEnvironment, uint64_t address); static void dispatchTlbFlush(LinearStream &cmdBuffer, uint64_t address, const RootDeviceEnvironment &rootDeviceEnvironment); - static size_t getSizeCacheFlush(const RootDeviceEnvironment &rootDeviceEnvironment); static size_t getSizeTlbFlush(const RootDeviceEnvironment &rootDeviceEnvironment); static bool isMultiTileSynchronizationSupported() { return false; diff --git a/shared/source/direct_submission/dispatchers/blitter_dispatcher.inl b/shared/source/direct_submission/dispatchers/blitter_dispatcher.inl index 8d9cb04ca5..0e34de3bdf 100644 --- a/shared/source/direct_submission/dispatchers/blitter_dispatcher.inl +++ b/shared/source/direct_submission/dispatchers/blitter_dispatcher.inl @@ -46,11 +46,6 @@ inline size_t BlitterDispatcher::getSizeMonitorFence(const RootDevice return size; } -template -inline void BlitterDispatcher::dispatchCacheFlush(LinearStream &cmdBuffer, const RootDeviceEnvironment &rootDeviceEnvironment, uint64_t address) { - dispatchTlbFlush(cmdBuffer, address, rootDeviceEnvironment); -} - template inline void BlitterDispatcher::dispatchTlbFlush(LinearStream &cmdBuffer, uint64_t address, const RootDeviceEnvironment &rootDeviceEnvironment) { NEO::EncodeDummyBlitWaArgs waArgs{false, const_cast(&rootDeviceEnvironment)}; @@ -60,11 +55,6 @@ inline void BlitterDispatcher::dispatchTlbFlush(LinearStream &cmdBuff EncodeMiFlushDW::programWithWa(cmdBuffer, address, 0ull, args); } -template -inline size_t BlitterDispatcher::getSizeCacheFlush(const RootDeviceEnvironment &rootDeviceEnvironment) { - return getSizeTlbFlush(rootDeviceEnvironment); -} - template inline size_t BlitterDispatcher::getSizeTlbFlush(const RootDeviceEnvironment &rootDeviceEnvironment) { EncodeDummyBlitWaArgs waArgs{false, const_cast(&rootDeviceEnvironment)}; diff --git a/shared/source/direct_submission/dispatchers/render_dispatcher.h b/shared/source/direct_submission/dispatchers/render_dispatcher.h index f874aa5bb0..d2c5899ea1 100644 --- a/shared/source/direct_submission/dispatchers/render_dispatcher.h +++ b/shared/source/direct_submission/dispatchers/render_dispatcher.h @@ -28,9 +28,7 @@ class RenderDispatcher : public Dispatcher { bool notifyKmd); static size_t getSizeMonitorFence(const RootDeviceEnvironment &rootDeviceEnvironment); - static void dispatchCacheFlush(LinearStream &cmdBuffer, const RootDeviceEnvironment &rootDeviceEnvironment, uint64_t address); static void dispatchTlbFlush(LinearStream &cmdBuffer, uint64_t address, const RootDeviceEnvironment &rootDeviceEnvironment); - static size_t getSizeCacheFlush(const RootDeviceEnvironment &rootDeviceEnvironment); static size_t getSizeTlbFlush(const RootDeviceEnvironment &rootDeviceEnvironment); static bool isMultiTileSynchronizationSupported() { return true; diff --git a/shared/source/direct_submission/dispatchers/render_dispatcher.inl b/shared/source/direct_submission/dispatchers/render_dispatcher.inl index b5fb9b7504..da1b8ea823 100644 --- a/shared/source/direct_submission/dispatchers/render_dispatcher.inl +++ b/shared/source/direct_submission/dispatchers/render_dispatcher.inl @@ -52,11 +52,6 @@ inline size_t RenderDispatcher::getSizeMonitorFence(const RootDeviceE return MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, NEO::PostSyncMode::immediateData); } -template -inline void RenderDispatcher::dispatchCacheFlush(LinearStream &cmdBuffer, const RootDeviceEnvironment &rootDeviceEnvironment, uint64_t address) { - MemorySynchronizationCommands::addFullCacheFlush(cmdBuffer, rootDeviceEnvironment); -} - template inline void RenderDispatcher::dispatchTlbFlush(LinearStream &cmdBuffer, uint64_t address, const RootDeviceEnvironment &rootDeviceEnvironment) { PipeControlArgs args; @@ -67,12 +62,6 @@ inline void RenderDispatcher::dispatchTlbFlush(LinearStream &cmdBuffe MemorySynchronizationCommands::addSingleBarrier(cmdBuffer, args); } -template -inline size_t RenderDispatcher::getSizeCacheFlush(const RootDeviceEnvironment &rootDeviceEnvironment) { - size_t size = MemorySynchronizationCommands::getSizeForSingleBarrier(); - return size; -} - template inline size_t RenderDispatcher::getSizeTlbFlush(const RootDeviceEnvironment &rootDeviceEnvironment) { return MemorySynchronizationCommands::getSizeForSingleBarrier(); diff --git a/shared/source/helpers/gfx_core_helper.h b/shared/source/helpers/gfx_core_helper.h index b16adb29f3..b4e5bfc46c 100644 --- a/shared/source/helpers/gfx_core_helper.h +++ b/shared/source/helpers/gfx_core_helper.h @@ -549,7 +549,6 @@ struct MemorySynchronizationCommands { static bool getDcFlushEnable(bool isFlushPreferred, const RootDeviceEnvironment &rootDeviceEnvironment); - static void addFullCacheFlush(LinearStream &commandStream, const RootDeviceEnvironment &rootDeviceEnvironment); static void setCacheFlushExtraProperties(PipeControlArgs &args); static void addStateCacheFlush(LinearStream &commandStream, const RootDeviceEnvironment &rootDeviceEnvironment); static void addInstructionCacheFlush(LinearStream &commandStream); diff --git a/shared/source/helpers/gfx_core_helper_base.inl b/shared/source/helpers/gfx_core_helper_base.inl index a6c235e452..10c64cd353 100644 --- a/shared/source/helpers/gfx_core_helper_base.inl +++ b/shared/source/helpers/gfx_core_helper_base.inl @@ -579,21 +579,6 @@ size_t GfxCoreHelperHw::getSingleTimestampPacketSizeHw() { return TimestampPackets::getSinglePacketSize(); } -template -void MemorySynchronizationCommands::addFullCacheFlush(LinearStream &commandStream, const RootDeviceEnvironment &rootDeviceEnvironment) { - PipeControlArgs args; - args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, rootDeviceEnvironment); - args.renderTargetCacheFlushEnable = true; - args.instructionCacheInvalidateEnable = true; - args.textureCacheInvalidationEnable = true; - args.pipeControlFlushEnable = true; - args.constantCacheInvalidationEnable = true; - args.stateCacheInvalidationEnable = true; - args.tlbInvalidation = true; - MemorySynchronizationCommands::setCacheFlushExtraProperties(args); - MemorySynchronizationCommands::addSingleBarrier(commandStream, args); -} - template void MemorySynchronizationCommands::addStateCacheFlush(LinearStream &commandStream, const RootDeviceEnvironment &rootDeviceEnvironment) { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp index 7d9a324640..df9afe4201 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp @@ -449,17 +449,6 @@ HWTEST_F(DirectSubmissionTest, givenDirectSubmissionWhenDispatchSwitchRingBuffer EXPECT_EQ(directSubmission.getSizeSwitchRingBufferSection(), directSubmission.ringCommandStream.getUsed()); } -HWTEST_F(DirectSubmissionTest, givenDirectSubmissionWhenDispatchFlushSectionThenExpectCorrectSizeUsed) { - using Dispatcher = RenderDispatcher; - MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); - - bool ret = directSubmission.initialize(false); - EXPECT_TRUE(ret); - - Dispatcher::dispatchCacheFlush(directSubmission.ringCommandStream, pDevice->getRootDeviceEnvironment(), 0ull); - EXPECT_EQ(Dispatcher::getSizeCacheFlush(pDevice->getRootDeviceEnvironment()), directSubmission.ringCommandStream.getUsed()); -} - HWTEST_F(DirectSubmissionTest, givenDirectSubmissionWhenDispatchTagUpdateSectionThenExpectCorrectSizeUsed) { using Dispatcher = RenderDispatcher; MockDirectSubmissionHw @@ -513,7 +502,6 @@ HWTEST_F(DirectSubmissionTest, givenDirectSubmissionWhenGetEndSizeThenExpectCorr MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); size_t expectedSize = Dispatcher::getSizeStopCommandBuffer() + - Dispatcher::getSizeCacheFlush(directSubmission.rootDeviceEnvironment) + (Dispatcher::getSizeStartCommandBuffer() - Dispatcher::getSizeStopCommandBuffer()) + MemoryConstants::cacheLineSize; size_t actualSize = directSubmission.getSizeEnd(false); diff --git a/shared/test/unit_test/direct_submission/dispatchers/blitter_dispatcher_tests.cpp b/shared/test/unit_test/direct_submission/dispatchers/blitter_dispatcher_tests.cpp index 241339c39c..9050a74caf 100644 --- a/shared/test/unit_test/direct_submission/dispatchers/blitter_dispatcher_tests.cpp +++ b/shared/test/unit_test/direct_submission/dispatchers/blitter_dispatcher_tests.cpp @@ -59,26 +59,6 @@ HWTEST_F(BlitterDispatcheTest, givenBlitterWhenDispatchingMonitorFenceCmdThenDis } EXPECT_TRUE(foundPostSync); } -HWTEST_F(BlitterDispatcheTest, givenBlitterWhenAskingForCacheFlushCmdSizeThenReturnExpetedSize) { - EncodeDummyBlitWaArgs waArgs{false, &(pDevice->getRootDeviceEnvironmentRef())}; - size_t expectedSize = EncodeMiFlushDW::getCommandSizeWithWa(waArgs); - EXPECT_EQ(expectedSize, BlitterDispatcher::getSizeCacheFlush(pDevice->getRootDeviceEnvironment())); -} - -HWTEST_F(BlitterDispatcheTest, givenBlitterWhenDispatchingCacheFlushCmdThenDispatchMiFlushCommand) { - using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; - EncodeDummyBlitWaArgs waArgs{false, &(pDevice->getRootDeviceEnvironmentRef())}; - size_t expectedSize = EncodeMiFlushDW::getCommandSizeWithWa(waArgs); - - BlitterDispatcher::dispatchCacheFlush(cmdBuffer, pDevice->getRootDeviceEnvironment(), 0ull); - - EXPECT_EQ(expectedSize, cmdBuffer.getUsed()); - - HardwareParse hwParse; - hwParse.parseCommands(cmdBuffer); - auto commandsList = hwParse.getCommandsList(); - EXPECT_LE(1u, commandsList.size()); -} HWTEST_F(BlitterDispatcheTest, givenBlitterWhenDispatchingTlbFlushThenDispatchMiFlushCommandWithproperBits) { using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; diff --git a/shared/test/unit_test/direct_submission/dispatchers/render_dispatcher_tests.cpp b/shared/test/unit_test/direct_submission/dispatchers/render_dispatcher_tests.cpp index 2443fcc025..bbc2a965cb 100644 --- a/shared/test/unit_test/direct_submission/dispatchers/render_dispatcher_tests.cpp +++ b/shared/test/unit_test/direct_submission/dispatchers/render_dispatcher_tests.cpp @@ -86,32 +86,6 @@ HWTEST_F(RenderDispatcherTest, givenRenderWhenAddingMonitorFenceCmdThenExpectPip EXPECT_TRUE(foundMonitorFence); } -HWTEST_F(RenderDispatcherTest, givenRenderWhenAddingCacheFlushCmdThenExpectPipeControlWithProperFields) { - using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; - - RenderDispatcher::dispatchCacheFlush(cmdBuffer, pDevice->getRootDeviceEnvironment(), 0ull); - - HardwareParse hwParse; - hwParse.parseCommands(cmdBuffer); - - bool foundCacheFlush = false; - for (auto &it : hwParse.cmdList) { - PIPE_CONTROL *pipeControl = genCmdCast(it); - if (pipeControl) { - foundCacheFlush = - pipeControl->getRenderTargetCacheFlushEnable() && - pipeControl->getInstructionCacheInvalidateEnable() && - pipeControl->getTextureCacheInvalidationEnable() && - pipeControl->getPipeControlFlushEnable() && - pipeControl->getStateCacheInvalidationEnable(); - if (foundCacheFlush) { - break; - } - } - } - EXPECT_TRUE(foundCacheFlush); -} - HWCMDTEST_F(IGFX_XE_HP_CORE, RenderDispatcherTest, givenRenderDispatcherPartitionedWorkloadFlagTrueWhenAddingMonitorFenceCmdThenExpectPipeControlWithProperAddressAndValueAndPartitionParameter) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; diff --git a/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp b/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp index e96e3aa903..3b92fd6ef3 100644 --- a/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp +++ b/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp @@ -1251,7 +1251,6 @@ HWTEST2_F(DrmDirectSubmissionTest, givenRelaxedOrderingSchedulerRequiredWhenAski EXPECT_EQ(expectedBaseSemaphoreSectionSize + EncodeSemaphore::getSizeMiSemaphoreWait(), directSubmission.getSizeSemaphoreSection(false)); size_t expectedBaseEndSize = Dispatcher::getSizeStopCommandBuffer() + - Dispatcher::getSizeCacheFlush(directSubmission.rootDeviceEnvironment) + (Dispatcher::getSizeStartCommandBuffer() - Dispatcher::getSizeStopCommandBuffer()) + MemoryConstants::cacheLineSize + Dispatcher::getSizeMonitorFence(device->getRootDeviceEnvironment()); EXPECT_EQ(expectedBaseEndSize + directSubmission.getSizeDispatchRelaxedOrderingQueueStall(), directSubmission.getSizeEnd(true)); diff --git a/shared/test/unit_test/direct_submission/windows/wddm_direct_submission_tests.cpp b/shared/test/unit_test/direct_submission/windows/wddm_direct_submission_tests.cpp index be45697c74..b20fb120be 100644 --- a/shared/test/unit_test/direct_submission/windows/wddm_direct_submission_tests.cpp +++ b/shared/test/unit_test/direct_submission/windows/wddm_direct_submission_tests.cpp @@ -1060,7 +1060,6 @@ HWTEST2_F(WddmDirectSubmissionTest, givenRelaxedOrderingSchedulerRequiredWhenAsk EXPECT_EQ(expectedBaseSemaphoreSectionSize + EncodeSemaphore::getSizeMiSemaphoreWait(), directSubmission.getSizeSemaphoreSection(false)); size_t expectedBaseEndSize = Dispatcher::getSizeStopCommandBuffer() + - Dispatcher::getSizeCacheFlush(directSubmission.rootDeviceEnvironment) + (Dispatcher::getSizeStartCommandBuffer() - Dispatcher::getSizeStopCommandBuffer()) + MemoryConstants::cacheLineSize + 2 * Dispatcher::getSizeMonitorFence(device->getRootDeviceEnvironment()); EXPECT_EQ(expectedBaseEndSize + directSubmission.getSizeDispatchRelaxedOrderingQueueStall(), directSubmission.getSizeEnd(true)); diff --git a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp index b43334571b..042330033d 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp @@ -1078,27 +1078,6 @@ HWCMDTEST_F(IGFX_GEN12LP_CORE, GfxCoreHelperTest, WhenIsFusedEuDispatchEnabledIs EXPECT_FALSE(gfxCoreHelper.isFusedEuDispatchEnabled(hardwareInfo, false)); } -HWTEST_F(PipeControlHelperTests, WhenProgrammingCacheFlushThenExpectBasicFieldsSet) { - using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; - auto buffer = std::make_unique(128); - - LinearStream stream(buffer.get(), 128); - MockExecutionEnvironment mockExecutionEnvironment{}; - MemorySynchronizationCommands::addFullCacheFlush(stream, *mockExecutionEnvironment.rootDeviceEnvironments[0]); - PIPE_CONTROL *pipeControl = genCmdCast(buffer.get()); - ASSERT_NE(nullptr, pipeControl); - - EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); - EXPECT_EQ(MemorySynchronizationCommands::getDcFlushEnable(true, *mockExecutionEnvironment.rootDeviceEnvironments[0]), pipeControl->getDcFlushEnable()); - - EXPECT_TRUE(pipeControl->getRenderTargetCacheFlushEnable()); - EXPECT_TRUE(pipeControl->getInstructionCacheInvalidateEnable()); - EXPECT_TRUE(pipeControl->getTextureCacheInvalidationEnable()); - EXPECT_TRUE(pipeControl->getPipeControlFlushEnable()); - EXPECT_TRUE(pipeControl->getStateCacheInvalidationEnable()); - EXPECT_TRUE(pipeControl->getTlbInvalidate()); -} - HWTEST_F(PipeControlHelperTests, WhenGettingPipeControSizeForInstructionCacheFlushThenReturnCorrectValue) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; size_t actualSize = MemorySynchronizationCommands::getSizeForInstructionCacheFlush();