From 4c795027e3163b7356d93b9685e23251a4a57ece Mon Sep 17 00:00:00 2001 From: Kamil Kopryk Date: Wed, 5 Mar 2025 15:46:03 +0000 Subject: [PATCH] refactor: add check if event L3 flush is needed Signed-off-by: Kamil Kopryk --- level_zero/core/source/cmdlist/cmdlist.h | 1 + level_zero/core/source/cmdlist/cmdlist_hw.inl | 6 ++++-- .../source/cmdlist/cmdlist_hw_skl_to_tgllp.inl | 1 + .../source/cmdlist/cmdlist_hw_xehp_and_later.inl | 10 +++++++--- .../source/gfx_core_helpers/l0_gfx_core_helper.cpp | 8 +++++--- .../source/gfx_core_helpers/l0_gfx_core_helper.h | 4 ++-- .../l0_gfx_core_helper_xehp_and_later.inl | 14 +++++++++++--- .../core/test/unit_tests/mocks/mock_cmdlist.h | 3 +++ .../unit_tests/sources/cmdlist/test_cmdlist_5.cpp | 9 +++++++-- .../cmdlist/test_cmdlist_append_wait_on_events.cpp | 8 +++++--- .../test_cmdlist_copy_event_xehp_and_later.cpp | 4 +++- .../test_cmdlist_fill_event_xehp_and_later.cpp | 4 +++- .../sources/cmdlist/test_in_order_cmdlist_2.cpp | 4 ++++ .../sources/helper/l0_gfx_core_helper_tests.cpp | 13 ++++++++++++- shared/source/command_container/command_encoder.h | 4 ++++ .../command_container/command_encoder_enablers.inl | 1 + ...ommand_encoder_from_xe_hpg_core_to_xe3_core.inl | 4 ++++ .../command_encoder_xehp_and_later.inl | 1 + .../source/debug_settings/debug_variables_base.inl | 1 + shared/source/gen12lp/command_encoder_gen12lp.cpp | 4 ++++ shared/source/os_interface/product_helper.h | 1 + shared/source/os_interface/product_helper.inl | 5 +++++ shared/source/os_interface/product_helper_hw.h | 1 + shared/test/common/test_files/igdrcl.config | 1 + .../unit_test/fixtures/command_container_fixture.h | 1 + 25 files changed, 92 insertions(+), 21 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index e2f7a0180b..ad41c8b1cd 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -505,6 +505,7 @@ struct CommandList : _ze_command_list_handle_t { bool statelessBuiltinsEnabled = false; bool localDispatchSupport = false; bool copyOperationOffloadEnabled = false; + bool l3FlushAfterPostSyncRequired = false; }; using CommandListAllocatorFn = CommandList *(*)(uint32_t); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index cbe57a3c23..09cb6b6afd 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -239,7 +239,6 @@ ze_result_t CommandListCoreFamily::initialize(Device *device, NEO this->pipelineSelectStateTracking = L0GfxCoreHelper::enablePipelineSelectStateTracking(rootDeviceEnvironment); this->stateBaseAddressTracking = L0GfxCoreHelper::enableStateBaseAddressTracking(rootDeviceEnvironment); this->pipeControlMultiKernelEventSync = L0GfxCoreHelper::usePipeControlMultiKernelEventSync(hwInfo); - this->compactL3FlushEventPacket = L0GfxCoreHelper::useCompactL3FlushEventPacket(hwInfo); this->signalAllEventPackets = L0GfxCoreHelper::useSignalAllEventPackets(hwInfo); this->dynamicHeapRequired = NEO::EncodeDispatchKernel::isDshNeeded(device->getDeviceInfo()); this->doubleSbaWa = productHelper.isAdditionalStateBaseAddressWARequired(hwInfo); @@ -261,6 +260,9 @@ ze_result_t CommandListCoreFamily::initialize(Device *device, NEO this->defaultPipelinedThreadArbitrationPolicy = gfxCoreHelper.getDefaultThreadArbitrationPolicy(); this->implicitSynchronizedDispatchForCooperativeKernelsAllowed = l0GfxCoreHelper.implicitSynchronizedDispatchForCooperativeKernelsAllowed(); this->maxLocalSubRegionSize = productHelper.getMaxLocalSubRegionSize(hwInfo); + this->l3FlushAfterPostSyncRequired = productHelper.isL3FlushAfterPostSyncRequired(heaplessModeEnabled); + this->compactL3FlushEventPacket = L0GfxCoreHelper::useCompactL3FlushEventPacket(hwInfo, this->l3FlushAfterPostSyncRequired); + if (NEO::debugManager.flags.OverrideThreadArbitrationPolicy.get() != -1) { this->defaultPipelinedThreadArbitrationPolicy = NEO::debugManager.flags.OverrideThreadArbitrationPolicy.get(); } @@ -2771,7 +2773,7 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu if (isCopyOnly(copyOffloadOperation)) { NEO::MiFlushArgs args{this->dummyBlitWa}; encodeMiFlush(0, 0, args); - } else { + } else if (!this->l3FlushAfterPostSyncRequired) { NEO::PipeControlArgs args; args.dcFlushEnable = true; NEO::MemorySynchronizationCommands::addSingleBarrier(*commandContainer.getCommandStream(), args); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl index 33bbe1e5f1..ba2263d98f 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl @@ -229,6 +229,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K .interruptEvent = false, .immediateScratchAddressPatching = !this->scratchAddressPatchingEnabled, .makeCommandView = false, + .isFlushL3AfterPostSyncEnabled = false, }; NEO::EncodeDispatchKernel::encodeCommon(commandContainer, dispatchKernelArgs); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index b58112695f..17d9513e6c 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -178,7 +178,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K uint64_t eventAddress = 0; bool isTimestampEvent = false; - bool l3FlushEnable = false; + bool l3FlushInPipeControlEnable = false; bool isHostSignalScopeEvent = launchParams.isHostSignalScopeEvent; bool interruptEvent = false; Event *compactEvent = nullptr; @@ -206,7 +206,9 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K bool flushRequired = event->isSignalScope() && !launchParams.isKernelSplitOperation; - l3FlushEnable = getDcFlushRequired(flushRequired); + + l3FlushInPipeControlEnable = getDcFlushRequired(flushRequired) && + !l3FlushAfterPostSyncRequired; interruptEvent = event->isInterruptModeEnabled(); } } @@ -388,6 +390,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K .interruptEvent = interruptEvent, .immediateScratchAddressPatching = !this->scratchAddressPatchingEnabled, .makeCommandView = launchParams.makeKernelCommandView, + .isFlushL3AfterPostSyncEnabled = this->l3FlushAfterPostSyncRequired, }; setAdditionalDispatchKernelArgsFromLaunchParams(dispatchKernelArgs, launchParams); @@ -428,7 +431,8 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K } } else if (event) { event->setPacketsInUse(partitionCount); - if (l3FlushEnable) { + + if (l3FlushInPipeControlEnable) { programEventL3Flush(event); } if (!launchParams.isKernelSplitOperation) { diff --git a/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.cpp b/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.cpp index 36756c2133..60a0f862e3 100644 --- a/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.cpp +++ b/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -74,11 +74,13 @@ bool L0GfxCoreHelper::usePipeControlMultiKernelEventSync(const NEO::HardwareInfo return true; } -bool L0GfxCoreHelper::useCompactL3FlushEventPacket(const NEO::HardwareInfo &hwInfo) { +bool L0GfxCoreHelper::useCompactL3FlushEventPacket(const NEO::HardwareInfo &hwInfo, bool flushL3AfterPostSync) { + if (NEO::debugManager.flags.CompactL3FlushEventPacket.get() != -1) { return !!NEO::debugManager.flags.CompactL3FlushEventPacket.get(); } - return true; + + return !flushL3AfterPostSync; } bool L0GfxCoreHelper::useDynamicEventPacketsCount(const NEO::HardwareInfo &hwInfo) { diff --git a/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.h b/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.h index 1b30c3d0cc..983c10db4d 100644 --- a/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.h +++ b/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -70,7 +70,7 @@ class L0GfxCoreHelper : public NEO::ApiGfxCoreHelper { static bool enableStateBaseAddressTracking(const NEO::RootDeviceEnvironment &rootDeviceEnvironment); static bool enableImmediateCmdListHeapSharing(const NEO::RootDeviceEnvironment &rootDeviceEnvironment, bool cmdlistSupport); static bool usePipeControlMultiKernelEventSync(const NEO::HardwareInfo &hwInfo); - static bool useCompactL3FlushEventPacket(const NEO::HardwareInfo &hwInfo); + static bool useCompactL3FlushEventPacket(const NEO::HardwareInfo &hwInfo, bool flushL3AfterPostSync); static bool useDynamicEventPacketsCount(const NEO::HardwareInfo &hwInfo); static bool useSignalAllEventPackets(const NEO::HardwareInfo &hwInfo); static NEO::HeapAddressModel getHeapAddressModel(const NEO::RootDeviceEnvironment &rootDeviceEnvironment); diff --git a/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper_xehp_and_later.inl b/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper_xehp_and_later.inl index 448a27525b..075ca6823f 100644 --- a/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper_xehp_and_later.inl +++ b/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper_xehp_and_later.inl @@ -1,12 +1,14 @@ /* - * Copyright (C) 2022-2024 Intel Corporation + * Copyright (C) 2022-2025 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/execution_environment/root_device_environment.h" +#include "shared/source/helpers/compiler_product_helper.h" #include "shared/source/helpers/gfx_core_helper.h" +#include "shared/source/os_interface/product_helper.h" #include "level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.h" @@ -48,10 +50,16 @@ uint32_t L0GfxCoreHelperHw::getEventMaxKernelCount(const NEO::HardwareIn template uint32_t L0GfxCoreHelperHw::getEventBaseMaxPacketCount(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) const { + + auto &compilerProductHelper = rootDeviceEnvironment.getHelper(); + auto &productHelper = rootDeviceEnvironment.getProductHelper(); auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo(); + auto heaplessEnabled = compilerProductHelper.isHeaplessModeEnabled(); + bool flushL3AfterPostSync = productHelper.isL3FlushAfterPostSyncRequired(heaplessEnabled); + uint32_t basePackets = getEventMaxKernelCount(hwInfo); - if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, rootDeviceEnvironment)) { - basePackets += L0GfxCoreHelper::useCompactL3FlushEventPacket(hwInfo) ? 0 : 1; + if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, rootDeviceEnvironment) && !flushL3AfterPostSync) { + basePackets += L0GfxCoreHelper::useCompactL3FlushEventPacket(hwInfo, flushL3AfterPostSync) ? 0 : 1; } return basePackets; diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 8e80dd8f91..fe9f3dd4ca 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -100,6 +100,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::isSyncModeQueue; using BaseClass::isTbxMode; using BaseClass::isTimestampEventForMultiTile; + using BaseClass::l3FlushAfterPostSyncRequired; using BaseClass::latestOperationRequiredNonWalkerInOrderCmdsChaining; using BaseClass::obtainKernelPreemptionMode; using BaseClass::partitionCount; @@ -283,6 +284,7 @@ struct WhiteBox<::L0::CommandListImp> : public ::L0::CommandListImp { using BaseClass::commandContainer; using BaseClass::commandListPreemptionMode; using BaseClass::commandsToPatch; + using BaseClass::compactL3FlushEventPacket; using BaseClass::copyOperationOffloadEnabled; using BaseClass::copyThroughLockedPtrEnabled; using BaseClass::currentBindingTablePoolBaseAddress; @@ -304,6 +306,7 @@ struct WhiteBox<::L0::CommandListImp> : public ::L0::CommandListImp { using BaseClass::isFlushTaskSubmissionEnabled; using BaseClass::isSyncModeQueue; using BaseClass::isTbxMode; + using BaseClass::l3FlushAfterPostSyncRequired; using BaseClass::minimalSizeForBcsSplit; using BaseClass::partitionCount; using BaseClass::pipelineSelectStateTracking; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp index f555b4c9e0..7d79528f3b 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp @@ -642,6 +642,9 @@ HWTEST_F(CommandListCreate, givenCommandListyWhenAppendWaitEventsWithDcFlushThen using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + DebugManagerStateRestore restorer; + NEO::debugManager.flags.ForceL3FlushAfterPostSync.set(0); + ze_result_t returnValue; std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::renderCompute, 0u, returnValue, false)); auto &commandContainer = commandList->getCmdContainer(); @@ -688,7 +691,9 @@ HWTEST_F(CommandListCreate, givenCommandListWhenAppendWaitEventsWithDcFlushThenP auto itor = find(cmdList.begin(), cmdList.end()); EXPECT_NE(cmdList.end(), itor); - if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, device->getNEODevice()->getRootDeviceEnvironment())) { + auto whiteBoxCmdList = static_cast(commandList.get()); + + if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, device->getNEODevice()->getRootDeviceEnvironment()) && !whiteBoxCmdList->l3FlushAfterPostSyncRequired) { itor--; EXPECT_NE(nullptr, genCmdCast(*itor)); } else { @@ -720,7 +725,7 @@ HWTEST_F(CommandListCreate, givenAsyncCmdQueueAndImmediateCommandListWhenAppendW EXPECT_NE(nullptr, whiteBoxCmdList->cmdQImmediate); size_t expectedUsed = 2 * NEO::EncodeSemaphore::getSizeMiSemaphoreWait() + sizeof(MI_BATCH_BUFFER_END); - if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, device->getNEODevice()->getRootDeviceEnvironment())) { + if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, device->getNEODevice()->getRootDeviceEnvironment()) && !whiteBoxCmdList->l3FlushAfterPostSyncRequired) { expectedUsed += sizeof(PIPE_CONTROL); } expectedUsed = alignUp(expectedUsed, 64); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp index 78ed0bb215..0ae3937a86 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp @@ -562,7 +562,9 @@ HWTEST_F(CommandListAppendWaitOnEvent, givenEventWithWaitScopeFlagDeviceWhenAppe auto itor = find(cmdList.begin(), cmdList.end()); EXPECT_NE(cmdList.end(), itor); - if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, device->getNEODevice()->getRootDeviceEnvironment())) { + auto whiteBoxCmdList = static_cast(commandList.get()); + + if (NEO::MemorySynchronizationCommands::getDcFlushEnable(true, device->getNEODevice()->getRootDeviceEnvironment()) && !whiteBoxCmdList->l3FlushAfterPostSyncRequired) { itor--; auto cmd = genCmdCast(*itor); @@ -759,7 +761,7 @@ HWTEST_F(CommandListAppendWaitOnSecondaryBatchBufferEvent, givenCommandBufferIsE commandList->getCmdContainer().getCommandStream()->getSpace(consumeSpace); size_t expectedConsumedSpace = NEO::EncodeSemaphore::getSizeMiSemaphoreWait(); - if (MemorySynchronizationCommands::getDcFlushEnable(true, device->getNEODevice()->getRootDeviceEnvironment())) { + if (MemorySynchronizationCommands::getDcFlushEnable(true, device->getNEODevice()->getRootDeviceEnvironment()) && !commandList->l3FlushAfterPostSyncRequired) { expectedConsumedSpace += sizeof(PIPE_CONTROL); } @@ -791,7 +793,7 @@ HWTEST_F(CommandListAppendWaitOnSecondaryBatchBufferEvent, givenCommandBufferIsE usedSpaceAfter)); auto itorPC = find(cmdList.begin(), cmdList.end()); - if (MemorySynchronizationCommands::getDcFlushEnable(true, device->getNEODevice()->getRootDeviceEnvironment())) { + if (MemorySynchronizationCommands::getDcFlushEnable(true, device->getNEODevice()->getRootDeviceEnvironment()) && !commandList->l3FlushAfterPostSyncRequired) { ASSERT_NE(cmdList.end(), itorPC); { auto cmd = genCmdCast(*itorPC); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_copy_event_xehp_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_copy_event_xehp_and_later.cpp index a1a8539f33..ff79e02cdc 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_copy_event_xehp_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_copy_event_xehp_and_later.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2024 Intel Corporation + * Copyright (C) 2022-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -45,6 +45,8 @@ struct AppendMemoryCopyMultiPacketEventFixture : public DeviceFixture { void setUp() { debugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync); debugManager.flags.CompactL3FlushEventPacket.set(compactL3FlushEventPacket); + debugManager.flags.ForceL3FlushAfterPostSync.set(0); + if constexpr (multiTile == 1) { debugManager.flags.CreateMultipleSubDevices.set(2); debugManager.flags.EnableImplicitScaling.set(1); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill_event_xehp_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill_event_xehp_and_later.cpp index 5ed00969be..5796ebb5be 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill_event_xehp_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill_event_xehp_and_later.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2024 Intel Corporation + * Copyright (C) 2022-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -45,6 +45,8 @@ struct AppendFillMultiPacketEventFixture : public AppendFillFixture { void setUp() { debugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync); debugManager.flags.CompactL3FlushEventPacket.set(compactL3FlushEventPacket); + debugManager.flags.ForceL3FlushAfterPostSync.set(0); + if constexpr (multiTile == 1) { debugManager.flags.CreateMultipleSubDevices.set(2); debugManager.flags.EnableImplicitScaling.set(1); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp index 1a8d9f4a9f..37973897e8 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp @@ -1863,6 +1863,10 @@ HWTEST2_F(StandaloneInOrderTimestampAllocationTests, givenDebugFlagSetToZeroWhen } HWTEST2_F(StandaloneInOrderTimestampAllocationTests, givenNonWalkerCounterSignalingWhenPassedNonProfilingEventThenNotAssignAllocation, IsAtLeastXeHpCore) { + + DebugManagerStateRestore restorer; + NEO::debugManager.flags.ForceL3FlushAfterPostSync.set(0); + auto eventPool = createEvents(1, false); auto eventHandle = events[0]->toHandle(); diff --git a/level_zero/core/test/unit_tests/sources/helper/l0_gfx_core_helper_tests.cpp b/level_zero/core/test/unit_tests/sources/helper/l0_gfx_core_helper_tests.cpp index acce7697c5..1d43892cd3 100644 --- a/level_zero/core/test/unit_tests/sources/helper/l0_gfx_core_helper_tests.cpp +++ b/level_zero/core/test/unit_tests/sources/helper/l0_gfx_core_helper_tests.cpp @@ -8,8 +8,10 @@ #include "shared/source/command_container/implicit_scaling.h" #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/basic_math.h" +#include "shared/source/helpers/compiler_product_helper.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/ptr_math.h" +#include "shared/source/os_interface/product_helper.h" #include "shared/source/release_helper/release_helper.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/default_hw_info.h" @@ -1013,10 +1015,18 @@ TEST_F(L0GfxCoreHelperTest, givenL0GfxCoreHelperWhenGettingDefaultValueForUsePip TEST_F(L0GfxCoreHelperTest, givenL0GfxCoreHelperWhenGettingDefaultValueForCompactL3FlushEventPacketThenReturnTrue) { auto hwInfo = *NEO::defaultHwInfo.get(); - bool defaultValue = L0::L0GfxCoreHelper::useCompactL3FlushEventPacket(hwInfo); + bool useL3FlushAfterPostSync = false; + bool defaultValue = L0::L0GfxCoreHelper::useCompactL3FlushEventPacket(hwInfo, useL3FlushAfterPostSync); EXPECT_TRUE(defaultValue); } +TEST_F(L0GfxCoreHelperTest, givenL3FlushAfterPostSyncWhenUseCompactL3FlushEventPacketThenFalseIsReturned) { + auto hwInfo = *NEO::defaultHwInfo.get(); + bool useL3FlushAfterPostSync = true; + bool compactL3FlushEventEnabled = L0::L0GfxCoreHelper::useCompactL3FlushEventPacket(hwInfo, useL3FlushAfterPostSync); + EXPECT_FALSE(compactL3FlushEventEnabled); +} + TEST_F(L0GfxCoreHelperTest, givenL0GfxCoreHelperWhenGettingDefaultValueForDynamicEventPacketCountThenReturnTrue) { auto hwInfo = *NEO::defaultHwInfo.get(); bool defaultValue = L0::L0GfxCoreHelper::useDynamicEventPacketsCount(hwInfo); @@ -1036,6 +1046,7 @@ struct L0GfxCoreHelperMultiPacketEventFixture { void setUp() { debugManager.flags.UsePipeControlMultiKernelEventSync.set(usePipeControlMultiPacketEventSync); debugManager.flags.CompactL3FlushEventPacket.set(compactL3FlushEventPacket); + debugManager.flags.ForceL3FlushAfterPostSync.set(0); } void tearDown() { diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index 97a1d140a9..62637c73a3 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -89,6 +89,7 @@ struct EncodeDispatchKernelArgs { bool interruptEvent = false; bool immediateScratchAddressPatching = false; bool makeCommandView = false; + bool isFlushL3AfterPostSyncEnabled = false; bool requiresSystemMemoryFence() const { return (isHostScopeSignalEvent && isKernelUsingSystemAllocation); @@ -188,6 +189,9 @@ struct EncodeDispatchKernel { template static void adjustTimestampPacket(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); + template + static void encodeL3FlushAfterPostSync(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); + template static void setupPostSyncForRegularEvent(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); diff --git a/shared/source/command_container/command_encoder_enablers.inl b/shared/source/command_container/command_encoder_enablers.inl index 964be1eb76..9e9e81d348 100644 --- a/shared/source/command_container/command_encoder_enablers.inl +++ b/shared/source/command_container/command_encoder_enablers.inl @@ -11,6 +11,7 @@ template struct NEO::EncodeDispatchKernel; template void NEO::EncodeDispatchKernel::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs); template void NEO::EncodeDispatchKernel::adjustTimestampPacket(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); template void NEO::EncodeDispatchKernel::setupPostSyncForRegularEvent(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); +template void NEO::EncodeDispatchKernel::encodeL3FlushAfterPostSync(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); template void NEO::EncodeDispatchKernel::setupPostSyncForInOrderExec(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); template void NEO::EncodeDispatchKernel::setGrfInfo(Family::DefaultWalkerType::InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment); template void NEO::EncodeDispatchKernel::setupPreferredSlmSize(Family::DefaultWalkerType::InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy); diff --git a/shared/source/command_container/command_encoder_from_xe_hpg_core_to_xe3_core.inl b/shared/source/command_container/command_encoder_from_xe_hpg_core_to_xe3_core.inl index 9f7b310715..ca6e5237fd 100644 --- a/shared/source/command_container/command_encoder_from_xe_hpg_core_to_xe3_core.inl +++ b/shared/source/command_container/command_encoder_from_xe_hpg_core_to_xe3_core.inl @@ -83,6 +83,10 @@ template template void EncodeDispatchKernel::adjustTimestampPacket(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} +template +template +void EncodeDispatchKernel::encodeL3FlushAfterPostSync(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} + template template void EncodeDispatchKernel::setWalkerRegionSettings(WalkerType &walkerCmd, const NEO::Device &device, uint32_t partitionCount, uint32_t workgroupSize, uint32_t threadGroupCount, uint32_t maxWgCountPerTile, bool requiredDispatchWalkOrder) {} diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 430cc2f3bd..a39435c365 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -504,6 +504,7 @@ void EncodeDispatchKernel::setupPostSyncForRegularEvent(WalkerType &walk postSync.setImmediateData(immData); postSync.setDestinationAddress(gpuVa); + EncodeDispatchKernel::encodeL3FlushAfterPostSync(walkerCmd, args); EncodeDispatchKernel::setupPostSyncMocs(walkerCmd, args.device->getRootDeviceEnvironment(), args.dcFlushEnable); EncodeDispatchKernel::adjustTimestampPacket(walkerCmd, args); } diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index faebb819b1..0b1eca6b42 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -417,6 +417,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, UseHighAlignmentForHeapExtended, -1, "-1: defaul DECLARE_DEBUG_VARIABLE(int32_t, DispatchCmdlistCmdBufferPrimary, -1, "-1: default, 0: dispatch command buffers as secondary, 1: dispatch command buffers as primary and chain") DECLARE_DEBUG_VARIABLE(int32_t, UseImmediateFlushTask, -1, "-1: default, 0: use regular flush task, 1: use immediate flush task") DECLARE_DEBUG_VARIABLE(int32_t, SkipDcFlushOnBarrierWithoutEvents, -1, "-1: default (enabled), 0: disabled, 1: enabled") +DECLARE_DEBUG_VARIABLE(int32_t, ForceL3FlushAfterPostSync, -1, "-1: default, 0: disabled, 1: enabled. If enabled flush L3 after post sync operation") DECLARE_DEBUG_VARIABLE(int32_t, EnableDeviceUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB") DECLARE_DEBUG_VARIABLE(int32_t, EnableHostUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB") DECLARE_DEBUG_VARIABLE(int32_t, UseLocalPreferredForCacheableBuffers, -1, "Use localPreferred for cacheable buffers") diff --git a/shared/source/gen12lp/command_encoder_gen12lp.cpp b/shared/source/gen12lp/command_encoder_gen12lp.cpp index 27befec9d6..65d2ee2b5f 100644 --- a/shared/source/gen12lp/command_encoder_gen12lp.cpp +++ b/shared/source/gen12lp/command_encoder_gen12lp.cpp @@ -596,6 +596,10 @@ template template void EncodeDispatchKernel::setupPostSyncForRegularEvent(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} +template +template +void EncodeDispatchKernel::encodeL3FlushAfterPostSync(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} + template template void EncodeDispatchKernel::setupPostSyncForInOrderExec(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {} diff --git a/shared/source/os_interface/product_helper.h b/shared/source/os_interface/product_helper.h index e86b6f4f20..d3babb3c16 100644 --- a/shared/source/os_interface/product_helper.h +++ b/shared/source/os_interface/product_helper.h @@ -258,6 +258,7 @@ class ProductHelper { virtual uint32_t getNumCacheRegions() const = 0; virtual uint64_t getPatIndex(CacheRegion cacheRegion, CachePolicy cachePolicy) const = 0; virtual bool isSharingWith3dOrMediaAllowed() const = 0; + virtual bool isL3FlushAfterPostSyncRequired(bool heaplessEnabled) const = 0; virtual bool isImageSuitableForCompression() const = 0; virtual ~ProductHelper() = default; diff --git a/shared/source/os_interface/product_helper.inl b/shared/source/os_interface/product_helper.inl index 0f86d87311..e0a08a4866 100644 --- a/shared/source/os_interface/product_helper.inl +++ b/shared/source/os_interface/product_helper.inl @@ -1005,6 +1005,11 @@ bool ProductHelperHw::isEvictionIfNecessaryFlagSupported() const { return true; } +template +bool ProductHelperHw::isL3FlushAfterPostSyncRequired(bool heaplessEnabled) const { + return false; +} + template bool ProductHelperHw::isImageSuitableForCompression() const { if (debugManager.flags.OverrideImageSuitableForRenderCompression.get() != -1) { diff --git a/shared/source/os_interface/product_helper_hw.h b/shared/source/os_interface/product_helper_hw.h index 7a3fee30e8..259d2ef9bf 100644 --- a/shared/source/os_interface/product_helper_hw.h +++ b/shared/source/os_interface/product_helper_hw.h @@ -199,6 +199,7 @@ class ProductHelperHw : public ProductHelper { uint32_t getNumCacheRegions() const override; uint64_t getPatIndex(CacheRegion cacheRegion, CachePolicy cachePolicy) const override; bool isSharingWith3dOrMediaAllowed() const override; + bool isL3FlushAfterPostSyncRequired(bool heaplessEnabled) const override; bool isImageSuitableForCompression() const override; ~ProductHelperHw() override = default; diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 3b65bfda60..e63749e4ef 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -604,6 +604,7 @@ ExperimentalEnableHostAllocationCache = -1 OverridePatIndexForUncachedTypes = -1 OverridePatIndexForCachedTypes = -1 FlushTlbBeforeCopy = -1 +ForceL3FlushAfterPostSync = -1 EnableUserFenceUponUnbind = -1 EnableWaitOnUserFenceAfterBindAndUnbind = -1 UseGemCreateExtInAllocateMemoryByKMD = -1 diff --git a/shared/test/unit_test/fixtures/command_container_fixture.h b/shared/test/unit_test/fixtures/command_container_fixture.h index 0fcbfd439c..f18724c4a1 100644 --- a/shared/test/unit_test/fixtures/command_container_fixture.h +++ b/shared/test/unit_test/fixtures/command_container_fixture.h @@ -76,6 +76,7 @@ class CommandEncodeStatesFixture : public DeviceFixture { .interruptEvent = false, .immediateScratchAddressPatching = false, .makeCommandView = false, + .isFlushL3AfterPostSyncEnabled = false, }; return args;