diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xe2_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xe2_and_later.cpp index 83aa8bd2bf..0d0bb6f488 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xe2_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xe2_and_later.cpp @@ -15,6 +15,7 @@ #include "level_zero/core/source/cmdlist/cmdlist_hw.h" #include "level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h" +#include "level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h" #include "level_zero/core/test/unit_tests/mocks/mock_event.h" @@ -383,5 +384,32 @@ HWTEST2_F(CommandListXe2AndLaterPreemptionTest, auto result = commandListCore->obtainKernelPreemptionMode(kernel.get()); EXPECT_EQ(NEO::PreemptionMode::MidThread, result); } + +using InOrderCmdListTests = InOrderCmdListFixture; + +HWTEST2_F(InOrderCmdListTests, givenDebugFlagWhenPostSyncWithInOrderExecInfoIsCreateThenL1IsNotFlushed, Platforms) { + DebugManagerStateRestore restorer; + NEO::debugManager.flags.ForcePostSyncL1Flush.set(0); + using DefaultWalkerType = typename FamilyType::DefaultWalkerType; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + + auto immCmdList = createImmCmdList(); + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed())); + + auto walkerItor = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), walkerItor); + + auto walkerCmd = genCmdCast(*walkerItor); + auto &postSync = walkerCmd->getPostSync(); + + EXPECT_FALSE(postSync.getDataportPipelineFlush()); + EXPECT_FALSE(postSync.getDataportSubsliceCacheFlush()); +} + } // namespace ult } // namespace L0 diff --git a/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl b/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl index fa5b0f6b98..199d021831 100644 --- a/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl +++ b/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -108,6 +108,11 @@ void GpgpuWalkerHelper::setupTimestampPacket(LinearStream *cmdStream, postSyncData.setDataportPipelineFlush(true); postSyncData.setDataportSubsliceCacheFlush(true); + if (NEO::debugManager.flags.ForcePostSyncL1Flush.get() != -1) { + postSyncData.setDataportPipelineFlush(!!NEO::debugManager.flags.ForcePostSyncL1Flush.get()); + postSyncData.setDataportSubsliceCacheFlush(!!NEO::debugManager.flags.ForcePostSyncL1Flush.get()); + } + EncodeDispatchKernel::template setupPostSyncMocs(*walkerCmd, rootDeviceEnvironment, MemorySynchronizationCommands::getDcFlushEnable(true, rootDeviceEnvironment)); diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests_dg2_and_later.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests_dg2_and_later.cpp index f4e0bf7193..974e712f72 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_dg2_and_later.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_dg2_and_later.cpp @@ -145,6 +145,42 @@ HWTEST2_F(Dg2AndLaterDispatchWalkerBasicTest, givenTimestampPacketWhenDispatchin EXPECT_EQ(contextStartAddress, secondWalker->getPostSync().getDestinationAddress()); } +HWTEST2_F(Dg2AndLaterDispatchWalkerBasicTest, givenDebugFlagToDisableL1FlushInPostSyncWhenKernelIsProgrammedThenL1FlushIsNotEnabled, matcherDG2AndLater) { + DebugManagerStateRestore restore; + NEO::debugManager.flags.ForcePostSyncL1Flush.set(0); + + using DefaultWalkerType = typename FamilyType::DefaultWalkerType; + + MockKernelWithInternals kernel1(*device); + + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + + TimestampPacketContainer timestampPacketContainer; + timestampPacketContainer.add(device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); + timestampPacketContainer.add(device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); + + MockMultiDispatchInfo multiDispatchInfo(device.get(), std::vector({kernel1.mockKernel})); + + MockCommandQueue cmdQ(context.get(), device.get(), nullptr, false); + auto &cmdStream = cmdQ.getCS(0); + + HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); + walkerArgs.currentTimestampPacketNodes = ×tampPacketContainer; + HardwareInterface::template dispatchWalker( + cmdQ, + multiDispatchInfo, + CsrDependencies(), + walkerArgs); + + HardwareParse hwParser; + hwParser.parseCommands(cmdStream, 0); + hwParser.findHardwareCommands(); + auto walker = genCmdCast(*hwParser.itorWalker); + + EXPECT_FALSE(walker->getPostSync().getDataportPipelineFlush()); + EXPECT_FALSE(walker->getPostSync().getDataportSubsliceCacheFlush()); +} + HWTEST2_F(Dg2AndLaterDispatchWalkerBasicTest, givenDebugVariableEnabledWhenEnqueueingThenWriteWalkerStamp, matcherDG2AndLater) { using DefaultWalkerType = typename FamilyType::DefaultWalkerType; DebugManagerStateRestore restore; diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 377e22e804..a466cbc8cf 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -438,6 +438,11 @@ void EncodeDispatchKernel::setupPostSyncForRegularEvent(WalkerType &walk postSync.setDataportPipelineFlush(true); postSync.setDataportSubsliceCacheFlush(true); + if (NEO::debugManager.flags.ForcePostSyncL1Flush.get() != -1) { + postSync.setDataportPipelineFlush(!!NEO::debugManager.flags.ForcePostSyncL1Flush.get()); + postSync.setDataportSubsliceCacheFlush(!!NEO::debugManager.flags.ForcePostSyncL1Flush.get()); + } + auto operationType = POSTSYNC_DATA::OPERATION_WRITE_IMMEDIATE_DATA; uint64_t gpuVa = args.eventAddress; uint64_t immData = args.postSyncImmValue; @@ -468,6 +473,10 @@ void EncodeDispatchKernel::setupPostSyncForInOrderExec(WalkerType &walke postSync.setDataportPipelineFlush(true); postSync.setDataportSubsliceCacheFlush(true); + if (NEO::debugManager.flags.ForcePostSyncL1Flush.get() != -1) { + postSync.setDataportPipelineFlush(!!NEO::debugManager.flags.ForcePostSyncL1Flush.get()); + postSync.setDataportSubsliceCacheFlush(!!NEO::debugManager.flags.ForcePostSyncL1Flush.get()); + } uint64_t gpuVa = args.inOrderExecInfo->getBaseDeviceAddress() + args.inOrderExecInfo->getAllocationOffset(); diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index f889dc8498..9149dd04d0 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -393,6 +393,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableHostUsmAllocationPool, -1, "-1: default (e DECLARE_DEBUG_VARIABLE(int32_t, UseLocalPreferredForCacheableBuffers, -1, "Use localPreferred for cacheable buffers") DECLARE_DEBUG_VARIABLE(int32_t, EnableCopyWithStagingBuffers, -1, "Enable copy with non-usm memory through staging buffers. -1: default, 0: disabled, 1: enabled") DECLARE_DEBUG_VARIABLE(int32_t, StagingBufferSize, -1, "Size of single staging buffer. -1: default (2MB), >0: size in KB") +DECLARE_DEBUG_VARIABLE(int32_t, ForcePostSyncL1Flush, -1, "-1: default (do nothing), 0: L1 flush disabled in post sync, 1: L1 flush enabled in post sync") /*DIRECT SUBMISSION FLAGS*/ DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD") diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 70888f0228..56b70900fd 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -616,4 +616,5 @@ EnableCopyWithStagingBuffers = -1 StagingBufferSize = -1 OverrideNumHighPriorityContexts = -1 ForceScratchAndMTPBufferSizeMode = -1 +ForcePostSyncL1Flush = -1 # Please don't edit below this line diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_dg2_and_later.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_dg2_and_later.cpp index 201ea0ae3f..c2a9fcc2b4 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_dg2_and_later.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_dg2_and_later.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -49,6 +49,33 @@ HWTEST2_F(CommandEncodeStatesTestDg2AndLater, givenEventAddressWhenEncodeAndPVCA EXPECT_EQ(true, cmd->getPostSync().getDataportSubsliceCacheFlush()); } +HWTEST2_F(CommandEncodeStatesTestDg2AndLater, givenDebugVariableToForceL1FlushWhenWalkerIsProgramedThenCacheFlushIsDisabled, IsAtLeastXeHpgCore) { + DebugManagerStateRestore restore; + NEO::debugManager.flags.ForcePostSyncL1Flush.set(0); + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + using DefaultWalkerType = typename FamilyType::DefaultWalkerType; + uint32_t dims[] = {2, 1, 1}; + std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); + uint64_t eventAddress = MemoryConstants::cacheLineSize * 123; + + bool requiresUncachedMocs = false; + EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs); + dispatchArgs.eventAddress = eventAddress; + dispatchArgs.isTimestampEvent = true; + + EncodeDispatchKernel::template encode(*cmdContainer.get(), dispatchArgs); + + GenCmdList commands; + CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed()); + + using DefaultWalkerType = typename FamilyType::DefaultWalkerType; + auto itor = find(commands.begin(), commands.end()); + ASSERT_NE(itor, commands.end()); + auto cmd = genCmdCast(*itor); + EXPECT_EQ(false, cmd->getPostSync().getDataportPipelineFlush()); + EXPECT_EQ(false, cmd->getPostSync().getDataportSubsliceCacheFlush()); +} + HWTEST2_F(CommandEncodeStatesTestDg2AndLater, givenEventAddressWhenEncodeThenMocsIndex2IsSet, IsXeHpgCore) { using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; using DefaultWalkerType = typename FamilyType::DefaultWalkerType; diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_pvc_and_later.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_pvc_and_later.cpp index 240e0ddad7..493781d49a 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_pvc_and_later.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_pvc_and_later.cpp @@ -1,17 +1,19 @@ /* - * Copyright (C) 2021-2023 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/command_stream/stream_properties.h" +#include "shared/source/helpers/in_order_cmd_helpers.h" #include "shared/source/kernel/grf_config.h" #include "shared/source/os_interface/product_helper.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/common/mocks/mock_execution_environment.h" +#include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/test_macros/hw_test.h" #include "shared/test/common/test_macros/test.h" #include "shared/test/unit_test/fixtures/command_container_fixture.h" @@ -97,3 +99,28 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTestPvcAndLater, givenCommandCon auto cmd = genCmdCast(*itorCmd); EXPECT_EQ(productHelper.isGrfNumReportedWithScm(), cmd->getLargeGrfMode()); } + +HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDebugVariableWhenPostSyncIsProgrammedThenL1IsNotFlushed, IsAtLeastXeHpcCore) { + using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using DefaultWalkerType = typename FamilyType::DefaultWalkerType; + DebugManagerStateRestore restorer; + NEO::debugManager.flags.ForcePostSyncL1Flush.set(0); + uint32_t dims[] = {2, 1, 1}; + std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); + + EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, false); + + DefaultWalkerType walkerCmd = FamilyType::template getInitGpuWalker(); + + MockTagAllocator> deviceTagAllocator(0, pDevice->getMemoryManager()); + + auto inOrderExecInfo = InOrderExecInfo::create(deviceTagAllocator.getTag(), nullptr, *pDevice, 1, false); + + dispatchArgs.inOrderExecInfo = inOrderExecInfo.get(); + + EncodeDispatchKernel::template setupPostSyncForInOrderExec(walkerCmd, dispatchArgs); + + auto &postSyncData = walkerCmd.getPostSync(); + EXPECT_FALSE(postSyncData.getDataportPipelineFlush()); + EXPECT_FALSE(postSyncData.getDataportSubsliceCacheFlush()); +} \ No newline at end of file