diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index e0cf728fd8..65f2bd4879 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -325,6 +325,9 @@ struct CommandList : _ze_command_list_handle_t { bool systolicModeSupport = false; bool pipelineSelectStateTracking = false; bool stateComputeModeTracking = false; + + std::atomic barrierCounter{0u}; + uint32_t latestFlushedBarrierCounter = 0u; }; using CommandListAllocatorFn = CommandList *(*)(uint32_t); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 2c95591f53..a22ee08446 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2495,6 +2495,7 @@ ze_result_t CommandListCoreFamily::appendBarrier(ze_event_handle_ } appendSignalEventPostWalker(signalEvent, workloadPartition); + this->barrierCounter++; return ZE_RESULT_SUCCESS; } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h index 3d8adebe89..92415bcfbb 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h @@ -138,9 +138,6 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily barrierCalled{false}; }; template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index c33bcc9441..bdfe65e9c9 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -210,7 +210,6 @@ ze_result_t CommandListCoreFamilyImmediate::appendBarrier( checkAvailableSpace(); } ret = CommandListCoreFamily::appendBarrier(hSignalEvent, numWaitEvents, phWaitEvents); - this->barrierCalled = true; return flushImmediate(ret, true); } @@ -514,7 +513,8 @@ ze_result_t CommandListCoreFamilyImmediate::performCpuMemcpy(void this->appendBarrier(nullptr, numWaitEvents, phWaitEvents); } - if (this->barrierCalled) { + bool needsFlushTagUpdate = this->latestFlushedBarrierCounter < this->barrierCounter; + if (needsFlushTagUpdate) { this->csr->flushTagUpdate(); } @@ -533,13 +533,13 @@ ze_result_t CommandListCoreFamilyImmediate::performCpuMemcpy(void cpuMemcpyDstPtr = dstptr; } - if (this->barrierCalled) { + if (needsFlushTagUpdate) { auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout; const auto waitStatus = this->csr->waitForCompletionWithTimeout(NEO::WaitParams{false, false, timeoutMicroseconds}, this->csr->peekTaskCount()); if (waitStatus == NEO::WaitStatus::GpuHang) { return ZE_RESULT_ERROR_DEVICE_LOST; } - this->barrierCalled = false; + this->latestFlushedBarrierCounter = this->barrierCounter; } if (signalEvent) { diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 1ed01f282f..778056e1c4 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -478,7 +478,6 @@ class MockCommandListImmediateHw : public WhiteBox<::L0::CommandListCoreFamilyIm using BaseClass = WhiteBox<::L0::CommandListCoreFamilyImmediate>; MockCommandListImmediateHw() : BaseClass() {} using BaseClass::applyMemoryRangesBarrier; - using BaseClass::barrierCalled; using BaseClass::isFlushTaskSubmissionEnabled; using BaseClass::isSyncModeQueue; diff --git a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp index 6f2b2d6700..4351910a95 100644 --- a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp @@ -886,6 +886,7 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, struct AppendMemoryLockedCopyFixture : public DeviceFixture { void setUp() { + DebugManager.flags.ExperimentalCopyThroughLock.set(1); DeviceFixture::setUp(); nonUsmHostPtr = new char[sz]; @@ -1075,23 +1076,6 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWith EXPECT_EQ(waitForFlushTagUpdateCalled, 1u); } -HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendBarrierThenSetBarrierCalled, IsXeHpcCore) { - MockCommandListImmediateHw cmdList; - cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); - cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; - - EXPECT_FALSE(cmdList.barrierCalled); - - cmdList.appendBarrier(nullptr, 0, nullptr); - - EXPECT_TRUE(cmdList.barrierCalled); - - auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr); - EXPECT_EQ(res, ZE_RESULT_SUCCESS); - - EXPECT_FALSE(cmdList.barrierCalled); -} - template class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImmediateHw { public: diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 61aaa6ea48..10ca92b10f 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -428,10 +428,10 @@ DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableCustomLocalMemoryAlignment, 0, DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableDeviceAllocationCache, -1, "Experimentally enable allocation cache.") DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalH2DCpuCopyThreshold, -1, "Override default treshold (in bytes) for H2D CPU copy.") DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalD2HCpuCopyThreshold, -1, "Override default treshold (in bytes) for D2H CPU copy.") -DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalCopyThroughLock, -1, "Experimentally copy memory through locked ptr. -1: default 0: disable 1: enable ") DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableSourceLevelDebugger, false, "Experimentally enable source level debugger.") DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableL0DebuggerForOpenCL, false, "Experimentally enable debugging OCL with L0 Debug API.") DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableTileAttach, true, "Experimentally enable attaching to tiles (subdevices).") +DECLARE_DEBUG_VARIABLE(bool, ExperimentalCopyThroughLock, false, "Experimentally copy memory through locked ptr.") /*DRIVER TOGGLES*/ DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger") diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl index 396d01a3f5..6939bbd070 100644 --- a/shared/source/helpers/hw_helper_base.inl +++ b/shared/source/helpers/hw_helper_base.inl @@ -723,9 +723,6 @@ bool HwHelperHw::isPatIndexFallbackWaRequired() const { template bool HwHelperHw::copyThroughLockedPtrEnabled() const { - if (DebugManager.flags.ExperimentalCopyThroughLock.get() != -1) { - return DebugManager.flags.ExperimentalCopyThroughLock.get() == 1; - } return false; } diff --git a/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp b/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp index 5d82593dee..fefb80d436 100644 --- a/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp @@ -444,10 +444,7 @@ bool HwHelperHw::isPatIndexFallbackWaRequired() const { template <> bool HwHelperHw::copyThroughLockedPtrEnabled() const { - if (DebugManager.flags.ExperimentalCopyThroughLock.get() != -1) { - return DebugManager.flags.ExperimentalCopyThroughLock.get() == 1; - } - return true; + return DebugManager.flags.ExperimentalCopyThroughLock.get(); } } // namespace NEO diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index c5f2ec1918..f93591506a 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -461,7 +461,7 @@ DirectSubmissionDisablePrefetcher = -1 ForceDefaultGrfCompilationMode = 0 ForceLargeGrfCompilationMode = 0 ForceStatelessMocsEncryptionBit = -1 -ExperimentalCopyThroughLock = -1 +ExperimentalCopyThroughLock = 0 ExperimentalH2DCpuCopyThreshold = -1 ExperimentalD2HCpuCopyThreshold = -1 CopyHostPtrOnCpu = -1 diff --git a/shared/test/unit_test/helpers/test_hw_info_config.cpp b/shared/test/unit_test/helpers/test_hw_info_config.cpp index 4dea5e4be9..bd0112b62f 100644 --- a/shared/test/unit_test/helpers/test_hw_info_config.cpp +++ b/shared/test/unit_test/helpers/test_hw_info_config.cpp @@ -198,17 +198,7 @@ HWTEST2_F(HwInfoConfigTest, givenHwInfoConfigWhenIsPlatformQueryNotSupportedThen EXPECT_FALSE(hwInfoConfig.isPlatformQuerySupported()); } -HWTEST2_F(HwInfoConfigTest, givenHwHelperWhenCallCopyThroughLockedPtrEnabledThenReturnFalse, IsNotXeHpcCore) { +HWTEST_F(HwInfoConfigTest, givenHwHelperWhenCallCopyThroughLockedPtrEnabledThenReturnFalse) { HwHelper &hwHelper = HwHelper::get(defaultHwInfo->platform.eRenderCoreFamily); EXPECT_FALSE(hwHelper.copyThroughLockedPtrEnabled()); } - -HWTEST_F(HwInfoConfigTest, givenHwHelperWhenFlagSetAndCallCopyThroughLockedPtrEnabledThenReturnCorrectValue) { - DebugManagerStateRestore restorer; - HwHelper &hwHelper = HwHelper::get(defaultHwInfo->platform.eRenderCoreFamily); - DebugManager.flags.ExperimentalCopyThroughLock.set(0); - EXPECT_FALSE(hwHelper.copyThroughLockedPtrEnabled()); - - DebugManager.flags.ExperimentalCopyThroughLock.set(1); - EXPECT_TRUE(hwHelper.copyThroughLockedPtrEnabled()); -} \ No newline at end of file diff --git a/shared/test/unit_test/xe_hpc_core/hw_helper_xe_hpc_core_tests.cpp b/shared/test/unit_test/xe_hpc_core/hw_helper_xe_hpc_core_tests.cpp index 594916954a..7e0cf23b68 100644 --- a/shared/test/unit_test/xe_hpc_core/hw_helper_xe_hpc_core_tests.cpp +++ b/shared/test/unit_test/xe_hpc_core/hw_helper_xe_hpc_core_tests.cpp @@ -6,6 +6,7 @@ */ #include "shared/source/helpers/hw_helper.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/default_hw_info.h" #include "shared/test/common/helpers/hw_helper_tests.h" #include "shared/test/common/test_macros/header/per_product_test_definitions.h" @@ -74,7 +75,13 @@ XE_HPC_CORETEST_F(HwHelperXeHpcCoreTest, givenXeHPCPlatformWhenCheckAssignEngine EXPECT_EQ(hwHelper.isAssignEngineRoundRobinSupported(hwInfo), HwInfoConfig::get(hwInfo.platform.eProductFamily)->isAssignEngineRoundRobinSupported()); } -XE_HPC_CORETEST_F(HwHelperTest, givenHwHelperWhenCallCopyThroughLockedPtrEnabledThenReturnTrue) { +XE_HPC_CORETEST_F(HwHelperTest, givenHwHelperWithFlagSetWhenCallCopyThroughLockedPtrEnabledThenReturnFalse) { + DebugManagerStateRestore restore; auto &hwHelper = HwHelperHw::get(); + + DebugManager.flags.ExperimentalCopyThroughLock.set(false); + EXPECT_FALSE(hwHelper.copyThroughLockedPtrEnabled()); + + DebugManager.flags.ExperimentalCopyThroughLock.set(true); EXPECT_TRUE(hwHelper.copyThroughLockedPtrEnabled()); }