From cfd96980a0cc4258abbb207244020190cd26132c Mon Sep 17 00:00:00 2001 From: Compute-Runtime-Validation Date: Wed, 5 Oct 2022 07:25:38 +0200 Subject: [PATCH] Revert "[L0][XE_HPC]Perform memcpy on CPU by default" This reverts commit 383f33b4827f0febb5be2100c0f5446208725605. Signed-off-by: Compute-Runtime-Validation --- level_zero/core/source/cmdlist/cmdlist.h | 3 +++ level_zero/core/source/cmdlist/cmdlist_hw.inl | 1 + .../core/source/cmdlist/cmdlist_hw_immediate.h | 3 --- .../source/cmdlist/cmdlist_hw_immediate.inl | 9 ++++----- .../core/test/unit_tests/mocks/mock_cmdlist.h | 1 - .../xe_hpc_core/test_cmdlist_xe_hpc_core.cpp | 18 +----------------- .../debug_settings/debug_variables_base.inl | 2 +- shared/source/helpers/hw_helper_base.inl | 3 --- .../xe_hpc_core/hw_helper_xe_hpc_core.cpp | 5 +---- shared/test/common/test_files/igdrcl.config | 2 +- .../unit_test/helpers/test_hw_info_config.cpp | 12 +----------- .../hw_helper_xe_hpc_core_tests.cpp | 9 ++++++++- 12 files changed, 21 insertions(+), 47 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index cabc7f94fb..1785825bda 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -329,6 +329,9 @@ struct CommandList : _ze_command_list_handle_t { bool systolicModeSupport = false; bool pipelineSelectStateTracking = false; bool stateComputeModeTracking = false; + + std::atomic barrierCounter{0u}; + uint32_t latestFlushedBarrierCounter = 0u; }; using CommandListAllocatorFn = CommandList *(*)(uint32_t); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index cfdeee1bab..8bd4432d61 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2517,6 +2517,7 @@ ze_result_t CommandListCoreFamily::appendBarrier(ze_event_handle_ } appendSignalEventPostWalker(signalEvent, workloadPartition); + this->barrierCounter++; return ZE_RESULT_SUCCESS; } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h index a2a3c7b47b..3d47fd2d2c 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h @@ -132,9 +132,6 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily barrierCalled{false}; }; template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index de8b82eca2..7e46f1ac04 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -246,8 +246,6 @@ ze_result_t CommandListCoreFamilyImmediate::appendBarrier( checkAvailableSpace(); } ret = CommandListCoreFamily::appendBarrier(hSignalEvent, numWaitEvents, phWaitEvents); - - this->barrierCalled = true; return flushImmediate(ret, true, hSignalEvent); } @@ -546,7 +544,8 @@ ze_result_t CommandListCoreFamilyImmediate::performCpuMemcpy(void this->appendBarrier(nullptr, numWaitEvents, phWaitEvents); } - if (this->barrierCalled) { + bool needsFlushTagUpdate = this->latestFlushedBarrierCounter < this->barrierCounter; + if (needsFlushTagUpdate) { this->csr->flushTagUpdate(); } @@ -565,13 +564,13 @@ ze_result_t CommandListCoreFamilyImmediate::performCpuMemcpy(void cpuMemcpyDstPtr = dstptr; } - if (this->barrierCalled) { + if (needsFlushTagUpdate) { auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout; const auto waitStatus = this->csr->waitForCompletionWithTimeout(NEO::WaitParams{false, false, timeoutMicroseconds}, this->csr->peekTaskCount()); if (waitStatus == NEO::WaitStatus::GpuHang) { return ZE_RESULT_ERROR_DEVICE_LOST; } - this->barrierCalled = false; + this->latestFlushedBarrierCounter = this->barrierCounter; } if (signalEvent) { diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index a4f7688640..1a1c9972ce 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -484,7 +484,6 @@ class MockCommandListImmediateHw : public WhiteBox<::L0::CommandListCoreFamilyIm using BaseClass = WhiteBox<::L0::CommandListCoreFamilyImmediate>; MockCommandListImmediateHw() : BaseClass() {} using BaseClass::applyMemoryRangesBarrier; - using BaseClass::barrierCalled; using BaseClass::isFlushTaskSubmissionEnabled; using BaseClass::isSyncModeQueue; diff --git a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp index 6f2b2d6700..4351910a95 100644 --- a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp @@ -886,6 +886,7 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore, struct AppendMemoryLockedCopyFixture : public DeviceFixture { void setUp() { + DebugManager.flags.ExperimentalCopyThroughLock.set(1); DeviceFixture::setUp(); nonUsmHostPtr = new char[sz]; @@ -1075,23 +1076,6 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWith EXPECT_EQ(waitForFlushTagUpdateCalled, 1u); } -HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendBarrierThenSetBarrierCalled, IsXeHpcCore) { - MockCommandListImmediateHw cmdList; - cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); - cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; - - EXPECT_FALSE(cmdList.barrierCalled); - - cmdList.appendBarrier(nullptr, 0, nullptr); - - EXPECT_TRUE(cmdList.barrierCalled); - - auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr); - EXPECT_EQ(res, ZE_RESULT_SUCCESS); - - EXPECT_FALSE(cmdList.barrierCalled); -} - template class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImmediateHw { public: diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index d420b54b41..132b419cf6 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -428,10 +428,10 @@ DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableCustomLocalMemoryAlignment, 0, DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableDeviceAllocationCache, -1, "Experimentally enable allocation cache.") DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalH2DCpuCopyThreshold, -1, "Override default treshold (in bytes) for H2D CPU copy.") DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalD2HCpuCopyThreshold, -1, "Override default treshold (in bytes) for D2H CPU copy.") -DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalCopyThroughLock, -1, "Experimentally copy memory through locked ptr. -1: default 0: disable 1: enable ") DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableSourceLevelDebugger, false, "Experimentally enable source level debugger.") DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableL0DebuggerForOpenCL, false, "Experimentally enable debugging OCL with L0 Debug API.") DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableTileAttach, true, "Experimentally enable attaching to tiles (subdevices).") +DECLARE_DEBUG_VARIABLE(bool, ExperimentalCopyThroughLock, false, "Experimentally copy memory through locked ptr.") /*DRIVER TOGGLES*/ DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger") diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl index c704a929ca..84acc42e32 100644 --- a/shared/source/helpers/hw_helper_base.inl +++ b/shared/source/helpers/hw_helper_base.inl @@ -720,9 +720,6 @@ bool HwHelperHw::isPatIndexFallbackWaRequired() const { template bool HwHelperHw::copyThroughLockedPtrEnabled() const { - if (DebugManager.flags.ExperimentalCopyThroughLock.get() != -1) { - return DebugManager.flags.ExperimentalCopyThroughLock.get() == 1; - } return false; } diff --git a/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp b/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp index 5d82593dee..fefb80d436 100644 --- a/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp @@ -444,10 +444,7 @@ bool HwHelperHw::isPatIndexFallbackWaRequired() const { template <> bool HwHelperHw::copyThroughLockedPtrEnabled() const { - if (DebugManager.flags.ExperimentalCopyThroughLock.get() != -1) { - return DebugManager.flags.ExperimentalCopyThroughLock.get() == 1; - } - return true; + return DebugManager.flags.ExperimentalCopyThroughLock.get(); } } // namespace NEO diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 695005c88f..a08b80a180 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -460,7 +460,7 @@ DirectSubmissionDisablePrefetcher = -1 ForceDefaultGrfCompilationMode = 0 ForceLargeGrfCompilationMode = 0 ForceStatelessMocsEncryptionBit = -1 -ExperimentalCopyThroughLock = -1 +ExperimentalCopyThroughLock = 0 ExperimentalH2DCpuCopyThreshold = -1 ExperimentalD2HCpuCopyThreshold = -1 CopyHostPtrOnCpu = -1 diff --git a/shared/test/unit_test/helpers/test_hw_info_config.cpp b/shared/test/unit_test/helpers/test_hw_info_config.cpp index 4dea5e4be9..bd0112b62f 100644 --- a/shared/test/unit_test/helpers/test_hw_info_config.cpp +++ b/shared/test/unit_test/helpers/test_hw_info_config.cpp @@ -198,17 +198,7 @@ HWTEST2_F(HwInfoConfigTest, givenHwInfoConfigWhenIsPlatformQueryNotSupportedThen EXPECT_FALSE(hwInfoConfig.isPlatformQuerySupported()); } -HWTEST2_F(HwInfoConfigTest, givenHwHelperWhenCallCopyThroughLockedPtrEnabledThenReturnFalse, IsNotXeHpcCore) { +HWTEST_F(HwInfoConfigTest, givenHwHelperWhenCallCopyThroughLockedPtrEnabledThenReturnFalse) { HwHelper &hwHelper = HwHelper::get(defaultHwInfo->platform.eRenderCoreFamily); EXPECT_FALSE(hwHelper.copyThroughLockedPtrEnabled()); } - -HWTEST_F(HwInfoConfigTest, givenHwHelperWhenFlagSetAndCallCopyThroughLockedPtrEnabledThenReturnCorrectValue) { - DebugManagerStateRestore restorer; - HwHelper &hwHelper = HwHelper::get(defaultHwInfo->platform.eRenderCoreFamily); - DebugManager.flags.ExperimentalCopyThroughLock.set(0); - EXPECT_FALSE(hwHelper.copyThroughLockedPtrEnabled()); - - DebugManager.flags.ExperimentalCopyThroughLock.set(1); - EXPECT_TRUE(hwHelper.copyThroughLockedPtrEnabled()); -} \ No newline at end of file diff --git a/shared/test/unit_test/xe_hpc_core/hw_helper_xe_hpc_core_tests.cpp b/shared/test/unit_test/xe_hpc_core/hw_helper_xe_hpc_core_tests.cpp index 594916954a..7e0cf23b68 100644 --- a/shared/test/unit_test/xe_hpc_core/hw_helper_xe_hpc_core_tests.cpp +++ b/shared/test/unit_test/xe_hpc_core/hw_helper_xe_hpc_core_tests.cpp @@ -6,6 +6,7 @@ */ #include "shared/source/helpers/hw_helper.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/default_hw_info.h" #include "shared/test/common/helpers/hw_helper_tests.h" #include "shared/test/common/test_macros/header/per_product_test_definitions.h" @@ -74,7 +75,13 @@ XE_HPC_CORETEST_F(HwHelperXeHpcCoreTest, givenXeHPCPlatformWhenCheckAssignEngine EXPECT_EQ(hwHelper.isAssignEngineRoundRobinSupported(hwInfo), HwInfoConfig::get(hwInfo.platform.eProductFamily)->isAssignEngineRoundRobinSupported()); } -XE_HPC_CORETEST_F(HwHelperTest, givenHwHelperWhenCallCopyThroughLockedPtrEnabledThenReturnTrue) { +XE_HPC_CORETEST_F(HwHelperTest, givenHwHelperWithFlagSetWhenCallCopyThroughLockedPtrEnabledThenReturnFalse) { + DebugManagerStateRestore restore; auto &hwHelper = HwHelperHw::get(); + + DebugManager.flags.ExperimentalCopyThroughLock.set(false); + EXPECT_FALSE(hwHelper.copyThroughLockedPtrEnabled()); + + DebugManager.flags.ExperimentalCopyThroughLock.set(true); EXPECT_TRUE(hwHelper.copyThroughLockedPtrEnabled()); }