From 2a262b22e107461919fb9704bbcf6c9133c3ca3e Mon Sep 17 00:00:00 2001 From: Dominik Dabek Date: Wed, 26 Apr 2023 09:30:37 +0000 Subject: [PATCH] performance: initialize cpu copy enabled bool once In immediate cmdlist, initialize copyThroughLockedPtrEnabled at creation once, instead of querying helper each mem copy. Related-To: NEO-7796 Signed-off-by: Dominik Dabek --- level_zero/core/source/cmdlist/cmdlist.h | 1 + .../source/cmdlist/cmdlist_hw_immediate.inl | 3 +- .../core/source/cmdlist/cmdlist_imp.cpp | 2 + .../core/test/unit_tests/mocks/mock_cmdlist.h | 2 + .../sources/cmdlist/test_cmdlist_7.cpp | 51 +++++++++++++++---- .../unit_tests/sources/event/test_event.cpp | 1 + 6 files changed, 47 insertions(+), 13 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 2da66ab2fc..40706809d9 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -421,6 +421,7 @@ struct CommandList : _ze_command_list_handle_t { bool dynamicHeapRequired = false; bool kernelWithAssertAppended = false; bool dispatchCmdListBatchBufferAsPrimary = false; + bool copyThroughLockedPtrEnabled = false; }; using CommandListAllocatorFn = CommandList *(*)(uint32_t); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index fc7d06930f..32919bbeed 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -717,8 +717,7 @@ bool CommandListCoreFamilyImmediate::preferCopyThroughLockedPtr(C return true; } - auto &gfxCoreHelper = this->device->getGfxCoreHelper(); - if (!gfxCoreHelper.copyThroughLockedPtrEnabled(this->device->getHwInfo(), this->device->getProductHelper())) { + if (!this->copyThroughLockedPtrEnabled) { return false; } diff --git a/level_zero/core/source/cmdlist/cmdlist_imp.cpp b/level_zero/core/source/cmdlist/cmdlist_imp.cpp index 55e50c5d66..7203f011ce 100644 --- a/level_zero/core/source/cmdlist/cmdlist_imp.cpp +++ b/level_zero/core/source/cmdlist/cmdlist_imp.cpp @@ -209,6 +209,8 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device commandList->numThreads = NEO::SysCalls::getNumThreads(); } + commandList->copyThroughLockedPtrEnabled = gfxCoreHelper.copyThroughLockedPtrEnabled(hwInfo, device->getProductHelper()); + return commandList; } diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 2b78d6b41f..a3fbef8e15 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -208,6 +208,7 @@ struct WhiteBox<::L0::CommandList> : public ::L0::CommandListImp { using BaseClass::commandContainer; using BaseClass::commandListPreemptionMode; using BaseClass::commandsToPatch; + using BaseClass::copyThroughLockedPtrEnabled; using BaseClass::csr; using BaseClass::currentBindingTablePoolBaseAddress; using BaseClass::currentDynamicStateBaseAddress; @@ -560,6 +561,7 @@ class MockCommandListImmediateHw : public WhiteBox<::L0::CommandListCoreFamilyIm MockCommandListImmediateHw() : BaseClass() {} using BaseClass::applyMemoryRangesBarrier; using BaseClass::cmdListType; + using BaseClass::copyThroughLockedPtrEnabled; using BaseClass::dcFlushSupport; using BaseClass::dependenciesPresent; using BaseClass::eventWaitlistSyncRequired; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp index e433aafe3a..16ef25e50d 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp @@ -1982,6 +1982,7 @@ using AppendMemoryLockedCopyTest = Test; HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DThenReturnTrue, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); CpuMemCopyInfo cpuMemCopyInfo(devicePtr, nonUsmHostPtr, 1024); auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &cpuMemCopyInfo.srcAllocData); @@ -1993,6 +1994,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForD2HThenReturnTrue, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); CpuMemCopyInfo cpuMemCopyInfo(nonUsmHostPtr, devicePtr, 1024); auto srcFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.srcAllocData); @@ -2004,6 +2006,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DThenReturnTrue, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); CpuMemCopyInfo cpuMemCopyInfo(devicePtr, hostPtr, 1024); auto srcFound = device->getDriverHandle()->findAllocationDataForRange(hostPtr, 1024, &cpuMemCopyInfo.srcAllocData); @@ -2015,6 +2018,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhen HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DWhenCopyCantBePerformedImmediatelyThenReturnFalse, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); CpuMemCopyInfo cpuMemCopyInfo(devicePtr, hostPtr, 1024); auto srcFound = device->getDriverHandle()->findAllocationDataForRange(hostPtr, 1024, &cpuMemCopyInfo.srcAllocData); @@ -2054,6 +2058,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhen HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSMDeviceAllocThenReturnCorrectValue, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); CpuMemCopyInfo cpuMemCopyInfo(devicePtr, nonUsmHostPtr, 1024); auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &cpuMemCopyInfo.srcAllocData); @@ -2066,6 +2071,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSM HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSMHostAllocThenReturnCorrectValue, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); NEO::SvmAllocationData *srcAllocData; NEO::SvmAllocationData *dstAllocData; @@ -2079,6 +2085,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSM HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSMSharedAllocThenReturnCorrectValue, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); NEO::SvmAllocationData *hostAllocData; NEO::SvmAllocationData *deviceAllocData; @@ -2107,6 +2114,7 @@ using LocalMemoryMultiSubDeviceTest = Test; HWTEST2_F(LocalMemoryMultiSubDeviceTest, givenImmediateCommandListWhenIsSuitableUSMDeviceAllocWithColouredBufferThenReturnFalse, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); void *devicePtr; @@ -2120,21 +2128,25 @@ HWTEST2_F(LocalMemoryMultiSubDeviceTest, givenImmediateCommandListWhenIsSuitable context->freeMem(devicePtr); } -HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrAndFlagDisabledWhenPreferCopyThroughLockedPtrCalledThenReturnFalse, IsAtLeastSkl) { - DebugManager.flags.ExperimentalCopyThroughLock.set(0); - MockCommandListImmediateHw cmdList; - cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); - CpuMemCopyInfo cpuMemCopyInfo(devicePtr, nonUsmHostPtr, 1024); - auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &cpuMemCopyInfo.srcAllocData); - ASSERT_FALSE(srcFound); - auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData); - ASSERT_TRUE(dstFound); - EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr)); +HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCreatingThenCopyThroughLockedPtrEnabledIsSetCorrectly, IsAtLeastSkl) { + const ze_command_queue_desc_t desc = {}; + ze_result_t returnValue; + std::unique_ptr commandList0(CommandList::createImmediate(productFamily, + device, + &desc, + false, + NEO::EngineGroupType::RenderCompute, + returnValue)); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + ASSERT_NE(nullptr, commandList0); + auto whiteBoxCmdList = static_cast(commandList0.get()); + EXPECT_EQ(whiteBoxCmdList->copyThroughLockedPtrEnabled, device->getGfxCoreHelper().copyThroughLockedPtrEnabled(device->getHwInfo(), device->getProductHelper())); } HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndForcingLockPtrViaEnvVariableWhenPreferCopyThroughLockPointerCalledThenTrueIsReturned, IsAtLeastSkl) { DebugManager.flags.ExperimentalForceCopyThroughLock.set(1); MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = false; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); CpuMemCopyInfo cpuMemCopyInfo(devicePtr, nonUsmHostPtr, 1024); auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &cpuMemCopyInfo.srcAllocData); @@ -2146,6 +2158,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndForcingLockPtr HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenGetTransferTypeThenReturnCorrectValue, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); void *hostPtr2; @@ -2197,6 +2210,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenGetTransferTy HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenGetTransferThresholdThenReturnCorrectValue, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); EXPECT_EQ(0u, cmdList.getTransferThreshold(TRANSFER_TYPE_UNKNOWN)); @@ -2223,6 +2237,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenGetTransferTh HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndThresholdDebugFlagSetWhenGetTransferThresholdThenReturnCorrectValue, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); EXPECT_EQ(4 * MemoryConstants::megaByte, cmdList.getTransferThreshold(HOST_NON_USM_TO_DEVICE_USM)); EXPECT_EQ(1 * MemoryConstants::kiloByte, cmdList.getTransferThreshold(DEVICE_USM_TO_HOST_NON_USM)); @@ -2236,6 +2251,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndThresholdDebug HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyH2DThenLockPtr, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; @@ -2251,6 +2267,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyD2HThenLockPtr, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; @@ -2269,6 +2286,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenForceModeWhenCopyIsCalledThenBothAllo DebugManager.flags.ExperimentalForceCopyThroughLock.set(1); MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = false; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; @@ -2297,6 +2315,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenForceModeWhenCopyIsCalledFromHostUsmT DebugManager.flags.ExperimentalForceCopyThroughLock.set(1); MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = false; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; @@ -2322,6 +2341,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenForceModeWhenCopyIsCalledFromHostUsmT HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyH2DAndDstPtrLockedThenDontLockAgain, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; @@ -2338,6 +2358,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyH2DThenUseMemcpyAndReturnSuccess, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; @@ -2356,6 +2377,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAndNonUsmHostPtrWhenCopyH2DThenSignalEvent, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; @@ -2379,6 +2401,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAnd HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAndCpuMemcpyWhenGpuHangThenDontSynchronizeEvent, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; reinterpret_cast *>(cmdList.csr)->callBaseWaitForCompletionWithTimeout = false; @@ -2405,6 +2428,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAnd HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWithoutBarrierThenDontWaitForTagUpdate, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; @@ -2417,6 +2441,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWith HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWithBarrierThenWaitForTagUpdate, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; @@ -2430,6 +2455,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWith HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendBarrierThenSetDependenciesPresent, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; @@ -2447,6 +2473,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendBarrier HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendWaitOnEventsThenSetDependenciesPresent, IsAtLeastSkl) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; @@ -2474,7 +2501,9 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendWaitOnE template class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImmediateHw { public: - MockAppendMemoryLockedCopyTestImmediateCmdList() : MockCommandListImmediateHw() {} + MockAppendMemoryLockedCopyTestImmediateCmdList() : MockCommandListImmediateHw() { + this->copyThroughLockedPtrEnabled = true; + } ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc, uint64_t dstOffset, void *srcPtr, NEO::GraphicsAllocation *srcPtrAlloc, diff --git a/level_zero/core/test/unit_tests/sources/event/test_event.cpp b/level_zero/core/test/unit_tests/sources/event/test_event.cpp index a6351e95b7..13eafe9a05 100644 --- a/level_zero/core/test/unit_tests/sources/event/test_event.cpp +++ b/level_zero/core/test/unit_tests/sources/event/test_event.cpp @@ -3494,6 +3494,7 @@ struct LocalMemoryEnabledDeviceFixture : public DeviceFixture { using EventTimestampTest = Test; HWTEST2_F(EventTimestampTest, givenAppendMemoryCopyIsCalledWhenCpuCopyIsUsedAndCopyTimeIsLessThanDeviceTimestampResolutionThenReturnTimstampDifferenceAsOne, IsXeHpcCore) { MockCommandListImmediateHw cmdList; + cmdList.copyThroughLockedPtrEnabled = true; cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u); cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver; neoDevice->setOSTime(new NEO::MockOSTimeWithConstTimestamp());