performance: initialize cpu copy enabled bool once

In immediate cmdlist, initialize copyThroughLockedPtrEnabled at creation
once, instead of querying helper each mem copy.

Related-To: NEO-7796

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek 2023-04-26 09:30:37 +00:00 committed by Compute-Runtime-Automation
parent bbf23df6eb
commit 2a262b22e1
6 changed files with 47 additions and 13 deletions

View File

@ -421,6 +421,7 @@ struct CommandList : _ze_command_list_handle_t {
bool dynamicHeapRequired = false;
bool kernelWithAssertAppended = false;
bool dispatchCmdListBatchBufferAsPrimary = false;
bool copyThroughLockedPtrEnabled = false;
};
using CommandListAllocatorFn = CommandList *(*)(uint32_t);

View File

@ -717,8 +717,7 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(C
return true;
}
auto &gfxCoreHelper = this->device->getGfxCoreHelper();
if (!gfxCoreHelper.copyThroughLockedPtrEnabled(this->device->getHwInfo(), this->device->getProductHelper())) {
if (!this->copyThroughLockedPtrEnabled) {
return false;
}

View File

@ -209,6 +209,8 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device
commandList->numThreads = NEO::SysCalls::getNumThreads();
}
commandList->copyThroughLockedPtrEnabled = gfxCoreHelper.copyThroughLockedPtrEnabled(hwInfo, device->getProductHelper());
return commandList;
}

View File

@ -208,6 +208,7 @@ struct WhiteBox<::L0::CommandList> : public ::L0::CommandListImp {
using BaseClass::commandContainer;
using BaseClass::commandListPreemptionMode;
using BaseClass::commandsToPatch;
using BaseClass::copyThroughLockedPtrEnabled;
using BaseClass::csr;
using BaseClass::currentBindingTablePoolBaseAddress;
using BaseClass::currentDynamicStateBaseAddress;
@ -560,6 +561,7 @@ class MockCommandListImmediateHw : public WhiteBox<::L0::CommandListCoreFamilyIm
MockCommandListImmediateHw() : BaseClass() {}
using BaseClass::applyMemoryRangesBarrier;
using BaseClass::cmdListType;
using BaseClass::copyThroughLockedPtrEnabled;
using BaseClass::dcFlushSupport;
using BaseClass::dependenciesPresent;
using BaseClass::eventWaitlistSyncRequired;

View File

@ -1982,6 +1982,7 @@ using AppendMemoryLockedCopyTest = Test<AppendMemoryLockedCopyFixture>;
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DThenReturnTrue, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, nonUsmHostPtr, 1024);
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
@ -1993,6 +1994,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForD2HThenReturnTrue, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
CpuMemCopyInfo cpuMemCopyInfo(nonUsmHostPtr, devicePtr, 1024);
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.srcAllocData);
@ -2004,6 +2006,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DThenReturnTrue, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, hostPtr, 1024);
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(hostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
@ -2015,6 +2018,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhen
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DWhenCopyCantBePerformedImmediatelyThenReturnFalse, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, hostPtr, 1024);
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(hostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
@ -2054,6 +2058,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhen
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSMDeviceAllocThenReturnCorrectValue, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, nonUsmHostPtr, 1024);
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
@ -2066,6 +2071,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSM
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSMHostAllocThenReturnCorrectValue, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
NEO::SvmAllocationData *srcAllocData;
NEO::SvmAllocationData *dstAllocData;
@ -2079,6 +2085,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSM
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSMSharedAllocThenReturnCorrectValue, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
NEO::SvmAllocationData *hostAllocData;
NEO::SvmAllocationData *deviceAllocData;
@ -2107,6 +2114,7 @@ using LocalMemoryMultiSubDeviceTest = Test<LocalMemoryMultiSubDeviceFixture>;
HWTEST2_F(LocalMemoryMultiSubDeviceTest, givenImmediateCommandListWhenIsSuitableUSMDeviceAllocWithColouredBufferThenReturnFalse, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *devicePtr;
@ -2120,21 +2128,25 @@ HWTEST2_F(LocalMemoryMultiSubDeviceTest, givenImmediateCommandListWhenIsSuitable
context->freeMem(devicePtr);
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrAndFlagDisabledWhenPreferCopyThroughLockedPtrCalledThenReturnFalse, IsAtLeastSkl) {
DebugManager.flags.ExperimentalCopyThroughLock.set(0);
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, nonUsmHostPtr, 1024);
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
ASSERT_FALSE(srcFound);
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
ASSERT_TRUE(dstFound);
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCreatingThenCopyThroughLockedPtrEnabledIsSetCorrectly, IsAtLeastSkl) {
const ze_command_queue_desc_t desc = {};
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList0(CommandList::createImmediate(productFamily,
device,
&desc,
false,
NEO::EngineGroupType::RenderCompute,
returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
ASSERT_NE(nullptr, commandList0);
auto whiteBoxCmdList = static_cast<CommandList *>(commandList0.get());
EXPECT_EQ(whiteBoxCmdList->copyThroughLockedPtrEnabled, device->getGfxCoreHelper().copyThroughLockedPtrEnabled(device->getHwInfo(), device->getProductHelper()));
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndForcingLockPtrViaEnvVariableWhenPreferCopyThroughLockPointerCalledThenTrueIsReturned, IsAtLeastSkl) {
DebugManager.flags.ExperimentalForceCopyThroughLock.set(1);
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = false;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, nonUsmHostPtr, 1024);
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
@ -2146,6 +2158,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndForcingLockPtr
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenGetTransferTypeThenReturnCorrectValue, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *hostPtr2;
@ -2197,6 +2210,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenGetTransferTy
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenGetTransferThresholdThenReturnCorrectValue, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
EXPECT_EQ(0u, cmdList.getTransferThreshold(TRANSFER_TYPE_UNKNOWN));
@ -2223,6 +2237,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenGetTransferTh
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndThresholdDebugFlagSetWhenGetTransferThresholdThenReturnCorrectValue, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
EXPECT_EQ(4 * MemoryConstants::megaByte, cmdList.getTransferThreshold(HOST_NON_USM_TO_DEVICE_USM));
EXPECT_EQ(1 * MemoryConstants::kiloByte, cmdList.getTransferThreshold(DEVICE_USM_TO_HOST_NON_USM));
@ -2236,6 +2251,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndThresholdDebug
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyH2DThenLockPtr, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
@ -2251,6 +2267,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyD2HThenLockPtr, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
@ -2269,6 +2286,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenForceModeWhenCopyIsCalledThenBothAllo
DebugManager.flags.ExperimentalForceCopyThroughLock.set(1);
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = false;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
@ -2297,6 +2315,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenForceModeWhenCopyIsCalledFromHostUsmT
DebugManager.flags.ExperimentalForceCopyThroughLock.set(1);
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = false;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
@ -2322,6 +2341,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenForceModeWhenCopyIsCalledFromHostUsmT
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyH2DAndDstPtrLockedThenDontLockAgain, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
@ -2338,6 +2358,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenCopyH2DThenUseMemcpyAndReturnSuccess, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
@ -2356,6 +2377,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAndNonUsmHostPtrWhenCopyH2DThenSignalEvent, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
@ -2379,6 +2401,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAnd
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAndCpuMemcpyWhenGpuHangThenDontSynchronizeEvent, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->callBaseWaitForCompletionWithTimeout = false;
@ -2405,6 +2428,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAnd
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWithoutBarrierThenDontWaitForTagUpdate, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
@ -2417,6 +2441,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWith
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWithBarrierThenWaitForTagUpdate, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
@ -2430,6 +2455,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWith
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendBarrierThenSetDependenciesPresent, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
@ -2447,6 +2473,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendBarrier
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendWaitOnEventsThenSetDependenciesPresent, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
@ -2474,7 +2501,9 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendWaitOnE
template <GFXCORE_FAMILY gfxCoreFamily>
class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImmediateHw<gfxCoreFamily> {
public:
MockAppendMemoryLockedCopyTestImmediateCmdList() : MockCommandListImmediateHw<gfxCoreFamily>() {}
MockAppendMemoryLockedCopyTestImmediateCmdList() : MockCommandListImmediateHw<gfxCoreFamily>() {
this->copyThroughLockedPtrEnabled = true;
}
ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc,
uint64_t dstOffset, void *srcPtr,
NEO::GraphicsAllocation *srcPtrAlloc,

View File

@ -3494,6 +3494,7 @@ struct LocalMemoryEnabledDeviceFixture : public DeviceFixture {
using EventTimestampTest = Test<LocalMemoryEnabledDeviceFixture>;
HWTEST2_F(EventTimestampTest, givenAppendMemoryCopyIsCalledWhenCpuCopyIsUsedAndCopyTimeIsLessThanDeviceTimestampResolutionThenReturnTimstampDifferenceAsOne, IsXeHpcCore) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
neoDevice->setOSTime(new NEO::MockOSTimeWithConstTimestamp());