Enable CpuMemcpy for USM Host to Device transfer

This commit enables HostUSM -> DeviceUSM transfer to be performed
through CpuMemcpy with 50KB threshold.
USM copy is done on CPU only when it can be performed immediately -
no dependencies present, no events to wait for.

Related-To: NEO-7564
Signed-off-by: Fabian Zwolinski <fabian.zwolinski@intel.com>
This commit is contained in:
Fabian Zwolinski
2023-02-10 14:38:53 +00:00
committed by Compute-Runtime-Automation
parent 3cf5850172
commit 5eeea0dee8
3 changed files with 85 additions and 9 deletions

View File

@@ -155,7 +155,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
void createLogicalStateHelper() override {}
NEO::LogicalStateHelper *getLogicalStateHelper() const override;
bool preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo);
bool preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
bool isSuitableUSMHostAlloc(NEO::SvmAllocationData *alloc);
bool isSuitableUSMDeviceAlloc(NEO::SvmAllocationData *alloc);
bool isSuitableUSMSharedAlloc(NEO::SvmAllocationData *alloc);

View File

@@ -395,7 +395,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
CpuMemCopyInfo cpuMemCopyInfo(dstptr, srcptr, size);
this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcptr), size, &cpuMemCopyInfo.srcAllocData);
this->device->getDriverHandle()->findAllocationDataForRange(dstptr, size, &cpuMemCopyInfo.dstAllocData);
if (preferCopyThroughLockedPtr(cpuMemCopyInfo)) {
if (preferCopyThroughLockedPtr(cpuMemCopyInfo, numWaitEvents, phWaitEvents)) {
ret = performCpuMemcpy(cpuMemCopyInfo, hSignalEvent, numWaitEvents, phWaitEvents);
if (ret == ZE_RESULT_SUCCESS || ret == ZE_RESULT_ERROR_DEVICE_LOST) {
return ret;
@@ -707,7 +707,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo) {
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
if (NEO::DebugManager.flags.ExperimentalForceCopyThroughLock.get() == 1) {
return true;
}
@@ -723,6 +723,21 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(C
bool cpuMemCopyEnabled = false;
switch (transferType) {
case HOST_USM_TO_DEVICE_USM: {
if (this->dependenciesPresent) {
cpuMemCopyEnabled = false;
break;
}
bool allEventsCompleted = true;
for (uint32_t i = 0; i < numWaitEvents; i++) {
if (!Event::fromHandle(phWaitEvents[i])->isAlreadyCompleted()) {
allEventsCompleted = false;
break;
}
}
cpuMemCopyEnabled = allEventsCompleted;
break;
}
case HOST_NON_USM_TO_DEVICE_USM:
case DEVICE_USM_TO_HOST_NON_USM:
cpuMemCopyEnabled = true;

View File

@@ -1976,7 +1976,7 @@ struct AppendMemoryLockedCopyFixture : public DeviceFixture {
using AppendMemoryLockedCopyTest = Test<AppendMemoryLockedCopyFixture>;
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledThenReturnTrue, IsAtLeastSkl) {
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DThenReturnTrue, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, nonUsmHostPtr, 1024);
@@ -1984,7 +1984,68 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW
ASSERT_FALSE(srcFound);
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
ASSERT_TRUE(dstFound);
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo));
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForD2HThenReturnTrue, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
CpuMemCopyInfo cpuMemCopyInfo(nonUsmHostPtr, devicePtr, 1024);
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.srcAllocData);
ASSERT_TRUE(srcFound);
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &cpuMemCopyInfo.dstAllocData);
ASSERT_FALSE(dstFound);
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DThenReturnTrue, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, hostPtr, 1024);
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(hostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
ASSERT_TRUE(srcFound);
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
ASSERT_TRUE(dstFound);
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DWhenCopyCantBePerformedImmediatelyThenReturnFalse, IsAtLeastSkl) {
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, hostPtr, 1024);
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(hostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
ASSERT_TRUE(srcFound);
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
ASSERT_TRUE(dstFound);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
eventPoolDesc.count = 1;
ze_result_t returnValue;
std::unique_ptr<L0::EventPool> eventPool = std::unique_ptr<L0::EventPool>(EventPool::create(device->getDriverHandle(), context, 0, nullptr, &eventPoolDesc, returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
ze_event_handle_t event = nullptr;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.wait = 0;
eventDesc.signal = 0;
ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event));
std::unique_ptr<L0::Event> eventObject(L0::Event::fromHandle(event));
cmdList.dependenciesPresent = false;
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 1, &event));
cmdList.dependenciesPresent = true;
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
cmdList.dependenciesPresent = true;
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 1, &event));
eventObject->setIsCompleted();
cmdList.dependenciesPresent = false;
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 1, &event));
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSMDeviceAllocThenReturnCorrectValue, IsAtLeastSkl) {
@@ -2064,7 +2125,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrA
ASSERT_FALSE(srcFound);
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
ASSERT_TRUE(dstFound);
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo));
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndForcingLockPtrViaEnvVariableWhenPreferCopyThroughLockPointerCalledThenTrueIsReturned, IsAtLeastSkl) {
@@ -2076,7 +2137,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndForcingLockPtr
ASSERT_FALSE(srcFound);
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
ASSERT_TRUE(dstFound);
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo));
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenGetTransferTypeThenReturnCorrectValue, IsAtLeastSkl) {
@@ -2430,7 +2491,7 @@ class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImm
uint32_t appendMemoryCopyKernelWithGACalled = 0;
};
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrWhenCopyH2DThenUseGpuMemcpy, IsAtLeastSkl) {
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrWhenCopyH2DThenUseCpuMemcpy, IsAtLeastSkl) {
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
@@ -2439,7 +2500,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrW
context->allocHostMem(&hostDesc, 1024, 1u, &usmSrcPtr);
cmdList.appendMemoryCopy(devicePtr, usmSrcPtr, 1024, nullptr, 0, nullptr, false);
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
EXPECT_EQ(cmdList.appendMemoryCopyKernelWithGACalled, 0u);
context->freeMem(usmSrcPtr);
}