mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-05 18:06:32 +08:00
Enable CpuMemcpy for USM Host to Device transfer
This commit enables HostUSM -> DeviceUSM transfer to be performed through CpuMemcpy with 50KB threshold. USM copy is done on CPU only when it can be performed immediately - no dependencies present, no events to wait for. Related-To: NEO-7564 Signed-off-by: Fabian Zwolinski <fabian.zwolinski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
3cf5850172
commit
5eeea0dee8
@@ -155,7 +155,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
|
|||||||
void createLogicalStateHelper() override {}
|
void createLogicalStateHelper() override {}
|
||||||
NEO::LogicalStateHelper *getLogicalStateHelper() const override;
|
NEO::LogicalStateHelper *getLogicalStateHelper() const override;
|
||||||
|
|
||||||
bool preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo);
|
bool preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
|
||||||
bool isSuitableUSMHostAlloc(NEO::SvmAllocationData *alloc);
|
bool isSuitableUSMHostAlloc(NEO::SvmAllocationData *alloc);
|
||||||
bool isSuitableUSMDeviceAlloc(NEO::SvmAllocationData *alloc);
|
bool isSuitableUSMDeviceAlloc(NEO::SvmAllocationData *alloc);
|
||||||
bool isSuitableUSMSharedAlloc(NEO::SvmAllocationData *alloc);
|
bool isSuitableUSMSharedAlloc(NEO::SvmAllocationData *alloc);
|
||||||
|
|||||||
@@ -395,7 +395,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
|||||||
CpuMemCopyInfo cpuMemCopyInfo(dstptr, srcptr, size);
|
CpuMemCopyInfo cpuMemCopyInfo(dstptr, srcptr, size);
|
||||||
this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcptr), size, &cpuMemCopyInfo.srcAllocData);
|
this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcptr), size, &cpuMemCopyInfo.srcAllocData);
|
||||||
this->device->getDriverHandle()->findAllocationDataForRange(dstptr, size, &cpuMemCopyInfo.dstAllocData);
|
this->device->getDriverHandle()->findAllocationDataForRange(dstptr, size, &cpuMemCopyInfo.dstAllocData);
|
||||||
if (preferCopyThroughLockedPtr(cpuMemCopyInfo)) {
|
if (preferCopyThroughLockedPtr(cpuMemCopyInfo, numWaitEvents, phWaitEvents)) {
|
||||||
ret = performCpuMemcpy(cpuMemCopyInfo, hSignalEvent, numWaitEvents, phWaitEvents);
|
ret = performCpuMemcpy(cpuMemCopyInfo, hSignalEvent, numWaitEvents, phWaitEvents);
|
||||||
if (ret == ZE_RESULT_SUCCESS || ret == ZE_RESULT_ERROR_DEVICE_LOST) {
|
if (ret == ZE_RESULT_SUCCESS || ret == ZE_RESULT_ERROR_DEVICE_LOST) {
|
||||||
return ret;
|
return ret;
|
||||||
@@ -707,7 +707,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||||
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo) {
|
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
|
||||||
if (NEO::DebugManager.flags.ExperimentalForceCopyThroughLock.get() == 1) {
|
if (NEO::DebugManager.flags.ExperimentalForceCopyThroughLock.get() == 1) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -723,6 +723,21 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(C
|
|||||||
bool cpuMemCopyEnabled = false;
|
bool cpuMemCopyEnabled = false;
|
||||||
|
|
||||||
switch (transferType) {
|
switch (transferType) {
|
||||||
|
case HOST_USM_TO_DEVICE_USM: {
|
||||||
|
if (this->dependenciesPresent) {
|
||||||
|
cpuMemCopyEnabled = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
bool allEventsCompleted = true;
|
||||||
|
for (uint32_t i = 0; i < numWaitEvents; i++) {
|
||||||
|
if (!Event::fromHandle(phWaitEvents[i])->isAlreadyCompleted()) {
|
||||||
|
allEventsCompleted = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cpuMemCopyEnabled = allEventsCompleted;
|
||||||
|
break;
|
||||||
|
}
|
||||||
case HOST_NON_USM_TO_DEVICE_USM:
|
case HOST_NON_USM_TO_DEVICE_USM:
|
||||||
case DEVICE_USM_TO_HOST_NON_USM:
|
case DEVICE_USM_TO_HOST_NON_USM:
|
||||||
cpuMemCopyEnabled = true;
|
cpuMemCopyEnabled = true;
|
||||||
|
|||||||
@@ -1976,7 +1976,7 @@ struct AppendMemoryLockedCopyFixture : public DeviceFixture {
|
|||||||
|
|
||||||
using AppendMemoryLockedCopyTest = Test<AppendMemoryLockedCopyFixture>;
|
using AppendMemoryLockedCopyTest = Test<AppendMemoryLockedCopyFixture>;
|
||||||
|
|
||||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledThenReturnTrue, IsAtLeastSkl) {
|
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DThenReturnTrue, IsAtLeastSkl) {
|
||||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||||
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, nonUsmHostPtr, 1024);
|
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, nonUsmHostPtr, 1024);
|
||||||
@@ -1984,7 +1984,68 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW
|
|||||||
ASSERT_FALSE(srcFound);
|
ASSERT_FALSE(srcFound);
|
||||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||||
ASSERT_TRUE(dstFound);
|
ASSERT_TRUE(dstFound);
|
||||||
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo));
|
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForD2HThenReturnTrue, IsAtLeastSkl) {
|
||||||
|
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||||
|
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||||
|
CpuMemCopyInfo cpuMemCopyInfo(nonUsmHostPtr, devicePtr, 1024);
|
||||||
|
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.srcAllocData);
|
||||||
|
ASSERT_TRUE(srcFound);
|
||||||
|
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||||
|
ASSERT_FALSE(dstFound);
|
||||||
|
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DThenReturnTrue, IsAtLeastSkl) {
|
||||||
|
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||||
|
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||||
|
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, hostPtr, 1024);
|
||||||
|
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(hostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
|
||||||
|
ASSERT_TRUE(srcFound);
|
||||||
|
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||||
|
ASSERT_TRUE(dstFound);
|
||||||
|
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DWhenCopyCantBePerformedImmediatelyThenReturnFalse, IsAtLeastSkl) {
|
||||||
|
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||||
|
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||||
|
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, hostPtr, 1024);
|
||||||
|
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(hostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
|
||||||
|
ASSERT_TRUE(srcFound);
|
||||||
|
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||||
|
ASSERT_TRUE(dstFound);
|
||||||
|
|
||||||
|
ze_event_pool_desc_t eventPoolDesc = {};
|
||||||
|
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
|
||||||
|
eventPoolDesc.count = 1;
|
||||||
|
ze_result_t returnValue;
|
||||||
|
std::unique_ptr<L0::EventPool> eventPool = std::unique_ptr<L0::EventPool>(EventPool::create(device->getDriverHandle(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||||
|
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||||
|
|
||||||
|
ze_event_handle_t event = nullptr;
|
||||||
|
ze_event_desc_t eventDesc = {};
|
||||||
|
eventDesc.index = 0;
|
||||||
|
eventDesc.wait = 0;
|
||||||
|
eventDesc.signal = 0;
|
||||||
|
|
||||||
|
ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event));
|
||||||
|
std::unique_ptr<L0::Event> eventObject(L0::Event::fromHandle(event));
|
||||||
|
|
||||||
|
cmdList.dependenciesPresent = false;
|
||||||
|
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 1, &event));
|
||||||
|
|
||||||
|
cmdList.dependenciesPresent = true;
|
||||||
|
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
|
||||||
|
|
||||||
|
cmdList.dependenciesPresent = true;
|
||||||
|
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 1, &event));
|
||||||
|
|
||||||
|
eventObject->setIsCompleted();
|
||||||
|
cmdList.dependenciesPresent = false;
|
||||||
|
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 1, &event));
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSMDeviceAllocThenReturnCorrectValue, IsAtLeastSkl) {
|
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSMDeviceAllocThenReturnCorrectValue, IsAtLeastSkl) {
|
||||||
@@ -2064,7 +2125,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrA
|
|||||||
ASSERT_FALSE(srcFound);
|
ASSERT_FALSE(srcFound);
|
||||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||||
ASSERT_TRUE(dstFound);
|
ASSERT_TRUE(dstFound);
|
||||||
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo));
|
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndForcingLockPtrViaEnvVariableWhenPreferCopyThroughLockPointerCalledThenTrueIsReturned, IsAtLeastSkl) {
|
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndForcingLockPtrViaEnvVariableWhenPreferCopyThroughLockPointerCalledThenTrueIsReturned, IsAtLeastSkl) {
|
||||||
@@ -2076,7 +2137,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndForcingLockPtr
|
|||||||
ASSERT_FALSE(srcFound);
|
ASSERT_FALSE(srcFound);
|
||||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||||
ASSERT_TRUE(dstFound);
|
ASSERT_TRUE(dstFound);
|
||||||
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo));
|
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenGetTransferTypeThenReturnCorrectValue, IsAtLeastSkl) {
|
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenGetTransferTypeThenReturnCorrectValue, IsAtLeastSkl) {
|
||||||
@@ -2430,7 +2491,7 @@ class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImm
|
|||||||
uint32_t appendMemoryCopyKernelWithGACalled = 0;
|
uint32_t appendMemoryCopyKernelWithGACalled = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrWhenCopyH2DThenUseGpuMemcpy, IsAtLeastSkl) {
|
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrWhenCopyH2DThenUseCpuMemcpy, IsAtLeastSkl) {
|
||||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||||
@@ -2439,7 +2500,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrW
|
|||||||
context->allocHostMem(&hostDesc, 1024, 1u, &usmSrcPtr);
|
context->allocHostMem(&hostDesc, 1024, 1u, &usmSrcPtr);
|
||||||
|
|
||||||
cmdList.appendMemoryCopy(devicePtr, usmSrcPtr, 1024, nullptr, 0, nullptr, false);
|
cmdList.appendMemoryCopy(devicePtr, usmSrcPtr, 1024, nullptr, 0, nullptr, false);
|
||||||
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
|
EXPECT_EQ(cmdList.appendMemoryCopyKernelWithGACalled, 0u);
|
||||||
context->freeMem(usmSrcPtr);
|
context->freeMem(usmSrcPtr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user