mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-01 04:23:00 +08:00
Enable CpuMemcpy for USM Host to Device transfer
This commit enables HostUSM -> DeviceUSM transfer to be performed through CpuMemcpy with 50KB threshold. USM copy is done on CPU only when it can be performed immediately - no dependencies present, no events to wait for. Related-To: NEO-7564 Signed-off-by: Fabian Zwolinski <fabian.zwolinski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
3cf5850172
commit
5eeea0dee8
@@ -155,7 +155,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
|
||||
void createLogicalStateHelper() override {}
|
||||
NEO::LogicalStateHelper *getLogicalStateHelper() const override;
|
||||
|
||||
bool preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo);
|
||||
bool preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
|
||||
bool isSuitableUSMHostAlloc(NEO::SvmAllocationData *alloc);
|
||||
bool isSuitableUSMDeviceAlloc(NEO::SvmAllocationData *alloc);
|
||||
bool isSuitableUSMSharedAlloc(NEO::SvmAllocationData *alloc);
|
||||
|
||||
@@ -395,7 +395,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
||||
CpuMemCopyInfo cpuMemCopyInfo(dstptr, srcptr, size);
|
||||
this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcptr), size, &cpuMemCopyInfo.srcAllocData);
|
||||
this->device->getDriverHandle()->findAllocationDataForRange(dstptr, size, &cpuMemCopyInfo.dstAllocData);
|
||||
if (preferCopyThroughLockedPtr(cpuMemCopyInfo)) {
|
||||
if (preferCopyThroughLockedPtr(cpuMemCopyInfo, numWaitEvents, phWaitEvents)) {
|
||||
ret = performCpuMemcpy(cpuMemCopyInfo, hSignalEvent, numWaitEvents, phWaitEvents);
|
||||
if (ret == ZE_RESULT_SUCCESS || ret == ZE_RESULT_ERROR_DEVICE_LOST) {
|
||||
return ret;
|
||||
@@ -707,7 +707,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo) {
|
||||
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
|
||||
if (NEO::DebugManager.flags.ExperimentalForceCopyThroughLock.get() == 1) {
|
||||
return true;
|
||||
}
|
||||
@@ -723,6 +723,21 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(C
|
||||
bool cpuMemCopyEnabled = false;
|
||||
|
||||
switch (transferType) {
|
||||
case HOST_USM_TO_DEVICE_USM: {
|
||||
if (this->dependenciesPresent) {
|
||||
cpuMemCopyEnabled = false;
|
||||
break;
|
||||
}
|
||||
bool allEventsCompleted = true;
|
||||
for (uint32_t i = 0; i < numWaitEvents; i++) {
|
||||
if (!Event::fromHandle(phWaitEvents[i])->isAlreadyCompleted()) {
|
||||
allEventsCompleted = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
cpuMemCopyEnabled = allEventsCompleted;
|
||||
break;
|
||||
}
|
||||
case HOST_NON_USM_TO_DEVICE_USM:
|
||||
case DEVICE_USM_TO_HOST_NON_USM:
|
||||
cpuMemCopyEnabled = true;
|
||||
|
||||
@@ -1976,7 +1976,7 @@ struct AppendMemoryLockedCopyFixture : public DeviceFixture {
|
||||
|
||||
using AppendMemoryLockedCopyTest = Test<AppendMemoryLockedCopyFixture>;
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledThenReturnTrue, IsAtLeastSkl) {
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DThenReturnTrue, IsAtLeastSkl) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, nonUsmHostPtr, 1024);
|
||||
@@ -1984,7 +1984,68 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW
|
||||
ASSERT_FALSE(srcFound);
|
||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||
ASSERT_TRUE(dstFound);
|
||||
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo));
|
||||
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForD2HThenReturnTrue, IsAtLeastSkl) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
CpuMemCopyInfo cpuMemCopyInfo(nonUsmHostPtr, devicePtr, 1024);
|
||||
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.srcAllocData);
|
||||
ASSERT_TRUE(srcFound);
|
||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||
ASSERT_FALSE(dstFound);
|
||||
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DThenReturnTrue, IsAtLeastSkl) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, hostPtr, 1024);
|
||||
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(hostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
|
||||
ASSERT_TRUE(srcFound);
|
||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||
ASSERT_TRUE(dstFound);
|
||||
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmHostPtrWhenPreferCopyThroughLockedPtrCalledForH2DWhenCopyCantBePerformedImmediatelyThenReturnFalse, IsAtLeastSkl) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
CpuMemCopyInfo cpuMemCopyInfo(devicePtr, hostPtr, 1024);
|
||||
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(hostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
|
||||
ASSERT_TRUE(srcFound);
|
||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||
ASSERT_TRUE(dstFound);
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
|
||||
eventPoolDesc.count = 1;
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::EventPool> eventPool = std::unique_ptr<L0::EventPool>(EventPool::create(device->getDriverHandle(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
ze_event_handle_t event = nullptr;
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.wait = 0;
|
||||
eventDesc.signal = 0;
|
||||
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event));
|
||||
std::unique_ptr<L0::Event> eventObject(L0::Event::fromHandle(event));
|
||||
|
||||
cmdList.dependenciesPresent = false;
|
||||
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 1, &event));
|
||||
|
||||
cmdList.dependenciesPresent = true;
|
||||
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
|
||||
|
||||
cmdList.dependenciesPresent = true;
|
||||
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 1, &event));
|
||||
|
||||
eventObject->setIsCompleted();
|
||||
cmdList.dependenciesPresent = false;
|
||||
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 1, &event));
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSMDeviceAllocThenReturnCorrectValue, IsAtLeastSkl) {
|
||||
@@ -2064,7 +2125,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrA
|
||||
ASSERT_FALSE(srcFound);
|
||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||
ASSERT_TRUE(dstFound);
|
||||
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo));
|
||||
EXPECT_FALSE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndForcingLockPtrViaEnvVariableWhenPreferCopyThroughLockPointerCalledThenTrueIsReturned, IsAtLeastSkl) {
|
||||
@@ -2076,7 +2137,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndForcingLockPtr
|
||||
ASSERT_FALSE(srcFound);
|
||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||
ASSERT_TRUE(dstFound);
|
||||
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo));
|
||||
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(cpuMemCopyInfo, 0, nullptr));
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenGetTransferTypeThenReturnCorrectValue, IsAtLeastSkl) {
|
||||
@@ -2430,7 +2491,7 @@ class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImm
|
||||
uint32_t appendMemoryCopyKernelWithGACalled = 0;
|
||||
};
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrWhenCopyH2DThenUseGpuMemcpy, IsAtLeastSkl) {
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrWhenCopyH2DThenUseCpuMemcpy, IsAtLeastSkl) {
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
@@ -2439,7 +2500,7 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndUsmSrcHostPtrW
|
||||
context->allocHostMem(&hostDesc, 1024, 1u, &usmSrcPtr);
|
||||
|
||||
cmdList.appendMemoryCopy(devicePtr, usmSrcPtr, 1024, nullptr, 0, nullptr, false);
|
||||
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
|
||||
EXPECT_EQ(cmdList.appendMemoryCopyKernelWithGACalled, 0u);
|
||||
context->freeMem(usmSrcPtr);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user