mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-30 09:58:55 +08:00
fix(l0): add fallback to cpu memory copy
Fallback to gpu copy if failed to obtain needed locked ptr Related-To: NEO-7553 Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
e44e6e9a31
commit
86f63875bf
@@ -158,7 +158,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
|
||||
bool isSuitableUSMDeviceAlloc(NEO::SvmAllocationData *alloc);
|
||||
bool isSuitableUSMSharedAlloc(NEO::SvmAllocationData *alloc);
|
||||
ze_result_t performCpuMemcpy(const CpuMemCopyInfo &cpuMemCopyInfo, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
|
||||
void *obtainLockedPtrFromDevice(NEO::SvmAllocationData *alloc, void *ptr);
|
||||
void *obtainLockedPtrFromDevice(NEO::SvmAllocationData *alloc, void *ptr, bool &lockingFailed);
|
||||
bool waitForEventsFromHost();
|
||||
void checkWaitEventsState(uint32_t numWaitEvents, ze_event_handle_t *waitEventList);
|
||||
TransferType getTransferType(NEO::SvmAllocationData *dstAlloc, NEO::SvmAllocationData *srcAlloc);
|
||||
|
||||
@@ -357,7 +357,10 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
||||
this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcptr), size, &cpuMemCopyInfo.srcAllocData);
|
||||
this->device->getDriverHandle()->findAllocationDataForRange(dstptr, size, &cpuMemCopyInfo.dstAllocData);
|
||||
if (preferCopyThroughLockedPtr(cpuMemCopyInfo)) {
|
||||
return performCpuMemcpy(cpuMemCopyInfo, hSignalEvent, numWaitEvents, phWaitEvents);
|
||||
ret = performCpuMemcpy(cpuMemCopyInfo, hSignalEvent, numWaitEvents, phWaitEvents);
|
||||
if (ret == ZE_RESULT_SUCCESS || ret == ZE_RESULT_ERROR_DEVICE_LOST) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size);
|
||||
@@ -677,6 +680,17 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isSuitableUSMSharedAlloc(NEO
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(const CpuMemCopyInfo &cpuMemCopyInfo, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
|
||||
bool lockingFailed = false;
|
||||
auto srcLockPointer = obtainLockedPtrFromDevice(cpuMemCopyInfo.srcAllocData, const_cast<void *>(cpuMemCopyInfo.srcPtr), lockingFailed);
|
||||
if (lockingFailed) {
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
}
|
||||
|
||||
auto dstLockPointer = obtainLockedPtrFromDevice(cpuMemCopyInfo.dstAllocData, const_cast<void *>(cpuMemCopyInfo.dstPtr), lockingFailed);
|
||||
if (lockingFailed) {
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
}
|
||||
|
||||
bool needsBarrier = (numWaitEvents > 0);
|
||||
if (needsBarrier) {
|
||||
this->appendBarrier(nullptr, numWaitEvents, phWaitEvents);
|
||||
@@ -694,9 +708,6 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(cons
|
||||
signalEvent = Event::fromHandle(hSignalEvent);
|
||||
}
|
||||
|
||||
auto srcLockPointer = obtainLockedPtrFromDevice(cpuMemCopyInfo.srcAllocData, const_cast<void *>(cpuMemCopyInfo.srcPtr));
|
||||
auto dstLockPointer = obtainLockedPtrFromDevice(cpuMemCopyInfo.dstAllocData, cpuMemCopyInfo.dstPtr);
|
||||
|
||||
const void *cpuMemcpySrcPtr = srcLockPointer ? srcLockPointer : cpuMemCopyInfo.srcPtr;
|
||||
void *cpuMemcpyDstPtr = dstLockPointer ? dstLockPointer : cpuMemCopyInfo.dstPtr;
|
||||
|
||||
@@ -724,7 +735,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(cons
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void *CommandListCoreFamilyImmediate<gfxCoreFamily>::obtainLockedPtrFromDevice(NEO::SvmAllocationData *allocData, void *ptr) {
|
||||
void *CommandListCoreFamilyImmediate<gfxCoreFamily>::obtainLockedPtrFromDevice(NEO::SvmAllocationData *allocData, void *ptr, bool &lockingFailed) {
|
||||
if (!allocData) {
|
||||
return nullptr;
|
||||
}
|
||||
@@ -736,6 +747,10 @@ void *CommandListCoreFamilyImmediate<gfxCoreFamily>::obtainLockedPtrFromDevice(N
|
||||
|
||||
if (!alloc->isLocked()) {
|
||||
this->device->getDriverHandle()->getMemoryManager()->lockResource(alloc);
|
||||
if (!alloc->isLocked()) {
|
||||
lockingFailed = true;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto gpuAddress = allocData->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex())->getGpuAddress();
|
||||
|
||||
@@ -2373,6 +2373,27 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmDstHostP
|
||||
EXPECT_GE(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndFailedToLockPtrThenUseGpuMemcpy, IsAtLeastSkl) {
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
|
||||
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1 * MemoryConstants::megaByte, nullptr, 0, nullptr);
|
||||
ASSERT_EQ(cmdList.appendMemoryCopyKernelWithGACalled, 0u);
|
||||
|
||||
NEO::SvmAllocationData *dstAllocData;
|
||||
ASSERT_TRUE(device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1 * MemoryConstants::megaByte, &dstAllocData));
|
||||
ASSERT_NE(dstAllocData, nullptr);
|
||||
auto mockMemoryManager = static_cast<MockMemoryManager *>(device->getDriverHandle()->getMemoryManager());
|
||||
auto graphicsAllocation = dstAllocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
|
||||
mockMemoryManager->unlockResource(graphicsAllocation);
|
||||
mockMemoryManager->failLockResource = true;
|
||||
ASSERT_FALSE(graphicsAllocation->isLocked());
|
||||
|
||||
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1 * MemoryConstants::megaByte, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndD2HCopyWhenSizeTooLargeButFlagSetThenUseCpuMemcpy, IsAtLeastSkl) {
|
||||
DebugManager.flags.ExperimentalD2HCpuCopyThreshold.set(2048);
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
@@ -2478,5 +2499,52 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndTimestampFlagN
|
||||
EXPECT_EQ(0u, reinterpret_cast<MockGpuTimestampEvent *>(event.get())->gpuEndTimestamp);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenAllocationDataWhenFailingToObtainLockedPtrFromDeviceThenNullptrIsReturned, IsAtLeastSkl) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
|
||||
NEO::SvmAllocationData *dstAllocData = nullptr;
|
||||
EXPECT_TRUE(device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &dstAllocData));
|
||||
ASSERT_NE(dstAllocData, nullptr);
|
||||
auto graphicsAllocation = dstAllocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
|
||||
ASSERT_FALSE(graphicsAllocation->isLocked());
|
||||
|
||||
auto mockMemoryManager = static_cast<MockMemoryManager *>(device->getDriverHandle()->getMemoryManager());
|
||||
mockMemoryManager->failLockResource = true;
|
||||
|
||||
bool lockingFailed = false;
|
||||
void *lockedPtr = cmdList.obtainLockedPtrFromDevice(dstAllocData, devicePtr, lockingFailed);
|
||||
EXPECT_FALSE(graphicsAllocation->isLocked());
|
||||
EXPECT_TRUE(lockingFailed);
|
||||
EXPECT_EQ(lockedPtr, nullptr);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenNullAllocationDataWhenObtainLockedPtrFromDeviceCalledThenNullptrIsReturned, IsAtLeastSkl) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
bool lockingFailed = false;
|
||||
EXPECT_EQ(cmdList.obtainLockedPtrFromDevice(nullptr, devicePtr, lockingFailed), nullptr);
|
||||
EXPECT_FALSE(lockingFailed);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenFailedToObtainLockedPtrWhenPerformingCpuMemoryCopyThenErrorIsReturned, IsAtLeastSkl) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
CpuMemCopyInfo cpuMemCopyInfo(nullptr, nullptr, 1024);
|
||||
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &cpuMemCopyInfo.srcAllocData);
|
||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &cpuMemCopyInfo.dstAllocData);
|
||||
ASSERT_TRUE(srcFound != dstFound);
|
||||
ze_result_t returnValue = ZE_RESULT_SUCCESS;
|
||||
|
||||
auto mockMemoryManager = static_cast<MockMemoryManager *>(device->getDriverHandle()->getMemoryManager());
|
||||
mockMemoryManager->failLockResource = true;
|
||||
|
||||
returnValue = cmdList.performCpuMemcpy(cpuMemCopyInfo, nullptr, 1, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, returnValue);
|
||||
|
||||
std::swap(cpuMemCopyInfo.srcAllocData, cpuMemCopyInfo.dstAllocData);
|
||||
returnValue = cmdList.performCpuMemcpy(cpuMemCopyInfo, nullptr, 1, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, returnValue);
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
|
||||
@@ -3131,8 +3131,19 @@ TEST_F(ImmediateEventAllPacketSignalSinglePacketUseTest, givenSignalAllEventPack
|
||||
testQueryAllPackets(event.get(), true);
|
||||
}
|
||||
|
||||
using EventTimestampTest = Test<DeviceFixture>;
|
||||
HWTEST2_F(EventTimestampTest, givenAppendMemoryCopyRegionsIsCalledWhenCopyTimeIsLessThanDeviceTimestampResolutionThenReturnTimstampDifferenceAsOne, IsXeHpcCore) {
|
||||
struct LocalMemoryEnabledDeviceFixture : public DeviceFixture {
|
||||
void setUp() {
|
||||
DebugManager.flags.EnableLocalMemory.set(1);
|
||||
DeviceFixture::setUp();
|
||||
}
|
||||
void tearDown() {
|
||||
DeviceFixture::tearDown();
|
||||
}
|
||||
DebugManagerStateRestore restore;
|
||||
};
|
||||
|
||||
using EventTimestampTest = Test<LocalMemoryEnabledDeviceFixture>;
|
||||
HWTEST2_F(EventTimestampTest, givenAppendMemoryCopyIsCalledWhenCpuCopyIsUsedAndCopyTimeIsLessThanDeviceTimestampResolutionThenReturnTimstampDifferenceAsOne, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
|
||||
@@ -91,7 +91,10 @@ class MockMemoryManager : public MemoryManagerCreate<OsAgnosticMemoryManager> {
|
||||
|
||||
void *lockResourceImpl(GraphicsAllocation &gfxAllocation) override {
|
||||
lockResourceCalled++;
|
||||
auto pLockedMemory = OsAgnosticMemoryManager::lockResourceImpl(gfxAllocation);
|
||||
void *pLockedMemory = nullptr;
|
||||
if (!failLockResource) {
|
||||
pLockedMemory = OsAgnosticMemoryManager::lockResourceImpl(gfxAllocation);
|
||||
}
|
||||
lockResourcePointers.push_back(pLockedMemory);
|
||||
return pLockedMemory;
|
||||
}
|
||||
@@ -239,6 +242,7 @@ class MockMemoryManager : public MemoryManagerCreate<OsAgnosticMemoryManager> {
|
||||
bool failReserveAddress = false;
|
||||
bool failAllocateSystemMemory = false;
|
||||
bool failAllocate32Bit = false;
|
||||
bool failLockResource = false;
|
||||
bool failSetMemAdvise = false;
|
||||
bool setMemPrefetchCalled = false;
|
||||
bool cpuCopyRequired = false;
|
||||
|
||||
Reference in New Issue
Block a user