mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-07 12:42:54 +08:00
[L0][XE_HPC]Perform memcpy on CPU by default
Related-To: NEO-7237 Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
edca8aa6de
commit
17655e3ed3
@@ -329,9 +329,6 @@ struct CommandList : _ze_command_list_handle_t {
|
||||
bool systolicModeSupport = false;
|
||||
bool pipelineSelectStateTracking = false;
|
||||
bool stateComputeModeTracking = false;
|
||||
|
||||
std::atomic<uint32_t> barrierCounter{0u};
|
||||
uint32_t latestFlushedBarrierCounter = 0u;
|
||||
};
|
||||
|
||||
using CommandListAllocatorFn = CommandList *(*)(uint32_t);
|
||||
|
||||
@@ -2517,7 +2517,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
|
||||
}
|
||||
|
||||
appendSignalEventPostWalker(signalEvent, workloadPartition);
|
||||
this->barrierCounter++;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -129,9 +129,12 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
|
||||
NEO::LogicalStateHelper *getLogicalStateHelper() const override;
|
||||
|
||||
bool preferCopyThroughLockedPtr(NEO::SvmAllocationData *dstAlloc, bool dstFound, NEO::SvmAllocationData *srcAlloc, bool srcFound, size_t size);
|
||||
bool isAllocUSMDeviceMemory(NEO::SvmAllocationData *alloc, bool allocFound);
|
||||
bool isSuitableUSMDeviceAlloc(NEO::SvmAllocationData *alloc, bool allocFound);
|
||||
ze_result_t performCpuMemcpy(void *dstptr, const void *srcptr, size_t size, bool isDstDeviceMemory, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
|
||||
void *obtainLockedPtrFromDevice(void *ptr, size_t size);
|
||||
|
||||
protected:
|
||||
std::atomic<bool> barrierCalled{false};
|
||||
};
|
||||
|
||||
template <PRODUCT_FAMILY gfxProductFamily>
|
||||
|
||||
@@ -246,6 +246,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendBarrier(
|
||||
checkAvailableSpace();
|
||||
}
|
||||
ret = CommandListCoreFamily<gfxCoreFamily>::appendBarrier(hSignalEvent, numWaitEvents, phWaitEvents);
|
||||
|
||||
this->barrierCalled = true;
|
||||
return flushImmediate(ret, true, hSignalEvent);
|
||||
}
|
||||
|
||||
@@ -525,15 +527,16 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(N
|
||||
d2HThreshold = NEO::DebugManager.flags.ExperimentalD2HCpuCopyThreshold.get();
|
||||
}
|
||||
if (NEO::HwHelper::get(this->device->getHwInfo().platform.eRenderCoreFamily).copyThroughLockedPtrEnabled()) {
|
||||
return (!srcFound && isAllocUSMDeviceMemory(dstAlloc, dstFound) && size <= h2DThreshold) ||
|
||||
(!dstFound && isAllocUSMDeviceMemory(srcAlloc, srcFound) && size <= d2HThreshold);
|
||||
return (!srcFound && isSuitableUSMDeviceAlloc(dstAlloc, dstFound) && size <= h2DThreshold) ||
|
||||
(!dstFound && isSuitableUSMDeviceAlloc(srcAlloc, srcFound) && size <= d2HThreshold);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isAllocUSMDeviceMemory(NEO::SvmAllocationData *alloc, bool allocFound) {
|
||||
return allocFound && (alloc->memoryType == InternalMemoryType::DEVICE_UNIFIED_MEMORY);
|
||||
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isSuitableUSMDeviceAlloc(NEO::SvmAllocationData *alloc, bool allocFound) {
|
||||
return allocFound && (alloc->memoryType == InternalMemoryType::DEVICE_UNIFIED_MEMORY) &&
|
||||
alloc->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex())->storageInfo.getNumBanks() == 1;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
@@ -544,8 +547,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(void
|
||||
this->appendBarrier(nullptr, numWaitEvents, phWaitEvents);
|
||||
}
|
||||
|
||||
bool needsFlushTagUpdate = this->latestFlushedBarrierCounter < this->barrierCounter;
|
||||
if (needsFlushTagUpdate) {
|
||||
if (this->barrierCalled) {
|
||||
this->csr->flushTagUpdate();
|
||||
}
|
||||
|
||||
@@ -564,13 +566,13 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(void
|
||||
cpuMemcpyDstPtr = dstptr;
|
||||
}
|
||||
|
||||
if (needsFlushTagUpdate) {
|
||||
if (this->barrierCalled) {
|
||||
auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
|
||||
const auto waitStatus = this->csr->waitForCompletionWithTimeout(NEO::WaitParams{false, false, timeoutMicroseconds}, this->csr->peekTaskCount());
|
||||
if (waitStatus == NEO::WaitStatus::GpuHang) {
|
||||
return ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
}
|
||||
this->latestFlushedBarrierCounter = this->barrierCounter;
|
||||
this->barrierCalled = false;
|
||||
}
|
||||
|
||||
if (signalEvent) {
|
||||
|
||||
@@ -484,6 +484,7 @@ class MockCommandListImmediateHw : public WhiteBox<::L0::CommandListCoreFamilyIm
|
||||
using BaseClass = WhiteBox<::L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>;
|
||||
MockCommandListImmediateHw() : BaseClass() {}
|
||||
using BaseClass::applyMemoryRangesBarrier;
|
||||
using BaseClass::barrierCalled;
|
||||
using BaseClass::isFlushTaskSubmissionEnabled;
|
||||
using BaseClass::isSyncModeQueue;
|
||||
|
||||
|
||||
@@ -886,7 +886,6 @@ HWTEST2_F(CommandListAppendLaunchKernelXeHpcCore,
|
||||
|
||||
struct AppendMemoryLockedCopyFixture : public DeviceFixture {
|
||||
void setUp() {
|
||||
DebugManager.flags.ExperimentalCopyThroughLock.set(1);
|
||||
DeviceFixture::setUp();
|
||||
|
||||
nonUsmHostPtr = new char[sz];
|
||||
@@ -917,15 +916,40 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrW
|
||||
EXPECT_TRUE(cmdList.preferCopyThroughLockedPtr(dstAllocData, dstFound, srcAllocData, srcFound, 1024));
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsAllocDeviceMemoryThenReturnCorrectValue, IsXeHpcCore) {
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSMDeviceAllocThenReturnCorrectValue, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
NEO::SvmAllocationData *srcAllocData;
|
||||
NEO::SvmAllocationData *dstAllocData;
|
||||
auto srcFound = device->getDriverHandle()->findAllocationDataForRange(nonUsmHostPtr, 1024, &srcAllocData);
|
||||
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &dstAllocData);
|
||||
EXPECT_FALSE(cmdList.isAllocUSMDeviceMemory(srcAllocData, srcFound));
|
||||
EXPECT_TRUE(cmdList.isAllocUSMDeviceMemory(dstAllocData, dstFound));
|
||||
EXPECT_FALSE(cmdList.isSuitableUSMDeviceAlloc(srcAllocData, srcFound));
|
||||
EXPECT_TRUE(cmdList.isSuitableUSMDeviceAlloc(dstAllocData, dstFound));
|
||||
}
|
||||
|
||||
struct LocalMemoryMultiSubDeviceFixture : public SingleRootMultiSubDeviceFixture {
|
||||
void setUp() {
|
||||
DebugManager.flags.EnableLocalMemory.set(1);
|
||||
DebugManager.flags.EnableImplicitScaling.set(1);
|
||||
SingleRootMultiSubDeviceFixture::setUp();
|
||||
}
|
||||
DebugManagerStateRestore restore;
|
||||
};
|
||||
|
||||
using LocalMemoryMultiSubDeviceTest = Test<LocalMemoryMultiSubDeviceFixture>;
|
||||
|
||||
HWTEST2_F(LocalMemoryMultiSubDeviceTest, givenImmediateCommandListWhenIsSuitableUSMDeviceAllocWithColouredBufferThenReturnFalse, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
|
||||
void *devicePtr;
|
||||
ze_device_mem_alloc_desc_t deviceDesc = {};
|
||||
context->allocDeviceMem(device->toHandle(), &deviceDesc, 2 * MemoryConstants::megaByte, 1u, &devicePtr);
|
||||
|
||||
NEO::SvmAllocationData *allocData;
|
||||
auto allocFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 2 * MemoryConstants::megaByte, &allocData);
|
||||
EXPECT_FALSE(cmdList.isSuitableUSMDeviceAlloc(allocData, allocFound));
|
||||
context->freeMem(devicePtr);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmHostPtrAndFlagDisabledWhenPreferCopyThroughLockedPtrCalledThenReturnFalse, IsXeHpcCore) {
|
||||
@@ -1076,6 +1100,23 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWith
|
||||
EXPECT_EQ(waitForFlushTagUpdateCalled, 1u);
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendBarrierThenSetBarrierCalled, IsXeHpcCore) {
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
|
||||
EXPECT_FALSE(cmdList.barrierCalled);
|
||||
|
||||
cmdList.appendBarrier(nullptr, 0, nullptr);
|
||||
|
||||
EXPECT_TRUE(cmdList.barrierCalled);
|
||||
|
||||
auto res = cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 1024, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(res, ZE_RESULT_SUCCESS);
|
||||
|
||||
EXPECT_FALSE(cmdList.barrierCalled);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
class MockAppendMemoryLockedCopyTestImmediateCmdList : public MockCommandListImmediateHw<gfxCoreFamily> {
|
||||
public:
|
||||
|
||||
Reference in New Issue
Block a user