Set isLockable if size small enough for cpu memcpy

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2022-12-06 10:19:05 +00:00
committed by Compute-Runtime-Automation
parent 336c8c10d5
commit 41a80072b9
9 changed files with 124 additions and 17 deletions

View File

@@ -623,14 +623,9 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(N
return true;
}
size_t h2DThreshold = 2 * MemoryConstants::megaByte;
size_t d2HThreshold = 1 * MemoryConstants::kiloByte;
if (NEO::DebugManager.flags.ExperimentalH2DCpuCopyThreshold.get() != -1) {
h2DThreshold = NEO::DebugManager.flags.ExperimentalH2DCpuCopyThreshold.get();
}
if (NEO::DebugManager.flags.ExperimentalD2HCpuCopyThreshold.get() != -1) {
d2HThreshold = NEO::DebugManager.flags.ExperimentalD2HCpuCopyThreshold.get();
}
size_t h2DThreshold = 0;
size_t d2HThreshold = 0;
NEO::GfxCoreHelper::getCpuCopyThresholds(h2DThreshold, d2HThreshold);
if (NEO::GfxCoreHelper::get(this->device->getHwInfo().platform.eRenderCoreFamily).copyThroughLockedPtrEnabled(this->device->getHwInfo())) {
return (!srcFound && isSuitableUSMDeviceAlloc(dstAlloc, dstFound) && size <= h2DThreshold) ||
(!dstFound && isSuitableUSMDeviceAlloc(srcAlloc, srcFound) && size <= d2HThreshold);
@@ -641,7 +636,8 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(N
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isSuitableUSMDeviceAlloc(NEO::SvmAllocationData *alloc, bool allocFound) {
return allocFound && (alloc->memoryType == InternalMemoryType::DEVICE_UNIFIED_MEMORY) &&
alloc->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex())->storageInfo.getNumBanks() == 1;
alloc->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex())->storageInfo.getNumBanks() == 1 &&
alloc->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex())->storageInfo.isLockable;
}
template <GFXCORE_FAMILY gfxCoreFamily>

View File

@@ -172,7 +172,6 @@ ze_result_t ContextImp::allocDeviceMem(ze_device_handle_t hDevice,
unifiedMemoryProperties.allocationFlags.flags.shareable = isShareableMemory(deviceDesc->pNext, static_cast<uint32_t>(lookupTable.exportMemory), neoDevice);
unifiedMemoryProperties.device = neoDevice;
unifiedMemoryProperties.allocationFlags.flags.compressedHint = isAllocationSuitableForCompression(lookupTable, *device, size);
if (deviceDesc->flags & ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED) {
unifiedMemoryProperties.allocationFlags.flags.locallyUncachedResource = 1;
}

View File

@@ -1883,8 +1883,9 @@ HWTEST_F(CommandListCreate, givenCommandListWhenRemoveDeallocationContainerDataT
struct AppendMemoryLockedCopyFixture : public DeviceFixture {
void setUp() {
DebugManager.flags.ExperimentalCopyThroughLock.set(1);
DebugManager.flags.EnableLocalMemory.set(1);
DebugManager.flags.ExperimentalCopyThroughLock.set(1);
DebugManager.flags.ForceLocalMemoryAccessMode.set(0);
DeviceFixture::setUp();
nonUsmHostPtr = new char[sz];
@@ -1900,7 +1901,7 @@ struct AppendMemoryLockedCopyFixture : public DeviceFixture {
DebugManagerStateRestore restore;
char *nonUsmHostPtr;
void *devicePtr;
size_t sz = 4 * MemoryConstants::megaByte;
size_t sz = 2 * MemoryConstants::megaByte;
};
using AppendMemoryLockedCopyTest = Test<AppendMemoryLockedCopyFixture>;
@@ -1924,6 +1925,9 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenIsSuitableUSM
auto dstFound = device->getDriverHandle()->findAllocationDataForRange(devicePtr, 1024, &dstAllocData);
EXPECT_FALSE(cmdList.isSuitableUSMDeviceAlloc(srcAllocData, srcFound));
EXPECT_TRUE(cmdList.isSuitableUSMDeviceAlloc(dstAllocData, dstFound));
dstAllocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex())->storageInfo.isLockable = 0;
EXPECT_FALSE(cmdList.isSuitableUSMDeviceAlloc(dstAllocData, dstFound));
}
struct LocalMemoryMultiSubDeviceFixture : public SingleRootMultiSubDeviceFixture {
@@ -2286,22 +2290,50 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndNonUsmDstHostP
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndD2HCopyWhenSizeTooLargeButFlagSetThenUseCpuMemcpy, IsAtLeastSkl) {
DebugManager.flags.ExperimentalD2HCpuCopyThreshold.set(2048);
constexpr size_t largeSize = 3 * MemoryConstants::megaByte;
DebugManager.flags.ExperimentalD2HCpuCopyThreshold.set(largeSize);
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
cmdList.appendMemoryCopy(nonUsmHostPtr, devicePtr, 2 * MemoryConstants::kiloByte, nullptr, 0, nullptr);
ze_device_mem_alloc_desc_t deviceDesc = {};
void *deviceAlloc;
char *hostAlloc = new char[largeSize];
context->allocDeviceMem(device->toHandle(), &deviceDesc, largeSize, 1u, &deviceAlloc);
cmdList.appendMemoryCopy(hostAlloc, deviceAlloc, largeSize, nullptr, 0, nullptr);
EXPECT_EQ(cmdList.appendMemoryCopyKernelWithGACalled, 0u);
context->freeMem(deviceAlloc);
delete[] hostAlloc;
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndH2DCopyWhenSizeTooLargeThenUseGpuMemcpy, IsAtLeastSkl) {
constexpr size_t largeSize = 3 * MemoryConstants::megaByte;
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
ze_device_mem_alloc_desc_t deviceDesc = {};
void *deviceAlloc;
char *hostAlloc = new char[largeSize];
context->allocDeviceMem(device->toHandle(), &deviceDesc, largeSize, 1u, &deviceAlloc);
cmdList.appendMemoryCopy(deviceAlloc, hostAlloc, largeSize, nullptr, 0, nullptr);
EXPECT_EQ(cmdList.appendMemoryCopyKernelWithGACalled, 1u);
context->freeMem(deviceAlloc);
delete[] hostAlloc;
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndH2DCopyWhenSizeTooLargeButFlagSetThenUseCpuMemcpy, IsAtLeastSkl) {
DebugManager.flags.ExperimentalH2DCpuCopyThreshold.set(3 * MemoryConstants::megaByte);
constexpr size_t largeSize = 3 * MemoryConstants::megaByte;
DebugManager.flags.ExperimentalH2DCpuCopyThreshold.set(largeSize);
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
cmdList.appendMemoryCopy(devicePtr, nonUsmHostPtr, 3 * MemoryConstants::megaByte, nullptr, 0, nullptr);
ze_device_mem_alloc_desc_t deviceDesc = {};
void *deviceAlloc;
char *hostAlloc = new char[largeSize];
context->allocDeviceMem(device->toHandle(), &deviceDesc, largeSize, 1u, &deviceAlloc);
cmdList.appendMemoryCopy(deviceAlloc, hostAlloc, largeSize, nullptr, 0, nullptr);
EXPECT_EQ(cmdList.appendMemoryCopyKernelWithGACalled, 0u);
context->freeMem(deviceAlloc);
delete[] hostAlloc;
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndCpuMemcpyWithDependencyThenAppendBarrierCalled, IsAtLeastSkl) {