Reuse graphics allocations in svmMemcpy

Related-To: NEO-6352
Signed-off-by: Maciej Dziuban <maciej.dziuban@intel.com>
This commit is contained in:
Maciej Dziuban
2021-11-08 12:06:22 +00:00
committed by Compute-Runtime-Automation
parent 32370473ad
commit 457ef00abf
7 changed files with 238 additions and 46 deletions

View File

@@ -979,15 +979,4 @@ void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHan
}
}
void *CommandQueue::convertAddressWithOffsetToGpuVa(void *ptr, InternalMemoryType memoryType, GraphicsAllocation &allocation) {
// If this is device or shared USM pointer, it is already a gpuVA and we don't have to do anything.
// Otherwise, we assume this is a cpuVA and we have to convert to gpuVA, while preserving offset from allocation start.
const bool isCpuPtr = (memoryType != DEVICE_UNIFIED_MEMORY) && (memoryType != SHARED_UNIFIED_MEMORY);
if (isCpuPtr) {
size_t dstOffset = ptrDiff(ptr, allocation.getUnderlyingBuffer());
ptr = reinterpret_cast<void *>(allocation.getGpuAddress() + dstOffset);
}
return ptr;
}
} // namespace NEO

View File

@@ -322,7 +322,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
return requiresCacheFlushAfterWalker;
}
static void *convertAddressWithOffsetToGpuVa(void *ptr, InternalMemoryType memoryType, GraphicsAllocation &allocation);
template <typename PtrType>
static PtrType convertAddressWithOffsetToGpuVa(PtrType ptr, InternalMemoryType memoryType, GraphicsAllocation &allocation);
void updateBcsTaskCount(aub_stream::EngineType bcsEngineType, uint32_t newBcsTaskCount);
uint32_t peekBcsTaskCount(aub_stream::EngineType bcsEngineType) const;
@@ -404,6 +405,18 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
std::unique_ptr<TimestampPacketContainer> timestampPacketContainer;
};
template <typename PtrType>
PtrType CommandQueue::convertAddressWithOffsetToGpuVa(PtrType ptr, InternalMemoryType memoryType, GraphicsAllocation &allocation) {
// If this is device or shared USM pointer, it is already a gpuVA and we don't have to do anything.
// Otherwise, we assume this is a cpuVA and we have to convert to gpuVA, while preserving offset from allocation start.
const bool isCpuPtr = (memoryType != DEVICE_UNIFIED_MEMORY) && (memoryType != SHARED_UNIFIED_MEMORY);
if (isCpuPtr) {
size_t dstOffset = ptrDiff(ptr, allocation.getUnderlyingBuffer());
ptr = reinterpret_cast<PtrType>(allocation.getGpuAddress() + dstOffset);
}
return ptr;
}
using CommandQueueCreateFunc = CommandQueue *(*)(Context *context, ClDevice *device, const cl_queue_properties *properties, bool internalUsage);
} // namespace NEO

View File

@@ -73,8 +73,12 @@ struct CsrSelectionArgs {
}
static void processResource(const MultiGraphicsAllocation &multiGfxAlloc, uint32_t rootDeviceIndex, Resource &outResource) {
outResource.allocation = multiGfxAlloc.getGraphicsAllocation(rootDeviceIndex);
outResource.isLocal = outResource.allocation->isAllocatedInLocalMemoryPool();
processResource(*multiGfxAlloc.getGraphicsAllocation(rootDeviceIndex), rootDeviceIndex, outResource);
}
static void processResource(const GraphicsAllocation &gfxAlloc, uint32_t rootDeviceIndex, Resource &outResource) {
outResource.allocation = &gfxAlloc;
outResource.isLocal = gfxAlloc.isAllocatedInLocalMemoryPool();
}
static inline TransferDirection createTransferDirection(bool srcLocal, bool dstLocal) {

View File

@@ -274,6 +274,24 @@ inline void setOperationParams(BuiltinOpParams &operationParams, size_t size,
operationParams.dstOffset = {ptrDiff(dstPtr, operationParams.dstPtr), 0, 0};
}
template <typename PtrType>
inline std::tuple<SvmAllocationData *, GraphicsAllocation *, PtrType> getExistingAlloc(Context *context,
PtrType ptr,
size_t size,
uint32_t rootDeviceIndex) {
SvmAllocationData *svmData = context->getSVMAllocsManager()->getSVMAlloc(ptr);
GraphicsAllocation *allocation = nullptr;
if (svmData) {
allocation = svmData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
} else {
context->tryGetExistingMapAllocation(ptr, size, allocation);
if (allocation) {
ptr = CommandQueue::convertAddressWithOffsetToGpuVa(ptr, InternalMemoryType::NOT_SPECIFIED, *allocation);
}
}
return std::make_tuple(svmData, allocation, ptr);
}
template <typename GfxFamily>
cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
void *dstPtr,
@@ -287,28 +305,28 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
return CL_INVALID_VALUE;
}
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
auto dstSvmData = context->getSVMAllocsManager()->getSVMAlloc(dstPtr);
auto srcSvmData = context->getSVMAllocsManager()->getSVMAlloc(srcPtr);
auto [dstSvmData, dstAllocation, dstGpuPtr] = getExistingAlloc(context, dstPtr, size, rootDeviceIndex);
auto [srcSvmData, srcAllocation, srcGpuPtr] = getExistingAlloc(context, srcPtr, size, rootDeviceIndex);
enum CopyType { HostToHost,
SvmToHost,
HostToSvm,
SvmToSvm };
CopyType copyType = HostToHost;
if ((srcSvmData != nullptr) && (dstSvmData != nullptr)) {
if ((srcAllocation != nullptr) && (dstAllocation != nullptr)) {
copyType = SvmToSvm;
} else if ((srcSvmData == nullptr) && (dstSvmData != nullptr)) {
} else if ((srcAllocation == nullptr) && (dstAllocation != nullptr)) {
copyType = HostToSvm;
} else if (srcSvmData != nullptr) {
} else if (srcAllocation != nullptr) {
copyType = SvmToHost;
}
auto pageFaultManager = context->getMemoryManager()->getPageFaultManager();
if (dstSvmData && pageFaultManager) {
pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast<void *>(dstSvmData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()));
pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast<void *>(dstAllocation->getGpuAddress()));
}
if (srcSvmData && pageFaultManager) {
pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast<void *>(srcSvmData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()));
pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast<void *>(srcAllocation->getGpuAddress()));
}
auto isStatelessRequired = false;
@@ -330,20 +348,20 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
cl_command_type cmdType;
if (copyType == SvmToHost) {
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, &srcSvmData->gpuAllocations, {}, device->getRootDeviceIndex(), &size};
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, srcAllocation, {}, device->getRootDeviceIndex(), &size};
CommandStreamReceiver &csr = selectCsrForBuiltinOperation(csrSelectionArgs);
GeneralSurface srcSvmSurf(srcSvmData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex));
HostPtrSurface dstHostPtrSurf(dstPtr, size);
GeneralSurface srcSvmSurf(srcAllocation);
HostPtrSurface dstHostPtrSurf(dstGpuPtr, size);
if (size != 0) {
bool status = csr.createAllocationForHostSurface(dstHostPtrSurf, true);
if (!status) {
return CL_OUT_OF_RESOURCES;
}
dstPtr = reinterpret_cast<void *>(dstHostPtrSurf.getAllocation()->getGpuAddress());
notifyEnqueueSVMMemcpy(srcSvmData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex), !!blockingCopy, EngineHelpers::isBcs(csr.getOsContext().getEngineType()));
dstGpuPtr = reinterpret_cast<void *>(dstHostPtrSurf.getAllocation()->getGpuAddress());
notifyEnqueueSVMMemcpy(srcAllocation, !!blockingCopy, EngineHelpers::isBcs(csr.getOsContext().getEngineType()));
}
setOperationParams(operationParams, size, srcPtr, srcSvmData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex), dstPtr, dstHostPtrSurf.getAllocation());
setOperationParams(operationParams, size, srcGpuPtr, srcAllocation, dstGpuPtr, dstHostPtrSurf.getAllocation());
surfaces[0] = &srcSvmSurf;
surfaces[1] = &dstHostPtrSurf;
@@ -351,36 +369,33 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
} else if (copyType == HostToSvm) {
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, {}, &dstSvmData->gpuAllocations, device->getRootDeviceIndex(), &size};
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, {}, dstAllocation, device->getRootDeviceIndex(), &size};
CommandStreamReceiver &csr = selectCsrForBuiltinOperation(csrSelectionArgs);
HostPtrSurface srcHostPtrSurf(const_cast<void *>(srcPtr), size);
GeneralSurface dstSvmSurf(dstSvmData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex));
HostPtrSurface srcHostPtrSurf(const_cast<void *>(srcGpuPtr), size);
GeneralSurface dstSvmSurf(dstAllocation);
cmdType = CL_COMMAND_WRITE_BUFFER;
if (size != 0) {
bool status = csr.createAllocationForHostSurface(srcHostPtrSurf, false);
if (!status) {
return CL_OUT_OF_RESOURCES;
}
srcPtr = reinterpret_cast<void *>(srcHostPtrSurf.getAllocation()->getGpuAddress());
srcGpuPtr = reinterpret_cast<void *>(srcHostPtrSurf.getAllocation()->getGpuAddress());
}
setOperationParams(operationParams, size, srcPtr, srcHostPtrSurf.getAllocation(),
dstPtr, dstSvmData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex));
setOperationParams(operationParams, size, srcGpuPtr, srcHostPtrSurf.getAllocation(), dstGpuPtr, dstAllocation);
surfaces[0] = &dstSvmSurf;
surfaces[1] = &srcHostPtrSurf;
dispatchInfo.setBuiltinOpParams(operationParams);
dispatchInfo.setBuiltinOpParams(operationParams);
dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
} else if (copyType == SvmToSvm) {
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, &srcSvmData->gpuAllocations, &dstSvmData->gpuAllocations, device->getRootDeviceIndex(), &size};
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, srcAllocation, dstAllocation, device->getRootDeviceIndex(), &size};
CommandStreamReceiver &csr = selectCsrForBuiltinOperation(csrSelectionArgs);
GeneralSurface srcSvmSurf(srcSvmData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex));
GeneralSurface dstSvmSurf(dstSvmData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex));
setOperationParams(operationParams, size, srcPtr, srcSvmData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex),
dstPtr, dstSvmData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex));
GeneralSurface srcSvmSurf(srcAllocation);
GeneralSurface dstSvmSurf(dstAllocation);
setOperationParams(operationParams, size, srcGpuPtr, srcAllocation, dstGpuPtr, dstAllocation);
surfaces[0] = &srcSvmSurf;
surfaces[1] = &dstSvmSurf;
@@ -391,8 +406,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, &size};
CommandStreamReceiver &csr = selectCsrForBuiltinOperation(csrSelectionArgs);
HostPtrSurface srcHostPtrSurf(const_cast<void *>(srcPtr), size);
HostPtrSurface dstHostPtrSurf(dstPtr, size);
HostPtrSurface srcHostPtrSurf(const_cast<void *>(srcGpuPtr), size);
HostPtrSurface dstHostPtrSurf(dstGpuPtr, size);
cmdType = CL_COMMAND_WRITE_BUFFER;
if (size != 0) {
bool status = csr.createAllocationForHostSurface(srcHostPtrSurf, false);
@@ -400,10 +415,10 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
if (!status) {
return CL_OUT_OF_RESOURCES;
}
srcPtr = reinterpret_cast<void *>(srcHostPtrSurf.getAllocation()->getGpuAddress());
dstPtr = reinterpret_cast<void *>(dstHostPtrSurf.getAllocation()->getGpuAddress());
srcGpuPtr = reinterpret_cast<void *>(srcHostPtrSurf.getAllocation()->getGpuAddress());
dstGpuPtr = reinterpret_cast<void *>(dstHostPtrSurf.getAllocation()->getGpuAddress());
}
setOperationParams(operationParams, size, srcPtr, srcHostPtrSurf.getAllocation(), dstPtr, dstHostPtrSurf.getAllocation());
setOperationParams(operationParams, size, srcGpuPtr, srcHostPtrSurf.getAllocation(), dstGpuPtr, dstHostPtrSurf.getAllocation());
surfaces[0] = &srcHostPtrSurf;
surfaces[1] = &dstHostPtrSurf;

View File

@@ -88,6 +88,21 @@ cl_int Context::tryGetExistingHostPtrAllocation(const void *ptr,
GraphicsAllocation *&allocation,
InternalMemoryType &memoryType,
bool &isCpuCopyAllowed) {
cl_int retVal = tryGetExistingSvmAllocation(ptr, size, rootDeviceIndex, allocation, memoryType, isCpuCopyAllowed);
if (retVal != CL_SUCCESS || allocation != nullptr) {
return retVal;
}
retVal = tryGetExistingMapAllocation(ptr, size, allocation);
return retVal;
}
cl_int Context::tryGetExistingSvmAllocation(const void *ptr,
size_t size,
uint32_t rootDeviceIndex,
GraphicsAllocation *&allocation,
InternalMemoryType &memoryType,
bool &isCpuCopyAllowed) {
if (getSVMAllocsManager()) {
SvmAllocationData *svmEntry = getSVMAllocsManager()->getSVMAlloc(ptr);
if (svmEntry) {
@@ -101,16 +116,19 @@ cl_int Context::tryGetExistingHostPtrAllocation(const void *ptr,
isCpuCopyAllowed = false;
}
}
return CL_SUCCESS;
}
}
return CL_SUCCESS;
}
cl_int Context::tryGetExistingMapAllocation(const void *ptr,
size_t size,
GraphicsAllocation *&allocation) {
if (MapInfo mapInfo = {}; mapOperationsStorage.getInfoForHostPtr(ptr, size, mapInfo)) {
if (mapInfo.graphicsAllocation) {
allocation = mapInfo.graphicsAllocation;
}
}
return CL_SUCCESS;
}

View File

@@ -103,6 +103,15 @@ class Context : public BaseObject<_cl_context> {
GraphicsAllocation *&allocation,
InternalMemoryType &memoryType,
bool &isCpuCopyAllowed);
cl_int tryGetExistingSvmAllocation(const void *ptr,
size_t size,
uint32_t rootDeviceIndex,
GraphicsAllocation *&allocation,
InternalMemoryType &memoryType,
bool &isCpuCopyAllowed);
cl_int tryGetExistingMapAllocation(const void *ptr,
size_t size,
GraphicsAllocation *&allocation);
const std::set<uint32_t> &getRootDeviceIndices() const;