Reuse graphics allocations in readBufferRect/writeBufferRect

Related-To: NEO-6352
Signed-off-by: Maciej Dziuban <maciej.dziuban@intel.com>
This commit is contained in:
Maciej Dziuban 2021-11-02 15:17:05 +00:00 committed by Compute-Runtime-Automation
parent 9c1e7422b1
commit a582f34c04
8 changed files with 152 additions and 30 deletions

View File

@ -959,4 +959,15 @@ void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHan
}
}
void *CommandQueue::convertAddressWithOffsetToGpuVa(void *ptr, InternalMemoryType memoryType, GraphicsAllocation &allocation) {
// If this is device or shared USM pointer, it is already a gpuVA and we don't have to do anything.
// Otherwise, we assume this is a cpuVA and we have to convert to gpuVA, while preserving offset from allocation start.
const bool isCpuPtr = (memoryType != DEVICE_UNIFIED_MEMORY) && (memoryType != SHARED_UNIFIED_MEMORY);
if (isCpuPtr) {
size_t dstOffset = ptrDiff(ptr, allocation.getUnderlyingBuffer());
ptr = reinterpret_cast<void *>(allocation.getGpuAddress() + dstOffset);
}
return ptr;
}
} // namespace NEO

View File

@ -321,6 +321,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
return requiresCacheFlushAfterWalker;
}
static void *convertAddressWithOffsetToGpuVa(void *ptr, InternalMemoryType memoryType, GraphicsAllocation &allocation);
void updateBcsTaskCount(aub_stream::EngineType bcsEngineType, uint32_t newBcsTaskCount);
uint32_t peekBcsTaskCount(aub_stream::EngineType bcsEngineType) const;

View File

@ -85,11 +85,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBuffer(
if (mapAllocation) {
surfaces[1] = &mapSurface;
mapSurface.setGraphicsAllocation(mapAllocation);
//get offset between base cpu ptr of map allocation and dst ptr
if ((memoryType != DEVICE_UNIFIED_MEMORY) && (memoryType != SHARED_UNIFIED_MEMORY)) {
size_t dstOffset = ptrDiff(dstPtr, mapAllocation->getUnderlyingBuffer());
dstPtr = reinterpret_cast<void *>(mapAllocation->getGpuAddress() + dstOffset);
}
dstPtr = convertAddressWithOffsetToGpuVa(dstPtr, memoryType, *mapAllocation);
} else {
surfaces[1] = &hostPtrSurf;
if (size != 0) {

View File

@ -51,26 +51,38 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBufferRect(
numEventsInWaitList, eventWaitList, event);
}
const size_t hostPtrSize = Buffer::calculateHostPtrSize(hostOrigin, region, hostRowPitch, hostSlicePitch);
const uint32_t rootDeviceIndex = getDevice().getRootDeviceIndex();
InternalMemoryType memoryType = InternalMemoryType::NOT_SPECIFIED;
GraphicsAllocation *mapAllocation = nullptr;
bool isCpuCopyAllowed = false;
getContext().tryGetExistingHostPtrAllocation(ptr, hostPtrSize, rootDeviceIndex, mapAllocation, memoryType, isCpuCopyAllowed);
auto eBuiltInOps = EBuiltInOps::CopyBufferRect;
if (forceStateless(buffer->getSize())) {
eBuiltInOps = EBuiltInOps::CopyBufferRectStateless;
}
size_t hostPtrSize = Buffer::calculateHostPtrSize(hostOrigin, region, hostRowPitch, hostSlicePitch);
void *dstPtr = ptr;
MemObjSurface bufferSurf(buffer);
MemObjSurface srcBufferSurf(buffer);
HostPtrSurface hostPtrSurf(dstPtr, hostPtrSize);
Surface *surfaces[] = {&bufferSurf, &hostPtrSurf};
GeneralSurface mapSurface;
Surface *surfaces[] = {&srcBufferSurf, nullptr};
if (region[0] != 0 &&
region[1] != 0 &&
region[2] != 0) {
bool status = csr.createAllocationForHostSurface(hostPtrSurf, true);
if (!status) {
return CL_OUT_OF_RESOURCES;
if (region[0] != 0 && region[1] != 0 && region[2] != 0) {
if (mapAllocation) {
surfaces[1] = &mapSurface;
mapSurface.setGraphicsAllocation(mapAllocation);
dstPtr = convertAddressWithOffsetToGpuVa(dstPtr, memoryType, *mapAllocation);
} else {
surfaces[1] = &hostPtrSurf;
bool status = csr.createAllocationForHostSurface(hostPtrSurf, true);
if (!status) {
return CL_OUT_OF_RESOURCES;
}
dstPtr = reinterpret_cast<void *>(hostPtrSurf.getAllocation()->getGpuAddress());
}
dstPtr = reinterpret_cast<void *>(hostPtrSurf.getAllocation()->getGpuAddress());
}
void *alignedDstPtr = alignDown(dstPtr, 4);

View File

@ -77,11 +77,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteBuffer(
if (mapAllocation) {
surfaces[1] = &mapSurface;
mapSurface.setGraphicsAllocation(mapAllocation);
//get offset between base cpu ptr of map allocation and dst ptr
if ((memoryType != DEVICE_UNIFIED_MEMORY) && (memoryType != SHARED_UNIFIED_MEMORY)) {
size_t srcOffset = ptrDiff(srcPtr, mapAllocation->getUnderlyingBuffer());
srcPtr = reinterpret_cast<void *>(mapAllocation->getGpuAddress() + srcOffset);
}
srcPtr = convertAddressWithOffsetToGpuVa(srcPtr, memoryType, *mapAllocation);
} else {
surfaces[1] = &hostPtrSurf;
if (size != 0) {

View File

@ -51,26 +51,38 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteBufferRect(
numEventsInWaitList, eventWaitList, event);
}
const size_t hostPtrSize = Buffer::calculateHostPtrSize(hostOrigin, region, hostRowPitch, hostSlicePitch);
const uint32_t rootDeviceIndex = getDevice().getRootDeviceIndex();
InternalMemoryType memoryType = InternalMemoryType::NOT_SPECIFIED;
GraphicsAllocation *mapAllocation = nullptr;
bool isCpuCopyAllowed = false;
getContext().tryGetExistingHostPtrAllocation(ptr, hostPtrSize, rootDeviceIndex, mapAllocation, memoryType, isCpuCopyAllowed);
auto eBuiltInOps = EBuiltInOps::CopyBufferRect;
if (forceStateless(buffer->getSize())) {
eBuiltInOps = EBuiltInOps::CopyBufferRectStateless;
}
size_t hostPtrSize = Buffer::calculateHostPtrSize(hostOrigin, region, hostRowPitch, hostSlicePitch);
void *srcPtr = const_cast<void *>(ptr);
MemObjSurface dstBufferSurf(buffer);
HostPtrSurface hostPtrSurf(srcPtr, hostPtrSize, true);
Surface *surfaces[] = {&dstBufferSurf, &hostPtrSurf};
GeneralSurface mapSurface;
Surface *surfaces[] = {&dstBufferSurf, nullptr};
if (region[0] != 0 &&
region[1] != 0 &&
region[2] != 0) {
bool status = csr.createAllocationForHostSurface(hostPtrSurf, false);
if (!status) {
return CL_OUT_OF_RESOURCES;
if (region[0] != 0 && region[1] != 0 && region[2] != 0) {
if (mapAllocation) {
surfaces[1] = &mapSurface;
mapSurface.setGraphicsAllocation(mapAllocation);
srcPtr = convertAddressWithOffsetToGpuVa(srcPtr, memoryType, *mapAllocation);
} else {
surfaces[1] = &hostPtrSurf;
bool status = csr.createAllocationForHostSurface(hostPtrSurf, false);
if (!status) {
return CL_OUT_OF_RESOURCES;
}
srcPtr = reinterpret_cast<void *>(hostPtrSurf.getAllocation()->getGpuAddress());
}
srcPtr = reinterpret_cast<void *>(hostPtrSurf.getAllocation()->getGpuAddress());
}
void *alignedSrcPtr = alignDown(srcPtr, 4);

View File

@ -6,7 +6,8 @@
*/
#include "shared/source/built_ins/built_ins.h"
#include "shared/source/helpers/constants.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/unit_test/utilities/base_object_utils.h"
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
#include "opencl/source/event/event.h"
@ -698,3 +699,48 @@ HWTEST_F(EnqueueReadBufferRectStatefulTest, WhenReadingBufferRectStatefulThenSuc
EXPECT_EQ(CL_SUCCESS, retVal);
}
HWTEST_F(EnqueueReadBufferRectHw, givenHostPtrIsFromMappedBufferWhenReadBufferRectIsCalledThenReuseGraphicsAllocation) {
DebugManagerStateRestore restore{};
DebugManager.flags.DisableZeroCopyForBuffers.set(1);
MockCommandQueueHw<FamilyType> queue(context.get(), device.get(), nullptr);
auto &csr = device->getUltCommandStreamReceiver<FamilyType>();
BufferDefaults::context = context.get();
auto bufferForMap = clUniquePtr(BufferHelper<>::create());
auto bufferForRead = clUniquePtr(BufferHelper<>::create());
cl_int retVal{};
void *mappedPtr = queue.enqueueMapBuffer(bufferForMap.get(), CL_TRUE, CL_MAP_READ, 0, bufferForMap->getSize(), 0, nullptr, nullptr, retVal);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_NE(nullptr, mappedPtr);
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
MapOperationsHandler *mapOperationsHandler = context->getMapOperationsStorage().getHandlerIfExists(bufferForMap.get());
EXPECT_NE(nullptr, mapOperationsHandler);
MapInfo mapInfo{};
EXPECT_TRUE(mapOperationsHandler->find(mappedPtr, mapInfo));
EXPECT_NE(nullptr, mapInfo.graphicsAllocation);
auto unmappedPtr = std::make_unique<char[]>(bufferForRead->getSize());
retVal = queue.enqueueReadBufferRect(bufferForRead.get(), CL_TRUE,
bufferOrigin, hostOrigin,
region,
bufferRowPitch, bufferSlicePitch,
hostRowPitch, hostSlicePitch,
unmappedPtr.get(),
0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(1u, csr.createAllocationForHostSurfaceCalled);
retVal = queue.enqueueReadBufferRect(bufferForRead.get(), CL_TRUE,
bufferOrigin, hostOrigin,
region,
bufferRowPitch, bufferSlicePitch,
hostRowPitch, hostSlicePitch,
mappedPtr,
0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(1u, csr.createAllocationForHostSurfaceCalled);
}

View File

@ -6,6 +6,8 @@
*/
#include "shared/source/built_ins/built_ins.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/unit_test/utilities/base_object_utils.h"
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
#include "opencl/source/event/event.h"
@ -697,3 +699,48 @@ HWTEST_F(EnqueueWriteBufferRectStatefulTest, WhenWritingBufferRectStatefulThenSu
EXPECT_EQ(CL_SUCCESS, retVal);
}
HWTEST_F(EnqueueWriteBufferRectHw, givenHostPtrIsFromMappedBufferWhenWriteBufferRectIsCalledThenReuseGraphicsAllocation) {
DebugManagerStateRestore restore{};
DebugManager.flags.DisableZeroCopyForBuffers.set(1);
MockCommandQueueHw<FamilyType> queue(context.get(), device.get(), nullptr);
auto &csr = device->getUltCommandStreamReceiver<FamilyType>();
BufferDefaults::context = context.get();
auto bufferForMap = clUniquePtr(BufferHelper<>::create());
auto bufferForWrite = clUniquePtr(BufferHelper<>::create());
cl_int retVal{};
void *mappedPtr = queue.enqueueMapBuffer(bufferForMap.get(), CL_TRUE, CL_MAP_READ, 0, bufferForMap->getSize(), 0, nullptr, nullptr, retVal);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_NE(nullptr, mappedPtr);
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
MapOperationsHandler *mapOperationsHandler = context->getMapOperationsStorage().getHandlerIfExists(bufferForMap.get());
EXPECT_NE(nullptr, mapOperationsHandler);
MapInfo mapInfo{};
EXPECT_TRUE(mapOperationsHandler->find(mappedPtr, mapInfo));
EXPECT_NE(nullptr, mapInfo.graphicsAllocation);
auto unmappedPtr = std::make_unique<char[]>(bufferForWrite->getSize());
retVal = queue.enqueueWriteBufferRect(bufferForWrite.get(), CL_TRUE,
bufferOrigin, hostOrigin,
region,
bufferRowPitch, bufferSlicePitch,
hostRowPitch, hostSlicePitch,
unmappedPtr.get(),
0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(1u, csr.createAllocationForHostSurfaceCalled);
retVal = queue.enqueueWriteBufferRect(bufferForWrite.get(), CL_TRUE,
bufferOrigin, hostOrigin,
region,
bufferRowPitch, bufferSlicePitch,
hostRowPitch, hostSlicePitch,
mappedPtr,
0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(1u, csr.createAllocationForHostSurfaceCalled);
}