diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 1fc8736135..5d074147bd 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -959,4 +959,15 @@ void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHan } } +void *CommandQueue::convertAddressWithOffsetToGpuVa(void *ptr, InternalMemoryType memoryType, GraphicsAllocation &allocation) { + // If this is device or shared USM pointer, it is already a gpuVA and we don't have to do anything. + // Otherwise, we assume this is a cpuVA and we have to convert to gpuVA, while preserving offset from allocation start. + const bool isCpuPtr = (memoryType != DEVICE_UNIFIED_MEMORY) && (memoryType != SHARED_UNIFIED_MEMORY); + if (isCpuPtr) { + size_t dstOffset = ptrDiff(ptr, allocation.getUnderlyingBuffer()); + ptr = reinterpret_cast(allocation.getGpuAddress() + dstOffset); + } + return ptr; +} + } // namespace NEO diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index f2fdab1995..650afbb7ab 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -321,6 +321,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> { return requiresCacheFlushAfterWalker; } + static void *convertAddressWithOffsetToGpuVa(void *ptr, InternalMemoryType memoryType, GraphicsAllocation &allocation); + void updateBcsTaskCount(aub_stream::EngineType bcsEngineType, uint32_t newBcsTaskCount); uint32_t peekBcsTaskCount(aub_stream::EngineType bcsEngineType) const; diff --git a/opencl/source/command_queue/enqueue_read_buffer.h b/opencl/source/command_queue/enqueue_read_buffer.h index f9a9088648..2b21dfbdce 100644 --- a/opencl/source/command_queue/enqueue_read_buffer.h +++ b/opencl/source/command_queue/enqueue_read_buffer.h @@ -85,11 +85,7 @@ cl_int CommandQueueHw::enqueueReadBuffer( if (mapAllocation) { surfaces[1] = &mapSurface; mapSurface.setGraphicsAllocation(mapAllocation); - //get offset between base cpu ptr of map allocation and dst ptr - if ((memoryType != DEVICE_UNIFIED_MEMORY) && (memoryType != SHARED_UNIFIED_MEMORY)) { - size_t dstOffset = ptrDiff(dstPtr, mapAllocation->getUnderlyingBuffer()); - dstPtr = reinterpret_cast(mapAllocation->getGpuAddress() + dstOffset); - } + dstPtr = convertAddressWithOffsetToGpuVa(dstPtr, memoryType, *mapAllocation); } else { surfaces[1] = &hostPtrSurf; if (size != 0) { diff --git a/opencl/source/command_queue/enqueue_read_buffer_rect.h b/opencl/source/command_queue/enqueue_read_buffer_rect.h index c591155301..dd9b292268 100644 --- a/opencl/source/command_queue/enqueue_read_buffer_rect.h +++ b/opencl/source/command_queue/enqueue_read_buffer_rect.h @@ -51,26 +51,38 @@ cl_int CommandQueueHw::enqueueReadBufferRect( numEventsInWaitList, eventWaitList, event); } + const size_t hostPtrSize = Buffer::calculateHostPtrSize(hostOrigin, region, hostRowPitch, hostSlicePitch); + const uint32_t rootDeviceIndex = getDevice().getRootDeviceIndex(); + InternalMemoryType memoryType = InternalMemoryType::NOT_SPECIFIED; + GraphicsAllocation *mapAllocation = nullptr; + bool isCpuCopyAllowed = false; + getContext().tryGetExistingHostPtrAllocation(ptr, hostPtrSize, rootDeviceIndex, mapAllocation, memoryType, isCpuCopyAllowed); + auto eBuiltInOps = EBuiltInOps::CopyBufferRect; if (forceStateless(buffer->getSize())) { eBuiltInOps = EBuiltInOps::CopyBufferRectStateless; } - size_t hostPtrSize = Buffer::calculateHostPtrSize(hostOrigin, region, hostRowPitch, hostSlicePitch); void *dstPtr = ptr; - MemObjSurface bufferSurf(buffer); + MemObjSurface srcBufferSurf(buffer); HostPtrSurface hostPtrSurf(dstPtr, hostPtrSize); - Surface *surfaces[] = {&bufferSurf, &hostPtrSurf}; + GeneralSurface mapSurface; + Surface *surfaces[] = {&srcBufferSurf, nullptr}; - if (region[0] != 0 && - region[1] != 0 && - region[2] != 0) { - bool status = csr.createAllocationForHostSurface(hostPtrSurf, true); - if (!status) { - return CL_OUT_OF_RESOURCES; + if (region[0] != 0 && region[1] != 0 && region[2] != 0) { + if (mapAllocation) { + surfaces[1] = &mapSurface; + mapSurface.setGraphicsAllocation(mapAllocation); + dstPtr = convertAddressWithOffsetToGpuVa(dstPtr, memoryType, *mapAllocation); + } else { + surfaces[1] = &hostPtrSurf; + bool status = csr.createAllocationForHostSurface(hostPtrSurf, true); + if (!status) { + return CL_OUT_OF_RESOURCES; + } + dstPtr = reinterpret_cast(hostPtrSurf.getAllocation()->getGpuAddress()); } - dstPtr = reinterpret_cast(hostPtrSurf.getAllocation()->getGpuAddress()); } void *alignedDstPtr = alignDown(dstPtr, 4); diff --git a/opencl/source/command_queue/enqueue_write_buffer.h b/opencl/source/command_queue/enqueue_write_buffer.h index a4cb3da524..8aeeef1c07 100644 --- a/opencl/source/command_queue/enqueue_write_buffer.h +++ b/opencl/source/command_queue/enqueue_write_buffer.h @@ -77,11 +77,7 @@ cl_int CommandQueueHw::enqueueWriteBuffer( if (mapAllocation) { surfaces[1] = &mapSurface; mapSurface.setGraphicsAllocation(mapAllocation); - //get offset between base cpu ptr of map allocation and dst ptr - if ((memoryType != DEVICE_UNIFIED_MEMORY) && (memoryType != SHARED_UNIFIED_MEMORY)) { - size_t srcOffset = ptrDiff(srcPtr, mapAllocation->getUnderlyingBuffer()); - srcPtr = reinterpret_cast(mapAllocation->getGpuAddress() + srcOffset); - } + srcPtr = convertAddressWithOffsetToGpuVa(srcPtr, memoryType, *mapAllocation); } else { surfaces[1] = &hostPtrSurf; if (size != 0) { diff --git a/opencl/source/command_queue/enqueue_write_buffer_rect.h b/opencl/source/command_queue/enqueue_write_buffer_rect.h index 5a28418b2f..5c4b31b1f6 100644 --- a/opencl/source/command_queue/enqueue_write_buffer_rect.h +++ b/opencl/source/command_queue/enqueue_write_buffer_rect.h @@ -51,26 +51,38 @@ cl_int CommandQueueHw::enqueueWriteBufferRect( numEventsInWaitList, eventWaitList, event); } + const size_t hostPtrSize = Buffer::calculateHostPtrSize(hostOrigin, region, hostRowPitch, hostSlicePitch); + const uint32_t rootDeviceIndex = getDevice().getRootDeviceIndex(); + InternalMemoryType memoryType = InternalMemoryType::NOT_SPECIFIED; + GraphicsAllocation *mapAllocation = nullptr; + bool isCpuCopyAllowed = false; + getContext().tryGetExistingHostPtrAllocation(ptr, hostPtrSize, rootDeviceIndex, mapAllocation, memoryType, isCpuCopyAllowed); + auto eBuiltInOps = EBuiltInOps::CopyBufferRect; if (forceStateless(buffer->getSize())) { eBuiltInOps = EBuiltInOps::CopyBufferRectStateless; } - size_t hostPtrSize = Buffer::calculateHostPtrSize(hostOrigin, region, hostRowPitch, hostSlicePitch); void *srcPtr = const_cast(ptr); MemObjSurface dstBufferSurf(buffer); HostPtrSurface hostPtrSurf(srcPtr, hostPtrSize, true); - Surface *surfaces[] = {&dstBufferSurf, &hostPtrSurf}; + GeneralSurface mapSurface; + Surface *surfaces[] = {&dstBufferSurf, nullptr}; - if (region[0] != 0 && - region[1] != 0 && - region[2] != 0) { - bool status = csr.createAllocationForHostSurface(hostPtrSurf, false); - if (!status) { - return CL_OUT_OF_RESOURCES; + if (region[0] != 0 && region[1] != 0 && region[2] != 0) { + if (mapAllocation) { + surfaces[1] = &mapSurface; + mapSurface.setGraphicsAllocation(mapAllocation); + srcPtr = convertAddressWithOffsetToGpuVa(srcPtr, memoryType, *mapAllocation); + } else { + surfaces[1] = &hostPtrSurf; + bool status = csr.createAllocationForHostSurface(hostPtrSurf, false); + if (!status) { + return CL_OUT_OF_RESOURCES; + } + srcPtr = reinterpret_cast(hostPtrSurf.getAllocation()->getGpuAddress()); } - srcPtr = reinterpret_cast(hostPtrSurf.getAllocation()->getGpuAddress()); } void *alignedSrcPtr = alignDown(srcPtr, 4); diff --git a/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp index 0e7e8e08ca..b39b1e33c8 100644 --- a/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp @@ -6,7 +6,8 @@ */ #include "shared/source/built_ins/built_ins.h" -#include "shared/source/helpers/constants.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/unit_test/utilities/base_object_utils.h" #include "opencl/source/built_ins/builtins_dispatch_builder.h" #include "opencl/source/event/event.h" @@ -698,3 +699,48 @@ HWTEST_F(EnqueueReadBufferRectStatefulTest, WhenReadingBufferRectStatefulThenSuc EXPECT_EQ(CL_SUCCESS, retVal); } + +HWTEST_F(EnqueueReadBufferRectHw, givenHostPtrIsFromMappedBufferWhenReadBufferRectIsCalledThenReuseGraphicsAllocation) { + DebugManagerStateRestore restore{}; + DebugManager.flags.DisableZeroCopyForBuffers.set(1); + + MockCommandQueueHw queue(context.get(), device.get(), nullptr); + auto &csr = device->getUltCommandStreamReceiver(); + + BufferDefaults::context = context.get(); + auto bufferForMap = clUniquePtr(BufferHelper<>::create()); + auto bufferForRead = clUniquePtr(BufferHelper<>::create()); + + cl_int retVal{}; + void *mappedPtr = queue.enqueueMapBuffer(bufferForMap.get(), CL_TRUE, CL_MAP_READ, 0, bufferForMap->getSize(), 0, nullptr, nullptr, retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, mappedPtr); + EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled); + + MapOperationsHandler *mapOperationsHandler = context->getMapOperationsStorage().getHandlerIfExists(bufferForMap.get()); + EXPECT_NE(nullptr, mapOperationsHandler); + MapInfo mapInfo{}; + EXPECT_TRUE(mapOperationsHandler->find(mappedPtr, mapInfo)); + EXPECT_NE(nullptr, mapInfo.graphicsAllocation); + + auto unmappedPtr = std::make_unique(bufferForRead->getSize()); + retVal = queue.enqueueReadBufferRect(bufferForRead.get(), CL_TRUE, + bufferOrigin, hostOrigin, + region, + bufferRowPitch, bufferSlicePitch, + hostRowPitch, hostSlicePitch, + unmappedPtr.get(), + 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(1u, csr.createAllocationForHostSurfaceCalled); + + retVal = queue.enqueueReadBufferRect(bufferForRead.get(), CL_TRUE, + bufferOrigin, hostOrigin, + region, + bufferRowPitch, bufferSlicePitch, + hostRowPitch, hostSlicePitch, + mappedPtr, + 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(1u, csr.createAllocationForHostSurfaceCalled); +} diff --git a/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp index f60f4c1e15..c8fd61dbe4 100644 --- a/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp @@ -6,6 +6,8 @@ */ #include "shared/source/built_ins/built_ins.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/unit_test/utilities/base_object_utils.h" #include "opencl/source/built_ins/builtins_dispatch_builder.h" #include "opencl/source/event/event.h" @@ -697,3 +699,48 @@ HWTEST_F(EnqueueWriteBufferRectStatefulTest, WhenWritingBufferRectStatefulThenSu EXPECT_EQ(CL_SUCCESS, retVal); } + +HWTEST_F(EnqueueWriteBufferRectHw, givenHostPtrIsFromMappedBufferWhenWriteBufferRectIsCalledThenReuseGraphicsAllocation) { + DebugManagerStateRestore restore{}; + DebugManager.flags.DisableZeroCopyForBuffers.set(1); + + MockCommandQueueHw queue(context.get(), device.get(), nullptr); + auto &csr = device->getUltCommandStreamReceiver(); + + BufferDefaults::context = context.get(); + auto bufferForMap = clUniquePtr(BufferHelper<>::create()); + auto bufferForWrite = clUniquePtr(BufferHelper<>::create()); + + cl_int retVal{}; + void *mappedPtr = queue.enqueueMapBuffer(bufferForMap.get(), CL_TRUE, CL_MAP_READ, 0, bufferForMap->getSize(), 0, nullptr, nullptr, retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(nullptr, mappedPtr); + EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled); + + MapOperationsHandler *mapOperationsHandler = context->getMapOperationsStorage().getHandlerIfExists(bufferForMap.get()); + EXPECT_NE(nullptr, mapOperationsHandler); + MapInfo mapInfo{}; + EXPECT_TRUE(mapOperationsHandler->find(mappedPtr, mapInfo)); + EXPECT_NE(nullptr, mapInfo.graphicsAllocation); + + auto unmappedPtr = std::make_unique(bufferForWrite->getSize()); + retVal = queue.enqueueWriteBufferRect(bufferForWrite.get(), CL_TRUE, + bufferOrigin, hostOrigin, + region, + bufferRowPitch, bufferSlicePitch, + hostRowPitch, hostSlicePitch, + unmappedPtr.get(), + 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(1u, csr.createAllocationForHostSurfaceCalled); + + retVal = queue.enqueueWriteBufferRect(bufferForWrite.get(), CL_TRUE, + bufferOrigin, hostOrigin, + region, + bufferRowPitch, bufferSlicePitch, + hostRowPitch, hostSlicePitch, + mappedPtr, + 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(1u, csr.createAllocationForHostSurfaceCalled); +}