diff --git a/runtime/built_ins/built_ins.cpp b/runtime/built_ins/built_ins.cpp index 3667d01f39..12a8bf093c 100644 --- a/runtime/built_ins/built_ins.cpp +++ b/runtime/built_ins/built_ins.cpp @@ -243,14 +243,14 @@ class BuiltInOp : public BuiltinDispa } else if (operationParams.srcMemObj) { kernelSplit1DBuilder.setArg(0, operationParams.srcMemObj); } else { - kernelSplit1DBuilder.setArgSvm(0, operationParams.size.x, operationParams.srcPtr, nullptr, CL_MEM_READ_ONLY); + kernelSplit1DBuilder.setArgSvm(0, operationParams.size.x + operationParams.srcOffset.x, operationParams.srcPtr, nullptr, CL_MEM_READ_ONLY); } if (operationParams.dstSvmAlloc) { kernelSplit1DBuilder.setArgSvmAlloc(1, operationParams.dstPtr, operationParams.dstSvmAlloc); } else if (operationParams.dstMemObj) { kernelSplit1DBuilder.setArg(1, operationParams.dstMemObj); } else { - kernelSplit1DBuilder.setArgSvm(1, operationParams.size.x, operationParams.dstPtr); + kernelSplit1DBuilder.setArgSvm(1, operationParams.size.x + operationParams.dstOffset.x, operationParams.dstPtr); } // Set-up srcOffset diff --git a/runtime/command_queue/enqueue_read_buffer.h b/runtime/command_queue/enqueue_read_buffer.h index 8a7df10d6c..835d5a3612 100644 --- a/runtime/command_queue/enqueue_read_buffer.h +++ b/runtime/command_queue/enqueue_read_buffer.h @@ -88,9 +88,16 @@ cl_int CommandQueueHw::enqueueReadBuffer( BuiltInOwnershipWrapper builtInLock(builder, this->context); void *dstPtr = ptr; + void *alignedDstPtr = dstPtr; + size_t dstPtrOffset = 0; + + if (!isAligned<4>(dstPtr)) { + alignedDstPtr = alignDown(dstPtr, 4); + dstPtrOffset = ptrDiff(dstPtr, alignedDstPtr); + } MemObjSurface bufferSurf(buffer); - HostPtrSurface hostPtrSurf(dstPtr, size); + HostPtrSurface hostPtrSurf(alignedDstPtr, size + dstPtrOffset); Surface *surfaces[] = {&bufferSurf, &hostPtrSurf}; if (size != 0) { @@ -98,11 +105,13 @@ cl_int CommandQueueHw::enqueueReadBuffer( if (!status) { return CL_OUT_OF_RESOURCES; } - dstPtr = reinterpret_cast(hostPtrSurf.getAllocation()->getGpuAddressToPatch()); + + hostPtrSurf.getAllocation()->allocationOffset = dstPtrOffset; } BuiltinDispatchInfoBuilder::BuiltinOpParams dc; - dc.dstPtr = dstPtr; + dc.dstPtr = alignedDstPtr; + dc.dstOffset = {dstPtrOffset, 0, 0}; dc.srcMemObj = buffer; dc.srcOffset = {offset, 0, 0}; dc.size = {size, 0, 0}; diff --git a/runtime/command_queue/enqueue_write_buffer.h b/runtime/command_queue/enqueue_write_buffer.h index 3123304933..a93779517f 100644 --- a/runtime/command_queue/enqueue_write_buffer.h +++ b/runtime/command_queue/enqueue_write_buffer.h @@ -89,8 +89,15 @@ cl_int CommandQueueHw::enqueueWriteBuffer( BuiltInOwnershipWrapper builtInLock(builder, this->context); void *srcPtr = const_cast(ptr); + void *alignedSrcPtr = srcPtr; + size_t srcPtrOffset = 0; - HostPtrSurface hostPtrSurf(srcPtr, size, true); + if (!isAligned<4>(srcPtr)) { + alignedSrcPtr = alignDown(srcPtr, 4); + srcPtrOffset = ptrDiff(srcPtr, alignedSrcPtr); + } + + HostPtrSurface hostPtrSurf(alignedSrcPtr, size + srcPtrOffset, true); MemObjSurface bufferSurf(buffer); Surface *surfaces[] = {&bufferSurf, &hostPtrSurf}; @@ -99,11 +106,13 @@ cl_int CommandQueueHw::enqueueWriteBuffer( if (!status) { return CL_OUT_OF_RESOURCES; } - srcPtr = reinterpret_cast(hostPtrSurf.getAllocation()->getGpuAddressToPatch()); + + hostPtrSurf.getAllocation()->allocationOffset = srcPtrOffset; } BuiltinDispatchInfoBuilder::BuiltinOpParams dc; - dc.srcPtr = srcPtr; + dc.srcPtr = alignedSrcPtr; + dc.srcOffset = {srcPtrOffset, 0, 0}; dc.dstMemObj = buffer; dc.dstOffset = {offset, 0, 0}; dc.size = {size, 0, 0}; diff --git a/runtime/command_stream/aub_command_stream_receiver_hw.inl b/runtime/command_stream/aub_command_stream_receiver_hw.inl index 4e0d9a4e62..8452802d69 100644 --- a/runtime/command_stream/aub_command_stream_receiver_hw.inl +++ b/runtime/command_stream/aub_command_stream_receiver_hw.inl @@ -623,7 +623,7 @@ void AUBCommandStreamReceiverHw::writeMemory(uint64_t gpuAddress, voi template bool AUBCommandStreamReceiverHw::writeMemory(GraphicsAllocation &gfxAllocation) { - auto cpuAddress = gfxAllocation.getUnderlyingBuffer(); + auto cpuAddress = ptrOffset(gfxAllocation.getUnderlyingBuffer(), static_cast(gfxAllocation.allocationOffset)); auto gpuAddress = GmmHelper::decanonize(gfxAllocation.getGpuAddress()); auto size = gfxAllocation.getUnderlyingBufferSize(); if (gfxAllocation.gmm && gfxAllocation.gmm->isRenderCompressed) { diff --git a/unit_tests/aub_tests/command_queue/enqueue_read_buffer_aub_tests.cpp b/unit_tests/aub_tests/command_queue/enqueue_read_buffer_aub_tests.cpp index 3c6c604f22..47b3e65314 100644 --- a/unit_tests/aub_tests/command_queue/enqueue_read_buffer_aub_tests.cpp +++ b/unit_tests/aub_tests/command_queue/enqueue_read_buffer_aub_tests.cpp @@ -143,3 +143,66 @@ HWTEST_F(AUBReadBuffer, reserveCanonicalGpuAddress) { AUBCommandStreamFixture::expectMemory(dstGpuAddress, srcMemory, sizeof(dstMemory)); } + +struct AUBReadBufferUnaligned + : public CommandEnqueueAUBFixture, + public ::testing::Test { + + void SetUp() override { + CommandEnqueueAUBFixture::SetUp(); + } + + void TearDown() override { + CommandEnqueueAUBFixture::TearDown(); + } + + template + void testReadBufferUnaligned(size_t offset, size_t size) { + MockContext context(&pCmdQ->getDevice()); + + char srcMemory[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + const auto bufferSize = sizeof(srcMemory); + char dstMemory[bufferSize] = {0}; + + auto retVal = CL_INVALID_VALUE; + + auto buffer = std::unique_ptr(Buffer::create( + &context, + CL_MEM_USE_HOST_PTR, + bufferSize, + srcMemory, + retVal)); + ASSERT_NE(nullptr, buffer); + + buffer->forceDisallowCPUCopy = true; + + // Map destination memory to GPU + GraphicsAllocation *allocation = createResidentAllocationAndStoreItInCsr(dstMemory, bufferSize); + auto dstMemoryGPUPtr = reinterpret_cast(allocation->getGpuAddress()); + + // Do unaligned read + retVal = pCmdQ->enqueueReadBuffer( + buffer.get(), + CL_TRUE, + offset, + size, + ptrOffset(dstMemory, offset), + 0, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Check the memory + AUBCommandStreamFixture::expectMemory(ptrOffset(dstMemoryGPUPtr, offset), ptrOffset(srcMemory, offset), size); + } +}; + +HWTEST_F(AUBReadBufferUnaligned, all) { + const std::vector offsets = {0, 1, 2, 3}; + const std::vector sizes = {4, 3, 2, 1}; + for (auto offset : offsets) { + for (auto size : sizes) { + testReadBufferUnaligned(offset, size); + } + } +} diff --git a/unit_tests/aub_tests/command_queue/enqueue_write_buffer_aub_tests.cpp b/unit_tests/aub_tests/command_queue/enqueue_write_buffer_aub_tests.cpp index b7ef1ad953..5b06ecf432 100644 --- a/unit_tests/aub_tests/command_queue/enqueue_write_buffer_aub_tests.cpp +++ b/unit_tests/aub_tests/command_queue/enqueue_write_buffer_aub_tests.cpp @@ -108,3 +108,63 @@ INSTANTIATE_TEST_CASE_P(AUBWriteBuffer_simple, 1 * sizeof(cl_float), 2 * sizeof(cl_float), 3 * sizeof(cl_float))); + +struct AUBWriteBufferUnaligned + : public CommandEnqueueAUBFixture, + public ::testing::Test { + + void SetUp() override { + CommandEnqueueAUBFixture::SetUp(); + } + + void TearDown() override { + CommandEnqueueAUBFixture::TearDown(); + } + + template + void testWriteBufferUnaligned(size_t offset, size_t size) { + MockContext context(&pCmdQ->getDevice()); + + char srcMemory[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + const auto bufferSize = sizeof(srcMemory); + char dstMemory[bufferSize] = {0}; + + auto retVal = CL_INVALID_VALUE; + + auto buffer = std::unique_ptr(Buffer::create( + &context, + CL_MEM_USE_HOST_PTR, + bufferSize, + dstMemory, + retVal)); + ASSERT_NE(nullptr, buffer); + + buffer->forceDisallowCPUCopy = true; + + // Do unaligned write + retVal = pCmdQ->enqueueWriteBuffer( + buffer.get(), + CL_TRUE, + offset, + size, + ptrOffset(srcMemory, offset), + 0, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + // Check the memory + auto bufferGPUPtr = reinterpret_cast((buffer->getGraphicsAllocation()->getGpuAddress())); + AUBCommandStreamFixture::expectMemory(ptrOffset(bufferGPUPtr, offset), ptrOffset(srcMemory, offset), size); + } +}; + +HWTEST_F(AUBWriteBufferUnaligned, all) { + const std::vector offsets = {0, 1, 2, 3}; + const std::vector sizes = {4, 3, 2, 1}; + for (auto offset : offsets) { + for (auto size : sizes) { + testWriteBufferUnaligned(offset, size); + } + } +}