From 53a99f677290879380b3f1322aa03efdee2c62e8 Mon Sep 17 00:00:00 2001 From: "Spruit, Neil R" Date: Wed, 14 Oct 2020 13:43:38 +0000 Subject: [PATCH] Fixed 2D/3D copy to use aligned pointers and offsets in kernel programming - Fixed the 2d/3d copy to correctly pass the aligned allocation data to the kernel programming of the buffer surface states otherwise unaligned host ptrs cannot be used. Change-Id: If62e9321d572a786999a5ead8303f11e2e9a3c8d Signed-off-by: Spruit, Neil R --- level_zero/core/source/cmdlist/cmdlist_hw.h | 6 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 34 +++--- .../sources/cmdlist/test_cmdlist_2.cpp | 113 ++++++++++++++++-- .../sources/cmdlist/test_cmdlist_blit.cpp | 2 +- 4 files changed, 125 insertions(+), 30 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 9806c2ced5..ce77893ae5 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -152,20 +152,22 @@ struct CommandListCoreFamily : CommandListImp { MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyBlitRegion(NEO::GraphicsAllocation *srcAlloc, NEO::GraphicsAllocation *dstAlloc, + size_t srcOffset, + size_t dstOffset, ze_copy_region_t srcRegion, ze_copy_region_t dstRegion, Vec3 copySize, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, Vec3 srcSize, Vec3 dstSize, ze_event_handle_t hSignalEvent); - MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernel2d(NEO::GraphicsAllocation *dstAlloc, NEO::GraphicsAllocation *srcAlloc, + MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernel2d(AlignedAllocationData *dstAlignedAllocation, AlignedAllocationData *srcAlignedAllocation, Builtin builtin, const ze_copy_region_t *dstRegion, uint32_t dstPitch, size_t dstOffset, const ze_copy_region_t *srcRegion, uint32_t srcPitch, size_t srcOffset, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents); - MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernel3d(NEO::GraphicsAllocation *dstAlloc, NEO::GraphicsAllocation *srcAlloc, + MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernel3d(AlignedAllocationData *dstAlignedAllocation, AlignedAllocationData *srcAlignedAllocation, Builtin builtin, const ze_copy_region_t *dstRegion, uint32_t dstPitch, uint32_t dstSlicePitch, size_t dstOffset, const ze_copy_region_t *srcRegion, uint32_t srcPitch, diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 0a3a9f3602..d5a5335502 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -682,6 +682,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyBlit(uintptr_t template ze_result_t CommandListCoreFamily::appendMemoryCopyBlitRegion(NEO::GraphicsAllocation *srcAlloc, NEO::GraphicsAllocation *dstAlloc, + size_t srcOffset, + size_t dstOffset, ze_copy_region_t srcRegion, ze_copy_region_t dstRegion, Vec3 copySize, size_t srcRowPitch, size_t srcSlicePitch, @@ -689,6 +691,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyBlitRegion(NEO Vec3 srcSize, Vec3 dstSize, ze_event_handle_t hSignalEvent) { using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; + dstRegion.originX += static_cast(dstOffset); + srcRegion.originX += static_cast(srcOffset); uint32_t bytesPerPixel = NEO::BlitCommandsHelper::getAvailableBytesPerPixel(copySize.x, srcRegion.originX, dstRegion.originX, srcSize.x, dstSize.x); bool copyOneCommand = NEO::BlitCommandsHelper::useOneBlitCopyCommand(copySize, bytesPerPixel); Vec3 srcPtrOffset = {(copyOneCommand ? (srcRegion.originX / bytesPerPixel) : srcRegion.originX), srcRegion.originY, srcRegion.originZ}; @@ -907,15 +911,15 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyRegion(void *d ze_result_t result = ZE_RESULT_SUCCESS; if (srcRegion->depth > 1) { - result = isCopyOnly() ? appendMemoryCopyBlitRegion(srcAllocationStruct.alloc, dstAllocationStruct.alloc, *srcRegion, *dstRegion, {srcRegion->width, srcRegion->height, srcRegion->depth}, + result = isCopyOnly() ? appendMemoryCopyBlitRegion(srcAllocationStruct.alloc, dstAllocationStruct.alloc, srcAllocationStruct.offset, dstAllocationStruct.offset, *srcRegion, *dstRegion, {srcRegion->width, srcRegion->height, srcRegion->depth}, srcPitch, srcSlicePitch, dstPitch, dstSlicePitch, srcSize3, dstSize3, hSignalEvent) - : this->appendMemoryCopyKernel3d(dstAllocationStruct.alloc, srcAllocationStruct.alloc, + : this->appendMemoryCopyKernel3d(&dstAllocationStruct, &srcAllocationStruct, Builtin::CopyBufferRectBytes3d, dstRegion, dstPitch, dstSlicePitch, dstAllocationStruct.offset, srcRegion, srcPitch, srcSlicePitch, srcAllocationStruct.offset, hSignalEvent, 0, nullptr); } else { - result = isCopyOnly() ? appendMemoryCopyBlitRegion(srcAllocationStruct.alloc, dstAllocationStruct.alloc, *srcRegion, *dstRegion, {srcRegion->width, srcRegion->height, srcRegion->depth}, + result = isCopyOnly() ? appendMemoryCopyBlitRegion(srcAllocationStruct.alloc, dstAllocationStruct.alloc, srcAllocationStruct.offset, dstAllocationStruct.offset, *srcRegion, *dstRegion, {srcRegion->width, srcRegion->height, srcRegion->depth}, srcPitch, srcSlicePitch, dstPitch, dstSlicePitch, srcSize3, dstSize3, hSignalEvent) - : this->appendMemoryCopyKernel2d(dstAllocationStruct.alloc, srcAllocationStruct.alloc, + : this->appendMemoryCopyKernel2d(&dstAllocationStruct, &srcAllocationStruct, Builtin::CopyBufferRectBytes2d, dstRegion, dstPitch, dstAllocationStruct.offset, srcRegion, srcPitch, srcAllocationStruct.offset, hSignalEvent, 0, nullptr); } @@ -933,8 +937,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyRegion(void *d } template -ze_result_t CommandListCoreFamily::appendMemoryCopyKernel3d(NEO::GraphicsAllocation *dstGA, - NEO::GraphicsAllocation *srcGA, +ze_result_t CommandListCoreFamily::appendMemoryCopyKernel3d(AlignedAllocationData *dstAlignedAllocation, + AlignedAllocationData *srcAlignedAllocation, Builtin builtin, const ze_copy_region_t *dstRegion, uint32_t dstPitch, @@ -980,11 +984,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyKernel3d(NEO:: uint32_t srcPitches[2] = {(srcPitch), (srcSlicePitch)}; uint32_t dstPitches[2] = {(dstPitch), (dstSlicePitch)}; - auto dstValPtr = static_cast(dstGA->getGpuAddress()); - auto srcValPtr = static_cast(srcGA->getGpuAddress()); - - builtinFunction->setArgBufferWithAlloc(0, srcValPtr, srcGA); - builtinFunction->setArgBufferWithAlloc(1, dstValPtr, dstGA); + builtinFunction->setArgBufferWithAlloc(0, srcAlignedAllocation->alignedAllocationPtr, srcAlignedAllocation->alloc); + builtinFunction->setArgBufferWithAlloc(1, dstAlignedAllocation->alignedAllocationPtr, dstAlignedAllocation->alloc); builtinFunction->setArgumentValue(2, sizeof(srcOrigin), &srcOrigin); builtinFunction->setArgumentValue(3, sizeof(dstOrigin), &dstOrigin); builtinFunction->setArgumentValue(4, sizeof(srcPitches), &srcPitches); @@ -995,8 +996,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyKernel3d(NEO:: } template -ze_result_t CommandListCoreFamily::appendMemoryCopyKernel2d(NEO::GraphicsAllocation *dstGA, - NEO::GraphicsAllocation *srcGA, +ze_result_t CommandListCoreFamily::appendMemoryCopyKernel2d(AlignedAllocationData *dstAlignedAllocation, + AlignedAllocationData *srcAlignedAllocation, Builtin builtin, const ze_copy_region_t *dstRegion, uint32_t dstPitch, @@ -1037,11 +1038,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyKernel2d(NEO:: uint32_t srcOrigin[2] = {(srcRegion->originX + static_cast(srcOffset)), (srcRegion->originY)}; uint32_t dstOrigin[2] = {(dstRegion->originX + static_cast(dstOffset)), (dstRegion->originY)}; - auto dstValPtr = static_cast(dstGA->getGpuAddress()); - auto srcValPtr = static_cast(srcGA->getGpuAddress()); - - builtinFunction->setArgBufferWithAlloc(0, srcValPtr, srcGA); - builtinFunction->setArgBufferWithAlloc(1, dstValPtr, dstGA); + builtinFunction->setArgBufferWithAlloc(0, srcAlignedAllocation->alignedAllocationPtr, srcAlignedAllocation->alloc); + builtinFunction->setArgBufferWithAlloc(1, dstAlignedAllocation->alignedAllocationPtr, dstAlignedAllocation->alloc); builtinFunction->setArgumentValue(2, sizeof(srcOrigin), &srcOrigin); builtinFunction->setArgumentValue(3, sizeof(dstOrigin), &dstOrigin); builtinFunction->setArgumentValue(4, sizeof(srcPitch), &srcPitch); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index 4cbc2f0f2c..3b9f41d034 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -5,6 +5,7 @@ * */ +#include "shared/source/command_container/command_encoder.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/hw_info.h" #include "shared/source/helpers/register_offsets.h" @@ -56,8 +57,10 @@ class MockCommandListHw : public WhiteBox<::L0::CommandListCoreFamily copySize, size_t srcRowPitch, size_t srcSlicePitch, @@ -67,7 +70,7 @@ class MockCommandListHw : public WhiteBox<::L0::CommandListCoreFamily { AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize) override { return L0::CommandListCoreFamily::getAlignedAllocation(device, buffer, bufferSize); } + ze_result_t appendMemoryCopyKernel2d(AlignedAllocationData *dstAlignedAllocation, AlignedAllocationData *srcAlignedAllocation, + Builtin builtin, const ze_copy_region_t *dstRegion, + uint32_t dstPitch, size_t dstOffset, + const ze_copy_region_t *srcRegion, uint32_t srcPitch, + size_t srcOffset, ze_event_handle_t hSignalEvent, + uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override { + srcAlignedPtr = srcAlignedAllocation->alignedAllocationPtr; + dstAlignedPtr = dstAlignedAllocation->alignedAllocationPtr; + return L0::CommandListCoreFamily::appendMemoryCopyKernel2d(dstAlignedAllocation, srcAlignedAllocation, builtin, dstRegion, dstPitch, dstOffset, srcRegion, srcPitch, srcOffset, hSignalEvent, numWaitEvents, phWaitEvents); + } + + ze_result_t appendMemoryCopyKernel3d(AlignedAllocationData *dstAlignedAllocation, AlignedAllocationData *srcAlignedAllocation, + Builtin builtin, const ze_copy_region_t *dstRegion, + uint32_t dstPitch, uint32_t dstSlicePitch, size_t dstOffset, + const ze_copy_region_t *srcRegion, uint32_t srcPitch, + uint32_t srcSlicePitch, size_t srcOffset, + ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, + ze_event_handle_t *phWaitEvents) override { + srcAlignedPtr = srcAlignedAllocation->alignedAllocationPtr; + dstAlignedPtr = dstAlignedAllocation->alignedAllocationPtr; + return L0::CommandListCoreFamily::appendMemoryCopyKernel3d(dstAlignedAllocation, srcAlignedAllocation, builtin, dstRegion, dstPitch, dstSlicePitch, dstOffset, srcRegion, srcPitch, srcSlicePitch, srcOffset, hSignalEvent, numWaitEvents, phWaitEvents); + } + + ze_result_t appendMemoryCopyBlitRegion(NEO::GraphicsAllocation *srcAllocation, + NEO::GraphicsAllocation *dstAllocation, + size_t srcOffset, + size_t dstOffset, + ze_copy_region_t srcRegion, + ze_copy_region_t dstRegion, Vec3 copySize, + size_t srcRowPitch, size_t srcSlicePitch, + size_t dstRowPitch, size_t dstSlicePitch, + Vec3 srcSize, Vec3 dstSize, ze_event_handle_t hSignalEvent) override { + srcBlitCopyRegionOffset = srcOffset; + dstBlitCopyRegionOffset = dstOffset; + return L0::CommandListCoreFamily::appendMemoryCopyBlitRegion(srcAllocation, dstAllocation, srcOffset, dstOffset, srcRegion, dstRegion, copySize, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, srcSize, dstSize, hSignalEvent); + } + uintptr_t srcAlignedPtr; + uintptr_t dstAlignedPtr; + size_t srcBlitCopyRegionOffset = 0; + size_t dstBlitCopyRegionOffset = 0; }; HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyRegionCalledThenTwoNewAllocationAreAddedToHostMapPtr, Platforms) { @@ -285,6 +328,56 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyRegionC EXPECT_EQ(cmdList.hostPtrMap.size(), 2u); } +HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenMemoryCopyRegion2DCalledThenSrcDstPointersArePageAligned, Platforms) { + MockAppendMemoryCopy cmdList; + cmdList.initialize(device, NEO::EngineGroupType::RenderCompute); + void *srcPtr = reinterpret_cast(0x1234); + void *dstPtr = reinterpret_cast(0x2345); + ze_copy_region_t dstRegion = {4, 4, 0, 2, 2, 0}; + ze_copy_region_t srcRegion = {4, 4, 0, 2, 2, 0}; + cmdList.appendMemoryCopyRegion(dstPtr, &dstRegion, 0, 0, srcPtr, &srcRegion, 0, 0, nullptr); + auto sshAlignmentMask = NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignmentMask(); + EXPECT_TRUE(cmdList.srcAlignedPtr == (cmdList.srcAlignedPtr & sshAlignmentMask)); + EXPECT_TRUE(cmdList.dstAlignedPtr == (cmdList.dstAlignedPtr & sshAlignmentMask)); +} + +HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenMemoryCopyRegion3DCalledThenSrcDstPointersArePageAligned, Platforms) { + MockAppendMemoryCopy cmdList; + cmdList.initialize(device, NEO::EngineGroupType::RenderCompute); + void *srcPtr = reinterpret_cast(0x1234); + void *dstPtr = reinterpret_cast(0x2345); + ze_copy_region_t dstRegion = {4, 4, 4, 2, 2, 2}; + ze_copy_region_t srcRegion = {4, 4, 4, 2, 2, 2}; + cmdList.appendMemoryCopyRegion(dstPtr, &dstRegion, 0, 0, srcPtr, &srcRegion, 0, 0, nullptr); + auto sshAlignmentMask = NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignmentMask(); + EXPECT_TRUE(cmdList.srcAlignedPtr == (cmdList.srcAlignedPtr & sshAlignmentMask)); + EXPECT_TRUE(cmdList.dstAlignedPtr == (cmdList.dstAlignedPtr & sshAlignmentMask)); +} + +HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemoryCopyRegion2DCalledThenSrcDstPointersArePageAligned, Platforms) { + MockAppendMemoryCopy cmdList; + cmdList.initialize(device, NEO::EngineGroupType::Copy); + void *srcPtr = reinterpret_cast(0x1234); + void *dstPtr = reinterpret_cast(0x2345); + ze_copy_region_t dstRegion = {4, 4, 0, 2, 2, 0}; + ze_copy_region_t srcRegion = {4, 4, 0, 2, 2, 0}; + cmdList.appendMemoryCopyRegion(dstPtr, &dstRegion, 0, 0, srcPtr, &srcRegion, 0, 0, nullptr); + EXPECT_GT(cmdList.srcBlitCopyRegionOffset, 0u); + EXPECT_GT(cmdList.dstBlitCopyRegionOffset, 0u); +} + +HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemoryCopyRegion3DCalledThenSrcDstPointersArePageAligned, Platforms) { + MockAppendMemoryCopy cmdList; + cmdList.initialize(device, NEO::EngineGroupType::Copy); + void *srcPtr = reinterpret_cast(0x1234); + void *dstPtr = reinterpret_cast(0x2345); + ze_copy_region_t dstRegion = {4, 4, 4, 2, 2, 2}; + ze_copy_region_t srcRegion = {4, 4, 4, 2, 2, 2}; + cmdList.appendMemoryCopyRegion(dstPtr, &dstRegion, 0, 0, srcPtr, &srcRegion, 0, 0, nullptr); + EXPECT_GT(cmdList.srcBlitCopyRegionOffset, 0u); + EXPECT_GT(cmdList.dstBlitCopyRegionOffset, 0u); +} + HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyRegionCalledThenPipeControlWithDcFlushAdded, Platforms) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; @@ -729,7 +822,7 @@ HWTEST2_F(CommandListCreate, givenCopyCommandListWhenCopyRegionWithinMaxBlitSize MemoryPool::System4KBPages); size_t rowPitch = copySize.x; size_t slicePitch = copySize.x * copySize.y; - commandList->appendMemoryCopyBlitRegion(&mockAllocationDst, &mockAllocationSrc, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr); + commandList->appendMemoryCopyBlitRegion(&mockAllocationDst, &mockAllocationSrc, 0, 0, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( @@ -761,7 +854,7 @@ HWTEST2_F(CommandListCreate, givenCopyCommandListWhenCopyRegionWithinMaxBlitSize MemoryPool::System4KBPages); size_t rowPitch = copySize.x; size_t slicePitch = copySize.x * copySize.y; - commandList->appendMemoryCopyBlitRegion(&mockAllocationDst, &mockAllocationSrc, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr); + commandList->appendMemoryCopyBlitRegion(&mockAllocationDst, &mockAllocationSrc, 0, 0, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr); uint32_t bytesPerPixel = NEO::BlitCommandsHelper::getAvailableBytesPerPixel(copySize.x, srcRegion.originX, dstRegion.originY, srcSize.x, dstSize.x); GenCmdList cmdList; @@ -793,7 +886,7 @@ HWTEST2_F(CommandListCreate, givenCopyCommandListWhenCopyRegionGreaterThanMaxBli MemoryPool::System4KBPages); size_t rowPitch = copySize.x; size_t slicePitch = copySize.x * copySize.y; - commandList->appendMemoryCopyBlitRegion(&mockAllocationDst, &mockAllocationSrc, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr); + commandList->appendMemoryCopyBlitRegion(&mockAllocationDst, &mockAllocationSrc, 0, 0, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( @@ -812,8 +905,10 @@ class MockCommandListForRegionSize : public WhiteBox<::L0::CommandListCoreFamily AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize) override { return {0, 0, nullptr, true}; } - ze_result_t appendMemoryCopyBlitRegion(NEO::GraphicsAllocation *srcptr, - NEO::GraphicsAllocation *dstptr, + ze_result_t appendMemoryCopyBlitRegion(NEO::GraphicsAllocation *srcAllocation, + NEO::GraphicsAllocation *dstAllocation, + size_t srcOffset, + size_t dstOffset, ze_copy_region_t srcRegion, ze_copy_region_t dstRegion, Vec3 copySize, size_t srcRowPitch, size_t srcSlicePitch, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp index 4cc9db2aee..d35cecc5a4 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp @@ -308,7 +308,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListWhenTimestampPassedToMemoryCopyR reinterpret_cast(0x1234), 0x1000, 0, sizeof(uint32_t), MemoryPool::System4KBPages); - commandList->appendMemoryCopyBlitRegion(&mockAllocationDst, &mockAllocationSrc, srcRegion, dstRegion, {0, 0, 0}, 0, 0, 0, 0, 0, 0, event->toHandle()); + commandList->appendMemoryCopyBlitRegion(&mockAllocationDst, &mockAllocationSrc, 0, 0, srcRegion, dstRegion, {0, 0, 0}, 0, 0, 0, 0, 0, 0, event->toHandle()); GenCmdList cmdList; auto baseAddr = event->getGpuAddress();