From 566a761aaa0dda147d591526da7123084d828d70 Mon Sep 17 00:00:00 2001 From: Maciej Dziuban Date: Tue, 26 Jan 2021 10:38:20 +0000 Subject: [PATCH] Optimize copying buffers with misaligned pointers Resolves: NEO-5476 Signed-off-by: Maciej Dziuban --- .../built_ins/builtins_dispatch_builder.cpp | 22 +++++++++++++----- .../unit_test/built_ins/built_in_tests.cpp | 10 +++++--- .../kernel_binary_helper_hash_value.cpp | 6 ++--- .../copy_buffer_to_buffer.builtin_kernel | 23 ++++++++++++++++++- ..._buffer_to_buffer_stateless.builtin_kernel | 23 ++++++++++++++++++- ...ages.cl => 17473353783988150586_images.cl} | 23 ++++++++++++++++++- ...> 17473353783988150586_images_options.txt} | 2 +- ...254829439342.cl => 2175211656417857798.cl} | 23 ++++++++++++++++++- ...ns.txt => 2175211656417857798_options.txt} | 2 +- 9 files changed, 116 insertions(+), 18 deletions(-) rename shared/test/unit_test/test_files/{5842966610900731025_images.cl => 17473353783988150586_images.cl} (96%) rename shared/test/unit_test/test_files/{17034883254829439342_options.txt => 17473353783988150586_images_options.txt} (60%) rename shared/test/unit_test/test_files/{17034883254829439342.cl => 2175211656417857798.cl} (91%) rename shared/test/unit_test/test_files/{5842966610900731025_images_options.txt => 2175211656417857798_options.txt} (60%) diff --git a/opencl/source/built_ins/builtins_dispatch_builder.cpp b/opencl/source/built_ins/builtins_dispatch_builder.cpp index 4cff48a7f6..ccbf67aa39 100644 --- a/opencl/source/built_ins/builtins_dispatch_builder.cpp +++ b/opencl/source/built_ins/builtins_dispatch_builder.cpp @@ -54,17 +54,20 @@ class BuiltInOp : public BuiltinDispatchInfoBui uintptr_t middleSizeBytes = operationParams.size.x - leftSize - rightSize; // calc middle size - if (!isAligned<4>(reinterpret_cast(operationParams.srcPtr) + operationParams.srcOffset.x + leftSize)) { - //corner case - src relative to dst does not have DWORD alignment - leftSize += middleSizeBytes; - middleSizeBytes = 0; - } + // corner case - fully optimized kernel requires DWORD alignment. If we don't have it, run slower, misaligned kernel + const auto srcMiddleStart = reinterpret_cast(operationParams.srcPtr) + operationParams.srcOffset.x + leftSize; + const auto srcMisalignment = srcMiddleStart % sizeof(uint32_t); + const auto isSrcMisaligned = srcMisalignment != 0u; auto middleSizeEls = middleSizeBytes / middleElSize; // num work items in middle walker // Set-up ISA kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Left, kernLeftLeftover); - kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle); + if (isSrcMisaligned) { + kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddleMisaligned); + } else { + kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle); + } kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Right, kernRightLeftover); // Set-up common kernel args @@ -95,6 +98,10 @@ class BuiltInOp : public BuiltinDispatchInfoBui kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 3, static_cast(operationParams.dstOffset.x + leftSize)); kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Right, 3, static_cast(operationParams.dstOffset.x + leftSize + middleSizeBytes)); + if (isSrcMisaligned) { + kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 4, static_cast(srcMisalignment * 8)); + } + // Set-up work sizes // Note for split walker, it would be just builder.SetDipatchGeometry(GWS, ELWS, OFFSET) kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Left, Vec3{leftSize, 0, 0}, Vec3{0, 0, 0}, Vec3{0, 0, 0}); @@ -112,6 +119,7 @@ class BuiltInOp : public BuiltinDispatchInfoBui protected: Kernel *kernLeftLeftover = nullptr; Kernel *kernMiddle = nullptr; + Kernel *kernMiddleMisaligned = nullptr; Kernel *kernRightLeftover = nullptr; BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, bool populateKernels) : BuiltinDispatchInfoBuilder(kernelsLib, device) { @@ -120,6 +128,7 @@ class BuiltInOp : public BuiltinDispatchInfoBui "", "CopyBufferToBufferLeftLeftover", kernLeftLeftover, "CopyBufferToBufferMiddle", kernMiddle, + "CopyBufferToBufferMiddleMisaligned", kernMiddleMisaligned, "CopyBufferToBufferRightLeftover", kernRightLeftover); } } @@ -134,6 +143,7 @@ class BuiltInOp : public BuiltInOpgetKernel(); - EXPECT_EQ(dispatchInfo->getKernel()->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName, "CopyBufferToBufferLeftLeftover"); + EXPECT_EQ(kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName, "CopyBufferToBufferMiddleMisaligned"); + + const auto crossThreadData = kernel->getCrossThreadData(rootDeviceIndex); + const auto crossThreadOffset = kernel->getKernelInfo(rootDeviceIndex).kernelArgInfo[4].kernelArgPatchInfoVector[0].crossthreadOffset; + EXPECT_EQ(8u, *reinterpret_cast(ptrOffset(crossThreadData, crossThreadOffset))); EXPECT_TRUE(compareBuiltinOpParams(multiDispatchInfo.peekBuiltinOpParams(), builtinOpsParams)); } diff --git a/opencl/test/unit_test/helpers/kernel_binary_helper_hash_value.cpp b/opencl/test/unit_test/helpers/kernel_binary_helper_hash_value.cpp index 644e414e20..8024b1ce2f 100644 --- a/opencl/test/unit_test/helpers/kernel_binary_helper_hash_value.cpp +++ b/opencl/test/unit_test/helpers/kernel_binary_helper_hash_value.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -7,5 +7,5 @@ #include "opencl/test/unit_test/helpers/kernel_binary_helper.h" -const std::string KernelBinaryHelper::BUILT_INS("17034883254829439342"); -const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("5842966610900731025_images"); +const std::string KernelBinaryHelper::BUILT_INS("2175211656417857798"); +const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("17473353783988150586_images"); diff --git a/shared/source/built_ins/kernels/copy_buffer_to_buffer.builtin_kernel b/shared/source/built_ins/kernels/copy_buffer_to_buffer.builtin_kernel index 7d47bb10d2..0238e5897f 100644 --- a/shared/source/built_ins/kernels/copy_buffer_to_buffer.builtin_kernel +++ b/shared/source/built_ins/kernels/copy_buffer_to_buffer.builtin_kernel @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017-2020 Intel Corporation + * Copyright (C) 2017-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -41,6 +41,27 @@ __kernel void CopyBufferToBufferMiddle( vstore4(loaded, gid, pDst); } +__kernel void CopyBufferToBufferMiddleMisaligned( + __global const uint* pSrc, + __global uint* pDst, + uint srcOffsetInBytes, + uint dstOffsetInBytes, + uint misalignmentInBits) +{ + const size_t gid = get_global_id(0); + pDst += dstOffsetInBytes >> 2; + pSrc += srcOffsetInBytes >> 2; + const uint4 src0 = vload4(gid, pSrc); + const uint4 src1 = vload4(gid + 1, pSrc); + + uint4 result; + result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits)); + result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits)); + result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits)); + result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits)); + vstore4(result, gid, pDst); +} + __kernel void CopyBufferToBufferRightLeftover( const __global uchar* pSrc, __global uchar* pDst, diff --git a/shared/source/built_ins/kernels/copy_buffer_to_buffer_stateless.builtin_kernel b/shared/source/built_ins/kernels/copy_buffer_to_buffer_stateless.builtin_kernel index ded7b759f4..1321e63798 100644 --- a/shared/source/built_ins/kernels/copy_buffer_to_buffer_stateless.builtin_kernel +++ b/shared/source/built_ins/kernels/copy_buffer_to_buffer_stateless.builtin_kernel @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -41,6 +41,27 @@ __kernel void CopyBufferToBufferMiddle( vstore4(loaded, gid, pDst); } +__kernel void CopyBufferToBufferMiddleMisaligned( + __global const uint* pSrc, + __global uint* pDst, + ulong srcOffsetInBytes, + ulong dstOffsetInBytes, + uint misalignmentInBits) +{ + const size_t gid = get_global_id(0); + pDst += dstOffsetInBytes >> 2; + pSrc += srcOffsetInBytes >> 2; + const uint4 src0 = vload4(gid, pSrc); + const uint4 src1 = vload4(gid + 1, pSrc); + + uint4 result; + result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits)); + result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits)); + result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits)); + result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits)); + vstore4(result, gid, pDst); +} + __kernel void CopyBufferToBufferRightLeftover( const __global uchar* pSrc, __global uchar* pDst, diff --git a/shared/test/unit_test/test_files/5842966610900731025_images.cl b/shared/test/unit_test/test_files/17473353783988150586_images.cl similarity index 96% rename from shared/test/unit_test/test_files/5842966610900731025_images.cl rename to shared/test/unit_test/test_files/17473353783988150586_images.cl index 29d75b3ccd..92313faab4 100644 --- a/shared/test/unit_test/test_files/5842966610900731025_images.cl +++ b/shared/test/unit_test/test_files/17473353783988150586_images.cl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Intel Corporation + * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -46,6 +46,27 @@ __kernel void CopyBufferToBufferMiddle( vstore4(loaded, gid, pDst); } +__kernel void CopyBufferToBufferMiddleMisaligned( + __global const uint* pSrc, + __global uint* pDst, + uint srcOffsetInBytes, + uint dstOffsetInBytes, + uint misalignmentInBits) +{ + const size_t gid = get_global_id(0); + pDst += dstOffsetInBytes >> 2; + pSrc += srcOffsetInBytes >> 2; + const uint4 src0 = vload4(gid, pSrc); + const uint4 src1 = vload4(gid + 1, pSrc); + + uint4 result; + result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits)); + result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits)); + result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits)); + result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits)); + vstore4(result, gid, pDst); +} + __kernel void CopyBufferToBufferRightLeftover( const __global uchar* pSrc, __global uchar* pDst, diff --git a/shared/test/unit_test/test_files/17034883254829439342_options.txt b/shared/test/unit_test/test_files/17473353783988150586_images_options.txt similarity index 60% rename from shared/test/unit_test/test_files/17034883254829439342_options.txt rename to shared/test/unit_test/test_files/17473353783988150586_images_options.txt index 60cc68c56e..11457cc5cc 100644 --- a/shared/test/unit_test/test_files/17034883254829439342_options.txt +++ b/shared/test/unit_test/test_files/17473353783988150586_images_options.txt @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Intel Corporation + * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * diff --git a/shared/test/unit_test/test_files/17034883254829439342.cl b/shared/test/unit_test/test_files/2175211656417857798.cl similarity index 91% rename from shared/test/unit_test/test_files/17034883254829439342.cl rename to shared/test/unit_test/test_files/2175211656417857798.cl index ca31e6bf52..2d7dff61ac 100644 --- a/shared/test/unit_test/test_files/17034883254829439342.cl +++ b/shared/test/unit_test/test_files/2175211656417857798.cl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Intel Corporation + * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -46,6 +46,27 @@ __kernel void CopyBufferToBufferMiddle( vstore4(loaded, gid, pDst); } +__kernel void CopyBufferToBufferMiddleMisaligned( + __global const uint* pSrc, + __global uint* pDst, + uint srcOffsetInBytes, + uint dstOffsetInBytes, + uint misalignmentInBits) +{ + const size_t gid = get_global_id(0); + pDst += dstOffsetInBytes >> 2; + pSrc += srcOffsetInBytes >> 2; + const uint4 src0 = vload4(gid, pSrc); + const uint4 src1 = vload4(gid + 1, pSrc); + + uint4 result; + result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits)); + result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits)); + result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits)); + result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits)); + vstore4(result, gid, pDst); +} + __kernel void CopyBufferToBufferRightLeftover( const __global uchar* pSrc, __global uchar* pDst, diff --git a/shared/test/unit_test/test_files/5842966610900731025_images_options.txt b/shared/test/unit_test/test_files/2175211656417857798_options.txt similarity index 60% rename from shared/test/unit_test/test_files/5842966610900731025_images_options.txt rename to shared/test/unit_test/test_files/2175211656417857798_options.txt index 60cc68c56e..11457cc5cc 100644 --- a/shared/test/unit_test/test_files/5842966610900731025_images_options.txt +++ b/shared/test/unit_test/test_files/2175211656417857798_options.txt @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Intel Corporation + * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT *