Optimize copying buffers with misaligned pointers

Resolves: NEO-5476
Signed-off-by: Maciej Dziuban <maciej.dziuban@intel.com>
This commit is contained in:
Maciej Dziuban
2021-01-26 10:38:20 +00:00
committed by Compute-Runtime-Automation
parent 9290637a8e
commit 566a761aaa
9 changed files with 116 additions and 18 deletions

View File

@@ -54,17 +54,20 @@ class BuiltInOp<EBuiltInOps::CopyBufferToBuffer> : public BuiltinDispatchInfoBui
uintptr_t middleSizeBytes = operationParams.size.x - leftSize - rightSize; // calc middle size
if (!isAligned<4>(reinterpret_cast<uintptr_t>(operationParams.srcPtr) + operationParams.srcOffset.x + leftSize)) {
//corner case - src relative to dst does not have DWORD alignment
leftSize += middleSizeBytes;
middleSizeBytes = 0;
}
// corner case - fully optimized kernel requires DWORD alignment. If we don't have it, run slower, misaligned kernel
const auto srcMiddleStart = reinterpret_cast<uintptr_t>(operationParams.srcPtr) + operationParams.srcOffset.x + leftSize;
const auto srcMisalignment = srcMiddleStart % sizeof(uint32_t);
const auto isSrcMisaligned = srcMisalignment != 0u;
auto middleSizeEls = middleSizeBytes / middleElSize; // num work items in middle walker
// Set-up ISA
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Left, kernLeftLeftover);
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle);
if (isSrcMisaligned) {
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddleMisaligned);
} else {
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle);
}
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Right, kernRightLeftover);
// Set-up common kernel args
@@ -95,6 +98,10 @@ class BuiltInOp<EBuiltInOps::CopyBufferToBuffer> : public BuiltinDispatchInfoBui
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 3, static_cast<OffsetType>(operationParams.dstOffset.x + leftSize));
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Right, 3, static_cast<OffsetType>(operationParams.dstOffset.x + leftSize + middleSizeBytes));
if (isSrcMisaligned) {
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 4, static_cast<uint32_t>(srcMisalignment * 8));
}
// Set-up work sizes
// Note for split walker, it would be just builder.SetDipatchGeometry(GWS, ELWS, OFFSET)
kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Left, Vec3<size_t>{leftSize, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
@@ -112,6 +119,7 @@ class BuiltInOp<EBuiltInOps::CopyBufferToBuffer> : public BuiltinDispatchInfoBui
protected:
Kernel *kernLeftLeftover = nullptr;
Kernel *kernMiddle = nullptr;
Kernel *kernMiddleMisaligned = nullptr;
Kernel *kernRightLeftover = nullptr;
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, bool populateKernels)
: BuiltinDispatchInfoBuilder(kernelsLib, device) {
@@ -120,6 +128,7 @@ class BuiltInOp<EBuiltInOps::CopyBufferToBuffer> : public BuiltinDispatchInfoBui
"",
"CopyBufferToBufferLeftLeftover", kernLeftLeftover,
"CopyBufferToBufferMiddle", kernMiddle,
"CopyBufferToBufferMiddleMisaligned", kernMiddleMisaligned,
"CopyBufferToBufferRightLeftover", kernRightLeftover);
}
}
@@ -134,6 +143,7 @@ class BuiltInOp<EBuiltInOps::CopyBufferToBufferStateless> : public BuiltInOp<EBu
CompilerOptions::greaterThan4gbBuffersRequired,
"CopyBufferToBufferLeftLeftover", kernLeftLeftover,
"CopyBufferToBufferMiddle", kernMiddle,
"CopyBufferToBufferMiddleMisaligned", kernMiddleMisaligned,
"CopyBufferToBufferRightLeftover", kernRightLeftover);
}

View File

@@ -900,7 +900,7 @@ TEST_F(BuiltInTests, GivenUnalignedCopyBufferToBufferWhenDispatchInfoIsCreatedTh
BuiltinOpParams builtinOpsParams;
builtinOpsParams.srcMemObj = &src;
builtinOpsParams.srcOffset.x = 1;
builtinOpsParams.srcOffset.x = 5; // causes misalignment from 4-byte boundary by 1 byte (8 bits)
builtinOpsParams.dstMemObj = &dst;
builtinOpsParams.size = {src.getSize(), 0, 0};
@@ -909,9 +909,13 @@ TEST_F(BuiltInTests, GivenUnalignedCopyBufferToBufferWhenDispatchInfoIsCreatedTh
EXPECT_EQ(1u, multiDispatchInfo.size());
const DispatchInfo *dispatchInfo = multiDispatchInfo.begin();
const Kernel *kernel = multiDispatchInfo.begin()->getKernel();
EXPECT_EQ(dispatchInfo->getKernel()->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName, "CopyBufferToBufferLeftLeftover");
EXPECT_EQ(kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName, "CopyBufferToBufferMiddleMisaligned");
const auto crossThreadData = kernel->getCrossThreadData(rootDeviceIndex);
const auto crossThreadOffset = kernel->getKernelInfo(rootDeviceIndex).kernelArgInfo[4].kernelArgPatchInfoVector[0].crossthreadOffset;
EXPECT_EQ(8u, *reinterpret_cast<uint32_t *>(ptrOffset(crossThreadData, crossThreadOffset)));
EXPECT_TRUE(compareBuiltinOpParams(multiDispatchInfo.peekBuiltinOpParams(), builtinOpsParams));
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2020 Intel Corporation
* Copyright (C) 2019-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -7,5 +7,5 @@
#include "opencl/test/unit_test/helpers/kernel_binary_helper.h"
const std::string KernelBinaryHelper::BUILT_INS("17034883254829439342");
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("5842966610900731025_images");
const std::string KernelBinaryHelper::BUILT_INS("2175211656417857798");
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("17473353783988150586_images");

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2017-2020 Intel Corporation
* Copyright (C) 2017-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -41,6 +41,27 @@ __kernel void CopyBufferToBufferMiddle(
vstore4(loaded, gid, pDst);
}
__kernel void CopyBufferToBufferMiddleMisaligned(
__global const uint* pSrc,
__global uint* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes,
uint misalignmentInBits)
{
const size_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
const uint4 src0 = vload4(gid, pSrc);
const uint4 src1 = vload4(gid + 1, pSrc);
uint4 result;
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits));
result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits));
result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits));
vstore4(result, gid, pDst);
}
__kernel void CopyBufferToBufferRightLeftover(
const __global uchar* pSrc,
__global uchar* pDst,

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2020 Intel Corporation
* Copyright (C) 2019-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -41,6 +41,27 @@ __kernel void CopyBufferToBufferMiddle(
vstore4(loaded, gid, pDst);
}
__kernel void CopyBufferToBufferMiddleMisaligned(
__global const uint* pSrc,
__global uint* pDst,
ulong srcOffsetInBytes,
ulong dstOffsetInBytes,
uint misalignmentInBits)
{
const size_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
const uint4 src0 = vload4(gid, pSrc);
const uint4 src1 = vload4(gid + 1, pSrc);
uint4 result;
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits));
result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits));
result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits));
vstore4(result, gid, pDst);
}
__kernel void CopyBufferToBufferRightLeftover(
const __global uchar* pSrc,
__global uchar* pDst,

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020 Intel Corporation
* Copyright (C) 2020-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -46,6 +46,27 @@ __kernel void CopyBufferToBufferMiddle(
vstore4(loaded, gid, pDst);
}
__kernel void CopyBufferToBufferMiddleMisaligned(
__global const uint* pSrc,
__global uint* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes,
uint misalignmentInBits)
{
const size_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
const uint4 src0 = vload4(gid, pSrc);
const uint4 src1 = vload4(gid + 1, pSrc);
uint4 result;
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits));
result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits));
result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits));
vstore4(result, gid, pDst);
}
__kernel void CopyBufferToBufferRightLeftover(
const __global uchar* pSrc,
__global uchar* pDst,

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020 Intel Corporation
* Copyright (C) 2020-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020 Intel Corporation
* Copyright (C) 2020-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -46,6 +46,27 @@ __kernel void CopyBufferToBufferMiddle(
vstore4(loaded, gid, pDst);
}
__kernel void CopyBufferToBufferMiddleMisaligned(
__global const uint* pSrc,
__global uint* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes,
uint misalignmentInBits)
{
const size_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
const uint4 src0 = vload4(gid, pSrc);
const uint4 src1 = vload4(gid + 1, pSrc);
uint4 result;
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits));
result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits));
result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits));
vstore4(result, gid, pDst);
}
__kernel void CopyBufferToBufferRightLeftover(
const __global uchar* pSrc,
__global uchar* pDst,

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020 Intel Corporation
* Copyright (C) 2020-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*