mirror of
https://github.com/intel/compute-runtime.git
synced 2025-11-10 05:49:51 +08:00
Optimize copying buffers with misaligned pointers
Resolves: NEO-5476 Signed-off-by: Maciej Dziuban <maciej.dziuban@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
9290637a8e
commit
566a761aaa
@@ -54,17 +54,20 @@ class BuiltInOp<EBuiltInOps::CopyBufferToBuffer> : public BuiltinDispatchInfoBui
|
||||
|
||||
uintptr_t middleSizeBytes = operationParams.size.x - leftSize - rightSize; // calc middle size
|
||||
|
||||
if (!isAligned<4>(reinterpret_cast<uintptr_t>(operationParams.srcPtr) + operationParams.srcOffset.x + leftSize)) {
|
||||
//corner case - src relative to dst does not have DWORD alignment
|
||||
leftSize += middleSizeBytes;
|
||||
middleSizeBytes = 0;
|
||||
}
|
||||
// corner case - fully optimized kernel requires DWORD alignment. If we don't have it, run slower, misaligned kernel
|
||||
const auto srcMiddleStart = reinterpret_cast<uintptr_t>(operationParams.srcPtr) + operationParams.srcOffset.x + leftSize;
|
||||
const auto srcMisalignment = srcMiddleStart % sizeof(uint32_t);
|
||||
const auto isSrcMisaligned = srcMisalignment != 0u;
|
||||
|
||||
auto middleSizeEls = middleSizeBytes / middleElSize; // num work items in middle walker
|
||||
|
||||
// Set-up ISA
|
||||
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Left, kernLeftLeftover);
|
||||
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle);
|
||||
if (isSrcMisaligned) {
|
||||
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddleMisaligned);
|
||||
} else {
|
||||
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle);
|
||||
}
|
||||
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Right, kernRightLeftover);
|
||||
|
||||
// Set-up common kernel args
|
||||
@@ -95,6 +98,10 @@ class BuiltInOp<EBuiltInOps::CopyBufferToBuffer> : public BuiltinDispatchInfoBui
|
||||
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 3, static_cast<OffsetType>(operationParams.dstOffset.x + leftSize));
|
||||
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Right, 3, static_cast<OffsetType>(operationParams.dstOffset.x + leftSize + middleSizeBytes));
|
||||
|
||||
if (isSrcMisaligned) {
|
||||
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 4, static_cast<uint32_t>(srcMisalignment * 8));
|
||||
}
|
||||
|
||||
// Set-up work sizes
|
||||
// Note for split walker, it would be just builder.SetDipatchGeometry(GWS, ELWS, OFFSET)
|
||||
kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Left, Vec3<size_t>{leftSize, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
|
||||
@@ -112,6 +119,7 @@ class BuiltInOp<EBuiltInOps::CopyBufferToBuffer> : public BuiltinDispatchInfoBui
|
||||
protected:
|
||||
Kernel *kernLeftLeftover = nullptr;
|
||||
Kernel *kernMiddle = nullptr;
|
||||
Kernel *kernMiddleMisaligned = nullptr;
|
||||
Kernel *kernRightLeftover = nullptr;
|
||||
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, bool populateKernels)
|
||||
: BuiltinDispatchInfoBuilder(kernelsLib, device) {
|
||||
@@ -120,6 +128,7 @@ class BuiltInOp<EBuiltInOps::CopyBufferToBuffer> : public BuiltinDispatchInfoBui
|
||||
"",
|
||||
"CopyBufferToBufferLeftLeftover", kernLeftLeftover,
|
||||
"CopyBufferToBufferMiddle", kernMiddle,
|
||||
"CopyBufferToBufferMiddleMisaligned", kernMiddleMisaligned,
|
||||
"CopyBufferToBufferRightLeftover", kernRightLeftover);
|
||||
}
|
||||
}
|
||||
@@ -134,6 +143,7 @@ class BuiltInOp<EBuiltInOps::CopyBufferToBufferStateless> : public BuiltInOp<EBu
|
||||
CompilerOptions::greaterThan4gbBuffersRequired,
|
||||
"CopyBufferToBufferLeftLeftover", kernLeftLeftover,
|
||||
"CopyBufferToBufferMiddle", kernMiddle,
|
||||
"CopyBufferToBufferMiddleMisaligned", kernMiddleMisaligned,
|
||||
"CopyBufferToBufferRightLeftover", kernRightLeftover);
|
||||
}
|
||||
|
||||
|
||||
@@ -900,7 +900,7 @@ TEST_F(BuiltInTests, GivenUnalignedCopyBufferToBufferWhenDispatchInfoIsCreatedTh
|
||||
BuiltinOpParams builtinOpsParams;
|
||||
|
||||
builtinOpsParams.srcMemObj = &src;
|
||||
builtinOpsParams.srcOffset.x = 1;
|
||||
builtinOpsParams.srcOffset.x = 5; // causes misalignment from 4-byte boundary by 1 byte (8 bits)
|
||||
builtinOpsParams.dstMemObj = &dst;
|
||||
builtinOpsParams.size = {src.getSize(), 0, 0};
|
||||
|
||||
@@ -909,9 +909,13 @@ TEST_F(BuiltInTests, GivenUnalignedCopyBufferToBufferWhenDispatchInfoIsCreatedTh
|
||||
|
||||
EXPECT_EQ(1u, multiDispatchInfo.size());
|
||||
|
||||
const DispatchInfo *dispatchInfo = multiDispatchInfo.begin();
|
||||
const Kernel *kernel = multiDispatchInfo.begin()->getKernel();
|
||||
|
||||
EXPECT_EQ(dispatchInfo->getKernel()->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName, "CopyBufferToBufferLeftLeftover");
|
||||
EXPECT_EQ(kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName, "CopyBufferToBufferMiddleMisaligned");
|
||||
|
||||
const auto crossThreadData = kernel->getCrossThreadData(rootDeviceIndex);
|
||||
const auto crossThreadOffset = kernel->getKernelInfo(rootDeviceIndex).kernelArgInfo[4].kernelArgPatchInfoVector[0].crossthreadOffset;
|
||||
EXPECT_EQ(8u, *reinterpret_cast<uint32_t *>(ptrOffset(crossThreadData, crossThreadOffset)));
|
||||
|
||||
EXPECT_TRUE(compareBuiltinOpParams(multiDispatchInfo.peekBuiltinOpParams(), builtinOpsParams));
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2020 Intel Corporation
|
||||
* Copyright (C) 2019-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -7,5 +7,5 @@
|
||||
|
||||
#include "opencl/test/unit_test/helpers/kernel_binary_helper.h"
|
||||
|
||||
const std::string KernelBinaryHelper::BUILT_INS("17034883254829439342");
|
||||
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("5842966610900731025_images");
|
||||
const std::string KernelBinaryHelper::BUILT_INS("2175211656417857798");
|
||||
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("17473353783988150586_images");
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2017-2020 Intel Corporation
|
||||
* Copyright (C) 2017-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -41,6 +41,27 @@ __kernel void CopyBufferToBufferMiddle(
|
||||
vstore4(loaded, gid, pDst);
|
||||
}
|
||||
|
||||
__kernel void CopyBufferToBufferMiddleMisaligned(
|
||||
__global const uint* pSrc,
|
||||
__global uint* pDst,
|
||||
uint srcOffsetInBytes,
|
||||
uint dstOffsetInBytes,
|
||||
uint misalignmentInBits)
|
||||
{
|
||||
const size_t gid = get_global_id(0);
|
||||
pDst += dstOffsetInBytes >> 2;
|
||||
pSrc += srcOffsetInBytes >> 2;
|
||||
const uint4 src0 = vload4(gid, pSrc);
|
||||
const uint4 src1 = vload4(gid + 1, pSrc);
|
||||
|
||||
uint4 result;
|
||||
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
|
||||
result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits));
|
||||
result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits));
|
||||
result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits));
|
||||
vstore4(result, gid, pDst);
|
||||
}
|
||||
|
||||
__kernel void CopyBufferToBufferRightLeftover(
|
||||
const __global uchar* pSrc,
|
||||
__global uchar* pDst,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2020 Intel Corporation
|
||||
* Copyright (C) 2019-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -41,6 +41,27 @@ __kernel void CopyBufferToBufferMiddle(
|
||||
vstore4(loaded, gid, pDst);
|
||||
}
|
||||
|
||||
__kernel void CopyBufferToBufferMiddleMisaligned(
|
||||
__global const uint* pSrc,
|
||||
__global uint* pDst,
|
||||
ulong srcOffsetInBytes,
|
||||
ulong dstOffsetInBytes,
|
||||
uint misalignmentInBits)
|
||||
{
|
||||
const size_t gid = get_global_id(0);
|
||||
pDst += dstOffsetInBytes >> 2;
|
||||
pSrc += srcOffsetInBytes >> 2;
|
||||
const uint4 src0 = vload4(gid, pSrc);
|
||||
const uint4 src1 = vload4(gid + 1, pSrc);
|
||||
|
||||
uint4 result;
|
||||
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
|
||||
result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits));
|
||||
result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits));
|
||||
result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits));
|
||||
vstore4(result, gid, pDst);
|
||||
}
|
||||
|
||||
__kernel void CopyBufferToBufferRightLeftover(
|
||||
const __global uchar* pSrc,
|
||||
__global uchar* pDst,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
* Copyright (C) 2020-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -46,6 +46,27 @@ __kernel void CopyBufferToBufferMiddle(
|
||||
vstore4(loaded, gid, pDst);
|
||||
}
|
||||
|
||||
__kernel void CopyBufferToBufferMiddleMisaligned(
|
||||
__global const uint* pSrc,
|
||||
__global uint* pDst,
|
||||
uint srcOffsetInBytes,
|
||||
uint dstOffsetInBytes,
|
||||
uint misalignmentInBits)
|
||||
{
|
||||
const size_t gid = get_global_id(0);
|
||||
pDst += dstOffsetInBytes >> 2;
|
||||
pSrc += srcOffsetInBytes >> 2;
|
||||
const uint4 src0 = vload4(gid, pSrc);
|
||||
const uint4 src1 = vload4(gid + 1, pSrc);
|
||||
|
||||
uint4 result;
|
||||
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
|
||||
result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits));
|
||||
result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits));
|
||||
result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits));
|
||||
vstore4(result, gid, pDst);
|
||||
}
|
||||
|
||||
__kernel void CopyBufferToBufferRightLeftover(
|
||||
const __global uchar* pSrc,
|
||||
__global uchar* pDst,
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
* Copyright (C) 2020-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
* Copyright (C) 2020-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -46,6 +46,27 @@ __kernel void CopyBufferToBufferMiddle(
|
||||
vstore4(loaded, gid, pDst);
|
||||
}
|
||||
|
||||
__kernel void CopyBufferToBufferMiddleMisaligned(
|
||||
__global const uint* pSrc,
|
||||
__global uint* pDst,
|
||||
uint srcOffsetInBytes,
|
||||
uint dstOffsetInBytes,
|
||||
uint misalignmentInBits)
|
||||
{
|
||||
const size_t gid = get_global_id(0);
|
||||
pDst += dstOffsetInBytes >> 2;
|
||||
pSrc += srcOffsetInBytes >> 2;
|
||||
const uint4 src0 = vload4(gid, pSrc);
|
||||
const uint4 src1 = vload4(gid + 1, pSrc);
|
||||
|
||||
uint4 result;
|
||||
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
|
||||
result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits));
|
||||
result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits));
|
||||
result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits));
|
||||
vstore4(result, gid, pDst);
|
||||
}
|
||||
|
||||
__kernel void CopyBufferToBufferRightLeftover(
|
||||
const __global uchar* pSrc,
|
||||
__global uchar* pDst,
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
* Copyright (C) 2020-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
Reference in New Issue
Block a user