Append additional fill kernel for left leftover

Related-To: NEO-7287

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2022-09-02 16:16:41 +00:00
committed by Compute-Runtime-Automation
parent 992e8e7c96
commit 403b49e987
16 changed files with 146 additions and 58 deletions

View File

@ -29,8 +29,8 @@ enum class Builtin : uint32_t {
CopyBufferToBufferSideStateless,
FillBufferImmediate,
FillBufferImmediateStateless,
FillBufferImmediateRightLeftOver,
FillBufferImmediateRightLeftOverStateless,
FillBufferImmediateLeftOver,
FillBufferImmediateLeftOverStateless,
FillBufferSSHOffset,
FillBufferSSHOffsetStateless,
FillBufferMiddle,

View File

@ -79,12 +79,12 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) {
builtinName = "FillBufferImmediate";
builtin = NEO::EBuiltInOps::FillBufferStateless;
break;
case Builtin::FillBufferImmediateRightLeftOver:
builtinName = "FillBufferImmediateRightLeftOver";
case Builtin::FillBufferImmediateLeftOver:
builtinName = "FillBufferImmediateLeftOver";
builtin = NEO::EBuiltInOps::FillBuffer;
break;
case Builtin::FillBufferImmediateRightLeftOverStateless:
builtinName = "FillBufferImmediateRightLeftOver";
case Builtin::FillBufferImmediateLeftOverStateless:
builtinName = "FillBufferImmediateLeftOver";
builtin = NEO::EBuiltInOps::FillBufferStateless;
break;
case Builtin::FillBufferSSHOffset:

View File

@ -228,6 +228,14 @@ struct CommandListCoreFamily : CommandListImp {
const ze_group_count_t *threadGroupDimensions,
Event *event,
const CmdListKernelLaunchParams &launchParams);
ze_result_t appendUnalignedFillKernel(bool isStateless,
uint32_t unalignedSize,
AlignedAllocationData dstAllocation,
const void *pattern,
Event *signalEvent,
CmdListKernelLaunchParams launchParams);
ze_result_t prepareIndirectParams(const ze_group_count_t *threadGroupDimensions);
void updateStreamProperties(Kernel &kernel, bool isMultiOsContextCapable, bool isCooperative);
void clearCommandsToPatch();
@ -248,7 +256,6 @@ struct CommandListCoreFamily : CommandListImp {
NEO::PipeControlArgs createBarrierFlags();
void appendMultiTileBarrier(NEO::Device &neoDevice);
size_t estimateBufferSizeMultiTileBarrier(const NEO::HardwareInfo &hwInfo);
uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region);
MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize, bool hostCopyAllowed);
ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);

View File

@ -1462,6 +1462,31 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryPrefetch(const voi
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendUnalignedFillKernel(bool isStateless, uint32_t unalignedSize, AlignedAllocationData dstAllocation, const void *pattern, Event *signalEvent, CmdListKernelLaunchParams launchParams) {
Kernel *builtinFunction = nullptr;
if (isStateless) {
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediateLeftOverStateless);
} else {
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediateLeftOver);
}
uint32_t groupSizeY = 1, groupSizeZ = 1;
uint32_t groupSizeX = static_cast<uint32_t>(unalignedSize);
builtinFunction->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ);
builtinFunction->setGroupSize(groupSizeX, groupSizeY, groupSizeZ);
ze_group_count_t dispatchFuncRemainderArgs{static_cast<uint32_t>(unalignedSize / groupSizeX), 1u, 1u};
uint32_t value = *(reinterpret_cast<const unsigned char *>(pattern));
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
builtinFunction->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset);
builtinFunction->setArgumentValue(2, sizeof(value), &value);
auto res = appendLaunchKernelSplit(builtinFunction, &dispatchFuncRemainderArgs, signalEvent, launchParams);
if (res) {
return res;
}
return ZE_RESULT_SUCCESS;
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
const void *pattern,
@ -1524,8 +1549,17 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
launchParams.isKernelSplitOperation = true;
launchParams.isBuiltInKernel = true;
launchParams.isDestinationAllocationInSystemMemory = hostPointerNeedsFlush;
if (patternSize == 1) {
size_t middleSize = size;
uint32_t leftRemainder = sizeof(uint32_t) - (dstAllocation.offset % sizeof(uint32_t));
if (dstAllocation.offset % sizeof(uint32_t) != 0 && leftRemainder <= size) {
res = appendUnalignedFillKernel(isStateless, leftRemainder, dstAllocation, pattern, signalEvent, launchParams);
if (res) {
return res;
}
middleSize -= leftRemainder;
dstAllocation.offset += leftRemainder;
}
Kernel *builtinFunction = nullptr;
if (isStateless) {
@ -1534,7 +1568,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediate);
}
const auto dataTypeSize = sizeof(uint32_t) * 4;
size_t adjustedSize = size / dataTypeSize;
size_t adjustedSize = middleSize / dataTypeSize;
size_t groupSizeX = device->getDeviceInfo().maxWorkGroupSize;
if (groupSizeX > adjustedSize && adjustedSize > 0) {
groupSizeX = adjustedSize;
@ -1545,8 +1579,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
}
size_t groups = adjustedSize / groupSizeX;
size_t remainingBytes = static_cast<size_t>((adjustedSize % groupSizeX) * dataTypeSize +
size % dataTypeSize);
uint32_t remainingBytes = static_cast<uint32_t>((adjustedSize % groupSizeX) * dataTypeSize +
middleSize % dataTypeSize);
ze_group_count_t dispatchFuncArgs{static_cast<uint32_t>(groups), 1u, 1u};
uint32_t value = 0;
@ -1563,23 +1597,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
}
if (remainingBytes) {
if (isStateless) {
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediateRightLeftOverStateless);
} else {
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediateRightLeftOver);
}
uint32_t groupSizeY = 1, groupSizeZ = 1;
uint32_t groupSizeX = static_cast<uint32_t>(remainingBytes);
builtinFunction->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ);
builtinFunction->setGroupSize(groupSizeX, groupSizeY, groupSizeZ);
ze_group_count_t dispatchFuncRemainderArgs{static_cast<uint32_t>(remainingBytes / groupSizeX), 1u, 1u};
size_t dstOffset = dstAllocation.offset + (size - remainingBytes);
value = *(reinterpret_cast<const unsigned char *>(pattern));
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
builtinFunction->setArgumentValue(1, sizeof(dstOffset), &dstOffset);
builtinFunction->setArgumentValue(2, sizeof(value), &value);
res = appendLaunchKernelSplit(builtinFunction, &dispatchFuncRemainderArgs, signalEvent, launchParams);
dstAllocation.offset += (middleSize - remainingBytes);
res = appendUnalignedFillKernel(isStateless, remainingBytes, dstAllocation, pattern, signalEvent, launchParams);
if (res) {
return res;
}
@ -1628,7 +1647,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
patternAllocOffset += patternSizeToCopy;
} while (patternAllocOffset < patternAllocationSize);
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
builtinFunction->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset);
builtinFunction->setArgBufferWithAlloc(2, reinterpret_cast<uintptr_t>(patternGfxAllocPtr), patternGfxAlloc);

View File

@ -53,7 +53,7 @@ class AppendFillFixture : public DeviceFixture {
if (numberOfCallsToAppendLaunchKernelWithParams == thresholdOfCallsToAppendLaunchKernelWithParamsToFail) {
return ZE_RESULT_ERROR_UNKNOWN;
}
if (numberOfCallsToAppendLaunchKernelWithParams < 2) {
if (numberOfCallsToAppendLaunchKernelWithParams < 3) {
threadGroupDimensions[numberOfCallsToAppendLaunchKernelWithParams] = *pThreadGroupDimensions;
xGroupSizes[numberOfCallsToAppendLaunchKernelWithParams] = kernel->getGroupSize()[0];
}
@ -63,8 +63,8 @@ class AppendFillFixture : public DeviceFixture {
event,
launchParams);
}
ze_group_count_t threadGroupDimensions[2];
uint32_t xGroupSizes[2];
ze_group_count_t threadGroupDimensions[3];
uint32_t xGroupSizes[3];
uint32_t thresholdOfCallsToAppendLaunchKernelWithParamsToFail = std::numeric_limits<uint32_t>::max();
uint32_t numberOfCallsToAppendLaunchKernelWithParams = 0;
};
@ -281,6 +281,62 @@ HWTEST2_F(AppendFillTest,
delete[] ptr;
}
HWTEST2_F(AppendFillTest,
givenAppendMemoryFillWhenPtrWithOffsetAndPatternSizeIsOneThenThreeKernelsDispatched, IsAtLeastSkl) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::Compute, 0u);
int pattern = 0;
uint32_t offset = 1;
const size_t size = 1024;
uint8_t *ptr = new uint8_t[size];
ze_result_t result = commandList->appendMemoryFill(ptr + offset, &pattern, 1, size - offset, nullptr, 0, nullptr);
size_t filledSize = commandList->xGroupSizes[0] * commandList->threadGroupDimensions[0].groupCountX;
filledSize += commandList->xGroupSizes[1] * commandList->threadGroupDimensions[1].groupCountX * 16;
filledSize += commandList->xGroupSizes[2] * commandList->threadGroupDimensions[2].groupCountX;
EXPECT_EQ(sizeof(uint32_t) - offset, commandList->xGroupSizes[0]);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(3u, commandList->numberOfCallsToAppendLaunchKernelWithParams);
EXPECT_EQ(size - offset, filledSize);
delete[] ptr;
}
HWTEST2_F(AppendFillTest,
givenAppendMemoryFillWhenPtrWithOffsetAndSmallSizeAndPatternSizeIsOneThenTwoKernelsDispatched, IsAtLeastSkl) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::Compute, 0u);
int pattern = 0;
uint32_t offset = 1;
const size_t size = 2;
uint8_t *ptr = new uint8_t[size];
ze_result_t result = commandList->appendMemoryFill(ptr + offset, &pattern, 1, size - offset, nullptr, 0, nullptr);
size_t filledSize = commandList->xGroupSizes[0] * commandList->threadGroupDimensions[0].groupCountX * 16;
filledSize += commandList->xGroupSizes[1] * commandList->threadGroupDimensions[1].groupCountX;
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(2u, commandList->numberOfCallsToAppendLaunchKernelWithParams);
EXPECT_EQ(size - offset, filledSize);
delete[] ptr;
}
HWTEST2_F(AppendFillTest,
givenAppendMemoryFillWhenPtrWithOffsetAndFailAppendUnalignedFillKernelThenReturnError, IsAtLeastSkl) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->thresholdOfCallsToAppendLaunchKernelWithParamsToFail = 0;
commandList->initialize(device, NEO::EngineGroupType::Compute, 0u);
int pattern = 0;
uint32_t offset = 1;
const size_t size = 1024;
uint8_t *ptr = new uint8_t[size];
ze_result_t result = commandList->appendMemoryFill(ptr + offset, &pattern, 1, size - offset, nullptr, 0, nullptr);
EXPECT_NE(ZE_RESULT_SUCCESS, result);
delete[] ptr;
}
HWTEST2_F(AppendFillTest,
givenCallToAppendMemoryFillWithSizeNotMultipleOfPatternSizeThenSuccessIsReturned, IsAtLeastSkl) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;

View File

@ -38,7 +38,7 @@ components:
dest_dir: kernels_bin
type: git
branch: kernels_bin
revision: 1941-284
revision: 1941-290
kmdaf:
branch: kmdaf
dest_dir: kmdaf

View File

@ -48,15 +48,16 @@ __kernel void FillBufferRightLeftover(
}
__kernel void FillBufferImmediate(
__global uint4* ptr,
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__kernel void FillBufferImmediateLeftOver(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)

View File

@ -48,15 +48,16 @@ __kernel void FillBufferRightLeftover(
}
__kernel void FillBufferImmediate(
__global uint4* ptr,
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
size_t gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__kernel void FillBufferImmediateLeftOver(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)

View File

@ -7,5 +7,5 @@
#include "shared/test/common/helpers/kernel_binary_helper.h"
const std::string KernelBinaryHelper::BUILT_INS("11256751929276190336");
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("13967618058110882853_images");
const std::string KernelBinaryHelper::BUILT_INS("11060315779005710265");
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("7912693229131574095_images");

View File

@ -156,15 +156,16 @@ __kernel void FillBufferRightLeftover(
}
__kernel void FillBufferImmediate(
__global uint4* ptr,
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__kernel void FillBufferImmediateLeftOver(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)

View File

@ -156,15 +156,16 @@ __kernel void FillBufferRightLeftover(
}
__kernel void FillBufferImmediate(
__global uint4* ptr,
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__kernel void FillBufferImmediateLeftOver(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)

View File

@ -156,15 +156,16 @@ __kernel void FillBufferRightLeftover(
}
__kernel void FillBufferImmediate(
__global uint4* ptr,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__kernel void FillBufferImmediateLeftOver(
__global uchar* ptr,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)

View File

@ -156,15 +156,16 @@ __kernel void FillBufferRightLeftover(
}
__kernel void FillBufferImmediate(
__global uint4* ptr,
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__kernel void FillBufferImmediateLeftOver(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)

View File

@ -156,15 +156,16 @@ __kernel void FillBufferRightLeftover(
}
__kernel void FillBufferImmediate(
__global uint4* ptr,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__kernel void FillBufferImmediateLeftOver(
__global uchar* ptr,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)