Revert "Use uint4 type instead of char in builtin kernel"

This reverts commit a39bc7e7b3.

Signed-off-by: Compute-Runtime-Validation <compute-runtime-validation@intel.com>
This commit is contained in:
Compute-Runtime-Validation
2022-09-01 05:04:43 +02:00
committed by Compute-Runtime-Automation
parent 6e5c0141b5
commit 84f19a1b93
15 changed files with 44 additions and 197 deletions

View File

@ -29,8 +29,6 @@ enum class Builtin : uint32_t {
CopyBufferToBufferSideStateless, CopyBufferToBufferSideStateless,
FillBufferImmediate, FillBufferImmediate,
FillBufferImmediateStateless, FillBufferImmediateStateless,
FillBufferImmediateRightLeftOver,
FillBufferImmediateRightLeftOverStateless,
FillBufferSSHOffset, FillBufferSSHOffset,
FillBufferSSHOffsetStateless, FillBufferSSHOffsetStateless,
FillBufferMiddle, FillBufferMiddle,

View File

@ -79,14 +79,6 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) {
builtinName = "FillBufferImmediate"; builtinName = "FillBufferImmediate";
builtin = NEO::EBuiltInOps::FillBufferStateless; builtin = NEO::EBuiltInOps::FillBufferStateless;
break; break;
case Builtin::FillBufferImmediateRightLeftOver:
builtinName = "FillBufferImmediateRightLeftOver";
builtin = NEO::EBuiltInOps::FillBuffer;
break;
case Builtin::FillBufferImmediateRightLeftOverStateless:
builtinName = "FillBufferImmediateRightLeftOver";
builtin = NEO::EBuiltInOps::FillBufferStateless;
break;
case Builtin::FillBufferSSHOffset: case Builtin::FillBufferSSHOffset:
builtinName = "FillBufferSSHOffset"; builtinName = "FillBufferSSHOffset";
builtin = NEO::EBuiltInOps::FillBuffer; builtin = NEO::EBuiltInOps::FillBuffer;

View File

@ -1533,51 +1533,37 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
} else { } else {
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediate); builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediate);
} }
const auto dataTypeSize = sizeof(uint32_t) * 4; uint32_t groupSizeX = builtinFunction->getImmutableData()->getDescriptor().kernelAttributes.simdSize;
size_t adjustedSize = std::max(static_cast<size_t>(1u), size / (dataTypeSize)); if (groupSizeX > static_cast<uint32_t>(size)) {
size_t groupSizeX = device->getDeviceInfo().maxWorkGroupSize; groupSizeX = static_cast<uint32_t>(size);
if (groupSizeX > adjustedSize) {
groupSizeX = adjustedSize;
} }
if (builtinFunction->setGroupSize(static_cast<uint32_t>(groupSizeX), 1u, 1u)) { if (builtinFunction->setGroupSize(groupSizeX, 1u, 1u)) {
DEBUG_BREAK_IF(true); DEBUG_BREAK_IF(true);
return ZE_RESULT_ERROR_UNKNOWN; return ZE_RESULT_ERROR_UNKNOWN;
} }
size_t groups = adjustedSize / groupSizeX; uint32_t value = *(reinterpret_cast<const unsigned char *>(pattern));
size_t remainingBytes = static_cast<size_t>((adjustedSize % groupSizeX) * dataTypeSize +
size % dataTypeSize);
ze_group_count_t dispatchFuncArgs{static_cast<uint32_t>(groups), 1u, 1u};
uint32_t value = 0;
memset(&value, *reinterpret_cast<const unsigned char *>(pattern), 4);
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc); builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
builtinFunction->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset); builtinFunction->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset);
builtinFunction->setArgumentValue(2, sizeof(value), &value); builtinFunction->setArgumentValue(2, sizeof(value), &value);
appendEventForProfilingAllWalkers(signalEvent, true); appendEventForProfilingAllWalkers(signalEvent, true);
uint32_t groups = static_cast<uint32_t>(size) / groupSizeX;
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
res = appendLaunchKernelSplit(builtinFunction, &dispatchFuncArgs, signalEvent, launchParams); res = appendLaunchKernelSplit(builtinFunction, &dispatchFuncArgs, signalEvent, launchParams);
if (res) { if (res) {
return res; return res;
} }
if (remainingBytes) { uint32_t groupRemainderSizeX = static_cast<uint32_t>(size) % groupSizeX;
if (isStateless) { if (groupRemainderSizeX) {
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediateRightLeftOverStateless); builtinFunction->setGroupSize(groupRemainderSizeX, 1u, 1u);
} else { ze_group_count_t dispatchFuncRemainderArgs{1u, 1u, 1u};
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediateRightLeftOver);
} size_t dstOffset = dstAllocation.offset + (size - groupRemainderSizeX);
uint32_t groupSizeY = 1, groupSizeZ = 1;
uint32_t groupSizeX = static_cast<uint32_t>(remainingBytes);
builtinFunction->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ);
builtinFunction->setGroupSize(groupSizeX, groupSizeY, groupSizeZ);
ze_group_count_t dispatchFuncRemainderArgs{static_cast<uint32_t>(remainingBytes / groupSizeX), 1u, 1u};
size_t dstOffset = dstAllocation.offset + (size - remainingBytes);
value = *(reinterpret_cast<const unsigned char *>(pattern));
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc); builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
builtinFunction->setArgumentValue(1, sizeof(dstOffset), &dstOffset); builtinFunction->setArgumentValue(1, sizeof(dstOffset), &dstOffset);
builtinFunction->setArgumentValue(2, sizeof(value), &value);
res = appendLaunchKernelSplit(builtinFunction, &dispatchFuncRemainderArgs, signalEvent, launchParams); res = appendLaunchKernelSplit(builtinFunction, &dispatchFuncRemainderArgs, signalEvent, launchParams);
if (res) { if (res) {
@ -1592,6 +1578,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
} else { } else {
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle); builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle);
} }
size_t middleElSize = sizeof(uint32_t); size_t middleElSize = sizeof(uint32_t);
size_t adjustedSize = size / middleElSize; size_t adjustedSize = size / middleElSize;
uint32_t groupSizeX = static_cast<uint32_t>(adjustedSize); uint32_t groupSizeX = static_cast<uint32_t>(adjustedSize);

View File

@ -53,18 +53,14 @@ class AppendFillFixture : public DeviceFixture {
if (numberOfCallsToAppendLaunchKernelWithParams == thresholdOfCallsToAppendLaunchKernelWithParamsToFail) { if (numberOfCallsToAppendLaunchKernelWithParams == thresholdOfCallsToAppendLaunchKernelWithParamsToFail) {
return ZE_RESULT_ERROR_UNKNOWN; return ZE_RESULT_ERROR_UNKNOWN;
} }
if (numberOfCallsToAppendLaunchKernelWithParams < 2) {
threadGroupDimensions[numberOfCallsToAppendLaunchKernelWithParams] = *pThreadGroupDimensions;
xGroupSizes[numberOfCallsToAppendLaunchKernelWithParams] = kernel->getGroupSize()[0];
}
numberOfCallsToAppendLaunchKernelWithParams++; numberOfCallsToAppendLaunchKernelWithParams++;
return CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(kernel, return CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(kernel,
pThreadGroupDimensions, pThreadGroupDimensions,
event, event,
launchParams); launchParams);
} }
ze_group_count_t threadGroupDimensions[2];
uint32_t xGroupSizes[2];
uint32_t thresholdOfCallsToAppendLaunchKernelWithParamsToFail = std::numeric_limits<uint32_t>::max(); uint32_t thresholdOfCallsToAppendLaunchKernelWithParamsToFail = std::numeric_limits<uint32_t>::max();
uint32_t numberOfCallsToAppendLaunchKernelWithParams = 0; uint32_t numberOfCallsToAppendLaunchKernelWithParams = 0;
}; };
@ -81,7 +77,6 @@ class AppendFillFixture : public DeviceFixture {
driverHandle = std::make_unique<Mock<MockDriverFillHandle>>(); driverHandle = std::make_unique<Mock<MockDriverFillHandle>>();
driverHandle->initialize(std::move(devices)); driverHandle->initialize(std::move(devices));
device = driverHandle->devices[0]; device = driverHandle->devices[0];
neoDevice->deviceInfo.maxWorkGroupSize = 256;
} }
void tearDown() { void tearDown() {
@ -194,75 +189,6 @@ HWTEST2_F(AppendFillTest,
EXPECT_EQ(patternAllocationsVectorSize + 1u, newPatternAllocationsVectorSize); EXPECT_EQ(patternAllocationsVectorSize + 1u, newPatternAllocationsVectorSize);
} }
HWTEST2_F(AppendFillTest,
givenAppendMemoryFillWhenPatternSizeIsOneThenDispatchOneKernel, IsAtLeastSkl) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::Compute, 0u);
int pattern = 0;
const size_t size = 1024 * 1024;
uint8_t *ptr = new uint8_t[size];
ze_result_t result = commandList->appendMemoryFill(ptr, &pattern, 1, size, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(1u, commandList->numberOfCallsToAppendLaunchKernelWithParams);
EXPECT_EQ(size, commandList->xGroupSizes[0] * commandList->threadGroupDimensions[0].groupCountX * 16);
delete[] ptr;
}
HWTEST2_F(AppendFillTest,
givenAppendMemoryFillWithUnalignedSizeWhenPatternSizeIsOneThenDispatchTwoKernels, IsAtLeastSkl) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::Compute, 0u);
int pattern = 0;
const size_t size = 1025;
uint8_t *ptr = new uint8_t[size];
ze_result_t result = commandList->appendMemoryFill(ptr, &pattern, 1, size, nullptr, 0, nullptr);
size_t filledSize = commandList->xGroupSizes[0] * commandList->threadGroupDimensions[0].groupCountX * 16;
filledSize += commandList->xGroupSizes[1] * commandList->threadGroupDimensions[1].groupCountX;
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(2u, commandList->numberOfCallsToAppendLaunchKernelWithParams);
EXPECT_EQ(size, filledSize);
delete[] ptr;
}
HWTEST2_F(AppendFillTest,
givenAppendMemoryFillWithSizeBelowMaxWorkgroupSizeWhenPatternSizeIsOneThenDispatchOneKernel, IsAtLeastSkl) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::Compute, 0u);
int pattern = 0;
const size_t size = neoDevice->getDeviceInfo().maxWorkGroupSize / 2;
uint8_t *ptr = new uint8_t[size];
ze_result_t result = commandList->appendMemoryFill(ptr, &pattern, 1, size, nullptr, 0, nullptr);
size_t filledSize = commandList->xGroupSizes[0] * commandList->threadGroupDimensions[0].groupCountX * 16;
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(1u, commandList->numberOfCallsToAppendLaunchKernelWithParams);
EXPECT_EQ(size, filledSize);
delete[] ptr;
}
HWTEST2_F(AppendFillTest,
givenAppendMemoryFillWhenPatternSizeIsOneThenGroupCountIsCorrect, IsAtLeastSkl) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::Compute, 0u);
int pattern = 0;
const size_t size = 1024 * 1024;
uint8_t *ptr = new uint8_t[size];
ze_result_t result = commandList->appendMemoryFill(ptr, &pattern, 1, size, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto groupSize = device->getDeviceInfo().maxWorkGroupSize;
auto dataTypeSize = sizeof(uint32_t) * 4;
auto expectedGroupCount = size / (dataTypeSize * groupSize);
EXPECT_EQ(expectedGroupCount, commandList->threadGroupDimensions[0].groupCountX);
delete[] ptr;
}
HWTEST2_F(AppendFillTest, HWTEST2_F(AppendFillTest,
givenCallToAppendMemoryFillWithSizeNotMultipleOfPatternSizeThenSuccessIsReturned, IsAtLeastSkl) { givenCallToAppendMemoryFillWithSizeNotMultipleOfPatternSizeThenSuccessIsReturned, IsAtLeastSkl) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily; using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;

View File

@ -38,7 +38,7 @@ components:
dest_dir: kernels_bin dest_dir: kernels_bin
type: git type: git
branch: kernels_bin branch: kernels_bin
revision: 1941-284 revision: 1941-282
kmdaf: kmdaf:
branch: kmdaf branch: kmdaf
dest_dir: kmdaf dest_dir: kmdaf

View File

@ -48,21 +48,13 @@ __kernel void FillBufferRightLeftover(
} }
__kernel void FillBufferImmediate( __kernel void FillBufferImmediate(
__global uint4* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__global uchar* ptr, __global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value) const uint value)
{ {
uint gid = get_global_id(0); uint dstIndex = get_global_id(0);
(ptr + dstSshOffset)[gid] = value; __global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
pDst[dstIndex] = value;
} }
__kernel void FillBufferSSHOffset( __kernel void FillBufferSSHOffset(

View File

@ -48,21 +48,13 @@ __kernel void FillBufferRightLeftover(
} }
__kernel void FillBufferImmediate( __kernel void FillBufferImmediate(
__global uint4* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
size_t gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__global uchar* ptr, __global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value) const uint value)
{ {
size_t gid = get_global_id(0); size_t dstIndex = get_global_id(0);
(ptr + dstSshOffset)[gid] = value; __global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
pDst[dstIndex] = value;
} }
__kernel void FillBufferSSHOffset( __kernel void FillBufferSSHOffset(

View File

@ -7,5 +7,5 @@
#include "shared/test/common/helpers/kernel_binary_helper.h" #include "shared/test/common/helpers/kernel_binary_helper.h"
const std::string KernelBinaryHelper::BUILT_INS("11256751929276190336"); const std::string KernelBinaryHelper::BUILT_INS("7998916142903730155");
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("13967618058110882853_images"); const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("16526264370178379440_images");

View File

@ -156,21 +156,13 @@ __kernel void FillBufferRightLeftover(
} }
__kernel void FillBufferImmediate( __kernel void FillBufferImmediate(
__global uint4* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__global uchar* ptr, __global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value) const uint value)
{ {
uint gid = get_global_id(0); uint dstIndex = get_global_id(0);
(ptr + dstSshOffset)[gid] = value; __global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
pDst[dstIndex] = value;
} }
__kernel void FillBufferSSHOffset( __kernel void FillBufferSSHOffset(

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2022 Intel Corporation * Copyright (C) 2020-2022 Intel Corporation
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
* *
@ -156,21 +156,13 @@ __kernel void FillBufferRightLeftover(
} }
__kernel void FillBufferImmediate( __kernel void FillBufferImmediate(
__global uint4* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__global uchar* ptr, __global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value) const uint value)
{ {
uint gid = get_global_id(0); uint dstIndex = get_global_id(0);
(ptr + dstSshOffset)[gid] = value; __global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
pDst[dstIndex] = value;
} }
__kernel void FillBufferSSHOffset( __kernel void FillBufferSSHOffset(

View File

@ -156,21 +156,13 @@ __kernel void FillBufferRightLeftover(
} }
__kernel void FillBufferImmediate( __kernel void FillBufferImmediate(
__global uint4* ptr,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__global uchar* ptr, __global uchar* ptr,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value) const uint value)
{ {
uint gid = get_global_id(0); uint dstIndex = get_global_id(0);
(ptr + dstSshOffset)[gid] = value; __global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
pDst[dstIndex] = value;
} }
__kernel void FillBufferSSHOffset( __kernel void FillBufferSSHOffset(

View File

@ -156,21 +156,13 @@ __kernel void FillBufferRightLeftover(
} }
__kernel void FillBufferImmediate( __kernel void FillBufferImmediate(
__global uint4* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__global uchar* ptr, __global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value) const uint value)
{ {
uint gid = get_global_id(0); size_t dstIndex = get_global_id(0);
(ptr + dstSshOffset)[gid] = value; __global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
pDst[dstIndex] = value;
} }
__kernel void FillBufferSSHOffset( __kernel void FillBufferSSHOffset(

View File

@ -156,21 +156,13 @@ __kernel void FillBufferRightLeftover(
} }
__kernel void FillBufferImmediate( __kernel void FillBufferImmediate(
__global uint4* ptr,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferImmediateRightLeftOver(
__global uchar* ptr, __global uchar* ptr,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value) const uint value)
{ {
uint gid = get_global_id(0); uint dstIndex = get_global_id(0);
(ptr + dstSshOffset)[gid] = value; __global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
pDst[dstIndex] = value;
} }
__kernel void FillBufferSSHOffset( __kernel void FillBufferSSHOffset(