diff --git a/level_zero/core/source/builtin/builtin_functions_lib.h b/level_zero/core/source/builtin/builtin_functions_lib.h index eceb4d6395..4da14a61e9 100644 --- a/level_zero/core/source/builtin/builtin_functions_lib.h +++ b/level_zero/core/source/builtin/builtin_functions_lib.h @@ -29,6 +29,8 @@ enum class Builtin : uint32_t { CopyBufferToBufferSideStateless, FillBufferImmediate, FillBufferImmediateStateless, + FillBufferImmediateRightLeftOver, + FillBufferImmediateRightLeftOverStateless, FillBufferSSHOffset, FillBufferSSHOffsetStateless, FillBufferMiddle, diff --git a/level_zero/core/source/builtin/builtin_functions_lib_impl.cpp b/level_zero/core/source/builtin/builtin_functions_lib_impl.cpp index d300ed1567..c36d958175 100644 --- a/level_zero/core/source/builtin/builtin_functions_lib_impl.cpp +++ b/level_zero/core/source/builtin/builtin_functions_lib_impl.cpp @@ -79,6 +79,14 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) { builtinName = "FillBufferImmediate"; builtin = NEO::EBuiltInOps::FillBufferStateless; break; + case Builtin::FillBufferImmediateRightLeftOver: + builtinName = "FillBufferImmediateRightLeftOver"; + builtin = NEO::EBuiltInOps::FillBuffer; + break; + case Builtin::FillBufferImmediateRightLeftOverStateless: + builtinName = "FillBufferImmediateRightLeftOver"; + builtin = NEO::EBuiltInOps::FillBufferStateless; + break; case Builtin::FillBufferSSHOffset: builtinName = "FillBufferSSHOffset"; builtin = NEO::EBuiltInOps::FillBuffer; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index e879484c9b..8577df736e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -1533,37 +1533,51 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, } else { builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediate); } - uint32_t groupSizeX = builtinFunction->getImmutableData()->getDescriptor().kernelAttributes.simdSize; - if (groupSizeX > static_cast(size)) { - groupSizeX = static_cast(size); + const auto dataTypeSize = sizeof(uint32_t) * 4; + size_t adjustedSize = std::max(static_cast(1u), size / (dataTypeSize)); + size_t groupSizeX = device->getDeviceInfo().maxWorkGroupSize; + if (groupSizeX > adjustedSize) { + groupSizeX = adjustedSize; } - if (builtinFunction->setGroupSize(groupSizeX, 1u, 1u)) { + if (builtinFunction->setGroupSize(static_cast(groupSizeX), 1u, 1u)) { DEBUG_BREAK_IF(true); return ZE_RESULT_ERROR_UNKNOWN; } - uint32_t value = *(reinterpret_cast(pattern)); + size_t groups = adjustedSize / groupSizeX; + size_t remainingBytes = static_cast((adjustedSize % groupSizeX) * dataTypeSize + + size % dataTypeSize); + ze_group_count_t dispatchFuncArgs{static_cast(groups), 1u, 1u}; + + uint32_t value = 0; + memset(&value, *reinterpret_cast(pattern), 4); builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc); builtinFunction->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset); builtinFunction->setArgumentValue(2, sizeof(value), &value); appendEventForProfilingAllWalkers(signalEvent, true); - uint32_t groups = static_cast(size) / groupSizeX; - ze_group_count_t dispatchFuncArgs{groups, 1u, 1u}; res = appendLaunchKernelSplit(builtinFunction, &dispatchFuncArgs, signalEvent, launchParams); if (res) { return res; } - uint32_t groupRemainderSizeX = static_cast(size) % groupSizeX; - if (groupRemainderSizeX) { - builtinFunction->setGroupSize(groupRemainderSizeX, 1u, 1u); - ze_group_count_t dispatchFuncRemainderArgs{1u, 1u, 1u}; - - size_t dstOffset = dstAllocation.offset + (size - groupRemainderSizeX); + if (remainingBytes) { + if (isStateless) { + builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediateRightLeftOverStateless); + } else { + builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediateRightLeftOver); + } + uint32_t groupSizeY = 1, groupSizeZ = 1; + uint32_t groupSizeX = static_cast(remainingBytes); + builtinFunction->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ); + builtinFunction->setGroupSize(groupSizeX, groupSizeY, groupSizeZ); + ze_group_count_t dispatchFuncRemainderArgs{static_cast(remainingBytes / groupSizeX), 1u, 1u}; + size_t dstOffset = dstAllocation.offset + (size - remainingBytes); + value = *(reinterpret_cast(pattern)); builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc); builtinFunction->setArgumentValue(1, sizeof(dstOffset), &dstOffset); + builtinFunction->setArgumentValue(2, sizeof(value), &value); res = appendLaunchKernelSplit(builtinFunction, &dispatchFuncRemainderArgs, signalEvent, launchParams); if (res) { @@ -1578,7 +1592,6 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, } else { builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle); } - size_t middleElSize = sizeof(uint32_t); size_t adjustedSize = size / middleElSize; uint32_t groupSizeX = static_cast(adjustedSize); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp index ab1246ad33..0c9c03b0f3 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp @@ -53,14 +53,18 @@ class AppendFillFixture : public DeviceFixture { if (numberOfCallsToAppendLaunchKernelWithParams == thresholdOfCallsToAppendLaunchKernelWithParamsToFail) { return ZE_RESULT_ERROR_UNKNOWN; } - + if (numberOfCallsToAppendLaunchKernelWithParams < 2) { + threadGroupDimensions[numberOfCallsToAppendLaunchKernelWithParams] = *pThreadGroupDimensions; + xGroupSizes[numberOfCallsToAppendLaunchKernelWithParams] = kernel->getGroupSize()[0]; + } numberOfCallsToAppendLaunchKernelWithParams++; return CommandListCoreFamily::appendLaunchKernelWithParams(kernel, pThreadGroupDimensions, event, launchParams); } - + ze_group_count_t threadGroupDimensions[2]; + uint32_t xGroupSizes[2]; uint32_t thresholdOfCallsToAppendLaunchKernelWithParamsToFail = std::numeric_limits::max(); uint32_t numberOfCallsToAppendLaunchKernelWithParams = 0; }; @@ -77,6 +81,7 @@ class AppendFillFixture : public DeviceFixture { driverHandle = std::make_unique>(); driverHandle->initialize(std::move(devices)); device = driverHandle->devices[0]; + neoDevice->deviceInfo.maxWorkGroupSize = 256; } void tearDown() { @@ -189,6 +194,75 @@ HWTEST2_F(AppendFillTest, EXPECT_EQ(patternAllocationsVectorSize + 1u, newPatternAllocationsVectorSize); } +HWTEST2_F(AppendFillTest, + givenAppendMemoryFillWhenPatternSizeIsOneThenDispatchOneKernel, IsAtLeastSkl) { + using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; + + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + int pattern = 0; + const size_t size = 1024 * 1024; + uint8_t *ptr = new uint8_t[size]; + ze_result_t result = commandList->appendMemoryFill(ptr, &pattern, 1, size, nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(1u, commandList->numberOfCallsToAppendLaunchKernelWithParams); + EXPECT_EQ(size, commandList->xGroupSizes[0] * commandList->threadGroupDimensions[0].groupCountX * 16); + delete[] ptr; +} + +HWTEST2_F(AppendFillTest, + givenAppendMemoryFillWithUnalignedSizeWhenPatternSizeIsOneThenDispatchTwoKernels, IsAtLeastSkl) { + using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; + + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + int pattern = 0; + const size_t size = 1025; + uint8_t *ptr = new uint8_t[size]; + ze_result_t result = commandList->appendMemoryFill(ptr, &pattern, 1, size, nullptr, 0, nullptr); + size_t filledSize = commandList->xGroupSizes[0] * commandList->threadGroupDimensions[0].groupCountX * 16; + filledSize += commandList->xGroupSizes[1] * commandList->threadGroupDimensions[1].groupCountX; + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(2u, commandList->numberOfCallsToAppendLaunchKernelWithParams); + EXPECT_EQ(size, filledSize); + delete[] ptr; +} + +HWTEST2_F(AppendFillTest, + givenAppendMemoryFillWithSizeBelowMaxWorkgroupSizeWhenPatternSizeIsOneThenDispatchOneKernel, IsAtLeastSkl) { + using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; + + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + int pattern = 0; + const size_t size = neoDevice->getDeviceInfo().maxWorkGroupSize / 2; + uint8_t *ptr = new uint8_t[size]; + ze_result_t result = commandList->appendMemoryFill(ptr, &pattern, 1, size, nullptr, 0, nullptr); + size_t filledSize = commandList->xGroupSizes[0] * commandList->threadGroupDimensions[0].groupCountX * 16; + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(1u, commandList->numberOfCallsToAppendLaunchKernelWithParams); + EXPECT_EQ(size, filledSize); + delete[] ptr; +} + +HWTEST2_F(AppendFillTest, + givenAppendMemoryFillWhenPatternSizeIsOneThenGroupCountIsCorrect, IsAtLeastSkl) { + using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; + + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + int pattern = 0; + const size_t size = 1024 * 1024; + uint8_t *ptr = new uint8_t[size]; + ze_result_t result = commandList->appendMemoryFill(ptr, &pattern, 1, size, nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto groupSize = device->getDeviceInfo().maxWorkGroupSize; + auto dataTypeSize = sizeof(uint32_t) * 4; + auto expectedGroupCount = size / (dataTypeSize * groupSize); + EXPECT_EQ(expectedGroupCount, commandList->threadGroupDimensions[0].groupCountX); + delete[] ptr; +} + HWTEST2_F(AppendFillTest, givenCallToAppendMemoryFillWithSizeNotMultipleOfPatternSizeThenSuccessIsReturned, IsAtLeastSkl) { using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; diff --git a/manifests/manifest.yml b/manifests/manifest.yml index e2b6ce2e77..2ddf24c0aa 100644 --- a/manifests/manifest.yml +++ b/manifests/manifest.yml @@ -38,7 +38,7 @@ components: dest_dir: kernels_bin type: git branch: kernels_bin - revision: 1941-282 + revision: 1941-284 kmdaf: branch: kmdaf dest_dir: kmdaf diff --git a/shared/source/built_ins/kernels/fill_buffer.builtin_kernel b/shared/source/built_ins/kernels/fill_buffer.builtin_kernel index e5d0276f98..7690e01b41 100644 --- a/shared/source/built_ins/kernels/fill_buffer.builtin_kernel +++ b/shared/source/built_ins/kernels/fill_buffer.builtin_kernel @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2021 Intel Corporation + * Copyright (C) 2020-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -48,13 +48,21 @@ __kernel void FillBufferRightLeftover( } __kernel void FillBufferImmediate( - __global uchar* ptr, - uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + __global uint4* ptr, + ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment const uint value) { - uint dstIndex = get_global_id(0); - __global uchar* pDst = (__global uchar*)ptr + dstSshOffset; - pDst[dstIndex] = value; + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; +} + +__kernel void FillBufferImmediateRightLeftOver( + __global uchar* ptr, + ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + const uint value) +{ + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; } __kernel void FillBufferSSHOffset( diff --git a/shared/source/built_ins/kernels/fill_buffer_stateless.builtin_kernel b/shared/source/built_ins/kernels/fill_buffer_stateless.builtin_kernel index c12f619112..8b967bf8e5 100644 --- a/shared/source/built_ins/kernels/fill_buffer_stateless.builtin_kernel +++ b/shared/source/built_ins/kernels/fill_buffer_stateless.builtin_kernel @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2021 Intel Corporation + * Copyright (C) 2020-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -48,13 +48,21 @@ __kernel void FillBufferRightLeftover( } __kernel void FillBufferImmediate( + __global uint4* ptr, + ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + const uint value) +{ + size_t gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; +} + +__kernel void FillBufferImmediateRightLeftOver( __global uchar* ptr, ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment const uint value) { - size_t dstIndex = get_global_id(0); - __global uchar* pDst = (__global uchar*)ptr + dstSshOffset; - pDst[dstIndex] = value; + size_t gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; } __kernel void FillBufferSSHOffset( diff --git a/shared/test/common/helpers/kernel_binary_helper_hash_value.cpp b/shared/test/common/helpers/kernel_binary_helper_hash_value.cpp index 8a5f3971fe..624e690ab0 100644 --- a/shared/test/common/helpers/kernel_binary_helper_hash_value.cpp +++ b/shared/test/common/helpers/kernel_binary_helper_hash_value.cpp @@ -7,5 +7,5 @@ #include "shared/test/common/helpers/kernel_binary_helper.h" -const std::string KernelBinaryHelper::BUILT_INS("7998916142903730155"); -const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("16526264370178379440_images"); +const std::string KernelBinaryHelper::BUILT_INS("11256751929276190336"); +const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("13967618058110882853_images"); diff --git a/shared/test/common/test_files/7998916142903730155.cl b/shared/test/common/test_files/11256751929276190336.cl similarity index 96% rename from shared/test/common/test_files/7998916142903730155.cl rename to shared/test/common/test_files/11256751929276190336.cl index 88541d4f1f..208ba372e6 100644 --- a/shared/test/common/test_files/7998916142903730155.cl +++ b/shared/test/common/test_files/11256751929276190336.cl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -156,13 +156,21 @@ __kernel void FillBufferRightLeftover( } __kernel void FillBufferImmediate( - __global uchar* ptr, - uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + __global uint4* ptr, + ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment const uint value) { - uint dstIndex = get_global_id(0); - __global uchar* pDst = (__global uchar*)ptr + dstSshOffset; - pDst[dstIndex] = value; + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; +} + +__kernel void FillBufferImmediateRightLeftOver( + __global uchar* ptr, + ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + const uint value) +{ + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; } __kernel void FillBufferSSHOffset( diff --git a/shared/test/common/test_files/16526264370178379440_images_options.txt b/shared/test/common/test_files/11256751929276190336_options.txt similarity index 100% rename from shared/test/common/test_files/16526264370178379440_images_options.txt rename to shared/test/common/test_files/11256751929276190336_options.txt diff --git a/shared/test/common/test_files/16526264370178379440_images.cl b/shared/test/common/test_files/13967618058110882853_images.cl similarity index 98% rename from shared/test/common/test_files/16526264370178379440_images.cl rename to shared/test/common/test_files/13967618058110882853_images.cl index ae42cba09f..57025cbfc1 100644 --- a/shared/test/common/test_files/16526264370178379440_images.cl +++ b/shared/test/common/test_files/13967618058110882853_images.cl @@ -156,13 +156,21 @@ __kernel void FillBufferRightLeftover( } __kernel void FillBufferImmediate( - __global uchar* ptr, - uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + __global uint4* ptr, + ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment const uint value) { - uint dstIndex = get_global_id(0); - __global uchar* pDst = (__global uchar*)ptr + dstSshOffset; - pDst[dstIndex] = value; + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; +} + +__kernel void FillBufferImmediateRightLeftOver( + __global uchar* ptr, + ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + const uint value) +{ + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; } __kernel void FillBufferSSHOffset( diff --git a/shared/test/common/test_files/7998916142903730155_options.txt b/shared/test/common/test_files/13967618058110882853_images_options.txt similarity index 100% rename from shared/test/common/test_files/7998916142903730155_options.txt rename to shared/test/common/test_files/13967618058110882853_images_options.txt diff --git a/shared/test/common/test_files/builtin_copyfill.cl b/shared/test/common/test_files/builtin_copyfill.cl index 88541d4f1f..ca4a73808f 100644 --- a/shared/test/common/test_files/builtin_copyfill.cl +++ b/shared/test/common/test_files/builtin_copyfill.cl @@ -156,13 +156,21 @@ __kernel void FillBufferRightLeftover( } __kernel void FillBufferImmediate( + __global uint4* ptr, + uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + const uint value) +{ + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; +} + +__kernel void FillBufferImmediateRightLeftOver( __global uchar* ptr, uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment const uint value) { - uint dstIndex = get_global_id(0); - __global uchar* pDst = (__global uchar*)ptr + dstSshOffset; - pDst[dstIndex] = value; + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; } __kernel void FillBufferSSHOffset( diff --git a/shared/test/common/test_files/builtin_copyfill_stateless.cl b/shared/test/common/test_files/builtin_copyfill_stateless.cl index ab34f32b30..9296207d4f 100644 --- a/shared/test/common/test_files/builtin_copyfill_stateless.cl +++ b/shared/test/common/test_files/builtin_copyfill_stateless.cl @@ -156,13 +156,21 @@ __kernel void FillBufferRightLeftover( } __kernel void FillBufferImmediate( + __global uint4* ptr, + ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + const uint value) +{ + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; +} + +__kernel void FillBufferImmediateRightLeftOver( __global uchar* ptr, ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment const uint value) { - size_t dstIndex = get_global_id(0); - __global uchar* pDst = (__global uchar*)ptr + dstSshOffset; - pDst[dstIndex] = value; + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; } __kernel void FillBufferSSHOffset( diff --git a/shared/test/common/test_files/builtin_images.cl b/shared/test/common/test_files/builtin_images.cl index ca761fa941..213848fb98 100644 --- a/shared/test/common/test_files/builtin_images.cl +++ b/shared/test/common/test_files/builtin_images.cl @@ -156,13 +156,21 @@ __kernel void FillBufferRightLeftover( } __kernel void FillBufferImmediate( + __global uint4* ptr, + uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + const uint value) +{ + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; +} + +__kernel void FillBufferImmediateRightLeftOver( __global uchar* ptr, uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment const uint value) { - uint dstIndex = get_global_id(0); - __global uchar* pDst = (__global uchar*)ptr + dstSshOffset; - pDst[dstIndex] = value; + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; } __kernel void FillBufferSSHOffset(