fix: Correct alignment check for immediate fill v2

Resolves: HSD-18042731538

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2025-07-07 10:36:55 +00:00
committed by Compute-Runtime-Automation
parent f2bd2d3716
commit 67462c4356
2 changed files with 26 additions and 4 deletions

View File

@@ -2308,6 +2308,13 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendUnalignedFillKernel(bool
return ZE_RESULT_SUCCESS;
}
inline bool canUseImmediateFill(size_t size, size_t patternSize, size_t offset, size_t maxWgSize) {
return patternSize == 1 || (patternSize <= 4 &&
isAligned<sizeof(uint32_t)>(offset) &&
isAligned<sizeof(uint32_t) * 4>(size) &&
(size <= maxWgSize || isAligned(size / (sizeof(uint32_t) * 4), maxWgSize)));
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
const void *pattern,
@@ -2392,8 +2399,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
const auto maxWgSize = this->device->getDeviceInfo().maxWorkGroupSize;
bool useImmediateFill = patternSize == 1 || (patternSize <= 4 && isAligned<sizeof(uint32_t)>(dstAllocation.offset) && isAligned<sizeof(uint32_t) * 4>(size) && (size <= maxWgSize || isAligned(size, maxWgSize)));
bool useImmediateFill = canUseImmediateFill(size, patternSize, dstAllocation.offset, this->device->getDeviceInfo().maxWorkGroupSize);
auto builtin = useImmediateFill
? BuiltinTypeHelper::adjustBuiltinType<Builtin::fillBufferImmediate>(isStateless, isHeapless)
: BuiltinTypeHelper::adjustBuiltinType<Builtin::fillBufferMiddle>(isStateless, isHeapless);
@@ -4113,9 +4119,10 @@ void CommandListCoreFamily<gfxCoreFamily>::setupFillKernelArguments(size_t baseO
size_t dstSize,
CmdListFillKernelArguments &outArguments,
Kernel *kernel) {
constexpr auto dataTypeSize = sizeof(uint32_t) * 4;
const auto maxWgSize = this->device->getDeviceInfo().maxWorkGroupSize;
if (patternSize == 1 || (patternSize <= 4 && isAligned<sizeof(uint32_t)>(baseOffset) && isAligned<dataTypeSize>(dstSize) && (dstSize <= maxWgSize || isAligned(dstSize, maxWgSize)))) {
if (canUseImmediateFill(dstSize, patternSize, baseOffset, maxWgSize)) {
constexpr auto dataTypeSize = sizeof(uint32_t) * 4;
size_t middleSize = dstSize;
outArguments.mainOffset = baseOffset;
outArguments.leftRemainingBytes = sizeof(uint32_t) - (baseOffset % sizeof(uint32_t));

View File

@@ -48,6 +48,21 @@ HWTEST_F(AppendFillTest, givenCallToAppendMemoryFillWithAppendLaunchKernelFailur
EXPECT_NE(ZE_RESULT_SUCCESS, result);
}
HWTEST_F(AppendFillTest, givenCallToAppendMemoryFillWithDataSizeNotAlignedToBothSizeOfFillDataAndMaxWgsThenUseFill) {
auto commandList = std::make_unique<WhiteBox<MockCommandList<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
const auto patternSize = 4;
const auto allocSize = sizeof(uint32_t) * 4 * device->getDeviceInfo().maxWorkGroupSize + 1;
size_t patternTagsVectorSizeBefore = commandList->patternTags.size();
CmdListMemoryCopyParams copyParams = {};
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, nullptr, 0, nullptr, copyParams);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t patternTagsVectorSize = commandList->patternTags.size();
EXPECT_NE(patternTagsVectorSize, patternTagsVectorSizeBefore);
EXPECT_EQ(0u, commandList->patternAllocations.size());
}
HWTEST_F(AppendFillTest, givenCallToAppendMemoryFillWithPatternSizeLessOrEqualThanFourButUnalignedSizeThenUseFill) {
auto commandList = std::make_unique<WhiteBox<MockCommandList<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);