performance: Use immediate fill for pattern sizes <= 4

Related-To: NEO-9729

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2025-06-26 09:15:22 +00:00
committed by Compute-Runtime-Automation
parent bd9b458add
commit 100bec3fa8
4 changed files with 58 additions and 11 deletions

View File

@@ -2384,7 +2384,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
auto builtin = (patternSize == 1)
bool useImmediateFill = patternSize == 1 || (patternSize <= 4 && (dstAllocation.offset % sizeof(uint32_t) == 0) && (size % (sizeof(uint32_t) * 4) == 0));
auto builtin = useImmediateFill
? BuiltinTypeHelper::adjustBuiltinType<Builtin::fillBufferImmediate>(isStateless, isHeapless)
: BuiltinTypeHelper::adjustBuiltinType<Builtin::fillBufferMiddle>(isStateless, isHeapless);
@@ -2413,7 +2414,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
launchParams.numKernelsInSplitLaunch++;
}
if (patternSize == 1) {
if (useImmediateFill) {
launchParams.numKernelsInSplitLaunch++;
if (fillArguments.leftRemainingBytes > 0) {
res = appendUnalignedFillKernel(isStateless, fillArguments.leftRemainingBytes, dstAllocation, pattern, signalEvent, launchParams);
@@ -2431,8 +2432,23 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
ze_group_count_t dispatchKernelArgs{static_cast<uint32_t>(fillArguments.groups), 1u, 1u};
uint32_t value = 0;
memset(&value, *reinterpret_cast<const unsigned char *>(pattern), 4);
uint32_t value = 0u;
switch (patternSize) {
case 1:
memset(&value, *reinterpret_cast<const unsigned char *>(pattern), 4);
break;
case 2:
memcpy(&value, pattern, 2);
value <<= 16;
memcpy(&value, pattern, 2);
break;
case 4:
memcpy(&value, pattern, 4);
break;
default:
UNRECOVERABLE_IF(true);
}
builtinSetArgFill(builtinKernel, 0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
builtinKernel->setArgumentValue(1, sizeof(fillArguments.mainOffset), &fillArguments.mainOffset);
@@ -4048,7 +4064,8 @@ void CommandListCoreFamily<gfxCoreFamily>::setupFillKernelArguments(size_t baseO
size_t dstSize,
CmdListFillKernelArguments &outArguments,
Kernel *kernel) {
if (patternSize == 1) {
constexpr auto dataTypeSize = sizeof(uint32_t) * 4;
if (patternSize == 1 || (patternSize <= 4 && (baseOffset % sizeof(uint32_t) == 0) && (dstSize % dataTypeSize == 0))) {
size_t middleSize = dstSize;
outArguments.mainOffset = baseOffset;
outArguments.leftRemainingBytes = sizeof(uint32_t) - (baseOffset % sizeof(uint32_t));
@@ -4059,7 +4076,6 @@ void CommandListCoreFamily<gfxCoreFamily>::setupFillKernelArguments(size_t baseO
outArguments.leftRemainingBytes = 0;
}
const auto dataTypeSize = sizeof(uint32_t) * 4;
size_t adjustedSize = middleSize / dataTypeSize;
outArguments.mainGroupSize = this->device->getDeviceInfo().maxWorkGroupSize;
if (outArguments.mainGroupSize > adjustedSize && adjustedSize > 0) {

View File

@@ -1372,7 +1372,7 @@ HWTEST_F(CommandListAppendLaunchKernel, givenUnalignePtrToFillWhenSettingFillPro
MockCommandListImmediateHw<FamilyType::gfxCoreFamily> cmdList;
cmdList.cmdQImmediate = queue.get();
auto unalignedOffset = 2u;
auto patternSize = 4u;
auto patternSize = 8u;
auto sizeToFill = 599u * patternSize;
CmdListFillKernelArguments outArguments;
cmdList.setupFillKernelArguments(unalignedOffset, patternSize, sizeToFill, outArguments, kernel.get());
@@ -1386,7 +1386,7 @@ HWTEST_F(CommandListAppendLaunchKernel, givenAlignePtrToFillWhenSettingFillPrope
MockCommandListImmediateHw<FamilyType::gfxCoreFamily> cmdList;
cmdList.cmdQImmediate = queue.get();
auto unalignedOffset = 4u;
auto patternSize = 4u;
auto patternSize = 8u;
auto sizeToFill = 599u * patternSize;
CmdListFillKernelArguments outArguments;
cmdList.setupFillKernelArguments(unalignedOffset, patternSize, sizeToFill, outArguments, kernel.get());

View File

@@ -56,12 +56,43 @@ HWTEST_F(AppendFillTest, givenCallToAppendMemoryFillWithAppendLaunchKernelFailur
EXPECT_NE(ZE_RESULT_SUCCESS, result);
}
HWTEST_F(AppendFillTest, givenCallToAppendMemoryFillWithPatternSizeLessOrEqualThanFourButUnalignedSizeThenUseFill) {
auto commandList = std::make_unique<WhiteBox<MockCommandList<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
for (const auto patternSize : {1, 2, 4}) {
size_t patternAllocationsVectorSizeBefore = commandList->patternAllocations.size();
CmdListMemoryCopyParams copyParams = {};
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, nullptr, 0, nullptr, copyParams);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t patternAllocationsVectorSize = commandList->patternAllocations.size();
if (patternSize == 1) {
EXPECT_EQ(patternAllocationsVectorSize, patternAllocationsVectorSizeBefore);
} else {
EXPECT_NE(patternAllocationsVectorSize, patternAllocationsVectorSizeBefore);
}
}
}
HWTEST_F(AppendFillTest, givenCallToAppendMemoryFillWithPatternSizeLessOrEqualThanFourThenUseImmmediateFill) {
auto commandList = std::make_unique<WhiteBox<MockCommandList<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
for (const auto patternSize : {1, 2, 4}) {
CmdListMemoryCopyParams copyParams = {};
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, 256, nullptr, 0, nullptr, copyParams);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t patternAllocationsVectorSize = commandList->patternAllocations.size();
EXPECT_EQ(patternAllocationsVectorSize, 0u);
}
}
HWTEST_F(AppendFillTest, givenTwoCallsToAppendMemoryFillWithSamePatternThenAllocationIsCreatedForEachCall) {
auto commandList = std::make_unique<WhiteBox<MockCommandList<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
CmdListMemoryCopyParams copyParams = {};
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, 4, allocSize, nullptr, 0, nullptr, copyParams);
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, 8, allocSize, nullptr, 0, nullptr, copyParams);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t patternAllocationsVectorSize = commandList->patternAllocations.size();
EXPECT_EQ(patternAllocationsVectorSize, 1u);
@@ -81,7 +112,7 @@ HWTEST_F(AppendFillTest, givenTwoCallsToAppendMemoryFillWithDifferentPatternsThe
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
CmdListMemoryCopyParams copyParams = {};
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, 4, allocSize, nullptr, 0, nullptr, copyParams);
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, 8, allocSize, nullptr, 0, nullptr, copyParams);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t patternAllocationsVectorSize = commandList->patternAllocations.size();
EXPECT_EQ(patternAllocationsVectorSize, 1u);

View File

@@ -2853,7 +2853,7 @@ HWTEST_F(MultiTileSynchronizedDispatchTests, givenLimitedSyncDispatchWhenAppendi
EXPECT_TRUE(verifyTokenCheck(1));
offset = cmdStream->getUsed();
immCmdList->appendMemoryFill(alloc, alloc, 2, 2, nullptr, 0, nullptr, copyParams);
immCmdList->appendMemoryFill(alloc, alloc, 8, 8, nullptr, 0, nullptr, copyParams);
EXPECT_TRUE(verifyTokenCheck(1));
offset = cmdStream->getUsed();