mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-26 23:33:20 +08:00
performance: Use immediate fill for pattern sizes <= 4
Related-To: NEO-9729 Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
bd9b458add
commit
100bec3fa8
@@ -2384,7 +2384,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
|
||||
|
||||
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
|
||||
|
||||
auto builtin = (patternSize == 1)
|
||||
bool useImmediateFill = patternSize == 1 || (patternSize <= 4 && (dstAllocation.offset % sizeof(uint32_t) == 0) && (size % (sizeof(uint32_t) * 4) == 0));
|
||||
auto builtin = useImmediateFill
|
||||
? BuiltinTypeHelper::adjustBuiltinType<Builtin::fillBufferImmediate>(isStateless, isHeapless)
|
||||
: BuiltinTypeHelper::adjustBuiltinType<Builtin::fillBufferMiddle>(isStateless, isHeapless);
|
||||
|
||||
@@ -2413,7 +2414,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
|
||||
launchParams.numKernelsInSplitLaunch++;
|
||||
}
|
||||
|
||||
if (patternSize == 1) {
|
||||
if (useImmediateFill) {
|
||||
launchParams.numKernelsInSplitLaunch++;
|
||||
if (fillArguments.leftRemainingBytes > 0) {
|
||||
res = appendUnalignedFillKernel(isStateless, fillArguments.leftRemainingBytes, dstAllocation, pattern, signalEvent, launchParams);
|
||||
@@ -2431,8 +2432,23 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
|
||||
|
||||
ze_group_count_t dispatchKernelArgs{static_cast<uint32_t>(fillArguments.groups), 1u, 1u};
|
||||
|
||||
uint32_t value = 0;
|
||||
memset(&value, *reinterpret_cast<const unsigned char *>(pattern), 4);
|
||||
uint32_t value = 0u;
|
||||
|
||||
switch (patternSize) {
|
||||
case 1:
|
||||
memset(&value, *reinterpret_cast<const unsigned char *>(pattern), 4);
|
||||
break;
|
||||
case 2:
|
||||
memcpy(&value, pattern, 2);
|
||||
value <<= 16;
|
||||
memcpy(&value, pattern, 2);
|
||||
break;
|
||||
case 4:
|
||||
memcpy(&value, pattern, 4);
|
||||
break;
|
||||
default:
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
|
||||
builtinSetArgFill(builtinKernel, 0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
|
||||
builtinKernel->setArgumentValue(1, sizeof(fillArguments.mainOffset), &fillArguments.mainOffset);
|
||||
@@ -4048,7 +4064,8 @@ void CommandListCoreFamily<gfxCoreFamily>::setupFillKernelArguments(size_t baseO
|
||||
size_t dstSize,
|
||||
CmdListFillKernelArguments &outArguments,
|
||||
Kernel *kernel) {
|
||||
if (patternSize == 1) {
|
||||
constexpr auto dataTypeSize = sizeof(uint32_t) * 4;
|
||||
if (patternSize == 1 || (patternSize <= 4 && (baseOffset % sizeof(uint32_t) == 0) && (dstSize % dataTypeSize == 0))) {
|
||||
size_t middleSize = dstSize;
|
||||
outArguments.mainOffset = baseOffset;
|
||||
outArguments.leftRemainingBytes = sizeof(uint32_t) - (baseOffset % sizeof(uint32_t));
|
||||
@@ -4059,7 +4076,6 @@ void CommandListCoreFamily<gfxCoreFamily>::setupFillKernelArguments(size_t baseO
|
||||
outArguments.leftRemainingBytes = 0;
|
||||
}
|
||||
|
||||
const auto dataTypeSize = sizeof(uint32_t) * 4;
|
||||
size_t adjustedSize = middleSize / dataTypeSize;
|
||||
outArguments.mainGroupSize = this->device->getDeviceInfo().maxWorkGroupSize;
|
||||
if (outArguments.mainGroupSize > adjustedSize && adjustedSize > 0) {
|
||||
|
||||
@@ -1372,7 +1372,7 @@ HWTEST_F(CommandListAppendLaunchKernel, givenUnalignePtrToFillWhenSettingFillPro
|
||||
MockCommandListImmediateHw<FamilyType::gfxCoreFamily> cmdList;
|
||||
cmdList.cmdQImmediate = queue.get();
|
||||
auto unalignedOffset = 2u;
|
||||
auto patternSize = 4u;
|
||||
auto patternSize = 8u;
|
||||
auto sizeToFill = 599u * patternSize;
|
||||
CmdListFillKernelArguments outArguments;
|
||||
cmdList.setupFillKernelArguments(unalignedOffset, patternSize, sizeToFill, outArguments, kernel.get());
|
||||
@@ -1386,7 +1386,7 @@ HWTEST_F(CommandListAppendLaunchKernel, givenAlignePtrToFillWhenSettingFillPrope
|
||||
MockCommandListImmediateHw<FamilyType::gfxCoreFamily> cmdList;
|
||||
cmdList.cmdQImmediate = queue.get();
|
||||
auto unalignedOffset = 4u;
|
||||
auto patternSize = 4u;
|
||||
auto patternSize = 8u;
|
||||
auto sizeToFill = 599u * patternSize;
|
||||
CmdListFillKernelArguments outArguments;
|
||||
cmdList.setupFillKernelArguments(unalignedOffset, patternSize, sizeToFill, outArguments, kernel.get());
|
||||
|
||||
@@ -56,12 +56,43 @@ HWTEST_F(AppendFillTest, givenCallToAppendMemoryFillWithAppendLaunchKernelFailur
|
||||
EXPECT_NE(ZE_RESULT_SUCCESS, result);
|
||||
}
|
||||
|
||||
HWTEST_F(AppendFillTest, givenCallToAppendMemoryFillWithPatternSizeLessOrEqualThanFourButUnalignedSizeThenUseFill) {
|
||||
auto commandList = std::make_unique<WhiteBox<MockCommandList<FamilyType::gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
|
||||
|
||||
for (const auto patternSize : {1, 2, 4}) {
|
||||
size_t patternAllocationsVectorSizeBefore = commandList->patternAllocations.size();
|
||||
CmdListMemoryCopyParams copyParams = {};
|
||||
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, nullptr, 0, nullptr, copyParams);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t patternAllocationsVectorSize = commandList->patternAllocations.size();
|
||||
if (patternSize == 1) {
|
||||
EXPECT_EQ(patternAllocationsVectorSize, patternAllocationsVectorSizeBefore);
|
||||
} else {
|
||||
EXPECT_NE(patternAllocationsVectorSize, patternAllocationsVectorSizeBefore);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(AppendFillTest, givenCallToAppendMemoryFillWithPatternSizeLessOrEqualThanFourThenUseImmmediateFill) {
|
||||
auto commandList = std::make_unique<WhiteBox<MockCommandList<FamilyType::gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
|
||||
|
||||
for (const auto patternSize : {1, 2, 4}) {
|
||||
CmdListMemoryCopyParams copyParams = {};
|
||||
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, 256, nullptr, 0, nullptr, copyParams);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t patternAllocationsVectorSize = commandList->patternAllocations.size();
|
||||
EXPECT_EQ(patternAllocationsVectorSize, 0u);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(AppendFillTest, givenTwoCallsToAppendMemoryFillWithSamePatternThenAllocationIsCreatedForEachCall) {
|
||||
auto commandList = std::make_unique<WhiteBox<MockCommandList<FamilyType::gfxCoreFamily>>>();
|
||||
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
|
||||
|
||||
CmdListMemoryCopyParams copyParams = {};
|
||||
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, 4, allocSize, nullptr, 0, nullptr, copyParams);
|
||||
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, 8, allocSize, nullptr, 0, nullptr, copyParams);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t patternAllocationsVectorSize = commandList->patternAllocations.size();
|
||||
EXPECT_EQ(patternAllocationsVectorSize, 1u);
|
||||
@@ -81,7 +112,7 @@ HWTEST_F(AppendFillTest, givenTwoCallsToAppendMemoryFillWithDifferentPatternsThe
|
||||
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
|
||||
|
||||
CmdListMemoryCopyParams copyParams = {};
|
||||
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, 4, allocSize, nullptr, 0, nullptr, copyParams);
|
||||
ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, 8, allocSize, nullptr, 0, nullptr, copyParams);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
size_t patternAllocationsVectorSize = commandList->patternAllocations.size();
|
||||
EXPECT_EQ(patternAllocationsVectorSize, 1u);
|
||||
|
||||
@@ -2853,7 +2853,7 @@ HWTEST_F(MultiTileSynchronizedDispatchTests, givenLimitedSyncDispatchWhenAppendi
|
||||
EXPECT_TRUE(verifyTokenCheck(1));
|
||||
|
||||
offset = cmdStream->getUsed();
|
||||
immCmdList->appendMemoryFill(alloc, alloc, 2, 2, nullptr, 0, nullptr, copyParams);
|
||||
immCmdList->appendMemoryFill(alloc, alloc, 8, 8, nullptr, 0, nullptr, copyParams);
|
||||
EXPECT_TRUE(verifyTokenCheck(1));
|
||||
|
||||
offset = cmdStream->getUsed();
|
||||
|
||||
Reference in New Issue
Block a user