From 100bec3fa896b1ba35dfe9c9626217c40edebd3c Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Thu, 26 Jun 2025 09:15:22 +0000 Subject: [PATCH] performance: Use immediate fill for pattern sizes <= 4 Related-To: NEO-9729 Signed-off-by: Lukasz Jobczyk --- level_zero/core/source/cmdlist/cmdlist_hw.inl | 28 +++++++++++---- .../sources/cmdlist/test_cmdlist_8.cpp | 4 +-- .../sources/cmdlist/test_cmdlist_fill.cpp | 35 +++++++++++++++++-- .../cmdlist/test_in_order_cmdlist_2.cpp | 2 +- 4 files changed, 58 insertions(+), 11 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index ebd8aab9b8..adc89f93c9 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2384,7 +2384,8 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership(); - auto builtin = (patternSize == 1) + bool useImmediateFill = patternSize == 1 || (patternSize <= 4 && (dstAllocation.offset % sizeof(uint32_t) == 0) && (size % (sizeof(uint32_t) * 4) == 0)); + auto builtin = useImmediateFill ? BuiltinTypeHelper::adjustBuiltinType(isStateless, isHeapless) : BuiltinTypeHelper::adjustBuiltinType(isStateless, isHeapless); @@ -2413,7 +2414,7 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, launchParams.numKernelsInSplitLaunch++; } - if (patternSize == 1) { + if (useImmediateFill) { launchParams.numKernelsInSplitLaunch++; if (fillArguments.leftRemainingBytes > 0) { res = appendUnalignedFillKernel(isStateless, fillArguments.leftRemainingBytes, dstAllocation, pattern, signalEvent, launchParams); @@ -2431,8 +2432,23 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, ze_group_count_t dispatchKernelArgs{static_cast(fillArguments.groups), 1u, 1u}; - uint32_t value = 0; - memset(&value, *reinterpret_cast(pattern), 4); + uint32_t value = 0u; + + switch (patternSize) { + case 1: + memset(&value, *reinterpret_cast(pattern), 4); + break; + case 2: + memcpy(&value, pattern, 2); + value <<= 16; + memcpy(&value, pattern, 2); + break; + case 4: + memcpy(&value, pattern, 4); + break; + default: + UNRECOVERABLE_IF(true); + } builtinSetArgFill(builtinKernel, 0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc); builtinKernel->setArgumentValue(1, sizeof(fillArguments.mainOffset), &fillArguments.mainOffset); @@ -4048,7 +4064,8 @@ void CommandListCoreFamily::setupFillKernelArguments(size_t baseO size_t dstSize, CmdListFillKernelArguments &outArguments, Kernel *kernel) { - if (patternSize == 1) { + constexpr auto dataTypeSize = sizeof(uint32_t) * 4; + if (patternSize == 1 || (patternSize <= 4 && (baseOffset % sizeof(uint32_t) == 0) && (dstSize % dataTypeSize == 0))) { size_t middleSize = dstSize; outArguments.mainOffset = baseOffset; outArguments.leftRemainingBytes = sizeof(uint32_t) - (baseOffset % sizeof(uint32_t)); @@ -4059,7 +4076,6 @@ void CommandListCoreFamily::setupFillKernelArguments(size_t baseO outArguments.leftRemainingBytes = 0; } - const auto dataTypeSize = sizeof(uint32_t) * 4; size_t adjustedSize = middleSize / dataTypeSize; outArguments.mainGroupSize = this->device->getDeviceInfo().maxWorkGroupSize; if (outArguments.mainGroupSize > adjustedSize && adjustedSize > 0) { diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_8.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_8.cpp index 7c6173ea37..46770df090 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_8.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_8.cpp @@ -1372,7 +1372,7 @@ HWTEST_F(CommandListAppendLaunchKernel, givenUnalignePtrToFillWhenSettingFillPro MockCommandListImmediateHw cmdList; cmdList.cmdQImmediate = queue.get(); auto unalignedOffset = 2u; - auto patternSize = 4u; + auto patternSize = 8u; auto sizeToFill = 599u * patternSize; CmdListFillKernelArguments outArguments; cmdList.setupFillKernelArguments(unalignedOffset, patternSize, sizeToFill, outArguments, kernel.get()); @@ -1386,7 +1386,7 @@ HWTEST_F(CommandListAppendLaunchKernel, givenAlignePtrToFillWhenSettingFillPrope MockCommandListImmediateHw cmdList; cmdList.cmdQImmediate = queue.get(); auto unalignedOffset = 4u; - auto patternSize = 4u; + auto patternSize = 8u; auto sizeToFill = 599u * patternSize; CmdListFillKernelArguments outArguments; cmdList.setupFillKernelArguments(unalignedOffset, patternSize, sizeToFill, outArguments, kernel.get()); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp index bf526c48cf..6a8705e1cb 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp @@ -56,12 +56,43 @@ HWTEST_F(AppendFillTest, givenCallToAppendMemoryFillWithAppendLaunchKernelFailur EXPECT_NE(ZE_RESULT_SUCCESS, result); } +HWTEST_F(AppendFillTest, givenCallToAppendMemoryFillWithPatternSizeLessOrEqualThanFourButUnalignedSizeThenUseFill) { + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u); + + for (const auto patternSize : {1, 2, 4}) { + size_t patternAllocationsVectorSizeBefore = commandList->patternAllocations.size(); + CmdListMemoryCopyParams copyParams = {}; + ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, nullptr, 0, nullptr, copyParams); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + size_t patternAllocationsVectorSize = commandList->patternAllocations.size(); + if (patternSize == 1) { + EXPECT_EQ(patternAllocationsVectorSize, patternAllocationsVectorSizeBefore); + } else { + EXPECT_NE(patternAllocationsVectorSize, patternAllocationsVectorSizeBefore); + } + } +} + +HWTEST_F(AppendFillTest, givenCallToAppendMemoryFillWithPatternSizeLessOrEqualThanFourThenUseImmmediateFill) { + auto commandList = std::make_unique>>(); + commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u); + + for (const auto patternSize : {1, 2, 4}) { + CmdListMemoryCopyParams copyParams = {}; + ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, 256, nullptr, 0, nullptr, copyParams); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + size_t patternAllocationsVectorSize = commandList->patternAllocations.size(); + EXPECT_EQ(patternAllocationsVectorSize, 0u); + } +} + HWTEST_F(AppendFillTest, givenTwoCallsToAppendMemoryFillWithSamePatternThenAllocationIsCreatedForEachCall) { auto commandList = std::make_unique>>(); commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u); CmdListMemoryCopyParams copyParams = {}; - ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, 4, allocSize, nullptr, 0, nullptr, copyParams); + ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, 8, allocSize, nullptr, 0, nullptr, copyParams); EXPECT_EQ(ZE_RESULT_SUCCESS, result); size_t patternAllocationsVectorSize = commandList->patternAllocations.size(); EXPECT_EQ(patternAllocationsVectorSize, 1u); @@ -81,7 +112,7 @@ HWTEST_F(AppendFillTest, givenTwoCallsToAppendMemoryFillWithDifferentPatternsThe commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u); CmdListMemoryCopyParams copyParams = {}; - ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, 4, allocSize, nullptr, 0, nullptr, copyParams); + ze_result_t result = commandList->appendMemoryFill(dstPtr, pattern, 8, allocSize, nullptr, 0, nullptr, copyParams); EXPECT_EQ(ZE_RESULT_SUCCESS, result); size_t patternAllocationsVectorSize = commandList->patternAllocations.size(); EXPECT_EQ(patternAllocationsVectorSize, 1u); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp index 61d644e469..aec886c73d 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp @@ -2853,7 +2853,7 @@ HWTEST_F(MultiTileSynchronizedDispatchTests, givenLimitedSyncDispatchWhenAppendi EXPECT_TRUE(verifyTokenCheck(1)); offset = cmdStream->getUsed(); - immCmdList->appendMemoryFill(alloc, alloc, 2, 2, nullptr, 0, nullptr, copyParams); + immCmdList->appendMemoryFill(alloc, alloc, 8, 8, nullptr, 0, nullptr, copyParams); EXPECT_TRUE(verifyTokenCheck(1)); offset = cmdStream->getUsed();