fix: add function to calculate number of threads per tg

Signed-off-by: Cencelewska, Katarzyna <katarzyna.cencelewska@intel.com>
2025-12-25 21:42:53 +08:00 · 2023-06-12 11:41:13 +00:00
parent 987394b27c
commit 7cb3278eb3
16 changed files with 71 additions and 21 deletions
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -317,10 +317,9 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
        }
    }

-    auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
-    this->numThreadsPerThreadGroup = static_cast<uint32_t>((itemsInGroup + simdSize - 1u) / simdSize);
    patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ);

+    auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
    auto remainderSimdLanes = itemsInGroup & (simdSize - 1u);
    threadExecutionMask = static_cast<uint32_t>(maxNBitValue(remainderSimdLanes));
    if (!threadExecutionMask) {
@@ -328,6 +327,12 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
    }
    evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);

+    auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
+    auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
+    auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
+    this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
+        simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, kernelRequiresGenerationOfLocalIdsByRuntime);
+
    if (kernelRequiresGenerationOfLocalIdsByRuntime) {
        auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
        uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
--- a/opencl/source/command_queue/gpgpu_walker_bdw_and_later.inl
+++ b/opencl/source/command_queue/gpgpu_walker_bdw_and_later.inl
@@ -29,10 +29,10 @@ inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
    bool localIdsGenerationByRuntime,
    bool inlineDataProgrammingRequired,
    uint32_t requiredWorkgroupOrder) {
-    auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
+    auto localWorkSize = static_cast<uint32_t>(localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2]);

    auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
-    walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
+    walkerCmd->setThreadWidthCounterMaximum(threadsPerWorkGroup);

    walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
    walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
--- a/opencl/source/helpers/hardware_commands_helper_base.inl
+++ b/opencl/source/helpers/hardware_commands_helper_base.inl
@@ -240,9 +240,10 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
                                                                       kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(),
                                                                       device.getRootDeviceEnvironment());
    }
-
+    auto &gfxCoreHelper = device.getGfxCoreHelper();
+    auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
    auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
-    auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
+    auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, localIdsGenerationByRuntime);

    uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();

--- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp
@@ -729,7 +729,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
    // only X is present
    auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, sizeGrf);
    sizePerThreadData = std::max(sizePerThreadData, sizeGrf);
-    size_t perThreadTotalDataSize = getThreadsPerWG(simd, lws[0]) * sizePerThreadData;
+    size_t perThreadTotalDataSize = getThreadsPerWG(simd, static_cast<uint32_t>(lws[0])) * sizePerThreadData;

    uint32_t expectedIndirectDataLength = alignUp(static_cast<uint32_t>(perThreadTotalDataSize), COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
    EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength());
@@ -832,13 +832,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
    // only X is present
    uint32_t localIdSizePerThread = getPerThreadSizeLocalIDs(simd, sizeGrf);
    localIdSizePerThread = std::max(localIdSizePerThread, sizeGrf);
-    auto sizePerThreadData = getThreadsPerWG(simd, lws[0]) * localIdSizePerThread;
+    auto sizePerThreadData = getThreadsPerWG(simd, static_cast<uint32_t>(lws[0])) * localIdSizePerThread;

    auto crossThreadDataSize = kernel->mockKernel->getCrossThreadDataSize();
    crossThreadDataSize -= std::min(static_cast<uint32_t>(sizeof(INLINE_DATA)), crossThreadDataSize);

    // second GRF in indirect
-    uint32_t expectedIndirectDataLength = static_cast<uint32_t>(sizePerThreadData + crossThreadDataSize);
+    uint32_t expectedIndirectDataLength = sizePerThreadData + crossThreadDataSize;
    expectedIndirectDataLength = alignUp(expectedIndirectDataLength, COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
    EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength());

--- a/shared/source/command_container/command_encoder_bdw_and_later.inl
+++ b/shared/source/command_container/command_encoder_bdw_and_later.inl
@@ -354,9 +354,9 @@ void EncodeDispatchKernel<Family>::encodeThreadData(WALKER_TYPE &walkerCmd,

    walkerCmd.setSimdSize(getSimdConfig<WALKER_TYPE>(simd));

-    auto localWorkSize = workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2];
+    auto localWorkSize = static_cast<uint32_t>(workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]);
    if (threadsPerThreadGroup == 0) {
-        threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkSize));
+        threadsPerThreadGroup = getThreadsPerWG(simd, localWorkSize);
    }
    walkerCmd.setThreadWidthCounterMaximum(threadsPerThreadGroup);

--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@@ -144,6 +144,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, CFEOverDispatchControl, -1, "Set Over Dispatch C
 DECLARE_DEBUG_VARIABLE(int32_t, CFELargeGRFThreadAdjustDisable, -1, "Set Large GRF thread adjust Disable field in CFE_STATE, -1 - do not set")
 DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumComputeUnitsForScratch, -1, "Override number of compute units used for scratch size calculation")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceThreadGroupDispatchSize, -1, "Set ThreadGroupDispatchSize in INTERFACE_DESCRIPTOR_DATA, -1 - default, 0 - TG size 8, 1 - TG size 4, 2 - TG size 2, 3 - Reserved")
+DECLARE_DEBUG_VARIABLE(int32_t, ForceNumberOfThreadsInGpgpuThreadGroup, -1, "-1 - default, set NumberOfThreadsInGpgpuThreadGroup in INTERFACE_DESCRIPTOR_DATA")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessL1CachingPolicy, -1, "-1: default, >=0 : program value for stateless L1 caching")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceMemoryBankIndexOverride, -1, "-1: default, 0: disable, 1:enable, Force index=1 of memory bank for XEHP")
 DECLARE_DEBUG_VARIABLE(int32_t, EnablePrivateScratchSlot1, -1, "-1: default, 0: disable, 1: enable Allows using private scratch space")
--- a/shared/source/helpers/aarch64/local_id_gen.cpp
+++ b/shared/source/helpers/aarch64/local_id_gen.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -42,7 +42,7 @@ LocalIDHelper::LocalIDHelper() {
 LocalIDHelper LocalIDHelper::initializer;

 void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
-    auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
+    auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])));
    bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
    if (useLayoutForImages) {
        generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
--- a/shared/source/helpers/gfx_core_helper.h
+++ b/shared/source/helpers/gfx_core_helper.h
@@ -168,6 +168,7 @@ class GfxCoreHelper {
    virtual bool isChipsetUniqueUUIDSupported() const = 0;
    virtual bool isTimestampShiftRequired() const = 0;
    virtual bool isRelaxedOrderingSupported() const = 0;
+    virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const = 0;
    static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
    virtual ~GfxCoreHelper() = default;

@@ -380,6 +381,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
    bool isChipsetUniqueUUIDSupported() const override;
    bool isTimestampShiftRequired() const override;
    bool isRelaxedOrderingSupported() const override;
+    uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const override;

    ~GfxCoreHelperHw() override = default;

--- a/shared/source/helpers/gfx_core_helper_base.inl
+++ b/shared/source/helpers/gfx_core_helper_base.inl
@@ -16,6 +16,7 @@
 #include "shared/source/helpers/constants.h"
 #include "shared/source/helpers/gfx_core_helper.h"
 #include "shared/source/helpers/hw_info.h"
+#include "shared/source/helpers/local_id_gen.h"
 #include "shared/source/helpers/pipe_control_args.h"
 #include "shared/source/helpers/timestamp_packet.h"
 #include "shared/source/memory_manager/allocation_properties.h"
@@ -685,4 +686,12 @@ template <typename GfxFamily>
 uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
    return 128u;
 }
+
+template <typename GfxFamily>
+uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const {
+    if (DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get() != -1) {
+        return DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get();
+    }
+    return getThreadsPerWG(simd, totalWorkItems);
+}
 } // namespace NEO
--- a/shared/source/helpers/local_id_gen.h
+++ b/shared/source/helpers/local_id_gen.h
@@ -16,7 +16,7 @@ inline uint32_t getGRFsPerThread(uint32_t simd, uint32_t grfSize) {
    return (simd == 32 && grfSize == 32) ? 2 : 1;
 }

-inline size_t getThreadsPerWG(uint32_t simd, size_t lws) {
+inline uint32_t getThreadsPerWG(uint32_t simd, uint32_t lws) {
    auto result = lws + simd - 1;

    // Original logic:
--- a/shared/source/helpers/per_thread_data.h
+++ b/shared/source/helpers/per_thread_data.h
@@ -20,7 +20,7 @@ struct PerThreadDataHelper {
        uint32_t grfSize,
        uint32_t numChannels,
        size_t localWorkSize) {
-        return getThreadsPerWG(simd, localWorkSize) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
+        return getThreadsPerWG(simd, static_cast<uint32_t>(localWorkSize)) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
    }
 }; // namespace PerThreadDataHelper
 } // namespace NEO
--- a/shared/source/helpers/x86_64/local_id_gen.cpp
+++ b/shared/source/helpers/x86_64/local_id_gen.cpp
@@ -45,7 +45,7 @@ LocalIDHelper LocalIDHelper::initializer;

 // traditional function to generate local IDs
 void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
-    auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
+    auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])));
    bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
    if (useLayoutForImages) {
        generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
--- a/shared/source/kernel/local_ids_cache.cpp
+++ b/shared/source/kernel/local_ids_cache.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022 Intel Corporation
+ * Copyright (C) 2022-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -33,9 +33,9 @@ std::unique_lock<std::mutex> LocalIdsCache::lock() {
 }

 size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3<uint16_t> &group) const {
-    const auto numElementsInGroup = Math::computeTotalElementsCount({group[0], group[1], group[2]});
+    const auto numElementsInGroup = static_cast<uint32_t>(Math::computeTotalElementsCount({group[0], group[1], group[2]}));
    const auto numberOfThreads = getThreadsPerWG(simdSize, numElementsInGroup);
-    return numberOfThreads * static_cast<size_t>(localIdsSizePerThread);
+    return static_cast<size_t>(numberOfThreads * localIdsSizePerThread);
 }

 size_t LocalIdsCache::getLocalIdsSizePerThread() const {
--- a/shared/test/common/test_files/igdrcl.config
+++ b/shared/test/common/test_files/igdrcl.config
@@ -534,4 +534,5 @@ PrintGlobalTimestampInNs = 0
 EnableDeviceStateVerification = -1
 VfBarResourceAllocationWa = 1
 EnableDynamicPostSyncAllocLayout = -1
+ForceNumberOfThreadsInGpgpuThreadGroup = -1
 # Please don't edit below this line
--- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp
+++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp
@@ -1568,3 +1568,34 @@ HWTEST_F(GfxCoreHelperTest, givenNumGrfAndSimdSizeWhenAdjustingMaxWorkGroupSizeT
    numGrfRequired = GrfConfig::DefaultGrfNumber;
    EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize));
 }
+
+HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue, IsAtMostXeHpcCore) {
+    auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
+    std::array<std::array<uint32_t, 3>, 8> values = {{
+        {32u, 32u, 1u}, // SIMT Size, totalWorkItems, Max Num of threads
+        {32u, 64u, 2u},
+        {32u, 128u, 4u},
+        {32u, 1024u, 32u},
+        {16u, 32u, 2u},
+        {16u, 64u, 4u},
+        {16u, 128u, 8u},
+        {16u, 1024u, 64u},
+    }};
+
+    for (auto &[simtSize, totalWgSize, expectedNumThreadsPerThreadGroup] : values) {
+        EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, true));
+    }
+}
+
+HWTEST_F(GfxCoreHelperTest, givenFlagForceNumberOfThreadsInGpgpuThreadGroupWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue) {
+    DebugManagerStateRestore dbgRestore;
+    const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
+
+    uint32_t expectedNumThreadsPerThreadGroup = 10u;
+    DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
+    EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
+
+    expectedNumThreadsPerThreadGroup = 20u;
+    DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
+    EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
+}
--- a/shared/test/unit_test/helpers/local_id_tests.cpp
+++ b/shared/test/unit_test/helpers/local_id_tests.cpp
@@ -40,7 +40,7 @@ HWTEST_F(LocalIdTests, GivenSimd32AndNon32GrfSizeWhenGettingGrfsPerThreadThenTwo
 }

 TEST(LocalID, GivenSimd32AndLws33WhenGettingThreadsPerWorkgroupThenTwoIsReturned) {
-    size_t lws = 33;
+    uint32_t lws = 33;
    uint32_t simd = 32;
    EXPECT_EQ(2u, getThreadsPerWG(simd, lws));
 }
@@ -313,7 +313,7 @@ HWTEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
 }

 TEST_P(LocalIDFixture, WhenThreadsPerWgAreGeneratedThenSizeCalculationAreCorrect) {
-    auto workItems = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
+    auto workItems = static_cast<uint32_t>(localWorkSizeX * localWorkSizeY * localWorkSizeZ);
    auto sizeTotalPerThreadData = getThreadsPerWG(simd, workItems) * getPerThreadSizeLocalIDs(simd, grfSize);

    // Should be multiple of GRFs