fix: correct limitation for num threads per thread group

taking into account the max work group limit Resolves: NEO-14922 Related-To: NEO-11881 Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com>
2026-01-05 09:09:04 +08:00 · 2025-05-20 14:41:49 +00:00
parent 5f80490385
commit 6ad4ad41b1
26 changed files with 171 additions and 228 deletions
--- a/shared/source/helpers/aarch64/local_id_gen.cpp
+++ b/shared/source/helpers/aarch64/local_id_gen.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2024 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -44,9 +44,8 @@ LocalIDHelper::LocalIDHelper() {
 LocalIDHelper LocalIDHelper::initializer;

 void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) {
-    bool localIdsGeneratedByHw = false;
    auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
-    auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, localIdsGeneratedByHw, rootDeviceEnvironment));
+    auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, rootDeviceEnvironment));
    bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
    if (useLayoutForImages) {
        generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
--- a/shared/source/helpers/gfx_core_helper.h
+++ b/shared/source/helpers/gfx_core_helper.h
@@ -120,7 +120,7 @@ class GfxCoreHelper {
    virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
    virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
                                             const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
-    virtual uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
+    virtual uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
    virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
    virtual size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const = 0;
    virtual bool isCpuImageTransferPreferred(const HardwareInfo &hwInfo) const = 0;
@@ -166,7 +166,7 @@ class GfxCoreHelper {
    virtual bool isChipsetUniqueUUIDSupported() const = 0;
    virtual bool isTimestampShiftRequired() const = 0;
    virtual bool isRelaxedOrderingSupported() const = 0;
-    virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
+    virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
    virtual uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const = 0;
    virtual DeviceHierarchyMode getDefaultDeviceHierarchy() const = 0;
    static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
@@ -361,7 +361,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
    uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
                                     const RootDeviceEnvironment &rootDeviceEnvironment) const override;

-    uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
+    uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
    size_t getMaxFillPaternSizeForCopyEngine() const override;

    size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const override;
@@ -415,7 +415,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
    bool isChipsetUniqueUUIDSupported() const override;
    bool isTimestampShiftRequired() const override;
    bool isRelaxedOrderingSupported() const override;
-    uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
+    uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
    uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const override;
    DeviceHierarchyMode getDefaultDeviceHierarchy() const override;

--- a/shared/source/helpers/gfx_core_helper_base.inl
+++ b/shared/source/helpers/gfx_core_helper_base.inl
@@ -661,7 +661,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::overrideMaxWorkGroupSize(uint32_t maxWG) co
 }

 template <typename GfxFamily>
-uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const {
+uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const {
    return defaultMaxGroupSize;
 }

@@ -671,7 +671,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
 }

 template <typename GfxFamily>
-uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const {
+uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const {
    return getThreadsPerWG(simd, totalWorkItems);
 }

--- a/shared/source/helpers/gfx_core_helper_xe3_and_later.inl
+++ b/shared/source/helpers/gfx_core_helper_xe3_and_later.inl
@@ -25,4 +25,37 @@ uint32_t GfxCoreHelperHw<Family>::calculateAvailableThreadCount(const HardwareIn
    }
    return std::min(hwInfo.gtSystemInfo.ThreadCount, maxThreadsPerEuCount * hwInfo.gtSystemInfo.EUCount);
 }
+
+template <>
+uint32_t GfxCoreHelperHw<Family>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const {
+    uint32_t numThreadsPerThreadGroup = getThreadsPerWG(simd, totalWorkItems);
+    if (debugManager.flags.RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup.get() == 1) {
+        return numThreadsPerThreadGroup;
+    }
+    auto simt = isSimd1(simd) ? 32u : simd;
+    const auto &compilerProductHelper = rootDeviceEnvironment.getHelper<CompilerProductHelper>();
+    const auto &productHelper = rootDeviceEnvironment.getProductHelper();
+    const auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
+    auto isHeaplessMode = compilerProductHelper.isHeaplessModeEnabled(hwInfo);
+
+    uint32_t maxThreadsPerThreadGroup = 32u;
+
+    if (grfCount == 512) {
+        maxThreadsPerThreadGroup = 16u;
+    } else if ((grfCount == 256) || (simt == 32u)) {
+        // driver limit maxWorkgroupSize to 1024 (NEO-11881) so for simt 32 the max threads per thread group is 32
+        maxThreadsPerThreadGroup = 32u;
+    } else if (grfCount == 192) {
+        maxThreadsPerThreadGroup = 40u;
+    } else if (grfCount == 160) {
+        maxThreadsPerThreadGroup = 48u;
+    } else if (grfCount <= 128) {
+        maxThreadsPerThreadGroup = 64u;
+    }
+
+    maxThreadsPerThreadGroup = productHelper.adjustMaxThreadsPerThreadGroup(maxThreadsPerThreadGroup, simt, grfCount, isHeaplessMode);
+    DEBUG_BREAK_IF(maxThreadsPerThreadGroup * simt > CommonConstants::maxWorkgroupSize);
+    return std::min(numThreadsPerThreadGroup, maxThreadsPerThreadGroup);
+}
+
 } // namespace NEO
--- a/shared/source/helpers/per_thread_data.h
+++ b/shared/source/helpers/per_thread_data.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2024 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -24,14 +24,13 @@ struct PerThreadDataHelper {
        uint32_t grfCount,
        uint32_t numChannels,
        size_t localWorkSize,
-        bool isHwLocalIdGeneration,
        const RootDeviceEnvironment &rootDeviceEnvironment) {
        auto perThreadSizeLocalIDs = static_cast<size_t>(getPerThreadSizeLocalIDs(simd, grfSize, numChannels));
        if (isSimd1(simd)) {
            return perThreadSizeLocalIDs * localWorkSize;
        }
        auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
-        return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfCount, isHwLocalIdGeneration, rootDeviceEnvironment);
+        return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfCount, rootDeviceEnvironment);
    }
 }; // namespace PerThreadDataHelper
 } // namespace NEO
--- a/shared/source/helpers/x86_64/local_id_gen.cpp
+++ b/shared/source/helpers/x86_64/local_id_gen.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2024 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -47,9 +47,8 @@ LocalIDHelper LocalIDHelper::initializer;

 // traditional function to generate local IDs
 void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) {
-    bool localIdsGeneratedByHw = false;
    auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
-    auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, localIdsGeneratedByHw, rootDeviceEnvironment));
+    auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, rootDeviceEnvironment));
    bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
    if (useLayoutForImages) {
        generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);