feature: limit max LWS based on preferred number of workgroups per ss

- limit max LWS size when SLM and barriers are not used Related-To: GSD-11112 Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
2026-01-01 04:23:00 +08:00 · 2025-05-16 12:30:58 +00:00
parent 8839d62c79
commit b03f625f03
9 changed files with 285 additions and 9 deletions
--- a/shared/source/helpers/local_work_size.cpp
+++ b/shared/source/helpers/local_work_size.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -360,6 +360,9 @@ void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const
        workGroupSize[i] = 1;

    UNRECOVERABLE_IF(wsInfo.simdSize == 0);
+    uint64_t totalNumberOfItems = workItems[0] * workItems[1] * workItems[2];
+    auto optimalWgThreadCount = optimalHardwareThreadCountGeneric[0];
+    bool totalRequiredThreadGroupsMoreThanSingleThreadGroup = totalNumberOfItems > wsInfo.simdSize * optimalWgThreadCount;

    // Find biggest power of two which devide each dimension size
    if (wsInfo.slmTotalSize == 0 && !wsInfo.hasBarriers) {
@@ -367,9 +370,14 @@ void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const
            return computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim);
        }

+        if (wsInfo.preferredWgCountPerSubSlice != 0 && wsInfo.simdSize == 32 && totalRequiredThreadGroupsMoreThanSingleThreadGroup) {
+            optimalWgThreadCount = std::min(optimalWgThreadCount, wsInfo.numThreadsPerSubSlice / wsInfo.preferredWgCountPerSubSlice);
+            wsInfo.maxWorkGroupSize = wsInfo.simdSize * optimalWgThreadCount;
+        }
+
        size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
        for (auto i = 0u; i < workDim; i++) {
-            uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalHardwareThreadCountGeneric[0]);
+            uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalWgThreadCount);
            while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
                requiredWorkItemsCount >>= 1;
            itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
@@ -382,7 +390,7 @@ void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const
        // If computed dimension sizes which are powers of two are creating group which is
        // bigger than maxWorkGroupSize or this group would create more than optimal hardware threads then downsize it
        uint64_t allItems = itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] * itemsPowerOfTwoDivisors[2];
-        if (allItems > wsInfo.simdSize && (allItems > wsInfo.maxWorkGroupSize || allItems > wsInfo.simdSize * optimalHardwareThreadCountGeneric[0])) {
+        if (allItems > wsInfo.simdSize && (allItems > wsInfo.maxWorkGroupSize || allItems > wsInfo.simdSize * optimalWgThreadCount)) {
            return computePowerOfTwoLWS(itemsPowerOfTwoDivisors, wsInfo, workGroupSize, workDim, canUseNx4);
        }
        // If coputed workgroup is at this point in correct size
@@ -394,9 +402,8 @@ void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const
        }
    }

-    uint64_t totalNuberOfItems = workItems[0] * workItems[1] * workItems[2];
    // If dimensions are not powers of two but total number of items is less than max work group size
-    if (totalNuberOfItems <= wsInfo.maxWorkGroupSize) {
+    if (totalNumberOfItems <= wsInfo.maxWorkGroupSize) {
        for (auto i = 0u; i < workDim; i++)
            workGroupSize[i] = workItems[i];
        return;
--- a/shared/source/os_interface/product_helper.h
+++ b/shared/source/os_interface/product_helper.h
@@ -95,6 +95,7 @@ class ProductHelper {
    virtual bool isMaxThreadsForWorkgroupWARequired(const HardwareInfo &hwInfo) const = 0;
    virtual uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const = 0;
    virtual uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const = 0;
+    virtual uint32_t getPreferredWorkgroupCountPerSubslice() const = 0;
    virtual void setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const = 0;
    virtual void updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const = 0;
    virtual bool obtainBlitterPreference(const HardwareInfo &hwInfo) const = 0;
--- a/shared/source/os_interface/product_helper.inl
+++ b/shared/source/os_interface/product_helper.inl
@@ -247,6 +247,11 @@ uint32_t ProductHelperHw<gfxProduct>::getMaxThreadsForWorkgroup(const HardwareIn
    return maxNumEUsPerSubSlice * numThreadsPerEU;
 }

+template <PRODUCT_FAMILY gfxProduct>
+uint32_t ProductHelperHw<gfxProduct>::getPreferredWorkgroupCountPerSubslice() const {
+    return 0;
+}
+
 template <PRODUCT_FAMILY gfxProduct>
 void ProductHelperHw<gfxProduct>::setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const {}

--- a/shared/source/os_interface/product_helper_hw.h
+++ b/shared/source/os_interface/product_helper_hw.h
@@ -35,6 +35,7 @@ class ProductHelperHw : public ProductHelper {
    bool isMaxThreadsForWorkgroupWARequired(const HardwareInfo &hwInfo) const override;
    uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const override;
    uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const override;
+    uint32_t getPreferredWorkgroupCountPerSubslice() const override;
    void setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const override;
    void updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const override;
    bool obtainBlitterPreference(const HardwareInfo &hwInfo) const override;
--- a/shared/source/program/work_size_info.cpp
+++ b/shared/source/program/work_size_info.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2023-2024 Intel Corporation
+ * Copyright (C) 2023-2025 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -73,4 +73,8 @@ void WorkSizeInfo::checkRatio(const size_t workItems[3]) {
    }
 }

+void WorkSizeInfo::setPreferredWgCountPerSubslice(uint32_t preferredWgCount) {
+    preferredWgCountPerSubSlice = preferredWgCount;
+}
+
 } // namespace NEO
--- a/shared/source/program/work_size_info.h
+++ b/shared/source/program/work_size_info.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2023 Intel Corporation
+ * Copyright (C) 2023-2025 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -29,12 +29,14 @@ struct WorkSizeInfo {
    bool useRatio = false;
    bool useStrictRatio = false;
    float targetRatio = 0;
+    uint32_t preferredWgCountPerSubSlice = 0;

    WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, const RootDeviceEnvironment &rootDeviceEnvironment, uint32_t numThreadsPerSubSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface, bool disableEUFusion);

    void setIfUseImg(const KernelInfo &kernelInfo);
    void setMinWorkGroupSize(const RootDeviceEnvironment &rootDeviceEnvironment, bool disableEUFusion);
    void checkRatio(const size_t workItems[3]);
+    void setPreferredWgCountPerSubslice(uint32_t preferredWgCount);
 };

 } // namespace NEO
--- a/shared/test/unit_test/os_interface/product_helper_tests.cpp
+++ b/shared/test/unit_test/os_interface/product_helper_tests.cpp
@@ -1166,3 +1166,7 @@ HWTEST2_F(ProductHelperTest, givenProductHelperWhenCallingIsResourceUncachedForC
        }
    }
 }
+
+HWTEST_F(ProductHelperTest, givenProductHelperWhenGettingPreferredWorkgroupCountPerSubsliceThenZeroReturned) {
+    EXPECT_EQ(0u, productHelper->getPreferredWorkgroupCountPerSubslice());
+}