Revert "fix: correct limitation for num threads per thread group"

This reverts commit 6ad4ad41b1. Signed-off-by: Compute-Runtime-Validation <compute-runtime-validation@intel.com>
2026-01-05 18:06:32 +08:00 · 2025-05-23 02:11:46 +02:00
parent d6849a5605
commit 593c9e76f2
26 changed files with 223 additions and 166 deletions
--- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp
+++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp
@@ -1649,16 +1649,18 @@ HWTEST_F(GfxCoreHelperTest, givenNumGrfAndSimdSizeWhenAdjustingMaxWorkGroupSizeT
    constexpr auto defaultMaxGroupSize = 1024u;

    uint32_t simdSize = 16u;
+    uint32_t isHwLocalIdGeneration = true;
    uint32_t numGrfRequired = GrfConfig::largeGrfNumber;
-    EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize, rootDeviceEnvironment));
+    EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment));

    simdSize = 32u;
    numGrfRequired = GrfConfig::largeGrfNumber;
-    EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize, rootDeviceEnvironment));
+    EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment));

    simdSize = 16u;
+    isHwLocalIdGeneration = false;
    numGrfRequired = GrfConfig::defaultGrfNumber;
-    EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize, rootDeviceEnvironment));
+    EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment));
 }

 HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue, IsAtMostXeHpcCore) {
@@ -1676,7 +1678,7 @@ HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThe
    }};

    for (auto &[simtSize, totalWgSize, expectedNumThreadsPerThreadGroup] : values) {
-        EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, rootDeviceEnvironment));
+        EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, true, rootDeviceEnvironment));
    }
 }

@@ -1686,19 +1688,19 @@ HWTEST_F(GfxCoreHelperTest, givenFlagRemoveRestrictionsOnNumberOfThreadsInGpgpuT
    const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
    const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();

-    std::array<std::array<uint32_t, 4>, 8> values = {{
-        {32u, 32u, 128u, 1u}, // SIMT Size, totalWorkItems,Grf size, Max Num of threads
-        {32u, 64u, 32u, 2u},
-        {32u, 128u, 256u, 4u},
-        {32u, 1024u, 128u, 32u},
-        {16u, 32u, 32u, 2u},
-        {16u, 64u, 256u, 4u},
-        {16u, 128u, 128u, 8u},
-        {16u, 1024u, 256u, 64u},
+    std::array<std::array<uint32_t, 5>, 8> values = {{
+        {32u, 32u, 128u, 1, 1u}, // SIMT Size, totalWorkItems, Max Num of threads, Grf size, Hw local id generation
+        {32u, 64u, 32u, 1, 2u},
+        {32u, 128u, 256u, 1, 4u},
+        {32u, 1024u, 128u, 1, 32u},
+        {16u, 32u, 32u, 0, 2u},
+        {16u, 64u, 256u, 0, 4u},
+        {16u, 128u, 128u, 0, 8u},
+        {16u, 1024u, 256u, 0, 64u},
    }};

-    for (auto &[simtSize, totalWgSize, grfsize, expectedNumThreadsPerThreadGroup] : values) {
-        EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, rootDeviceEnvironment));
+    for (auto &[simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
+        EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, rootDeviceEnvironment));
    }
 }

--- a/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp
+++ b/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp
@@ -81,7 +81,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP

    NEO::MockExecutionEnvironment mockExecutionEnvironment{};
    auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
-    auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.v0.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
+    auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.v0.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
    EXPECT_EQ(localIdsSize + ImplicitArgsV0::getAlignedSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
 }

--- a/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp
+++ b/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp
@@ -807,17 +807,23 @@ XE2_HPG_CORETEST_F(GfxCoreHelperTestsXe2HpgCore, givenNumGrfAndSimdSizeWhenAdjus
    auto defaultMaxWorkGroupSize = 2048u;
    const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
    const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
-    std::array<std::array<uint32_t, 3>, 6> values = {{
-        {GrfConfig::defaultGrfNumber, 16u, 1024u}, // Grf Size, SIMT Size, Max Num of threads
-        {GrfConfig::defaultGrfNumber, 32u, 1024u},
-        {GrfConfig::largeGrfNumber, 16u, 512u},
-        {GrfConfig::largeGrfNumber, 32u, 1024u},
-        {GrfConfig::defaultGrfNumber, 1u, 32u},
-        {GrfConfig::largeGrfNumber, 1u, 32u},
+    std::array<std::array<uint32_t, 4>, 12> values = {{
+        {GrfConfig::defaultGrfNumber, 16u, 0u, 1024u}, // Grf Size, SIMT Size, HW local-id generation, Max Num of threads
+        {GrfConfig::defaultGrfNumber, 16u, 1u, 1024u},
+        {GrfConfig::defaultGrfNumber, 32u, 1u, 1024u},
+        {GrfConfig::defaultGrfNumber, 32u, 0u, 2048u},
+        {GrfConfig::largeGrfNumber, 16u, 0u, 512u},
+        {GrfConfig::largeGrfNumber, 16u, 1u, 512u},
+        {GrfConfig::largeGrfNumber, 32u, 0u, 1024u},
+        {GrfConfig::largeGrfNumber, 32u, 1u, 1024u},
+        {GrfConfig::defaultGrfNumber, 1u, 1u, 32u},
+        {GrfConfig::defaultGrfNumber, 1u, 0u, 64u},
+        {GrfConfig::largeGrfNumber, 1u, 0u, 32u},
+        {GrfConfig::largeGrfNumber, 1u, 1u, 32u},
    }};

-    for (auto &[grfSize, simtSize, expectedNumThreadsPerThreadGroup] : values) {
-        EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.adjustMaxWorkGroupSize(grfSize, simtSize, defaultMaxWorkGroupSize, rootDeviceEnvironment));
+    for (auto &[grfSize, simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
+        EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.adjustMaxWorkGroupSize(grfSize, simtSize, isHwLocalIdGeneration, defaultMaxWorkGroupSize, rootDeviceEnvironment));
    }
 }

@@ -825,17 +831,23 @@ XE2_HPG_CORETEST_F(GfxCoreHelperTestsXe2HpgCore, givenParamsWhenCalculateNumThre
    auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
    const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
    auto totalWgSize = 2048u;
-    std::array<std::array<uint32_t, 3>, 6> values = {{
-        {GrfConfig::defaultGrfNumber, 16u, 64u}, // Grf Size, SIMT Size, Max Num of threads
-        {GrfConfig::defaultGrfNumber, 32u, 32u},
-        {GrfConfig::defaultGrfNumber, 1u, 32u},
-        {GrfConfig::largeGrfNumber, 16u, 32u},
-        {GrfConfig::largeGrfNumber, 32u, 32u},
-        {GrfConfig::largeGrfNumber, 1u, 32u},
+    std::array<std::array<uint32_t, 4>, 12> values = {{
+        {GrfConfig::defaultGrfNumber, 16u, 0u, 64u}, // Grf Size, SIMT Size, HW local-id generation, Max Num of threads
+        {GrfConfig::defaultGrfNumber, 16u, 1u, 64u},
+        {GrfConfig::defaultGrfNumber, 32u, 1u, 32u},
+        {GrfConfig::defaultGrfNumber, 32u, 0u, 64u},
+        {GrfConfig::defaultGrfNumber, 1u, 1u, 32u},
+        {GrfConfig::defaultGrfNumber, 1u, 0u, 64u},
+        {GrfConfig::largeGrfNumber, 16u, 0u, 32u},
+        {GrfConfig::largeGrfNumber, 16u, 1u, 32u},
+        {GrfConfig::largeGrfNumber, 32u, 0u, 32u},
+        {GrfConfig::largeGrfNumber, 32u, 1u, 32u},
+        {GrfConfig::largeGrfNumber, 1u, 0u, 32u},
+        {GrfConfig::largeGrfNumber, 1u, 1u, 32u},
    }};

-    for (auto &[grfSize, simtSize, expectedNumThdreadsPerThreadGroup] : values) {
-        EXPECT_EQ(expectedNumThdreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfSize, rootDeviceEnvironment));
+    for (auto &[grfSize, simtSize, isHwLocalIdGeneration, expectedNumThdreadsPerThreadGroup] : values) {
+        EXPECT_EQ(expectedNumThdreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfSize, isHwLocalIdGeneration, rootDeviceEnvironment));
    }
 }

--- a/shared/test/unit_test/xe3_core/gfx_core_helper_xe3_core_tests.cpp
+++ b/shared/test/unit_test/xe3_core/gfx_core_helper_xe3_core_tests.cpp
@@ -754,26 +754,41 @@ XE3_CORETEST_F(GfxCoreHelperTestsXe3Core, givenNumGrfAndSimdSizeWhenAdjustingMax
    auto defaultMaxWorkGroupSize = 2048u;
    const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
    const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
-    std::array<std::array<uint32_t, 3>, 15> values = {{
-        {128u, 16u, 1024u}, // Grf Size, SIMT Size, Max Num of threads
-        {128u, 32u, 1024u},
-        {160u, 16u, 768u},
-        {160u, 32u, 1024u},
-        {192u, 16u, 640u},
-        {192u, 32u, 1024u},
-        {256u, 16u, 512u},
-        {256u, 32u, 1024u},
-        {512u, 16u, 256u},
-        {512u, 32u, 512u},
-        {128u, 1u, 32u},
-        {160u, 1u, 32u},
-        {192u, 1u, 32u},
-        {256u, 1u, 32u},
-        {512u, 1u, 16u},
+    std::array<std::array<uint32_t, 4>, 30> values = {{
+        {128u, 16u, 0u, 1024u}, // Grf Size, SIMT Size, HW local-id generation, Max Num of threads
+        {128u, 16u, 1u, 1024u},
+        {128u, 32u, 1u, 1024u},
+        {128u, 32u, 0u, 2048u},
+        {160u, 16u, 0u, 768u},
+        {160u, 16u, 1u, 768u},
+        {160u, 32u, 1u, 1024u},
+        {160u, 32u, 0u, 1536u},
+        {192u, 16u, 0u, 640u},
+        {192u, 16u, 1u, 640u},
+        {192u, 32u, 1u, 1024u},
+        {192u, 32u, 0u, 1280u},
+        {256u, 16u, 0u, 512u},
+        {256u, 16u, 1u, 512u},
+        {256u, 32u, 1u, 1024u},
+        {256u, 32u, 0u, 1024u},
+        {512u, 16u, 0u, 256u},
+        {512u, 16u, 1u, 256u},
+        {512u, 32u, 1u, 512u},
+        {512u, 32u, 0u, 512u},
+        {128u, 1u, 1u, 32u},
+        {128u, 1u, 0u, 64u},
+        {160u, 1u, 1u, 32u},
+        {160u, 1u, 0u, 48u},
+        {192u, 1u, 1u, 32u},
+        {192u, 1u, 0u, 40u},
+        {256u, 1u, 1u, 32u},
+        {256u, 1u, 0u, 32u},
+        {512u, 1u, 1u, 16u},
+        {512u, 1u, 0u, 16u},
    }};

-    for (auto &[grfSize, simtSize, expectedNumThreadsPerThreadGroup] : values) {
-        EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.adjustMaxWorkGroupSize(grfSize, simtSize, defaultMaxWorkGroupSize, rootDeviceEnvironment));
+    for (auto &[grfSize, simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
+        EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.adjustMaxWorkGroupSize(grfSize, simtSize, isHwLocalIdGeneration, defaultMaxWorkGroupSize, rootDeviceEnvironment));
    }
 }

@@ -786,26 +801,41 @@ XE3_CORETEST_F(GfxCoreHelperTestsXe3Core, givenParamsWhenCalculateNumThreadsPerT
    auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
    const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
    auto totalWgSize = 2048u;
-    std::array<std::array<uint32_t, 3>, 15> values = {{
-        {128u, 16u, 64u}, // Grf Size, SIMT Size, Max Num of threads
-        {128u, 32u, 32u},
-        {128u, 1u, 32u},
-        {160u, 16u, 48u},
-        {160u, 32u, 32u},
-        {160u, 1u, 32u},
-        {192u, 16u, 40u},
-        {192u, 32u, 32u},
-        {192u, 1u, 32u},
-        {256u, 16u, 32u},
-        {256u, 32u, 32u},
-        {256u, 1u, 32u},
-        {512u, 16u, 16u},
-        {512u, 32u, 16u},
-        {512u, 1u, 16u},
+    std::array<std::array<uint32_t, 4>, 30> values = {{
+        {128u, 16u, 0u, 64u}, // Grf Size, SIMT Size, HW local-id generation, Max Num of threads
+        {128u, 16u, 1u, 64u},
+        {128u, 32u, 1u, 32u},
+        {128u, 32u, 0u, 64u},
+        {128u, 1u, 1u, 32u},
+        {128u, 1u, 0u, 64u},
+        {160u, 16u, 0u, 48u},
+        {160u, 16u, 1u, 48u},
+        {160u, 32u, 1u, 32u},
+        {160u, 32u, 0u, 48u},
+        {160u, 1u, 1u, 32u},
+        {160u, 1u, 0u, 48u},
+        {192u, 16u, 0u, 40u},
+        {192u, 16u, 1u, 40u},
+        {192u, 32u, 1u, 32u},
+        {192u, 32u, 0u, 40u},
+        {192u, 1u, 1u, 32u},
+        {192u, 1u, 0u, 40u},
+        {256u, 16u, 0u, 32u},
+        {256u, 16u, 1u, 32u},
+        {256u, 32u, 1u, 32u},
+        {256u, 32u, 0u, 32u},
+        {256u, 1u, 1u, 32u},
+        {256u, 1u, 0u, 32u},
+        {512u, 16u, 0u, 16u},
+        {512u, 16u, 1u, 16u},
+        {512u, 32u, 1u, 16u},
+        {512u, 32u, 0u, 16u},
+        {512u, 1u, 1u, 16u},
+        {512u, 1u, 0u, 16u},
    }};

-    for (auto &[grfSize, simtSize, expectedNumThreadsPerThreadGroup] : values) {
-        EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfSize, rootDeviceEnvironment));
+    for (auto &[grfSize, simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
+        EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfSize, isHwLocalIdGeneration, rootDeviceEnvironment));
    }
 }