fix: Use proper value about hw local id generations

- remove useless flag ForceNumberOfThreadsInGpgpuThreadGroup - add new flag "RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup" to restore old path without restrictions about number of threads in thread group - fix forwarding information about hw local ids generations to calculate numOfThreadsInThreadGroup correctly Related-To: NEO-7952, NEO-7982 Signed-off-by: Cencelewska, Katarzyna <katarzyna.cencelewska@intel.com>
2025-12-19 16:24:18 +08:00 · 2023-06-23 14:26:00 +00:00
parent aea5f435db
commit 68d81c82a7
9 changed files with 31 additions and 19 deletions
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -339,7 +339,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
    auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
    auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
    this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
-        simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, kernelRequiresGenerationOfLocalIdsByRuntime);
+        simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime);

    if (kernelRequiresGenerationOfLocalIdsByRuntime) {
        auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
--- a/level_zero/core/test/unit_tests/mocks/mock_kernel.h
+++ b/level_zero/core/test/unit_tests/mocks/mock_kernel.h
@@ -120,7 +120,11 @@ struct Mock<::L0::Kernel> : public WhiteBox<::L0::Kernel> {
    }

    void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {}
-    void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {}
+    void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {
+        if (enableForcingOfGenerateLocalIdByHw) {
+            kernelRequiresGenerationOfLocalIdsByRuntime = !forceGenerateLocalIdByHw;
+        }
+    }
    ze_result_t setArgBufferWithAlloc(uint32_t argIndex, uintptr_t argVal, NEO::GraphicsAllocation *allocation, NEO::SvmAllocationData *peerAllocData) override {
        return ZE_RESULT_SUCCESS;
    }
@@ -135,6 +139,8 @@ struct Mock<::L0::Kernel> : public WhiteBox<::L0::Kernel> {
    NEO::KernelInfo info;
    uint32_t printPrintfOutputCalledTimes = 0;
    bool hangDetectedPassedToPrintfOutput = false;
+    bool enableForcingOfGenerateLocalIdByHw = false;
+    bool forceGenerateLocalIdByHw = false;
 };

 } // namespace ult
--- a/opencl/source/helpers/hardware_commands_helper_base.inl
+++ b/opencl/source/helpers/hardware_commands_helper_base.inl
@@ -243,7 +243,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
    auto &gfxCoreHelper = device.getGfxCoreHelper();
    auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
    auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
-    auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, localIdsGenerationByRuntime);
+    auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, !localIdsGenerationByRuntime);

    uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();

--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@@ -82,6 +82,7 @@ DECLARE_DEBUG_VARIABLE(bool, AppendMemoryPrefetchForKmdMigratedSharedAllocations
 DECLARE_DEBUG_VARIABLE(bool, ForceMemoryPrefetchForKmdMigratedSharedAllocations, false, "Force prefetch of shared memory in command queue execute command lists")
 DECLARE_DEBUG_VARIABLE(bool, ClKhrExternalMemoryExtension, false, "Enable cl_khr_external_memory extension")
 DECLARE_DEBUG_VARIABLE(bool, WaitForMemoryRelease, false, "Wait for memory release when out of memory")
+DECLARE_DEBUG_VARIABLE(bool, RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup, 0, "0 - default disabled, 1- remove restrictions on NumberOfThreadsInGpgpuThreadGroup in INTERFACE_DESCRIPTOR_DATA")
 DECLARE_DEBUG_VARIABLE(std::string, ForceDeviceId, std::string("unk"), "Override device id in AUB/TBX mode")
 DECLARE_DEBUG_VARIABLE(std::string, FilterDeviceId, std::string("unk"), "Device id filter, adapter matching device id will be opened; ignored when unk")
 DECLARE_DEBUG_VARIABLE(std::string, FilterBdfPath, std::string("unk"), "Linux-only, BDF path filter, only matching paths will be opened; ignored when unk")
@@ -144,7 +145,6 @@ DECLARE_DEBUG_VARIABLE(int32_t, CFEOverDispatchControl, -1, "Set Over Dispatch C
 DECLARE_DEBUG_VARIABLE(int32_t, CFELargeGRFThreadAdjustDisable, -1, "Set Large GRF thread adjust Disable field in CFE_STATE, -1 - do not set")
 DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumComputeUnitsForScratch, -1, "Override number of compute units used for scratch size calculation")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceThreadGroupDispatchSize, -1, "Set ThreadGroupDispatchSize in INTERFACE_DESCRIPTOR_DATA, -1 - default, 0 - TG size 8, 1 - TG size 4, 2 - TG size 2, 3 - Reserved")
-DECLARE_DEBUG_VARIABLE(int32_t, ForceNumberOfThreadsInGpgpuThreadGroup, -1, "-1 - default, set NumberOfThreadsInGpgpuThreadGroup in INTERFACE_DESCRIPTOR_DATA")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessL1CachingPolicy, -1, "-1: default, >=0 : program value for stateless L1 caching")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceMemoryBankIndexOverride, -1, "-1: default, 0: disable, 1:enable, Force index=1 of memory bank for XEHP")
 DECLARE_DEBUG_VARIABLE(int32_t, EnablePrivateScratchSlot1, -1, "-1: default, 0: disable, 1: enable Allows using private scratch space")
--- a/shared/source/helpers/gfx_core_helper.h
+++ b/shared/source/helpers/gfx_core_helper.h
@@ -168,7 +168,7 @@ class GfxCoreHelper {
    virtual bool isChipsetUniqueUUIDSupported() const = 0;
    virtual bool isTimestampShiftRequired() const = 0;
    virtual bool isRelaxedOrderingSupported() const = 0;
-    virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const = 0;
+    virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const = 0;
    static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
    virtual ~GfxCoreHelper() = default;

@@ -381,7 +381,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
    bool isChipsetUniqueUUIDSupported() const override;
    bool isTimestampShiftRequired() const override;
    bool isRelaxedOrderingSupported() const override;
-    uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const override;
+    uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const override;

    ~GfxCoreHelperHw() override = default;

--- a/shared/source/helpers/gfx_core_helper_base.inl
+++ b/shared/source/helpers/gfx_core_helper_base.inl
@@ -688,10 +688,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
 }

 template <typename GfxFamily>
-uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const {
-    if (DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get() != -1) {
-        return DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get();
-    }
+uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const {
    return getThreadsPerWG(simd, totalWorkItems);
 }
 } // namespace NEO
--- a/shared/test/common/mocks/mock_execution_environment.h
+++ b/shared/test/common/mocks/mock_execution_environment.h
@@ -13,6 +13,7 @@
 namespace NEO {

 struct MockRootDeviceEnvironment : public RootDeviceEnvironment {
+    using RootDeviceEnvironment::hwInfo;
    using RootDeviceEnvironment::isDummyAllocationInitialized;
    using RootDeviceEnvironment::RootDeviceEnvironment;
    ~MockRootDeviceEnvironment() override = default;
--- a/shared/test/common/test_files/igdrcl.config
+++ b/shared/test/common/test_files/igdrcl.config
@@ -534,8 +534,8 @@ PrintGlobalTimestampInNs = 0
 EnableDeviceStateVerification = -1
 VfBarResourceAllocationWa = 1
 EnableDynamicPostSyncAllocLayout = -1
-ForceNumberOfThreadsInGpgpuThreadGroup = -1
 PrintTimestampPacketUsage = -1
 TrackNumCsrClientsOnSyncPoints = -1
 CommandListTimestampRefreshIntervalInMilliSec = -1
+RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup = 0
 # Please don't edit below this line
--- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp
+++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp
@@ -1587,15 +1587,23 @@ HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThe
    }
 }

-HWTEST_F(GfxCoreHelperTest, givenFlagForceNumberOfThreadsInGpgpuThreadGroupWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue) {
+HWTEST_F(GfxCoreHelperTest, givenFlagRemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroupWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue) {
    DebugManagerStateRestore dbgRestore;
+    DebugManager.flags.RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup.set(1);
    const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();

-    uint32_t expectedNumThreadsPerThreadGroup = 10u;
-    DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
-    EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
+    std::array<std::array<uint32_t, 5>, 8> values = {{
+        {32u, 32u, 128u, 1, 1u}, // SIMT Size, totalWorkItems, Max Num of threads, Grf size, Hw local id generation
+        {32u, 64u, 32u, 1, 2u},
+        {32u, 128u, 256u, 1, 4u},
+        {32u, 1024u, 128u, 1, 32u},
+        {16u, 32u, 32u, 0, 2u},
+        {16u, 64u, 256u, 0, 4u},
+        {16u, 128u, 128u, 0, 8u},
+        {16u, 1024u, 256u, 0, 64u},
+    }};

-    expectedNumThreadsPerThreadGroup = 20u;
-    DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
-    EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
+    for (auto &[simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
+        EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, isHwLocalIdGeneration));
+    }
 }