fix: to always use grfs count in calculateNumThreadsPerThreadGroup

grf size != grf count Related-To: GSD-8437 Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com>
2025-12-24 21:18:24 +08:00 · 2024-03-22 09:39:15 +00:00
parent df54d67f40
commit da7b03dd15
27 changed files with 86 additions and 68 deletions
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -377,17 +377,17 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
    }
    evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);

-    auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
+    auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
    auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
    auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
    this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
-        simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
+        simdSize, static_cast<uint32_t>(itemsInGroup), grfCount, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);

    if (kernelRequiresGenerationOfLocalIdsByRuntime) {
        auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
        uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
            static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
-                simdSize, grfSize, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment));
+                simdSize, grfSize, grfCount, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment));
        if (perThreadDataSizeForWholeThreadGroupNeeded >
            perThreadDataSizeForWholeThreadGroupAllocated) {
            alignedFree(perThreadDataForWholeThreadGroup);
@@ -405,7 +405,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
                                         static_cast<uint16_t>(groupSizeY),
                                         static_cast<uint16_t>(groupSizeZ)}},
                std::array<uint8_t, 3>{{0, 1, 2}},
-                false, grfSize, rootDeviceEnvironment);
+                false, grfSize, grfCount, rootDeviceEnvironment);
        }

        this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp
@@ -1029,12 +1029,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
    dispatchKernelWithImplicitArgs<FamilyType>();

    auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
+    auto numGrf = GrfConfig::defaultGrfNumber;
    auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
    const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
-    generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, rootDeviceEnvironment);
+    generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);

    auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
-    size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
+    size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);

    EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
    alignedFree(expectedLocalIds);
@@ -1075,12 +1076,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
    dispatchKernelWithImplicitArgs<FamilyType>();

    auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
+    auto numGrf = GrfConfig::defaultGrfNumber;
    auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
    const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
-    generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, rootDeviceEnvironment);
+    generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);

    auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
-    size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
+    size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);

    EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
    alignedFree(expectedLocalIds);
--- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp
+++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp
@@ -306,6 +306,7 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
    mockKernel.module = &mockModule;
    const auto &device = mockModule.getDevice();
    auto grfSize = device->getHwInfo().capabilityTable.grfSize;
+    auto numGrf = GrfConfig::defaultGrfNumber;
    const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
    uint32_t groupSize[3] = {2, 3, 5};
    auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
@@ -315,13 +316,14 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
    auto numThreadsPerTG = gfxHelper.calculateNumThreadsPerThreadGroup(
        mockKernel.descriptor.kernelAttributes.simdSize,
        groupSize[0] * groupSize[1] * groupSize[2],
-        grfSize,
+        numGrf,
        mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
        rootDeviceEnvironment);
    auto perThreadDataSizeForWholeTGNeeded =
        static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
            mockKernel.descriptor.kernelAttributes.simdSize,
            grfSize,
+            numGrf,
            mockKernel.descriptor.kernelAttributes.numLocalIdChannels,
            groupSize[0] * groupSize[1] * groupSize[2],
            !mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
--- a/opencl/source/helpers/hardware_commands_helper_base.inl
+++ b/opencl/source/helpers/hardware_commands_helper_base.inl
@@ -55,6 +55,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kerne
    const auto &hwInfo = kernel.getHardwareInfo();

    auto numChannels = kernelDescriptor.kernelAttributes.numLocalIdChannels;
+    auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
    uint32_t grfSize = hwInfo.capabilityTable.grfSize;
    auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
    uint32_t requiredWalkOrder = 0u;
@@ -69,7 +70,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kerne
        requiredWalkOrder,
        simdSize);
    auto size = kernel.getCrossThreadDataSize() +
-                getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize, isHwLocalIdGeneration, rootDeviceEnvironment);
+                getPerThreadDataSizeTotal(simdSize, grfSize, grfCount, numChannels, localWorkSize, isHwLocalIdGeneration, rootDeviceEnvironment);

    auto pImplicitArgs = kernel.getImplicitArgs();
    if (pImplicitArgs) {
@@ -268,9 +269,9 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
    }

    auto &gfxCoreHelper = device.getGfxCoreHelper();
-    auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
+    auto grfCount = kernel.getDescriptor().kernelAttributes.numGrfRequired;
    auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
-    auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, !localIdsGenerationByRuntime, device.getRootDeviceEnvironment());
+    auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfCount, !localIdsGenerationByRuntime, device.getRootDeviceEnvironment());

    uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();

--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@@ -2281,8 +2281,9 @@ void Kernel::initializeLocalIdsCache() {
                                         workgroupDimensionsOrder[1],
                                         workgroupDimensionsOrder[2]};
    auto simdSize = getDescriptor().kernelAttributes.simdSize;
+    auto grfCount = getDescriptor().kernelAttributes.numGrfRequired;
    auto grfSize = static_cast<uint8_t>(getDevice().getHardwareInfo().capabilityTable.grfSize);
-    localIdsCache = std::make_unique<LocalIdsCache>(4, wgDimOrder, simdSize, grfSize, usingImagesOnly);
+    localIdsCache = std::make_unique<LocalIdsCache>(4, wgDimOrder, grfCount, simdSize, grfSize, usingImagesOnly);
 }

 void Kernel::setLocalIdsForGroup(const Vec3<uint16_t> &groupSize, void *destination) const {
--- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
+++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
@@ -1379,9 +1379,10 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
        auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
        auto simdSize = kernelInfo.getMaxSimdSize();
        uint32_t grfSize = sizeof(typename FamilyType::GRF);
+        auto numGrf = GrfConfig::defaultGrfNumber;

        auto size = kernelWithImplicitArgs.getCrossThreadDataSize() +
-                    HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(workGroupSize), false, rootDeviceEnvironment) +
+                    HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(simdSize, grfSize, numGrf, numChannels, Math::computeTotalElementsCount(workGroupSize), false, rootDeviceEnvironment) +
                    ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernelWithImplicitArgs.getImplicitArgs(), kernelWithImplicitArgs.getDescriptor(), false, rootDeviceEnvironment);

        size = alignUp(size, MemoryConstants::cacheLineSize);
--- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp
+++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp
@@ -565,8 +565,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
    constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF);
    size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
    auto numChannels = modifiedKernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
+    auto numGrf = GrfConfig::defaultGrfNumber;
    const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
-    size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize, !kernelUsesLocalIds, rootDeviceEnvironment);
+    size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numGrf, numChannels, localWorkSize, !kernelUsesLocalIds, rootDeviceEnvironment);
    ASSERT_LE(expectedIohSize, ioh.getUsed());

    auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
@@ -575,7 +576,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
                     std::array<uint8_t, 3>{{modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0],
                                             modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1],
                                             modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2]}},
-                     false, grfSize, rootDeviceEnvironment);
+                     false, grfSize, numGrf, rootDeviceEnvironment);

    EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize));
    alignedFree(expectedLocalIds);
@@ -1224,12 +1225,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
    dispatchKernelWithImplicitArgs<FamilyType>();

    auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
+    auto numGrf = GrfConfig::defaultGrfNumber;
    auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
    const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
-    generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, rootDeviceEnvironment);
+    generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);

    auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
-    size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, rootDeviceEnvironment);
+    size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment);

    EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
    alignedFree(expectedLocalIds);
@@ -1258,12 +1260,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
    dispatchKernelWithImplicitArgs<FamilyType>();

    auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
+    auto numGrf = GrfConfig::defaultGrfNumber;
    auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
    const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
-    generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, rootDeviceEnvironment);
+    generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);

    auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
-    size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, rootDeviceEnvironment);
+    size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment);

    EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
    alignedFree(expectedLocalIds);
--- a/shared/source/command_container/command_encoder.h
+++ b/shared/source/command_container/command_encoder.h
@@ -137,7 +137,7 @@ struct EncodeDispatchKernel {
                                 const RootDeviceEnvironment &rootDeviceEnvironment);

    template <typename InterfaceDescriptorType>
-    static void setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData,
+    static void setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount, const size_t &sizeCrossThreadData,
                           const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);

    static void *getInterfaceDescriptor(CommandContainer &container, IndirectHeap *childDsh, uint32_t &iddOffset);
@@ -155,10 +155,10 @@ struct EncodeDispatchKernel {
    static void programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);

    template <typename WalkerType, typename InterfaceDescriptorType>
-    static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
+    static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd);

    template <typename WalkerType, typename InterfaceDescriptorType>
-    static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
+    static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd);

    static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);

--- a/shared/source/command_container/command_encoder.inl
+++ b/shared/source/command_container/command_encoder.inl
@@ -729,7 +729,7 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT

 template <typename Family>
 template <typename WalkerType, typename InterfaceDescriptorType>
-void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {}
+void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {}

 template <typename Family>
 size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) {
@@ -759,7 +759,7 @@ size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &

 template <typename GfxFamily>
 template <typename WalkerType, typename InterfaceDescriptorType>
-void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
+void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
    const auto &productHelper = device.getProductHelper();

    if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
@@ -777,7 +777,7 @@ void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispat

        if (algorithmVersion == 2) {
            auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
-            if (numGrf == 256) {
+            if (grfCount == 256) {
                threadsPerXeCore /= 2;
            }
            auto tgDispatchSizeSelected = 8;
@@ -819,11 +819,11 @@ void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispat
            }
        } else {
            if (adjustTGDispatchSize) {
-                UNRECOVERABLE_IF(numGrf == 0u);
+                UNRECOVERABLE_IF(grfCount == 0u);
                constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
                constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
                auto &gfxCoreHelper = device.getGfxCoreHelper();
-                uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf);
+                uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, grfCount);
                if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
                    const uint32_t tilesCount = device.getNumSubDevices();
                    availableThreadCount *= tilesCount;
--- a/shared/source/command_container/command_encoder_bdw_and_later.inl
+++ b/shared/source/command_container/command_encoder_bdw_and_later.inl
@@ -31,7 +31,7 @@ namespace NEO {

 template <typename Family>
 template <typename InterfaceDescriptorType>
-void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf,
+void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount,
                                              const size_t &sizeCrossThreadData, const size_t &sizePerThreadData,
                                              const RootDeviceEnvironment &rootDeviceEnvironment) {
    auto grfSize = sizeof(typename Family::GRF);
--- a/shared/source/command_container/command_encoder_enablers.inl
+++ b/shared/source/command_container/command_encoder_enablers.inl
@@ -14,9 +14,9 @@ template void NEO::EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Fa
 template void NEO::EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args);
 template void NEO::EncodeDispatchKernel<Family>::setupPostSyncForRegularEvent<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args);
 template void NEO::EncodeDispatchKernel<Family>::setupPostSyncForInOrderExec<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args);
-template void NEO::EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
+template void NEO::EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t grfCount, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
 template void NEO::EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
-template void NEO::EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::DefaultWalkerType, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::DefaultWalkerType &walkerCmd);
+template void NEO::EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::DefaultWalkerType, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, Family::DefaultWalkerType &walkerCmd);
 template void NEO::EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
 template void NEO::EncodeDispatchKernel<Family>::encode<Family::DefaultWalkerType>(CommandContainer &container, EncodeDispatchKernelArgs &args);
 template void NEO::EncodeDispatchKernel<Family>::encodeThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
--- a/shared/source/command_container/command_encoder_xehp_and_later.inl
+++ b/shared/source/command_container/command_encoder_xehp_and_later.inl
@@ -41,7 +41,7 @@ constexpr size_t immWriteDestinationAddressAlignment = 8;

 template <typename Family>
 template <typename InterfaceDescriptorType>
-void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf,
+void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount,
                                              const size_t &sizeCrossThreadData, const size_t &sizePerThreadData,
                                              const RootDeviceEnvironment &rootDeviceEnvironment) {
 }
@@ -377,7 +377,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
    auto threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
    EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, walkerCmd);
    if (debugManager.flags.PrintKernelDispatchParameters.get()) {
-        fprintf(stdout, "kernel, %s, numGrf, %d, simdSize, %d, tilesCount, %d, implicitScaling, %s, threadGroupCount, %d, numberOfThreadsInGpgpuThreadGroup, %d, threadGroupDimensions, %d, %d, %d, threadGroupDispatchSize enum, %d\n",
+        fprintf(stdout, "kernel, %s, grfCount, %d, simdSize, %d, tilesCount, %d, implicitScaling, %s, threadGroupCount, %d, numberOfThreadsInGpgpuThreadGroup, %d, threadGroupDimensions, %d, %d, %d, threadGroupDispatchSize enum, %d\n",
                kernelDescriptor.kernelMetadata.kernelName.c_str(),
                kernelDescriptor.kernelAttributes.numGrfRequired,
                kernelDescriptor.kernelAttributes.simdSize,
--- a/shared/source/helpers/aarch64/local_id_gen.cpp
+++ b/shared/source/helpers/aarch64/local_id_gen.cpp
@@ -43,10 +43,10 @@ LocalIDHelper::LocalIDHelper() {

 LocalIDHelper LocalIDHelper::initializer;

-void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const RootDeviceEnvironment &rootDeviceEnvironment) {
+void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) {
    bool localIdsGeneratedByHw = false;
    auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
-    auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw, rootDeviceEnvironment));
+    auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, localIdsGeneratedByHw, rootDeviceEnvironment));
    bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
    if (useLayoutForImages) {
        generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
--- a/shared/source/helpers/gfx_core_helper.h
+++ b/shared/source/helpers/gfx_core_helper.h
@@ -125,7 +125,7 @@ class GfxCoreHelper {
    virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
    virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
                                             const RootDeviceEnvironment &rootDeviceEnvironment, bool isEngineInstanced) const = 0;
-    virtual uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
+    virtual uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
    virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
    virtual size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const = 0;
    virtual bool isCpuImageTransferPreferred(const HardwareInfo &hwInfo) const = 0;
@@ -174,7 +174,7 @@ class GfxCoreHelper {
    virtual bool isChipsetUniqueUUIDSupported() const = 0;
    virtual bool isTimestampShiftRequired() const = 0;
    virtual bool isRelaxedOrderingSupported() const = 0;
-    virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
+    virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
    virtual uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const = 0;
    virtual char const *getDefaultDeviceHierarchy() const = 0;
    static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
@@ -341,7 +341,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
    uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
                                     const RootDeviceEnvironment &rootDeviceEnvironment, bool isEngineInstanced) const override;

-    uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
+    uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
    size_t getMaxFillPaternSizeForCopyEngine() const override;

    size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const override;
@@ -401,7 +401,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
    bool isChipsetUniqueUUIDSupported() const override;
    bool isTimestampShiftRequired() const override;
    bool isRelaxedOrderingSupported() const override;
-    uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
+    uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
    uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const override;
    char const *getDefaultDeviceHierarchy() const override;

--- a/shared/source/helpers/gfx_core_helper_base.inl
+++ b/shared/source/helpers/gfx_core_helper_base.inl
@@ -707,7 +707,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::overrideMaxWorkGroupSize(uint32_t maxWG) co
 }

 template <typename GfxFamily>
-uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const {
+uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const {
    return defaultMaxGroupSize;
 }

@@ -717,7 +717,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
 }

 template <typename GfxFamily>
-uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const {
+uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const {
    return getThreadsPerWG(simd, totalWorkItems);
 }

--- a/shared/source/helpers/local_id_gen.h
+++ b/shared/source/helpers/local_id_gen.h
@@ -64,7 +64,7 @@ void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroup
                          const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);

 void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize,
-                      const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const RootDeviceEnvironment &rootDeviceEnvironment);
+                      const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment);
 void generateLocalIDsWithLayoutForImages(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd);

 bool isCompatibleWithLayoutForImages(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd);
--- a/shared/source/helpers/per_thread_data.h
+++ b/shared/source/helpers/per_thread_data.h
@@ -21,6 +21,7 @@ struct PerThreadDataHelper {
    static inline size_t getPerThreadDataSizeTotal(
        uint32_t simd,
        uint32_t grfSize,
+        uint32_t grfCount,
        uint32_t numChannels,
        size_t localWorkSize,
        bool isHwLocalIdGeneration,
@@ -30,7 +31,7 @@ struct PerThreadDataHelper {
            return perThreadSizeLocalIDs * localWorkSize;
        }
        auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
-        return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfSize, isHwLocalIdGeneration, rootDeviceEnvironment);
+        return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfCount, isHwLocalIdGeneration, rootDeviceEnvironment);
    }
 }; // namespace PerThreadDataHelper
 } // namespace NEO
--- a/shared/source/helpers/x86_64/local_id_gen.cpp
+++ b/shared/source/helpers/x86_64/local_id_gen.cpp
@@ -46,10 +46,10 @@ LocalIDHelper::LocalIDHelper() {
 LocalIDHelper LocalIDHelper::initializer;

 // traditional function to generate local IDs
-void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, const RootDeviceEnvironment &rootDeviceEnvironment) {
+void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) {
    bool localIdsGeneratedByHw = false;
    auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
-    auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfSize, localIdsGeneratedByHw, rootDeviceEnvironment));
+    auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, localIdsGeneratedByHw, rootDeviceEnvironment));
    bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
    if (useLayoutForImages) {
        generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
--- a/shared/source/kernel/implicit_args_helper.cpp
+++ b/shared/source/kernel/implicit_args_helper.cpp
@@ -54,12 +54,13 @@ uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const
        return alignUp(implicitArgsSize, MemoryConstants::cacheLineSize);
    } else {
        auto simdSize = pImplicitArgs->simdWidth;
+        auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
        auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize);
        Vec3<size_t> localWorkSize = {pImplicitArgs->localSizeX, pImplicitArgs->localSizeY, pImplicitArgs->localSizeZ};
        auto itemsInGroup = Math::computeTotalElementsCount(localWorkSize);
        uint32_t localIdsSizeNeeded =
            alignUp(static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
-                        simdSize, grfSize, 3u, itemsInGroup, isHwLocalIdGeneration, rootDeviceEnvironment)),
+                        simdSize, grfSize, grfCount, 3u, itemsInGroup, isHwLocalIdGeneration, rootDeviceEnvironment)),
                    MemoryConstants::cacheLineSize);
        return implicitArgsSize + localIdsSizeNeeded;
    }
@@ -74,6 +75,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
    if (!patchImplicitArgsBufferInCrossThread) {
        auto simdSize = implicitArgs.simdWidth;
        auto grfSize = getGrfSize(simdSize);
+        auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
        auto dimensionOrder = getDimensionOrderForLocalIds(kernelDescriptor.kernelAttributes.workgroupDimensionsOrder, hwGenerationOfLocalIdsParams);

        NEO::generateLocalIDs(
@@ -83,7 +85,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
                                     static_cast<uint16_t>(implicitArgs.localSizeY),
                                     static_cast<uint16_t>(implicitArgs.localSizeZ)}},
            dimensionOrder,
-            false, grfSize, rootDeviceEnvironment);
+            false, grfSize, grfCount, rootDeviceEnvironment);
        auto sizeForLocalIdsProgramming = totalSizeToProgram - ImplicitArgs::getSize();
        ptrToPatch = ptrOffset(ptrToPatch, sizeForLocalIdsProgramming);
    }
--- a/shared/source/kernel/local_ids_cache.cpp
+++ b/shared/source/kernel/local_ids_cache.cpp
@@ -13,14 +13,15 @@
 #include "shared/source/helpers/gfx_core_helper.h"
 #include "shared/source/helpers/local_id_gen.h"
 #include "shared/source/helpers/simd_helper.h"
+#include "shared/source/kernel/grf_config.h"

 #include <cstring>

 namespace NEO {

-LocalIdsCache::LocalIdsCache(size_t cacheSize, std::array<uint8_t, 3> wgDimOrder, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages)
+LocalIdsCache::LocalIdsCache(size_t cacheSize, std::array<uint8_t, 3> wgDimOrder, uint32_t grfCount, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages)
    : wgDimOrder(wgDimOrder), localIdsSizePerThread(getPerThreadSizeLocalIDs(static_cast<uint32_t>(simdSize), static_cast<uint32_t>(grfSize))),
-      grfSize(grfSize), simdSize(simdSize), usesOnlyImages(usesOnlyImages) {
+      grfCount(grfCount), grfSize(grfSize), simdSize(simdSize), usesOnlyImages(usesOnlyImages) {
    UNRECOVERABLE_IF(cacheSize == 0)
    cache.resize(cacheSize);
 }
@@ -41,7 +42,7 @@ size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3<uint16_t> &group, const
        return static_cast<size_t>(numElementsInGroup * localIdsSizePerThread);
    }
    auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
-    const auto numberOfThreads = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simdSize, numElementsInGroup, grfSize, false, rootDeviceEnvironment);
+    const auto numberOfThreads = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simdSize, numElementsInGroup, grfCount, false, rootDeviceEnvironment);
    return static_cast<size_t>(numberOfThreads * localIdsSizePerThread);
 }

@@ -81,7 +82,7 @@ void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3<uint16_
        entry.localIdsSizeAllocated = entry.localIdsSize;
    }
    NEO::generateLocalIDs(entry.localIdsData, static_cast<uint16_t>(simdSize),
-                          {group[0], group[1], group[2]}, wgDimOrder, usesOnlyImages, grfSize, rootDeviceEnvironment);
+                          {group[0], group[1], group[2]}, wgDimOrder, usesOnlyImages, grfSize, grfCount, rootDeviceEnvironment);
 }

 } // namespace NEO
--- a/shared/source/kernel/local_ids_cache.h
+++ b/shared/source/kernel/local_ids_cache.h
@@ -27,7 +27,7 @@ class LocalIdsCache {
    LocalIdsCache(LocalIdsCache &) = delete;
    LocalIdsCache &operator=(const LocalIdsCache &other) = delete;

-    LocalIdsCache(size_t cacheSize, std::array<uint8_t, 3> wgDimOrder, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages = false);
+    LocalIdsCache(size_t cacheSize, std::array<uint8_t, 3> wgDimOrder, uint32_t grfCount, uint8_t simdSize, uint8_t grfSize, bool usesOnlyImages = false);
    ~LocalIdsCache();

    void setLocalIdsForGroup(const Vec3<uint16_t> &group, void *destination, const RootDeviceEnvironment &rootDeviceEnvironment);
@@ -43,6 +43,7 @@ class LocalIdsCache {
    std::mutex setLocalIdsMutex;
    const std::array<uint8_t, 3> wgDimOrder;
    const uint32_t localIdsSizePerThread;
+    const uint32_t grfCount;
    const uint8_t grfSize;
    const uint8_t simdSize;
    const bool usesOnlyImages;
--- a/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp
+++ b/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp
@@ -28,8 +28,8 @@ namespace NEO {

 template <>
 template <typename WalkerType, typename InterfaceDescriptorType>
-void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
-    EncodeDispatchKernel<Family>::adjustInterfaceDescriptorDataForOverdispatch(interfaceDescriptor, device, hwInfo, threadGroupCount, numGrf, walkerCmd);
+void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
+    EncodeDispatchKernel<Family>::adjustInterfaceDescriptorDataForOverdispatch(interfaceDescriptor, device, hwInfo, threadGroupCount, grfCount, walkerCmd);
 }

 template <>
--- a/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp
+++ b/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021-2023 Intel Corporation
+ * Copyright (C) 2021-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -87,7 +87,7 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(InterfaceDescriptor

 template <>
 template <typename WalkerType, typename InterfaceDescriptorType>
-void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
+void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
    const auto &productHelper = device.getProductHelper();
    if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
        if (interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup() == 1) {
--- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp
+++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp
@@ -1124,7 +1124,7 @@ HWTEST2_F(EncodeDispatchKernelTest, givenPrintKernelDispatchParametersWhenEncodi
    std::string outputString = testing::internal::GetCapturedStdout(); // stop capturing

    EXPECT_NE(std::string::npos, outputString.find("kernel"));
-    EXPECT_NE(std::string::npos, outputString.find("numGrf"));
+    EXPECT_NE(std::string::npos, outputString.find("grfCount"));
    EXPECT_NE(std::string::npos, outputString.find("simdSize"));
    EXPECT_NE(std::string::npos, outputString.find("tilesCount"));
    EXPECT_NE(std::string::npos, outputString.find("implicitScaling"));
--- a/shared/test/unit_test/helpers/local_id_tests.cpp
+++ b/shared/test/unit_test/helpers/local_id_tests.cpp
@@ -10,6 +10,7 @@
 #include "shared/source/helpers/gfx_core_helper.h"
 #include "shared/source/helpers/local_id_gen.h"
 #include "shared/source/helpers/ptr_math.h"
+#include "shared/source/kernel/grf_config.h"
 #include "shared/test/common/helpers/default_hw_info.h"
 #include "shared/test/common/helpers/unit_test_helper.h"
 #include "shared/test/common/mocks/mock_execution_environment.h"
@@ -115,7 +116,7 @@ TEST(LocalIdTest, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize

    NEO::MockExecutionEnvironment mockExecutionEnvironment{};
    auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
-    generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u, rootDeviceEnvironment);
+    generateLocalIDs(localIdsPtr.get(), 16u, localSizes, dimensionsOrder, false, 64u, GrfConfig::defaultGrfNumber, rootDeviceEnvironment);
    EXPECT_EQ(localIdsView[0], 0u);
    EXPECT_EQ(localIdsView[1], 1u);
    EXPECT_EQ(localIdsView[2], 0u);
@@ -301,6 +302,7 @@ struct LocalIDFixture : ::testing::TestWithParam<std::tuple<int, int, int, int,
    uint32_t localWorkSize;
    uint32_t simd;
    uint32_t grfSize;
+    uint32_t numGrf = GrfConfig::defaultGrfNumber;

    // Provide support for a max LWS of 256
    // 32 threads @ SIMD8
@@ -313,7 +315,7 @@ HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenIdsAreWithinLimits) {
    NEO::MockExecutionEnvironment mockExecutionEnvironment{};
    auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
    generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
-                     std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, rootDeviceEnvironment);
+                     std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, numGrf, rootDeviceEnvironment);
    validateIDWithinLimits(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
 }

@@ -321,7 +323,7 @@ HWTEST_P(LocalIDFixture, WhenGeneratingLocalIdsThenAllWorkItemsCovered) {
    NEO::MockExecutionEnvironment mockExecutionEnvironment{};
    auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
    generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
-                     std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, rootDeviceEnvironment);
+                     std::array<uint8_t, 3>{{0, 1, 2}}, false, grfSize, numGrf, rootDeviceEnvironment);
    validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
 }

@@ -330,7 +332,7 @@ HWTEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) {
    NEO::MockExecutionEnvironment mockExecutionEnvironment{};
    auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
    generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
-                     dimensionsOrder, false, grfSize, rootDeviceEnvironment);
+                     dimensionsOrder, false, grfSize, numGrf, rootDeviceEnvironment);
    validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
    validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
 }
@@ -340,7 +342,7 @@ HWTEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) {
    NEO::MockExecutionEnvironment mockExecutionEnvironment{};
    auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
    generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
-                     dimensionsOrder, false, grfSize, rootDeviceEnvironment);
+                     dimensionsOrder, false, grfSize, numGrf, rootDeviceEnvironment);
    validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
    validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
 }
@@ -350,7 +352,7 @@ HWTEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
    NEO::MockExecutionEnvironment mockExecutionEnvironment{};
    auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
    generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
-                     dimensionsOrder, false, grfSize, rootDeviceEnvironment);
+                     dimensionsOrder, false, grfSize, numGrf, rootDeviceEnvironment);
    validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, UnitTestHelper<FamilyType>::useFullRowForLocalIdsGeneration);
    validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
 }
@@ -392,7 +394,7 @@ struct LocalIdsLayoutForImagesTest : ::testing::TestWithParam<std::tuple<uint16_
        EXPECT_TRUE(isCompatibleWithLayoutForImages(localWorkSize, dimensionsOrder, simd));
        NEO::MockExecutionEnvironment mockExecutionEnvironment{};
        auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
-        generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize, rootDeviceEnvironment);
+        generateLocalIDs(buffer, simd, localWorkSize, dimensionsOrder, true, grfSize, numGrfs, rootDeviceEnvironment);
    }
    void validateGRF() {
        uint32_t totalLocalIds = localWorkSize.at(0) * localWorkSize.at(1);
@@ -494,8 +496,8 @@ TEST_P(LocalIdsLayoutTest, givenLocalWorkgroupSize4x4x1WhenGenerateLocalIdsThenH
    memset(buffer2, 0xff, size);
    NEO::MockExecutionEnvironment mockExecutionEnvironment{};
    auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
-    generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize, rootDeviceEnvironment);
-    generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize, rootDeviceEnvironment);
+    generateLocalIDs(buffer1, simd, localWorkSize, dimensionsOrder, false, grfSize, GrfConfig::defaultGrfNumber, rootDeviceEnvironment);
+    generateLocalIDs(buffer2, simd, localWorkSize, dimensionsOrder, true, grfSize, GrfConfig::defaultGrfNumber, rootDeviceEnvironment);

    for (auto i = 0u; i < elemsInBuffer / rowWidth; i++) {
        for (auto j = 0u; j < rowWidth; j++) {
--- a/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp
+++ b/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp
@@ -79,7 +79,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP

    NEO::MockExecutionEnvironment mockExecutionEnvironment{};
    auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
-    auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
+    auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
    EXPECT_EQ(localIdsSize + ImplicitArgs::getSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
 }

--- a/shared/test/unit_test/kernel/local_ids_cache_tests.cpp
+++ b/shared/test/unit_test/kernel/local_ids_cache_tests.cpp
@@ -10,6 +10,7 @@
 #include "shared/source/helpers/gfx_core_helper.h"
 #include "shared/source/helpers/hw_info.h"
 #include "shared/source/helpers/per_thread_data.h"
+#include "shared/source/kernel/grf_config.h"
 #include "shared/source/kernel/local_ids_cache.h"
 #include "shared/test/common/helpers/default_hw_info.h"
 #include "shared/test/common/mocks/mock_execution_environment.h"
@@ -22,7 +23,7 @@ class MockLocalIdsCache : public NEO::LocalIdsCache {
    using Base::Base;
    using Base::cache;
    MockLocalIdsCache(size_t cacheSize) : MockLocalIdsCache(cacheSize, 32u){};
-    MockLocalIdsCache(size_t cacheSize, uint8_t simd) : Base(cacheSize, {0, 1, 2}, simd, 32, false){};
+    MockLocalIdsCache(size_t cacheSize, uint8_t simd) : Base(cacheSize, {0, 1, 2}, GrfConfig::defaultGrfNumber, simd, 32, false){};
 };
 struct LocalIdsCacheFixture {
    void setUp() {