refactor: Exclude thread dispatch algorithm for overdispatch to function

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
2024-03-01 12:33:10 +00:00 · 2024-03-01 12:33:10 +00:00 · bd6925d51a
parent 8840b6d02f
commit bd6925d51a
4 changed files with 121 additions and 109 deletions
--- a/shared/source/command_container/command_encoder.h
+++ b/shared/source/command_container/command_encoder.h
@ -158,6 +158,9 @@ struct EncodeDispatchKernel {
    template <typename WalkerType, typename InterfaceDescriptorType>
    static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);

+    template <typename WalkerType, typename InterfaceDescriptorType>
+    static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
+
    static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);

    template <typename WalkerType>
--- a/shared/source/command_container/command_encoder.inl
+++ b/shared/source/command_container/command_encoder.inl
@ -7,6 +7,7 @@

 #pragma once
 #include "shared/source/command_container/command_encoder.h"
+#include "shared/source/command_container/implicit_scaling.h"
 #include "shared/source/command_stream/linear_stream.h"
 #include "shared/source/debugger/debugger_l0.h"
 #include "shared/source/device/device.h"
@ -18,6 +19,7 @@
 #include "shared/source/helpers/blit_commands_helper.h"
 #include "shared/source/helpers/definitions/command_encoder_args.h"
 #include "shared/source/helpers/gfx_core_helper.h"
+#include "shared/source/helpers/hw_info.h"
 #include "shared/source/helpers/local_id_gen.h"
 #include "shared/source/helpers/preamble.h"
 #include "shared/source/helpers/register_offsets.h"
@ -755,6 +757,119 @@ size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &
    return size;
 }

+template <typename GfxFamily>
+template <typename WalkerType, typename InterfaceDescriptorType>
+void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
+    const auto &productHelper = device.getProductHelper();
+
+    if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
+        interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
+
+        bool adjustTGDispatchSize = true;
+        if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) {
+            adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get();
+        }
+        // apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount
+        auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1;
+        if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) {
+            algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get();
+        }
+
+        if (algorithmVersion == 2) {
+            auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
+            if (numGrf == 256) {
+                threadsPerXeCore /= 2;
+            }
+            auto tgDispatchSizeSelected = 8;
+            uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
+
+            if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
+                while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
+                    tgDispatchSizeSelected /= 2;
+                }
+            } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
+                while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
+                    tgDispatchSizeSelected /= 2;
+                }
+            }
+
+            auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
+            auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u;
+
+            // make sure we fit all xe core
+            while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) {
+                tgDispatchSizeSelected /= 2;
+            }
+
+            auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup;
+            // make sure we do not use more threads then present on each xe core
+            while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) {
+                tgDispatchSizeSelected /= 2;
+                threadCountPerGrouping /= 2;
+            }
+
+            if (tgDispatchSizeSelected == 8) {
+                interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
+            } else if (tgDispatchSizeSelected == 1) {
+                interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
+            } else if (tgDispatchSizeSelected == 2) {
+                interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
+            } else {
+                interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
+            }
+        } else {
+            if (adjustTGDispatchSize) {
+                UNRECOVERABLE_IF(numGrf == 0u);
+                constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
+                constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
+                auto &gfxCoreHelper = device.getGfxCoreHelper();
+                uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf);
+                if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
+                    const uint32_t tilesCount = device.getNumSubDevices();
+                    availableThreadCount *= tilesCount;
+                }
+                uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
+                uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
+                UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
+                auto tgDispatchSizeSelected = 1u;
+
+                if (dispatchedTotalThreadCount <= availableThreadCount) {
+                    tgDispatchSizeSelected = 1;
+                } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
+                    tgDispatchSizeSelected = 8;
+                } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
+                    tgDispatchSizeSelected = 4;
+                } else {
+                    tgDispatchSizeSelected = 2;
+                }
+                if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
+                    while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
+                        tgDispatchSizeSelected /= 2;
+                    }
+                } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
+                    while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
+                        tgDispatchSizeSelected /= 2;
+                    }
+                }
+                if (tgDispatchSizeSelected == 8) {
+                    interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
+                } else if (tgDispatchSizeSelected == 1) {
+                    interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
+                } else if (tgDispatchSizeSelected == 2) {
+                    interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
+                } else {
+                    interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
+                }
+            }
+        }
+    }
+
+    if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) {
+        interfaceDescriptor.setThreadGroupDispatchSize(static_cast<typename INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE>(
+            debugManager.flags.ForceThreadGroupDispatchSize.get()));
+    }
+}
+
 template <typename Family>
 size_t EncodeDispatchKernel<Family>::getSizeRequiredSsh(const KernelInfo &kernelInfo) {
    size_t requiredSshSize = kernelInfo.heapInfo.surfaceStateHeapSize;
--- a/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp
+++ b/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021-2023 Intel Corporation
+ * Copyright (C) 2021-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -29,113 +29,7 @@ namespace NEO {
 template <>
 template <typename WalkerType, typename InterfaceDescriptorType>
 void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
-    const auto &productHelper = device.getProductHelper();
-
-    if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
-        interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
-        bool adjustTGDispatchSize = true;
-        if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) {
-            adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get();
-        }
-        // apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount
-        auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1;
-        if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) {
-            algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get();
-        }
-
-        if (algorithmVersion == 2) {
-            auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
-            if (numGrf == 256) {
-                threadsPerXeCore /= 2;
-            }
-            auto tgDispatchSizeSelected = 8;
-            uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
-
-            if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
-                while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
-                    tgDispatchSizeSelected /= 2;
-                }
-            } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
-                while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
-                    tgDispatchSizeSelected /= 2;
-                }
-            }
-
-            auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
-            auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u;
-
-            // make sure we fit all xe core
-            while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) {
-                tgDispatchSizeSelected /= 2;
-            }
-
-            auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup;
-            // make sure we do not use more threads then present on each xe core
-            while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) {
-                tgDispatchSizeSelected /= 2;
-                threadCountPerGrouping /= 2;
-            }
-
-            if (tgDispatchSizeSelected == 8) {
-                interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
-            } else if (tgDispatchSizeSelected == 1) {
-                interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
-            } else if (tgDispatchSizeSelected == 2) {
-                interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
-            } else {
-                interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
-            }
-        } else {
-            if (adjustTGDispatchSize) {
-                UNRECOVERABLE_IF(numGrf == 0u);
-                constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
-                constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
-                auto &gfxCoreHelper = device.getGfxCoreHelper();
-                uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf);
-                if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
-                    const uint32_t tilesCount = device.getNumSubDevices();
-                    availableThreadCount *= tilesCount;
-                }
-                uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
-                uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
-                UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
-                auto tgDispatchSizeSelected = 1u;
-
-                if (dispatchedTotalThreadCount <= availableThreadCount) {
-                    tgDispatchSizeSelected = 1;
-                } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
-                    tgDispatchSizeSelected = 8;
-                } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
-                    tgDispatchSizeSelected = 4;
-                } else {
-                    tgDispatchSizeSelected = 2;
-                }
-                if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
-                    while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
-                        tgDispatchSizeSelected /= 2;
-                    }
-                } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
-                    while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
-                        tgDispatchSizeSelected /= 2;
-                    }
-                }
-                if (tgDispatchSizeSelected == 8) {
-                    interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
-                } else if (tgDispatchSizeSelected == 1) {
-                    interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
-                } else if (tgDispatchSizeSelected == 2) {
-                    interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
-                } else {
-                    interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
-                }
-            }
-        }
-    }
-
-    if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) {
-        interfaceDescriptor.setThreadGroupDispatchSize(static_cast<INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE>(
-            debugManager.flags.ForceThreadGroupDispatchSize.get()));
-    }
+    EncodeDispatchKernel<Family>::adjustInterfaceDescriptorDataForOverdispatch(interfaceDescriptor, device, hwInfo, threadGroupCount, numGrf, walkerCmd);
 }

 template <>
--- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp
+++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp
@ -685,7 +685,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInlineDataRequiredAnd
    EXPECT_EQ(0u, cmd->getLocalZMaximum());
 }

-HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInterfaceDescriptorDataWhenForceThreadGroupDispatchSizeVariableIsDefaultThenThreadGroupDispatchSizeIsNotChanged) {
+HWTEST2_F(CommandEncodeStatesTest, givenInterfaceDescriptorDataWhenForceThreadGroupDispatchSizeVariableIsDefaultThenThreadGroupDispatchSizeIsNotChanged, IsXeLpg) {
    using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
    using DefaultWalkerType = typename FamilyType::DefaultWalkerType;
    INTERFACE_DESCRIPTOR_DATA iddArg;