refactor: unify programming of preferred slm size 5/n

- remove xe2 hpg encode preferred slm size - add xe2 release helper preferred slm array - add dedicated method to calculate thread count per sub slice Related-To: NEO-12639 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
2025-12-24 21:18:24 +08:00 · 2024-10-08 17:33:33 +00:00
parent 946e421f77
commit 46a63d3e0e
10 changed files with 117 additions and 92 deletions
--- a/shared/source/command_container/command_encoder.h
+++ b/shared/source/command_container/command_encoder.h
@@ -132,6 +132,8 @@ struct EncodeDispatchKernel {
    static void setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment,
                                      const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);

+    static uint32_t getThreadCountPerSubslice(const HardwareInfo &hwInfo);
+
    template <typename InterfaceDescriptorType>
    static void encodeEuSchedulingPolicy(InterfaceDescriptorType *pInterfaceDescriptor, const KernelDescriptor &kernelDesc, int32_t defaultPipelinedThreadArbitrationPolicy);

--- a/shared/source/command_container/command_encoder_pre_xe2_hpg_core.inl
+++ b/shared/source/command_container/command_encoder_pre_xe2_hpg_core.inl
@@ -10,4 +10,10 @@ template <typename Family>
 size_t EncodeDispatchKernel<Family>::getDefaultIOHAlignment() {
    return 1;
 }
+
+template <typename Family>
+uint32_t EncodeDispatchKernel<Family>::getThreadCountPerSubslice(const HardwareInfo &hwInfo) {
+    return hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
+}
+
 } // namespace NEO
--- a/shared/source/command_container/command_encoder_xe2_hpg_core_and_later.inl
+++ b/shared/source/command_container/command_encoder_xe2_hpg_core_and_later.inl
@@ -15,4 +15,9 @@ size_t EncodeDispatchKernel<Family>::getDefaultIOHAlignment() {
    return alignment;
 }

+template <typename Family>
+uint32_t EncodeDispatchKernel<Family>::getThreadCountPerSubslice(const HardwareInfo &hwInfo) {
+    return hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.SubSliceCount;
+}
+
 } // namespace NEO
--- a/shared/source/command_container/command_encoder_xehp_and_later.inl
+++ b/shared/source/command_container/command_encoder_xehp_and_later.inl
@@ -1066,7 +1066,7 @@ template <typename InterfaceDescriptorType>
 void EncodeDispatchKernel<Family>::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
    using PREFERRED_SLM_ALLOCATION_SIZE = typename InterfaceDescriptorType::PREFERRED_SLM_ALLOCATION_SIZE;
    auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
-    const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
+    const uint32_t threadsPerDssCount = EncodeDispatchKernel<Family>::getThreadCountPerSubslice(hwInfo);
    const uint32_t workGroupCountPerDss = static_cast<uint32_t>(Math::divideAndRoundUp(threadsPerDssCount, threadsPerThreadGroup));

    uint32_t slmSize = 0u;
--- a/shared/source/release_helper/release_helper_2001.cpp
+++ b/shared/source/release_helper/release_helper_2001.cpp
@@ -7,6 +7,7 @@

 #include "shared/source/release_helper/release_helper.h"
 #include "shared/source/release_helper/release_helper_base.inl"
+#include "shared/source/xe2_hpg_core/hw_cmds_base.h"

 #include "release_definitions.h"

@@ -32,6 +33,22 @@ template <>
 bool ReleaseHelperHw<release>::isBindlessAddressingDisabled() const {
    return false;
 }
+
+template <>
+const SizeToPreferredSlmValueArray &ReleaseHelperHw<release>::getSizeToPreferredSlmValue(bool isHeapless) const {
+    using PREFERRED_SLM_ALLOCATION_SIZE = typename Xe2HpgCoreFamily::INTERFACE_DESCRIPTOR_DATA::PREFERRED_SLM_ALLOCATION_SIZE;
+    static const SizeToPreferredSlmValueArray sizeToPreferredSlmValue = {{
+        {0, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_0K},
+        {16 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_16K},
+        {32 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_32K},
+        {64 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_64K},
+        {96 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_96K},
+        {128 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_128K},
+        {std::numeric_limits<uint32_t>::max(), PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_160K},
+    }};
+    return sizeToPreferredSlmValue;
+}
+
 } // namespace NEO

 #include "shared/source/release_helper/release_helper_common_xe2_hpg.inl"
--- a/shared/source/release_helper/release_helper_2004.cpp
+++ b/shared/source/release_helper/release_helper_2004.cpp
@@ -27,12 +27,6 @@ inline bool ReleaseHelperHw<release>::isAuxSurfaceModeOverrideRequired() const {
    return true;
 }

-template <>
-int ReleaseHelperHw<release>::getProductMaxPreferredSlmSize(int preferredEnumValue) const {
-    using PREFERRED_SLM_ALLOCATION_SIZE = typename Xe2HpgCoreFamily::INTERFACE_DESCRIPTOR_DATA::PREFERRED_SLM_ALLOCATION_SIZE;
-    return std::min(preferredEnumValue, static_cast<int>(PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_128K));
-}
-
 template <>
 bool ReleaseHelperHw<release>::isLocalOnlyAllowed() const {
    return false;
@@ -43,6 +37,20 @@ bool ReleaseHelperHw<release>::isBindlessAddressingDisabled() const {
    return false;
 }

+template <>
+const SizeToPreferredSlmValueArray &ReleaseHelperHw<release>::getSizeToPreferredSlmValue(bool isHeapless) const {
+    using PREFERRED_SLM_ALLOCATION_SIZE = typename Xe2HpgCoreFamily::INTERFACE_DESCRIPTOR_DATA::PREFERRED_SLM_ALLOCATION_SIZE;
+    static const SizeToPreferredSlmValueArray sizeToPreferredSlmValue = {{
+        {0, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_0K},
+        {16 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_16K},
+        {32 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_32K},
+        {64 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_64K},
+        {96 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_96K},
+        {std::numeric_limits<uint32_t>::max(), PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_128K},
+    }};
+    return sizeToPreferredSlmValue;
+}
+
 } // namespace NEO

 #include "shared/source/release_helper/release_helper_common_xe2_hpg.inl"
--- a/shared/source/xe2_hpg_core/command_encoder_xe2_hpg_core.cpp
+++ b/shared/source/xe2_hpg_core/command_encoder_xe2_hpg_core.cpp
@@ -253,64 +253,6 @@ void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDevice
    walkerCmd.setComputeDispatchAllWalkerEnable(computeDispatchAllWalkerEnable);
 }

-template <>
-template <typename InterfaceDescriptorType>
-void EncodeDispatchKernel<Family>::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
-    using PREFERRED_SLM_ALLOCATION_SIZE = typename InterfaceDescriptorType::PREFERRED_SLM_ALLOCATION_SIZE;
-
-    auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
-    uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount /
-                                  hwInfo.gtSystemInfo.SubSliceCount;
-    uint32_t workGroupCountPerDss = static_cast<uint32_t>(Math::divideAndRoundUp(threadsPerDssCount, threadsPerThreadGroup));
-
-    uint32_t slmSize = 0u;
-
-    switch (slmPolicy) {
-    case SlmPolicy::slmPolicyLargeData:
-        slmSize = slmTotalSize;
-        break;
-    case SlmPolicy::slmPolicyLargeSlm:
-    default:
-        slmSize = slmTotalSize * workGroupCountPerDss;
-        break;
-    }
-
-    struct SizeToPreferredSlmValue {
-        uint32_t upperLimit;
-        PREFERRED_SLM_ALLOCATION_SIZE valueToProgram;
-    };
-    const std::array<SizeToPreferredSlmValue, 10> ranges = {{
-        // upper limit, retVal
-        {0, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_0K},
-        {16 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_16K},
-        {32 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_32K},
-        {64 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_64K},
-        {96 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_96K},
-        {128 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_128K},
-    }};
-
-    auto programmableIdPreferredSlmSize = PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_160K;
-
-    auto *releaseHelper = rootDeviceEnvironment.getReleaseHelper();
-    programmableIdPreferredSlmSize = static_cast<PREFERRED_SLM_ALLOCATION_SIZE>(
-        releaseHelper->getProductMaxPreferredSlmSize(programmableIdPreferredSlmSize));
-
-    for (auto &range : ranges) {
-        if (slmSize <= range.upperLimit) {
-            programmableIdPreferredSlmSize = range.valueToProgram;
-            break;
-        }
-    }
-
-    pInterfaceDescriptor->setPreferredSlmAllocationSize(programmableIdPreferredSlmSize);
-
-    if (debugManager.flags.OverridePreferredSlmAllocationSizePerDss.get() != -1) {
-        auto toProgram =
-            static_cast<PREFERRED_SLM_ALLOCATION_SIZE>(debugManager.flags.OverridePreferredSlmAllocationSizePerDss.get());
-        pInterfaceDescriptor->setPreferredSlmAllocationSize(toProgram);
-    }
-}
-
 template <>
 template <typename InterfaceDescriptorType>
 void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount,
--- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp
+++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp
@@ -1644,3 +1644,15 @@ HWTEST2_F(CommandEncodeStatesTest, givenEncodeDispatchKernelWhenForcingDifferent
    debugManager.flags.ForceIOHAlignment.set(expectedAlignemnt);
    EXPECT_EQ(NEO::EncodeDispatchKernel<FamilyType>::getDefaultIOHAlignment(), expectedAlignemnt);
 }
+
+HWTEST2_F(CommandEncodeStatesTest, givenEncodeDispatchKernelWhenGettingThreadCountPerSubsliceThenUseDualSubSliceAsDenominator, IsAtMostXeHpcCore) {
+    auto &hwInfo = pDevice->getHardwareInfo();
+    auto expectedValue = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
+    EXPECT_EQ(expectedValue, NEO::EncodeDispatchKernel<FamilyType>::getThreadCountPerSubslice(hwInfo));
+}
+
+HWTEST2_F(CommandEncodeStatesTest, givenEncodeDispatchKernelWhenGettingThreadCountPerSubsliceThenUseSubSliceAsDenominator, IsAtLeastXe2HpgCore) {
+    auto &hwInfo = pDevice->getHardwareInfo();
+    auto expectedValue = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.SubSliceCount;
+    EXPECT_EQ(expectedValue, NEO::EncodeDispatchKernel<FamilyType>::getThreadCountPerSubslice(hwInfo));
+}
--- a/shared/test/unit_test/release_helper/release_helper_20_01_tests.cpp
+++ b/shared/test/unit_test/release_helper/release_helper_20_01_tests.cpp
@@ -75,6 +75,34 @@ TEST_F(ReleaseHelper2001Tests, whenIsLocalOnlyAllowedCalledThenFalseReturned) {
    whenIsLocalOnlyAllowedCalledThenFalseReturned();
 }

-TEST_F(ReleaseHelper2001Tests, whenGettingPreferredSlmSizeThenAllEntriesEmpty) {
-    whenGettingPreferredSlmSizeThenAllEntriesEmpty();
+TEST_F(ReleaseHelper2001Tests, whenGettingPreferredSlmSizeThenAllEntriesHaveCorrectValues) {
+    for (auto &revision : getRevisions()) {
+        ipVersion.revision = revision;
+        releaseHelper = ReleaseHelper::create(ipVersion);
+        ASSERT_NE(nullptr, releaseHelper);
+
+        constexpr uint32_t kB = 1024;
+
+        auto &preferredSlmValueArray = releaseHelper->getSizeToPreferredSlmValue(false);
+        EXPECT_EQ(0u, preferredSlmValueArray[0].upperLimit);
+        EXPECT_EQ(0u, preferredSlmValueArray[0].valueToProgram);
+
+        EXPECT_EQ(16 * kB, preferredSlmValueArray[1].upperLimit);
+        EXPECT_EQ(1u, preferredSlmValueArray[1].valueToProgram);
+
+        EXPECT_EQ(32 * kB, preferredSlmValueArray[2].upperLimit);
+        EXPECT_EQ(2u, preferredSlmValueArray[2].valueToProgram);
+
+        EXPECT_EQ(64 * kB, preferredSlmValueArray[3].upperLimit);
+        EXPECT_EQ(3u, preferredSlmValueArray[3].valueToProgram);
+
+        EXPECT_EQ(96 * kB, preferredSlmValueArray[4].upperLimit);
+        EXPECT_EQ(4u, preferredSlmValueArray[4].valueToProgram);
+
+        EXPECT_EQ(128 * kB, preferredSlmValueArray[5].upperLimit);
+        EXPECT_EQ(5u, preferredSlmValueArray[5].valueToProgram);
+
+        EXPECT_EQ(std::numeric_limits<uint32_t>::max(), preferredSlmValueArray[6].upperLimit);
+        EXPECT_EQ(6u, preferredSlmValueArray[6].valueToProgram);
+    }
 }
--- a/shared/test/unit_test/release_helper/release_helper_20_04_tests.cpp
+++ b/shared/test/unit_test/release_helper/release_helper_20_04_tests.cpp
@@ -44,28 +44,8 @@ TEST_F(ReleaseHelper2004Tests, whenGettingCapabilitiesThenCorrectPropertiesAreRe
    }
 }

-TEST_F(ReleaseHelper2004Tests, whenGettingMaxPreferredSlmSizeThenSizeSizeIsLimitedBy128K) {
-    for (auto &revision : getRevisions()) {
-        ipVersion.revision = revision;
-        releaseHelper = ReleaseHelper::create(ipVersion);
-        ASSERT_NE(nullptr, releaseHelper);
-
-        using PREFERRED_SLM_ALLOCATION_SIZE = typename Xe2HpgCoreFamily::INTERFACE_DESCRIPTOR_DATA::PREFERRED_SLM_ALLOCATION_SIZE;
-        for (auto &preferredSlmSize : {PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_0K,
-                                       PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_16K,
-                                       PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_32K,
-                                       PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_64K,
-                                       PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_96K,
-                                       PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_128K}) {
-
-            auto maxPreferredSlmValue = releaseHelper->getProductMaxPreferredSlmSize(preferredSlmSize);
-            EXPECT_EQ(maxPreferredSlmValue, preferredSlmSize);
-        }
-        auto preferredSlmSize128k = PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_128K;
-        auto preferredSlmSize160k = PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_160K;
-        auto maxPreferredSlmValue = releaseHelper->getProductMaxPreferredSlmSize(preferredSlmSize160k);
-        EXPECT_EQ(maxPreferredSlmValue, preferredSlmSize128k);
-    }
+TEST_F(ReleaseHelper2004Tests, whenGettingMaxPreferredSlmSizeThenSizeIsNotModified) {
+    whenGettingMaxPreferredSlmSizeThenSizeIsNotModified();
 }

 TEST_F(ReleaseHelper2004Tests, whenShouldAdjustCalledThenTrueReturned) {
@@ -96,6 +76,31 @@ TEST_F(ReleaseHelper2004Tests, whenIsLocalOnlyAllowedCalledThenFalseReturned) {
    whenIsLocalOnlyAllowedCalledThenFalseReturned();
 }

-TEST_F(ReleaseHelper2004Tests, whenGettingPreferredSlmSizeThenAllEntriesEmpty) {
-    whenGettingPreferredSlmSizeThenAllEntriesEmpty();
+TEST_F(ReleaseHelper2004Tests, whenGettingPreferredSlmSizeThenAllEntriesHaveCorrectValues) {
+    for (auto &revision : getRevisions()) {
+        ipVersion.revision = revision;
+        releaseHelper = ReleaseHelper::create(ipVersion);
+        ASSERT_NE(nullptr, releaseHelper);
+
+        constexpr uint32_t kB = 1024;
+
+        auto &preferredSlmValueArray = releaseHelper->getSizeToPreferredSlmValue(false);
+        EXPECT_EQ(0u, preferredSlmValueArray[0].upperLimit);
+        EXPECT_EQ(0u, preferredSlmValueArray[0].valueToProgram);
+
+        EXPECT_EQ(16 * kB, preferredSlmValueArray[1].upperLimit);
+        EXPECT_EQ(1u, preferredSlmValueArray[1].valueToProgram);
+
+        EXPECT_EQ(32 * kB, preferredSlmValueArray[2].upperLimit);
+        EXPECT_EQ(2u, preferredSlmValueArray[2].valueToProgram);
+
+        EXPECT_EQ(64 * kB, preferredSlmValueArray[3].upperLimit);
+        EXPECT_EQ(3u, preferredSlmValueArray[3].valueToProgram);
+
+        EXPECT_EQ(96 * kB, preferredSlmValueArray[4].upperLimit);
+        EXPECT_EQ(4u, preferredSlmValueArray[4].valueToProgram);
+
+        EXPECT_EQ(std::numeric_limits<uint32_t>::max(), preferredSlmValueArray[5].upperLimit);
+        EXPECT_EQ(5u, preferredSlmValueArray[5].valueToProgram);
+    }
 }