fix: Correct logic for SIMD1

- For calculating number of threads per workgroup, treat simd 1 as it was simd 32 - Correct logic of calculating space for per thread data for simd 1 - Minor: unit tests refactor - Corrected naming Related-To: NEO-8261 Signed-off-by: Kacper Nowak <kacper.nowak@intel.com>
2025-12-24 04:12:57 +08:00 · 2023-09-01 19:20:39 +02:00
parent b053e9348e
commit fc099ead2e
4 changed files with 68 additions and 17 deletions
--- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp
+++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp
@@ -12,6 +12,7 @@
 #include "shared/source/helpers/bindless_heaps_helper.h"
 #include "shared/source/helpers/gfx_core_helper.h"
 #include "shared/source/helpers/local_memory_access_modes.h"
 #include "shared/source/helpers/per_thread_data.h"
 #include "shared/source/helpers/ray_tracing_helper.h"
 #include "shared/source/indirect_heap/indirect_heap.h"
 #include "shared/source/kernel/implicit_args.h"
@@ -295,19 +296,36 @@ TEST_F(SetKernelArgCacheTest, givenValidBufferArgumentWhenSetMultipleTimesThenSe
 using KernelImpSetGroupSizeTest = Test<DeviceFixture>;
-TEST_F(KernelImpSetGroupSizeTest, WhenCalculatingLocalIdsThenGrfSizeIsTakenFromCapabilityTable) {
+TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSettingGroupSizeThenProperlyGenerateLocalIds) {
    Mock<KernelImp> mockKernel;
    Mock<Module> mockModule(this->device, nullptr);
    mockKernel.descriptor.kernelAttributes.simdSize = 1;
    mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime = true; // although it is enabled for SIMD 1, make sure it is enforced
    mockKernel.descriptor.kernelAttributes.numLocalIdChannels = 3;
    mockKernel.module = &mockModule;
    auto grfSize = mockModule.getDevice()->getHwInfo().capabilityTable.grfSize;
    uint32_t groupSize[3] = {2, 3, 5};
    auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
    EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
-    EXPECT_EQ(groupSize[0] * groupSize[1] * groupSize[2], mockKernel.numThreadsPerThreadGroup);
+
-    EXPECT_EQ(grfSize * groupSize[0] * groupSize[1] * groupSize[2], mockKernel.perThreadDataSizeForWholeThreadGroup);
+    const auto &gfxHelper = mockModule.getDevice()->getGfxCoreHelper();
-    ASSERT_LE(grfSize * groupSize[0] * groupSize[1] * groupSize[2], mockKernel.perThreadDataSizeForWholeThreadGroup);
+    auto numThreadsPerTG = gfxHelper.calculateNumThreadsPerThreadGroup(
        mockKernel.descriptor.kernelAttributes.simdSize,
        groupSize[0] * groupSize[1] * groupSize[2],
        grfSize,
        mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime);
    auto perThreadDataSizeForWholeTGNeeded =
        static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
            mockKernel.descriptor.kernelAttributes.simdSize,
            grfSize,
            mockKernel.descriptor.kernelAttributes.numLocalIdChannels,
            groupSize[0] * groupSize[1] * groupSize[2],
            !mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
            gfxHelper));
    EXPECT_EQ(numThreadsPerTG, mockKernel.getNumThreadsPerThreadGroup());
    EXPECT_EQ((perThreadDataSizeForWholeTGNeeded / numThreadsPerTG), mockKernel.perThreadDataSize);
    using LocalIdT = unsigned short;
    auto threadOffsetInLocalIds = grfSize / sizeof(LocalIdT);
    auto generatedLocalIds = reinterpret_cast<LocalIdT *>(mockKernel.perThreadDataForWholeThreadGroup);
@@ -335,7 +353,6 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeDisabledWhenSet
    uint32_t groupSize[3] = {2, 3, 5};
    auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
    EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
    EXPECT_EQ(groupSize[0] * groupSize[1] * groupSize[2], mockKernel.numThreadsPerThreadGroup);
    EXPECT_EQ(0u, mockKernel.perThreadDataSizeForWholeThreadGroup);
    EXPECT_EQ(0u, mockKernel.perThreadDataSize);
    EXPECT_EQ(nullptr, mockKernel.perThreadDataForWholeThreadGroup);
--- a/shared/source/helpers/local_id_gen.h
+++ b/shared/source/helpers/local_id_gen.h
@@ -7,17 +7,21 @@
 #pragma once
 #include "shared/source/helpers/simd_helper.h"
 #include <array>
 #include <cstddef>
 #include <cstdint>
 namespace NEO {
 class GfxCoreHelper;
-inline uint32_t getGRFsPerThread(uint32_t simd, uint32_t grfSize) {
+inline uint32_t getNumGrfsPerLocalIdCoordinate(uint32_t simd, uint32_t grfSize) {
    return (simd == 32 && grfSize == 32) ? 2 : 1;
 }
 inline uint32_t getThreadsPerWG(uint32_t simd, uint32_t lws) {
    if (isSimd1(simd))
        simd = 32;
    auto result = lws + simd - 1;
    // Original logic:
@@ -27,17 +31,17 @@ inline uint32_t getThreadsPerWG(uint32_t simd, uint32_t lws) {
                   ? 5
               : simd == 16
                   ? 4
-               : simd == 8
+                   : 3; // for SIMD 8
                   ? 3
                   : 0;
    return result;
 }
 inline uint32_t getPerThreadSizeLocalIDs(uint32_t simd, uint32_t grfSize, uint32_t numChannels = 3) {
-    auto numGRFSPerThread = getGRFsPerThread(simd, grfSize);
+    if (isSimd1(simd)) {
-    uint32_t returnSize = numGRFSPerThread * grfSize * (simd == 1 ? 1u : numChannels);
+        return grfSize;
-    returnSize = std::max(returnSize, grfSize);
+    }
    auto numGRFSPerLocalIdCoord = getNumGrfsPerLocalIdCoordinate(simd, grfSize);
    uint32_t returnSize = numGRFSPerLocalIdCoord * grfSize * numChannels;
    return returnSize;
 }
--- a/shared/source/helpers/per_thread_data.h
+++ b/shared/source/helpers/per_thread_data.h
@@ -8,6 +8,7 @@
 #pragma once
 #include "shared/source/helpers/gfx_core_helper.h"
 #include "shared/source/helpers/local_id_gen.h"
 #include "shared/source/helpers/simd_helper.h"
 #include <cstddef>
 #include <cstdint>
@@ -23,7 +24,11 @@ struct PerThreadDataHelper {
        size_t localWorkSize,
        bool isHwLocalIdGeneration,
        const GfxCoreHelper &gfxCoreHelper) {
-        return gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfSize, isHwLocalIdGeneration) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
+        auto perThreadSizeLocalIDs = static_cast<size_t>(getPerThreadSizeLocalIDs(simd, grfSize, numChannels));
        if (isSimd1(simd)) {
            return perThreadSizeLocalIDs * localWorkSize;
        }
        return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfSize, isHwLocalIdGeneration);
    }
 }; // namespace PerThreadDataHelper
 } // namespace NEO
--- a/shared/test/unit_test/helpers/local_id_tests.cpp
+++ b/shared/test/unit_test/helpers/local_id_tests.cpp
@@ -23,22 +23,27 @@ using LocalIdTests = ::testing::Test;
 HWTEST_F(LocalIdTests, GivenSimd8WhenGettingGrfsPerThreadThenOneIsReturned) {
    uint32_t simd = 8;
-    EXPECT_EQ(1u, getGRFsPerThread(simd, 32));
+    EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 32));
 }
 HWTEST_F(LocalIdTests, GivenSimd16WhenGettingGrfsPerThreadThenOneIsReturned) {
    uint32_t simd = 16;
-    EXPECT_EQ(1u, getGRFsPerThread(simd, 32));
+    EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 32));
 }
 HWTEST_F(LocalIdTests, GivenSimd32WhenGettingGrfsPerThreadThenTwoIsReturned) {
    uint32_t simd = 32;
-    EXPECT_EQ(2u, getGRFsPerThread(simd, 32));
+    EXPECT_EQ(2u, getNumGrfsPerLocalIdCoordinate(simd, 32));
 }
 HWTEST_F(LocalIdTests, GivenSimd1WhenGettingGrfsPerThreadThenOneIsReturned) {
    uint32_t simd = 1;
    EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 32));
 }
 HWTEST_F(LocalIdTests, GivenSimd32AndNon32GrfSizeWhenGettingGrfsPerThreadThenTwoIsReturned) {
    uint32_t simd = 32;
-    EXPECT_EQ(1u, getGRFsPerThread(simd, 33));
+    EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 33));
 }
 TEST(LocalID, GivenSimd32AndLws33WhenGettingThreadsPerWorkgroupThenTwoIsReturned) {
@@ -78,6 +83,26 @@ TEST(LocalID, GivenSimd1WhenGettingPerThreadSizeLocalIdsThenValueIsEqualGrfSize)
    EXPECT_EQ(grfSize, getPerThreadSizeLocalIDs(simd, grfSize));
 }
 TEST(LocalID, WhenThreadsPerWgAreGeneratedThenCalculationsAreCorrect) {
    const auto lws = 33u;
    for (const auto &simd : {1u, 8u, 16u, 32u}) {
        switch (simd) {
        case 1u: // treat SIMD 1 as SIMD 32 in such case
        case 32u:
            EXPECT_EQ((lws + std::max(32u, simd) - 1) >> 5, getThreadsPerWG(simd, lws));
            break;
        case 8u:
            EXPECT_EQ((lws + simd - 1) >> 3, getThreadsPerWG(simd, lws));
            break;
        case 16u:
            EXPECT_EQ((lws + simd - 1) >> 4, getThreadsPerWG(simd, lws));
            break;
        default:
            break;
        }
    }
 }
 TEST(LocalIdTest, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize) {
    auto localIdsPtr = allocateAlignedMemory(3 * 64u, MemoryConstants::cacheLineSize);