From fc099ead2e10ef035e773779a216dc131ea52adc Mon Sep 17 00:00:00 2001 From: Kacper Nowak Date: Fri, 1 Sep 2023 19:20:39 +0200 Subject: [PATCH] fix: Correct logic for SIMD1 - For calculating number of threads per workgroup, treat simd 1 as it was simd 32 - Correct logic of calculating space for per thread data for simd 1 - Minor: unit tests refactor - Corrected naming Related-To: NEO-8261 Signed-off-by: Kacper Nowak --- .../unit_tests/sources/kernel/test_kernel.cpp | 27 ++++++++++++--- shared/source/helpers/local_id_gen.h | 18 ++++++---- shared/source/helpers/per_thread_data.h | 7 +++- .../test/unit_test/helpers/local_id_tests.cpp | 33 ++++++++++++++++--- 4 files changed, 68 insertions(+), 17 deletions(-) diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index 91b387e85f..d95e363171 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -12,6 +12,7 @@ #include "shared/source/helpers/bindless_heaps_helper.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/local_memory_access_modes.h" +#include "shared/source/helpers/per_thread_data.h" #include "shared/source/helpers/ray_tracing_helper.h" #include "shared/source/indirect_heap/indirect_heap.h" #include "shared/source/kernel/implicit_args.h" @@ -295,19 +296,36 @@ TEST_F(SetKernelArgCacheTest, givenValidBufferArgumentWhenSetMultipleTimesThenSe using KernelImpSetGroupSizeTest = Test; -TEST_F(KernelImpSetGroupSizeTest, WhenCalculatingLocalIdsThenGrfSizeIsTakenFromCapabilityTable) { +TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSettingGroupSizeThenProperlyGenerateLocalIds) { Mock mockKernel; Mock mockModule(this->device, nullptr); mockKernel.descriptor.kernelAttributes.simdSize = 1; + mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime = true; // although it is enabled for SIMD 1, make sure it is enforced mockKernel.descriptor.kernelAttributes.numLocalIdChannels = 3; mockKernel.module = &mockModule; auto grfSize = mockModule.getDevice()->getHwInfo().capabilityTable.grfSize; uint32_t groupSize[3] = {2, 3, 5}; auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]); EXPECT_EQ(ZE_RESULT_SUCCESS, ret); - EXPECT_EQ(groupSize[0] * groupSize[1] * groupSize[2], mockKernel.numThreadsPerThreadGroup); - EXPECT_EQ(grfSize * groupSize[0] * groupSize[1] * groupSize[2], mockKernel.perThreadDataSizeForWholeThreadGroup); - ASSERT_LE(grfSize * groupSize[0] * groupSize[1] * groupSize[2], mockKernel.perThreadDataSizeForWholeThreadGroup); + + const auto &gfxHelper = mockModule.getDevice()->getGfxCoreHelper(); + auto numThreadsPerTG = gfxHelper.calculateNumThreadsPerThreadGroup( + mockKernel.descriptor.kernelAttributes.simdSize, + groupSize[0] * groupSize[1] * groupSize[2], + grfSize, + mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime); + auto perThreadDataSizeForWholeTGNeeded = + static_cast(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal( + mockKernel.descriptor.kernelAttributes.simdSize, + grfSize, + mockKernel.descriptor.kernelAttributes.numLocalIdChannels, + groupSize[0] * groupSize[1] * groupSize[2], + !mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime, + gfxHelper)); + + EXPECT_EQ(numThreadsPerTG, mockKernel.getNumThreadsPerThreadGroup()); + EXPECT_EQ((perThreadDataSizeForWholeTGNeeded / numThreadsPerTG), mockKernel.perThreadDataSize); + using LocalIdT = unsigned short; auto threadOffsetInLocalIds = grfSize / sizeof(LocalIdT); auto generatedLocalIds = reinterpret_cast(mockKernel.perThreadDataForWholeThreadGroup); @@ -335,7 +353,6 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeDisabledWhenSet uint32_t groupSize[3] = {2, 3, 5}; auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]); EXPECT_EQ(ZE_RESULT_SUCCESS, ret); - EXPECT_EQ(groupSize[0] * groupSize[1] * groupSize[2], mockKernel.numThreadsPerThreadGroup); EXPECT_EQ(0u, mockKernel.perThreadDataSizeForWholeThreadGroup); EXPECT_EQ(0u, mockKernel.perThreadDataSize); EXPECT_EQ(nullptr, mockKernel.perThreadDataForWholeThreadGroup); diff --git a/shared/source/helpers/local_id_gen.h b/shared/source/helpers/local_id_gen.h index ab88965571..6477c6ab19 100644 --- a/shared/source/helpers/local_id_gen.h +++ b/shared/source/helpers/local_id_gen.h @@ -7,17 +7,21 @@ #pragma once +#include "shared/source/helpers/simd_helper.h" + #include #include #include namespace NEO { class GfxCoreHelper; -inline uint32_t getGRFsPerThread(uint32_t simd, uint32_t grfSize) { +inline uint32_t getNumGrfsPerLocalIdCoordinate(uint32_t simd, uint32_t grfSize) { return (simd == 32 && grfSize == 32) ? 2 : 1; } inline uint32_t getThreadsPerWG(uint32_t simd, uint32_t lws) { + if (isSimd1(simd)) + simd = 32; auto result = lws + simd - 1; // Original logic: @@ -27,17 +31,17 @@ inline uint32_t getThreadsPerWG(uint32_t simd, uint32_t lws) { ? 5 : simd == 16 ? 4 - : simd == 8 - ? 3 - : 0; + : 3; // for SIMD 8 return result; } inline uint32_t getPerThreadSizeLocalIDs(uint32_t simd, uint32_t grfSize, uint32_t numChannels = 3) { - auto numGRFSPerThread = getGRFsPerThread(simd, grfSize); - uint32_t returnSize = numGRFSPerThread * grfSize * (simd == 1 ? 1u : numChannels); - returnSize = std::max(returnSize, grfSize); + if (isSimd1(simd)) { + return grfSize; + } + auto numGRFSPerLocalIdCoord = getNumGrfsPerLocalIdCoordinate(simd, grfSize); + uint32_t returnSize = numGRFSPerLocalIdCoord * grfSize * numChannels; return returnSize; } diff --git a/shared/source/helpers/per_thread_data.h b/shared/source/helpers/per_thread_data.h index e8f6fe7766..174a4fec86 100644 --- a/shared/source/helpers/per_thread_data.h +++ b/shared/source/helpers/per_thread_data.h @@ -8,6 +8,7 @@ #pragma once #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/local_id_gen.h" +#include "shared/source/helpers/simd_helper.h" #include #include @@ -23,7 +24,11 @@ struct PerThreadDataHelper { size_t localWorkSize, bool isHwLocalIdGeneration, const GfxCoreHelper &gfxCoreHelper) { - return gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkSize), grfSize, isHwLocalIdGeneration) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels); + auto perThreadSizeLocalIDs = static_cast(getPerThreadSizeLocalIDs(simd, grfSize, numChannels)); + if (isSimd1(simd)) { + return perThreadSizeLocalIDs * localWorkSize; + } + return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkSize), grfSize, isHwLocalIdGeneration); } }; // namespace PerThreadDataHelper } // namespace NEO diff --git a/shared/test/unit_test/helpers/local_id_tests.cpp b/shared/test/unit_test/helpers/local_id_tests.cpp index 661277ce9b..b8318c9c01 100644 --- a/shared/test/unit_test/helpers/local_id_tests.cpp +++ b/shared/test/unit_test/helpers/local_id_tests.cpp @@ -23,22 +23,27 @@ using LocalIdTests = ::testing::Test; HWTEST_F(LocalIdTests, GivenSimd8WhenGettingGrfsPerThreadThenOneIsReturned) { uint32_t simd = 8; - EXPECT_EQ(1u, getGRFsPerThread(simd, 32)); + EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 32)); } HWTEST_F(LocalIdTests, GivenSimd16WhenGettingGrfsPerThreadThenOneIsReturned) { uint32_t simd = 16; - EXPECT_EQ(1u, getGRFsPerThread(simd, 32)); + EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 32)); } HWTEST_F(LocalIdTests, GivenSimd32WhenGettingGrfsPerThreadThenTwoIsReturned) { uint32_t simd = 32; - EXPECT_EQ(2u, getGRFsPerThread(simd, 32)); + EXPECT_EQ(2u, getNumGrfsPerLocalIdCoordinate(simd, 32)); +} + +HWTEST_F(LocalIdTests, GivenSimd1WhenGettingGrfsPerThreadThenOneIsReturned) { + uint32_t simd = 1; + EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 32)); } HWTEST_F(LocalIdTests, GivenSimd32AndNon32GrfSizeWhenGettingGrfsPerThreadThenTwoIsReturned) { uint32_t simd = 32; - EXPECT_EQ(1u, getGRFsPerThread(simd, 33)); + EXPECT_EQ(1u, getNumGrfsPerLocalIdCoordinate(simd, 33)); } TEST(LocalID, GivenSimd32AndLws33WhenGettingThreadsPerWorkgroupThenTwoIsReturned) { @@ -78,6 +83,26 @@ TEST(LocalID, GivenSimd1WhenGettingPerThreadSizeLocalIdsThenValueIsEqualGrfSize) EXPECT_EQ(grfSize, getPerThreadSizeLocalIDs(simd, grfSize)); } +TEST(LocalID, WhenThreadsPerWgAreGeneratedThenCalculationsAreCorrect) { + const auto lws = 33u; + for (const auto &simd : {1u, 8u, 16u, 32u}) { + switch (simd) { + case 1u: // treat SIMD 1 as SIMD 32 in such case + case 32u: + EXPECT_EQ((lws + std::max(32u, simd) - 1) >> 5, getThreadsPerWG(simd, lws)); + break; + case 8u: + EXPECT_EQ((lws + simd - 1) >> 3, getThreadsPerWG(simd, lws)); + break; + case 16u: + EXPECT_EQ((lws + simd - 1) >> 4, getThreadsPerWG(simd, lws)); + break; + default: + break; + } + } +} + TEST(LocalIdTest, givenVariadicGrfSizeWhenLocalSizesAreEmittedThenUseFullRowSize) { auto localIdsPtr = allocateAlignedMemory(3 * 64u, MemoryConstants::cacheLineSize);