diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 84fb371322..994fc2d979 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -21,6 +21,7 @@ #include "shared/source/helpers/per_thread_data.h" #include "shared/source/helpers/ray_tracing_helper.h" #include "shared/source/helpers/register_offsets.h" +#include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/string.h" #include "shared/source/helpers/surface_format_info.h" #include "shared/source/kernel/implicit_args.h" @@ -331,7 +332,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, auto remainderSimdLanes = itemsInGroup & (simdSize - 1u); threadExecutionMask = static_cast(maxNBitValue(remainderSimdLanes)); if (!threadExecutionMask) { - threadExecutionMask = static_cast(maxNBitValue((simdSize == 1) ? 32 : simdSize)); + threadExecutionMask = static_cast(maxNBitValue((isSimd1(simdSize)) ? 32 : simdSize)); } evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor); diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp index 547251083c..fbe5c92aa8 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp @@ -8,6 +8,7 @@ #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/basic_math.h" #include "shared/source/helpers/gfx_core_helper.h" +#include "shared/source/helpers/simd_helper.h" #include "shared/test/common/helpers/raii_gfx_core_helper.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/common/mocks/mock_l0_debugger.h" @@ -82,7 +83,7 @@ TEST_F(KernelImpTest, givenExecutionMaskWithoutReminderWhenProgrammingItsValueTh descriptor.kernelAttributes.simdSize = simd; kernel.KernelImp::setGroupSize(simd, 1, 1); - if (simd == 1) { + if (isSimd1(simd)) { EXPECT_EQ(maxNBitValue(32), kernel.KernelImp::getThreadExecutionMask()); } else { EXPECT_EQ(maxNBitValue(simd), kernel.KernelImp::getThreadExecutionMask()); diff --git a/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl b/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl index 1deafc46a4..a51cc791b8 100644 --- a/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl +++ b/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl @@ -46,7 +46,7 @@ size_t GpgpuWalkerHelper::setGpgpuWalkerThreadData( auto remainderSimdLanes = localWorkSize & (simd - 1); uint64_t executionMask = maxNBitValue(remainderSimdLanes); if (!executionMask) { - executionMask = maxNBitValue((simd == 1) ? 32 : simd); + executionMask = maxNBitValue(isSimd1(simd) ? 32 : simd); } walkerCmd->setExecutionMask(static_cast(executionMask)); diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index b9c6f0414a..3bf4745c7c 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -25,6 +25,7 @@ #include "shared/source/helpers/hw_info.h" #include "shared/source/helpers/kernel_helpers.h" #include "shared/source/helpers/ptr_math.h" +#include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/surface_format_info.h" #include "shared/source/kernel/implicit_args.h" #include "shared/source/kernel/kernel_arg_descriptor_extended_vme.h" @@ -85,7 +86,7 @@ Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, ClDevice &c program->retainForKernel(); imageTransformer.reset(new ImageTransformer); auto &deviceInfo = getDevice().getDevice().getDeviceInfo(); - if (kernelInfoArg.kernelDescriptor.kernelAttributes.simdSize == 1u) { + if (isSimd1(kernelInfoArg.kernelDescriptor.kernelAttributes.simdSize)) { auto &productHelper = getDevice().getProductHelper(); maxKernelWorkGroupSize = productHelper.getMaxThreadsForWorkgroupInDSSOrSS(getHardwareInfo(), static_cast(deviceInfo.maxNumEUsPerSubSlice), static_cast(deviceInfo.maxNumEUsPerDualSubSlice)); } else { @@ -2375,7 +2376,7 @@ void Kernel::setLocalIdsForGroup(const Vec3 &groupSize, void *destinat size_t Kernel::getLocalIdsSizeForGroup(const Vec3 &groupSize) const { UNRECOVERABLE_IF(localIdsCache.get() == nullptr); - return localIdsCache->getLocalIdsSizeForGroup(groupSize); + return localIdsCache->getLocalIdsSizeForGroup(groupSize, getGfxCoreHelper()); } size_t Kernel::getLocalIdsSizePerThread() const { diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp index 10f2dd5af9..a66347baa0 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp @@ -1487,4 +1487,4 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp size = alignUp(size, MemoryConstants::cacheLineSize); EXPECT_EQ(size, iohSizeWithImplicitArgs); } -} +} \ No newline at end of file diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp index 657447756b..1b9b6004a0 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp @@ -10,6 +10,7 @@ #include "shared/source/command_stream/linear_stream.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/gfx_core_helper.h" +#include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/timestamp_packet.h" #include "shared/source/indirect_heap/indirect_heap.h" #include "shared/source/memory_manager/unified_memory_manager.h" @@ -770,7 +771,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenExecution hwParser.findHardwareCommands(); auto walker = genCmdCast(*hwParser.itorWalker); - if (simd == 1) { + if (isSimd1(simd)) { EXPECT_EQ(maxNBitValue(32), walker->getExecutionMask()); } else { EXPECT_EQ(maxNBitValue(simd), walker->getExecutionMask()); diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 9f641f99c9..f49470c16e 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -492,7 +492,7 @@ void EncodeDispatchKernel::encodeThreadData(WALKER_TYPE &walkerCmd, auto remainderSimdLanes = workGroupSize & (simd - 1); executionMask = maxNBitValue(remainderSimdLanes); if (!executionMask) { - executionMask = maxNBitValue((simd == 1) ? 32 : simd); + executionMask = maxNBitValue(isSimd1(simd) ? 32 : simd); } } diff --git a/shared/source/helpers/simd_helper.h b/shared/source/helpers/simd_helper.h index e1376535cf..bee96272bd 100644 --- a/shared/source/helpers/simd_helper.h +++ b/shared/source/helpers/simd_helper.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2021 Intel Corporation + * Copyright (C) 2019-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -8,7 +8,10 @@ #pragma once #include +constexpr bool isSimd1(uint32_t simdSize) { + return simdSize == 1u; +} template constexpr typename WALKER_TYPE::SIMD_SIZE getSimdConfig(uint32_t simdSize) { - return static_cast((simdSize == 1) ? (32 >> 4) : (simdSize >> 4)); + return static_cast(isSimd1(simdSize) ? (32 >> 4) : (simdSize >> 4)); } diff --git a/shared/source/kernel/implicit_args_helper.cpp b/shared/source/kernel/implicit_args_helper.cpp index 39070ad831..240557c5d8 100644 --- a/shared/source/kernel/implicit_args_helper.cpp +++ b/shared/source/kernel/implicit_args_helper.cpp @@ -10,6 +10,7 @@ #include "shared/source/helpers/hw_walk_order.h" #include "shared/source/helpers/per_thread_data.h" #include "shared/source/helpers/ptr_math.h" +#include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/string.h" #include "shared/source/helpers/vec.h" #include "shared/source/kernel/implicit_args.h" @@ -35,7 +36,7 @@ std::array getDimensionOrderForLocalIds(const uint8_t *workgroupDime } uint32_t getGrfSize(uint32_t simd) { - if (simd == 1u) { + if (isSimd1(simd)) { return 3 * sizeof(uint16_t); } return 32u; diff --git a/shared/source/kernel/local_ids_cache.cpp b/shared/source/kernel/local_ids_cache.cpp index f678b72fbb..b82e324b44 100644 --- a/shared/source/kernel/local_ids_cache.cpp +++ b/shared/source/kernel/local_ids_cache.cpp @@ -9,7 +9,9 @@ #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/basic_math.h" +#include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/local_id_gen.h" +#include "shared/source/helpers/simd_helper.h" #include @@ -32,9 +34,12 @@ std::unique_lock LocalIdsCache::lock() { return std::unique_lock(setLocalIdsMutex); } -size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3 &group) const { +size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3 &group, const GfxCoreHelper &gfxCoreHelper) const { const auto numElementsInGroup = static_cast(Math::computeTotalElementsCount({group[0], group[1], group[2]})); - const auto numberOfThreads = getThreadsPerWG(simdSize, numElementsInGroup); + if (isSimd1(simdSize)) { + return static_cast(numElementsInGroup * localIdsSizePerThread); + } + const auto numberOfThreads = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simdSize, numElementsInGroup, grfSize, false); return static_cast(numberOfThreads * localIdsSizePerThread); } @@ -65,7 +70,7 @@ void LocalIdsCache::setLocalIdsForGroup(const Vec3 &group, void *desti } void LocalIdsCache::commitNewEntry(LocalIdsCacheEntry &entry, const Vec3 &group, const GfxCoreHelper &gfxCoreHelper) { - entry.localIdsSize = getLocalIdsSizeForGroup(group); + entry.localIdsSize = getLocalIdsSizeForGroup(group, gfxCoreHelper); entry.groupSize = group; entry.accessCounter = 0U; if (entry.localIdsSize > entry.localIdsSizeAllocated) { diff --git a/shared/source/kernel/local_ids_cache.h b/shared/source/kernel/local_ids_cache.h index 26955ed6b0..b4c42fbcae 100644 --- a/shared/source/kernel/local_ids_cache.h +++ b/shared/source/kernel/local_ids_cache.h @@ -31,7 +31,7 @@ class LocalIdsCache { ~LocalIdsCache(); void setLocalIdsForGroup(const Vec3 &group, void *destination, const GfxCoreHelper &gfxCoreHelper); - size_t getLocalIdsSizeForGroup(const Vec3 &group) const; + size_t getLocalIdsSizeForGroup(const Vec3 &group, const GfxCoreHelper &gfxCoreHelper) const; size_t getLocalIdsSizePerThread() const; protected: diff --git a/shared/test/unit_test/kernel/local_ids_cache_tests.cpp b/shared/test/unit_test/kernel/local_ids_cache_tests.cpp index 602832f91b..2f4f96a0b3 100644 --- a/shared/test/unit_test/kernel/local_ids_cache_tests.cpp +++ b/shared/test/unit_test/kernel/local_ids_cache_tests.cpp @@ -15,15 +15,15 @@ #include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/common/test_macros/test.h" +class MockLocalIdsCache : public NEO::LocalIdsCache { + public: + using Base = NEO::LocalIdsCache; + using Base::Base; + using Base::cache; + MockLocalIdsCache(size_t cacheSize) : MockLocalIdsCache(cacheSize, 32u){}; + MockLocalIdsCache(size_t cacheSize, uint8_t simd) : Base(cacheSize, {0, 1, 2}, simd, 32, false){}; +}; struct LocalIdsCacheFixture { - class MockLocalIdsCache : public NEO::LocalIdsCache { - public: - using Base = NEO::LocalIdsCache; - using Base::Base; - using Base::cache; - MockLocalIdsCache(size_t cacheSize) : Base(cacheSize, {0, 1, 2}, 32, 32, false){}; - }; - void setUp() { localIdsCache = std::make_unique(1); } @@ -34,8 +34,8 @@ struct LocalIdsCacheFixture { std::unique_ptr localIdsCache; }; -using LocalIdsCacheTest = Test; -TEST_F(LocalIdsCacheTest, GivenCacheMissWhenGetLocalIdsForGroupThenNewEntryIsCommitedIntoLeastUsedEntry) { +using LocalIdsCacheTests = Test; +TEST_F(LocalIdsCacheTests, GivenCacheMissWhenGetLocalIdsForGroupThenNewEntryIsCommitedIntoLeastUsedEntry) { localIdsCache->cache.resize(2); localIdsCache->cache[0].accessCounter = 2U; auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily); @@ -48,7 +48,7 @@ TEST_F(LocalIdsCacheTest, GivenCacheMissWhenGetLocalIdsForGroupThenNewEntryIsCom EXPECT_EQ(1U, localIdsCache->cache[1].accessCounter); } -TEST_F(LocalIdsCacheTest, GivenEntryInCacheWhenGetLocalIdsForGroupThenEntryFromCacheIsUsed) { +TEST_F(LocalIdsCacheTests, GivenEntryInCacheWhenGetLocalIdsForGroupThenEntryFromCacheIsUsed) { localIdsCache->cache[0].groupSize = groupSize; localIdsCache->cache[0].localIdsData = static_cast(alignedMalloc(512, 32)); localIdsCache->cache[0].localIdsSize = 512U; @@ -59,7 +59,7 @@ TEST_F(LocalIdsCacheTest, GivenEntryInCacheWhenGetLocalIdsForGroupThenEntryFromC EXPECT_EQ(2U, localIdsCache->cache[0].accessCounter); } -TEST_F(LocalIdsCacheTest, GivenEntryWithBiggerBufferAllocatedWhenGetLocalIdsForGroupThenBufferIsReused) { +TEST_F(LocalIdsCacheTests, GivenEntryWithBiggerBufferAllocatedWhenGetLocalIdsForGroupThenBufferIsReused) { localIdsCache->cache[0].groupSize = {4, 1, 1}; localIdsCache->cache[0].localIdsData = static_cast(alignedMalloc(512, 32)); localIdsCache->cache[0].localIdsSize = 512U; @@ -76,12 +76,22 @@ TEST_F(LocalIdsCacheTest, GivenEntryWithBiggerBufferAllocatedWhenGetLocalIdsForG EXPECT_EQ(localIdsData, localIdsCache->cache[0].localIdsData); } -TEST_F(LocalIdsCacheTest, GivenValidLocalIdsCacheWhenGettingLocalIdsSizePerThreadThenCorrectValueIsReturned) { +TEST_F(LocalIdsCacheTests, GivenValidLocalIdsCacheWhenGettingLocalIdsSizePerThreadThenCorrectValueIsReturned) { auto localIdsSizePerThread = localIdsCache->getLocalIdsSizePerThread(); EXPECT_EQ(192U, localIdsSizePerThread); } -TEST_F(LocalIdsCacheTest, GivenValidLocalIdsCacheWhenGettingLocalIdsSizeForGroupThenCorrectValueIsReturned) { - auto localIdsSizePerThread = localIdsCache->getLocalIdsSizeForGroup(groupSize); +TEST_F(LocalIdsCacheTests, GivenValidLocalIdsCacheWhenGettingLocalIdsSizeForGroupThenCorrectValueIsReturned) { + auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily); + auto localIdsSizePerThread = localIdsCache->getLocalIdsSizeForGroup(groupSize, *gfxCoreHelper.get()); EXPECT_EQ(1536U, localIdsSizePerThread); } + +TEST(LocalIdsCacheTest, givenSimd1WhenGettingLocalIdsSizeForGroupThenCorrectValueIsReturned) { + auto gfxCoreHelper = NEO::GfxCoreHelper::create(NEO::defaultHwInfo->platform.eRenderCoreFamily); + auto localIdsCache = std::make_unique(1u, 1u); + Vec3 groupSize = {128, 2, 1}; + auto localIdsSizePerThread = localIdsCache->getLocalIdsSizeForGroup(groupSize, *gfxCoreHelper.get()); + auto expectedLocalIdsSizePerThread = groupSize[0] * groupSize[1] * groupSize[2] * localIdsCache->getLocalIdsSizePerThread(); + EXPECT_EQ(expectedLocalIdsSizePerThread, localIdsSizePerThread); +} \ No newline at end of file