diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index b749a7531c..0c68046f22 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -398,13 +398,22 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, if (numChannels > 0) { UNRECOVERABLE_IF(3 != numChannels); + + std::array walkOrder{0, 1, 2}; + if (kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder) { + walkOrder = { + kernelDescriptor.kernelAttributes.workgroupWalkOrder[0], + kernelDescriptor.kernelAttributes.workgroupWalkOrder[1], + kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}; + } + NEO::generateLocalIDs( perThreadDataForWholeThreadGroup, static_cast(simdSize), std::array{{static_cast(groupSizeX), static_cast(groupSizeY), static_cast(groupSizeZ)}}, - std::array{{0, 1, 2}}, + walkOrder, false, grfSize, grfCount, rootDeviceEnvironment); } diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp index 4a2b1209e6..c12a9a7f98 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp @@ -8,6 +8,7 @@ #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/basic_math.h" #include "shared/source/helpers/gfx_core_helper.h" +#include "shared/source/helpers/local_id_gen.h" #include "shared/source/helpers/simd_helper.h" #include "shared/test/common/helpers/raii_gfx_core_helper.h" #include "shared/test/common/mocks/mock_bindless_heaps_helper.h" @@ -831,5 +832,49 @@ TEST_F(KernelImpTest, GivenGroupSizeRequiresSwLocalIdsGenerationWhenNextGroupSiz EXPECT_EQ(0u, kernel.getPerThreadDataSize()); } +TEST_F(KernelImpTest, GivenGroupSizeRequiresSwLocalIdsGenerationWhenKernelSpecifiesRequiredWalkOrderThenUseCorrectOrderToGenerateLocalIds) { + Mock module(device, nullptr); + Mock<::L0::KernelImp> kernel; + kernel.module = &module; + + auto grfSize = device->getHwInfo().capabilityTable.grfSize; + + WhiteBox<::L0::KernelImmutableData> kernelInfo = {}; + NEO::KernelDescriptor descriptor; + kernelInfo.kernelDescriptor = &descriptor; + kernelInfo.kernelDescriptor->kernelAttributes.numLocalIdChannels = 3; + kernelInfo.kernelDescriptor->kernelAttributes.numGrfRequired = GrfConfig::defaultGrfNumber; + kernelInfo.kernelDescriptor->kernelAttributes.flags.requiresWorkgroupWalkOrder = true; + kernelInfo.kernelDescriptor->kernelAttributes.workgroupWalkOrder[0] = 2; + kernelInfo.kernelDescriptor->kernelAttributes.workgroupWalkOrder[1] = 1; + kernelInfo.kernelDescriptor->kernelAttributes.workgroupWalkOrder[2] = 0; + kernelInfo.kernelDescriptor->kernelAttributes.simdSize = 32; + + kernel.kernelImmData = &kernelInfo; + + kernel.enableForcingOfGenerateLocalIdByHw = true; + kernel.forceGenerateLocalIdByHw = false; + + kernel.KernelImp::setGroupSize(12, 12, 1); + + uint32_t perThreadSizeNeeded = kernel.getPerThreadDataSizeForWholeThreadGroup(); + auto testPerThreadDataBuffer = static_cast(alignedMalloc(perThreadSizeNeeded, 32)); + + std::array walkOrder{2, 1, 0}; + + NEO::generateLocalIDs( + testPerThreadDataBuffer, + static_cast(32), + std::array{{static_cast(12), + static_cast(12), + static_cast(1)}}, + walkOrder, + false, grfSize, GrfConfig::defaultGrfNumber, device->getNEODevice()->getRootDeviceEnvironment()); + + EXPECT_EQ(0, memcmp(testPerThreadDataBuffer, kernel.KernelImp::getPerThreadData(), perThreadSizeNeeded)); + + alignedFree(testPerThreadDataBuffer); +} + } // namespace ult } // namespace L0