diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index f4e7f82f8f..cda485bac8 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -387,13 +387,13 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, auto &rootDeviceEnvironment = neoDevice->getRootDeviceEnvironment(); auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup( - simdSize, static_cast(itemsInGroup), grfCount, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment); + simdSize, static_cast(itemsInGroup), grfCount, rootDeviceEnvironment); if (kernelRequiresGenerationOfLocalIdsByRuntime) { auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize; uint32_t perThreadDataSizeForWholeThreadGroupNeeded = static_cast(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal( - simdSize, grfSize, grfCount, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment)); + simdSize, grfSize, grfCount, numChannels, itemsInGroup, rootDeviceEnvironment)); if (perThreadDataSizeForWholeThreadGroupNeeded > perThreadDataSizeForWholeThreadGroupAllocated) { alignedFree(perThreadDataForWholeThreadGroup); @@ -940,7 +940,7 @@ ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties) uint32_t maxKernelWorkGroupSize = static_cast(this->module->getMaxGroupSize(kernelDescriptor)); const auto &rootDeviceEnvironment = this->module->getDevice()->getNEODevice()->getRootDeviceEnvironment(); - maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, !kernelRequiresGenerationOfLocalIdsByRuntime, maxKernelWorkGroupSize, rootDeviceEnvironment); + maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, maxKernelWorkGroupSize, rootDeviceEnvironment); pKernelProperties->maxNumSubgroups = maxKernelWorkGroupSize / kernelDescriptor.kernelAttributes.simdSize; void *pNext = pKernelProperties->pNext; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index d6f2e2f1e1..892924bb62 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -929,7 +929,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment); auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize(); - size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment); + size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, rootDeviceEnvironment); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds)); alignedFree(expectedLocalIds); @@ -976,7 +976,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment); auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize(); - size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment); + size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, rootDeviceEnvironment); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds)); alignedFree(expectedLocalIds); diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index d4fade693e..d2b5a5857a 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -405,7 +405,6 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett mockKernel.descriptor.kernelAttributes.simdSize, groupSize[0] * groupSize[1] * groupSize[2], numGrf, - mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment); auto perThreadDataSizeForWholeTGNeeded = static_cast(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal( @@ -414,7 +413,6 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett numGrf, mockKernel.descriptor.kernelAttributes.numLocalIdChannels, groupSize[0] * groupSize[1] * groupSize[2], - !mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment)); EXPECT_EQ(numThreadsPerTG, mockKernel.getNumThreadsPerThreadGroup()); @@ -1968,7 +1966,7 @@ TEST_F(KernelPropertiesTests, whenPassingKernelMaxGroupSizePropertiesStructToGet EXPECT_EQ(ZE_RESULT_SUCCESS, res); auto &device = *module->getDevice(); auto &gfxCoreHelper = device.getGfxCoreHelper(); - uint32_t maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, false, static_cast(this->module->getMaxGroupSize(kernelDescriptor)), device.getNEODevice()->getRootDeviceEnvironment()); + uint32_t maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, static_cast(this->module->getMaxGroupSize(kernelDescriptor)), device.getNEODevice()->getRootDeviceEnvironment()); EXPECT_EQ(maxKernelWorkGroupSize, maxGroupSizeProperties.maxGroupSize); } diff --git a/level_zero/core/test/unit_tests/xe2_hpg_core/test_module_xe2_hpg_core.cpp b/level_zero/core/test/unit_tests/xe2_hpg_core/test_module_xe2_hpg_core.cpp index fde48b7068..666edcd8d0 100644 --- a/level_zero/core/test/unit_tests/xe2_hpg_core/test_module_xe2_hpg_core.cpp +++ b/level_zero/core/test/unit_tests/xe2_hpg_core/test_module_xe2_hpg_core.cpp @@ -69,17 +69,13 @@ XE2_HPG_CORETEST_F(Xe2KernelSetupTests, givenParamsWhenSetupGroupSizeThenNumThre module.getMaxGroupSizeResult = UINT32_MAX; kernel.module = &module; - std::array, 4> values = {{ - {16u, 1u, 64u}, // SIMT Size, HW local-id generation, Max Num of threads - {32u, 1u, 32u}, - {16u, 0u, 64u}, - {32u, 0u, 64u}, - + std::array, 2> values = {{ + {16u, 64u}, // SIMT Size, Max Num of threads + {32u, 32u}, }}; - for (auto &[simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) { + for (auto &[simtSize, expectedNumThreadsPerThreadGroup] : values) { kernel.descriptor.kernelAttributes.simdSize = simtSize; - kernel.forceGenerateLocalIdByHw = isHwLocalIdGeneration; kernel.setGroupSize(1024u, 1024u, 1024u); EXPECT_EQ(expectedNumThreadsPerThreadGroup, kernel.numThreadsPerThreadGroup); kernel.groupSize[0] = kernel.groupSize[1] = kernel.groupSize[2] = 0; @@ -96,16 +92,13 @@ XE2_HPG_CORETEST_F(Xe2KernelSetupTests, givenParamsWhenSetupGroupSizeThenNumThre module.getMaxGroupSizeResult = UINT32_MAX; kernel.module = &module; - std::array, 4> values = {{ - {16u, 0u, 32u}, // SIMT Size, HW local-id generation, Max Num of threads - {16u, 1u, 32u}, - {32u, 0u, 32u}, - {32u, 1u, 32u}, + std::array, 2> values = {{ + {16u, 32u}, // SIMT Size, Max Num of threads + {32u, 32u}, }}; - for (auto &[simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) { + for (auto &[simtSize, expectedNumThreadsPerThreadGroup] : values) { kernel.descriptor.kernelAttributes.simdSize = simtSize; - kernel.forceGenerateLocalIdByHw = isHwLocalIdGeneration; kernel.setGroupSize(1024u, 1024u, 1024u); EXPECT_EQ(expectedNumThreadsPerThreadGroup, kernel.numThreadsPerThreadGroup); kernel.groupSize[0] = kernel.groupSize[1] = kernel.groupSize[2] = 0; diff --git a/level_zero/core/test/unit_tests/xe3_core/test_module_xe3_core.cpp b/level_zero/core/test/unit_tests/xe3_core/test_module_xe3_core.cpp index 9597220a04..9634849cd6 100644 --- a/level_zero/core/test/unit_tests/xe3_core/test_module_xe3_core.cpp +++ b/level_zero/core/test/unit_tests/xe3_core/test_module_xe3_core.cpp @@ -70,17 +70,14 @@ XE3_CORETEST_F(Xe3KernelSetupTests, givenParamsWhenSetupGroupSizeThenNumThreadsP module.getMaxGroupSizeResult = UINT32_MAX; kernel.module = &module; - std::array, 4> values = {{ - {16u, 0u, 64u}, // SIMT Size, HW local-id generation, Max Num of threads - {16u, 1u, 64u}, - {32u, 1u, 32u}, - {32u, 0u, 64u}, + std::array, 2> values = {{ + {16u, 64u}, // SIMT Size, Max Num of threads + {32u, 32u}, }}; - for (auto &[simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) { + for (auto &[simtSize, expectedNumThreadsPerThreadGroup] : values) { kernel.descriptor.kernelAttributes.simdSize = simtSize; - kernel.forceGenerateLocalIdByHw = isHwLocalIdGeneration; kernel.setGroupSize(1024u, 1024u, 1024u); EXPECT_EQ(expectedNumThreadsPerThreadGroup, kernel.numThreadsPerThreadGroup); kernel.groupSize[0] = kernel.groupSize[1] = kernel.groupSize[2] = 0; @@ -97,16 +94,13 @@ XE3_CORETEST_F(Xe3KernelSetupTests, givenParamsWhenSetupGroupSizeThenNumThreadsP module.getMaxGroupSizeResult = UINT32_MAX; kernel.module = &module; - std::array, 4> values = {{ - {16u, 0u, 48u}, // SIMT Size, HW local-id generation, Max Num of threads - {16u, 1u, 48u}, - {32u, 1u, 32u}, - {32u, 0u, 48u}, + std::array, 2> values = {{ + {16u, 48u}, // SIMT Size, Max Num of threads + {32u, 32u}, }}; - for (auto &[simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) { + for (auto &[simtSize, expectedNumThreadsPerThreadGroup] : values) { kernel.descriptor.kernelAttributes.simdSize = simtSize; - kernel.forceGenerateLocalIdByHw = isHwLocalIdGeneration; kernel.setGroupSize(1024u, 1024u, 1024u); EXPECT_EQ(expectedNumThreadsPerThreadGroup, kernel.numThreadsPerThreadGroup); kernel.groupSize[0] = kernel.groupSize[1] = kernel.groupSize[2] = 0; @@ -123,16 +117,13 @@ XE3_CORETEST_F(Xe3KernelSetupTests, givenParamsWhenSetupGroupSizeThenNumThreadsP module.getMaxGroupSizeResult = UINT32_MAX; kernel.module = &module; - std::array, 4> values = {{ - {16u, 0u, 40u}, // SIMT Size, HW local-id generation, Max Num of threads - {16u, 1u, 40u}, - {32u, 1u, 32u}, - {32u, 0u, 40u}, + std::array, 2> values = {{ + {16u, 40u}, // SIMT Size, Max Num of threads + {32u, 32u}, }}; - for (auto &[simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) { + for (auto &[simtSize, expectedNumThreadsPerThreadGroup] : values) { kernel.descriptor.kernelAttributes.simdSize = simtSize; - kernel.forceGenerateLocalIdByHw = isHwLocalIdGeneration; kernel.setGroupSize(1024u, 1024u, 1024u); EXPECT_EQ(expectedNumThreadsPerThreadGroup, kernel.numThreadsPerThreadGroup); kernel.groupSize[0] = kernel.groupSize[1] = kernel.groupSize[2] = 0; @@ -149,16 +140,13 @@ XE3_CORETEST_F(Xe3KernelSetupTests, givenParamsWhenSetupGroupSizeThenNumThreadsP module.getMaxGroupSizeResult = UINT32_MAX; kernel.module = &module; - std::array, 4> values = {{ - {16u, 0u, 32u}, // SIMT Size, HW local-id generation, Max Num of threads - {16u, 1u, 32u}, - {32u, 1u, 32u}, - {32u, 0u, 32u}, + std::array, 2> values = {{ + {16u, 32u}, // SIMT Size, Max Num of threads + {32u, 32u}, }}; - for (auto &[simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) { + for (auto &[simtSize, expectedNumThreadsPerThreadGroup] : values) { kernel.descriptor.kernelAttributes.simdSize = simtSize; - kernel.forceGenerateLocalIdByHw = isHwLocalIdGeneration; kernel.setGroupSize(1024u, 1024u, 1024u); EXPECT_EQ(expectedNumThreadsPerThreadGroup, kernel.numThreadsPerThreadGroup); kernel.groupSize[0] = kernel.groupSize[1] = kernel.groupSize[2] = 0; @@ -175,16 +163,13 @@ XE3_CORETEST_F(Xe3KernelSetupTests, givenParamsWhenSetupGroupSizeThenNumThreadsP module.getMaxGroupSizeResult = UINT32_MAX; kernel.module = &module; - std::array, 4> values = {{ - {16u, 0u, 16u}, // SIMT Size, HW local-id generation, Max Num of threads - {16u, 1u, 16u}, - {32u, 1u, 16u}, - {32u, 0u, 16u}, + std::array, 2> values = {{ + {16u, 16u}, // SIMT Size, Max Num of threads + {32u, 16u}, }}; - for (auto &[simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) { + for (auto &[simtSize, expectedNumThreadsPerThreadGroup] : values) { kernel.descriptor.kernelAttributes.simdSize = simtSize; - kernel.forceGenerateLocalIdByHw = isHwLocalIdGeneration; kernel.setGroupSize(1024u, 1024u, 1024u); EXPECT_EQ(expectedNumThreadsPerThreadGroup, kernel.numThreadsPerThreadGroup); kernel.groupSize[0] = kernel.groupSize[1] = kernel.groupSize[2] = 0; diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index 79286c84fd..bf6c7d1a05 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -69,7 +69,7 @@ size_t HardwareCommandsHelper::getSizeRequiredIOH(const Kernel &kerne requiredWalkOrder, simdSize); auto size = kernel.getCrossThreadDataSize() + - HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSize, grfCount, numChannels, localWorkSize, isHwLocalIdGeneration, rootDeviceEnvironment); + HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSize, grfCount, numChannels, localWorkSize, rootDeviceEnvironment); auto pImplicitArgs = kernel.getImplicitArgs(); if (pImplicitArgs) { @@ -297,7 +297,7 @@ size_t HardwareCommandsHelper::sendIndirectState( auto &gfxCoreHelper = device.getGfxCoreHelper(); auto grfCount = kernel.getDescriptor().kernelAttributes.numGrfRequired; auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2]; - auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkItems), grfCount, !localIdsGenerationByRuntime, device.getRootDeviceEnvironment()); + auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkItems), grfCount, device.getRootDeviceEnvironment()); uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize(); diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 116616686e..020bbb4ba3 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -2235,10 +2235,9 @@ void Kernel::reconfigureKernel() { const auto &kernelDescriptor = kernelInfo.kernelDescriptor; const auto &gfxCoreHelper = this->getGfxCoreHelper(); auto maxWorkGroupSize = gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, this->maxKernelWorkGroupSize); - bool isLocalIdsGeneratedByHw = false; // if local ids generated by runtime then more work groups available maxWorkGroupSize = static_cast(kernelInfo.getMaxRequiredWorkGroupSize(maxWorkGroupSize)); - this->maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, isLocalIdsGeneratedByHw, maxWorkGroupSize, getDevice().getRootDeviceEnvironment()); + this->maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, maxWorkGroupSize, getDevice().getRootDeviceEnvironment()); this->containsStatelessWrites = kernelDescriptor.kernelAttributes.flags.usesStatelessWrites; this->systolicPipelineSelectMode = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode; diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp index ff833eb283..2a9db40284 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp @@ -1380,7 +1380,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp auto numGrf = GrfConfig::defaultGrfNumber; auto size = kernelWithImplicitArgs.getCrossThreadDataSize() + - HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSize, numGrf, numChannels, Math::computeTotalElementsCount(workGroupSize), false, rootDeviceEnvironment) + + HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSize, numGrf, numChannels, Math::computeTotalElementsCount(workGroupSize), rootDeviceEnvironment) + ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernelWithImplicitArgs.getImplicitArgs(), kernelWithImplicitArgs.getDescriptor(), false, rootDeviceEnvironment); size = alignUp(size, NEO::EncodeDispatchKernel::getDefaultIOHAlignment()); diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp index 0a75ce1bc3..b56325de7a 100644 --- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp +++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp @@ -569,7 +569,7 @@ HWCMDTEST_F(IGFX_GEN12LP_CORE, HardwareCommandsTest, whenSendingIndirectStateThe auto numChannels = modifiedKernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels; auto numGrf = GrfConfig::defaultGrfNumber; const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); - size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numGrf, numChannels, localWorkSize, !kernelUsesLocalIds, rootDeviceEnvironment); + size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numGrf, numChannels, localWorkSize, rootDeviceEnvironment); ASSERT_LE(expectedIohSize, ioh.getUsed()); auto expectedLocalIds = alignedMalloc(expectedIohSize, 64); @@ -1295,7 +1295,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment); auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize(); - size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment); + size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, rootDeviceEnvironment); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds)); alignedFree(expectedLocalIds); @@ -1330,7 +1330,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment); auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize(); - size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment); + size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, rootDeviceEnvironment); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds)); alignedFree(expectedLocalIds); diff --git a/shared/source/helpers/aarch64/local_id_gen.cpp b/shared/source/helpers/aarch64/local_id_gen.cpp index eee848a437..62ccfa8f06 100644 --- a/shared/source/helpers/aarch64/local_id_gen.cpp +++ b/shared/source/helpers/aarch64/local_id_gen.cpp @@ -44,9 +44,8 @@ LocalIDHelper::LocalIDHelper() { LocalIDHelper LocalIDHelper::initializer; void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) { - bool localIdsGeneratedByHw = false; auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); - auto threadsPerWorkGroup = static_cast(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, localIdsGeneratedByHw, rootDeviceEnvironment)); + auto threadsPerWorkGroup = static_cast(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, rootDeviceEnvironment)); bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd); if (useLayoutForImages) { generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd); diff --git a/shared/source/helpers/gfx_core_helper.h b/shared/source/helpers/gfx_core_helper.h index 09cc32d5b8..193ae9bce2 100644 --- a/shared/source/helpers/gfx_core_helper.h +++ b/shared/source/helpers/gfx_core_helper.h @@ -120,7 +120,7 @@ class GfxCoreHelper { virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0; virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0; - virtual uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0; + virtual uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0; virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0; virtual bool isCpuImageTransferPreferred(const HardwareInfo &hwInfo) const = 0; virtual aub_stream::MMIOList getExtraMmioList(const HardwareInfo &hwInfo, const GmmHelper &gmmHelper) const = 0; @@ -165,7 +165,7 @@ class GfxCoreHelper { virtual bool isChipsetUniqueUUIDSupported() const = 0; virtual bool isTimestampShiftRequired() const = 0; virtual bool isRelaxedOrderingSupported() const = 0; - virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0; + virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0; virtual uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const = 0; virtual DeviceHierarchyMode getDefaultDeviceHierarchy() const = 0; static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper); @@ -362,7 +362,7 @@ class GfxCoreHelperHw : public GfxCoreHelper { uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment) const override; - uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override; + uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override; size_t getMaxFillPaternSizeForCopyEngine() const override; bool isCpuImageTransferPreferred(const HardwareInfo &hwInfo) const override; @@ -414,7 +414,7 @@ class GfxCoreHelperHw : public GfxCoreHelper { bool isChipsetUniqueUUIDSupported() const override; bool isTimestampShiftRequired() const override; bool isRelaxedOrderingSupported() const override; - uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const override; + uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const override; uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const override; DeviceHierarchyMode getDefaultDeviceHierarchy() const override; diff --git a/shared/source/helpers/gfx_core_helper_base.inl b/shared/source/helpers/gfx_core_helper_base.inl index 2fafe1c6be..5fb83ec222 100644 --- a/shared/source/helpers/gfx_core_helper_base.inl +++ b/shared/source/helpers/gfx_core_helper_base.inl @@ -684,7 +684,7 @@ uint32_t GfxCoreHelperHw::overrideMaxWorkGroupSize(uint32_t maxWG) co } template -uint32_t GfxCoreHelperHw::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const { +uint32_t GfxCoreHelperHw::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const { return defaultMaxGroupSize; } @@ -694,7 +694,7 @@ uint32_t GfxCoreHelperHw::getMinimalGrfSize() const { } template -uint32_t GfxCoreHelperHw::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const { +uint32_t GfxCoreHelperHw::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const { return getThreadsPerWG(simd, totalWorkItems); } diff --git a/shared/source/helpers/gfx_core_helper_xe3_and_later.inl b/shared/source/helpers/gfx_core_helper_xe3_and_later.inl index d6f65ae0cf..348c8ce21a 100644 --- a/shared/source/helpers/gfx_core_helper_xe3_and_later.inl +++ b/shared/source/helpers/gfx_core_helper_xe3_and_later.inl @@ -25,4 +25,38 @@ uint32_t GfxCoreHelperHw::calculateAvailableThreadCount(const HardwareIn } return std::min(hwInfo.gtSystemInfo.ThreadCount, maxThreadsPerEuCount * hwInfo.gtSystemInfo.EUCount); } + +template <> +uint32_t GfxCoreHelperHw::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const { + uint32_t numThreadsPerThreadGroup = getThreadsPerWG(simd, totalWorkItems); + if (debugManager.flags.RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup.get() == 1) { + return numThreadsPerThreadGroup; + } + + const auto &compilerProductHelper = rootDeviceEnvironment.getHelper(); + const auto &productHelper = rootDeviceEnvironment.getProductHelper(); + const auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo(); + auto isHeaplessMode = compilerProductHelper.isHeaplessModeEnabled(hwInfo); + + uint32_t maxThreadsPerThreadGroup = 32u; + if (grfCount == 512) { + maxThreadsPerThreadGroup = 16u; + } else if ((grfCount == 256) || (simd == 32u)) { + // driver limit maxWorkgroupSize to 1024 (NEO-11881) so for simt 32 the max threads per thread group is 32 + maxThreadsPerThreadGroup = 32u; + } else if (grfCount == 192) { + maxThreadsPerThreadGroup = 40u; + } else if (grfCount == 160) { + maxThreadsPerThreadGroup = 48u; + } else if (grfCount <= 128) { + maxThreadsPerThreadGroup = 64u; + } + + maxThreadsPerThreadGroup = productHelper.adjustMaxThreadsPerThreadGroup(maxThreadsPerThreadGroup, simd, grfCount, isHeaplessMode); + + numThreadsPerThreadGroup = std::min(numThreadsPerThreadGroup, maxThreadsPerThreadGroup); + DEBUG_BREAK_IF(numThreadsPerThreadGroup * simd > CommonConstants::maxWorkgroupSize); + return numThreadsPerThreadGroup; +} + } // namespace NEO diff --git a/shared/source/helpers/per_thread_data.h b/shared/source/helpers/per_thread_data.h index 90f1bb2d34..284524885d 100644 --- a/shared/source/helpers/per_thread_data.h +++ b/shared/source/helpers/per_thread_data.h @@ -24,14 +24,13 @@ struct PerThreadDataHelper { uint32_t grfCount, uint32_t numChannels, size_t localWorkSize, - bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) { auto perThreadSizeLocalIDs = static_cast(getPerThreadSizeLocalIDs(simd, grfSize, numChannels)); if (isSimd1(simd)) { return perThreadSizeLocalIDs * localWorkSize; } auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); - return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkSize), grfCount, isHwLocalIdGeneration, rootDeviceEnvironment); + return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkSize), grfCount, rootDeviceEnvironment); } }; // namespace PerThreadDataHelper } // namespace NEO diff --git a/shared/source/helpers/x86_64/local_id_gen.cpp b/shared/source/helpers/x86_64/local_id_gen.cpp index ab69181a41..8fe776440d 100644 --- a/shared/source/helpers/x86_64/local_id_gen.cpp +++ b/shared/source/helpers/x86_64/local_id_gen.cpp @@ -47,9 +47,8 @@ LocalIDHelper LocalIDHelper::initializer; // traditional function to generate local IDs void generateLocalIDs(void *buffer, uint16_t simd, const std::array &localWorkgroupSize, const std::array &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) { - bool localIdsGeneratedByHw = false; auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); - auto threadsPerWorkGroup = static_cast(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, localIdsGeneratedByHw, rootDeviceEnvironment)); + auto threadsPerWorkGroup = static_cast(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, rootDeviceEnvironment)); bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd); if (useLayoutForImages) { generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd); diff --git a/shared/source/kernel/implicit_args_helper.cpp b/shared/source/kernel/implicit_args_helper.cpp index a45be26cd9..b99245703b 100644 --- a/shared/source/kernel/implicit_args_helper.cpp +++ b/shared/source/kernel/implicit_args_helper.cpp @@ -73,7 +73,7 @@ uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const } auto itemsInGroup = Math::computeTotalElementsCount(localWorkSize); - localIdsSize = static_cast(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(simdSize, grfSize, grfCount, 3u, itemsInGroup, isHwLocalIdGeneration, rootDeviceEnvironment)); + localIdsSize = static_cast(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(simdSize, grfSize, grfCount, 3u, itemsInGroup, rootDeviceEnvironment)); localIdsSize = alignUp(localIdsSize, MemoryConstants::cacheLineSize); } return implicitArgsStructSize + localIdsSize; diff --git a/shared/source/kernel/local_ids_cache.cpp b/shared/source/kernel/local_ids_cache.cpp index 729e07d9d8..16c13a83f4 100644 --- a/shared/source/kernel/local_ids_cache.cpp +++ b/shared/source/kernel/local_ids_cache.cpp @@ -42,7 +42,7 @@ size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3 &group, const return static_cast(numElementsInGroup * localIdsSizePerThread); } auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); - const auto numberOfThreads = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simdSize, numElementsInGroup, grfCount, false, rootDeviceEnvironment); + const auto numberOfThreads = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simdSize, numElementsInGroup, grfCount, rootDeviceEnvironment); return static_cast(numberOfThreads * localIdsSizePerThread); } diff --git a/shared/source/os_interface/product_helper.h b/shared/source/os_interface/product_helper.h index 7b7a71a7d7..2dd644243a 100644 --- a/shared/source/os_interface/product_helper.h +++ b/shared/source/os_interface/product_helper.h @@ -257,7 +257,7 @@ class ProductHelper { virtual bool supports2DBlockStore() const = 0; virtual bool supports2DBlockLoad() const = 0; virtual uint32_t getNumCacheRegions() const = 0; - virtual uint32_t adjustMaxThreadsPerThreadGroup(uint32_t maxThreadsPerThreadGroup, uint32_t simt, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, bool isHeaplessModeEnabled) const = 0; + virtual uint32_t adjustMaxThreadsPerThreadGroup(uint32_t maxThreadsPerThreadGroup, uint32_t simt, uint32_t grfCount, bool isHeaplessModeEnabled) const = 0; virtual uint64_t getPatIndex(CacheRegion cacheRegion, CachePolicy cachePolicy) const = 0; virtual uint32_t getGmmResourceUsageOverride(uint32_t usageType) const = 0; virtual bool isSharingWith3dOrMediaAllowed() const = 0; diff --git a/shared/source/os_interface/product_helper.inl b/shared/source/os_interface/product_helper.inl index 41715fb3a6..8470fe101e 100644 --- a/shared/source/os_interface/product_helper.inl +++ b/shared/source/os_interface/product_helper.inl @@ -972,7 +972,7 @@ bool ProductHelperHw::isL3FlushAfterPostSyncRequired(bool heaplessEn } template -uint32_t ProductHelperHw::adjustMaxThreadsPerThreadGroup(uint32_t maxThreadsPerThreadGroup, uint32_t simt, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, bool isHeaplessModeEnabled) const { +uint32_t ProductHelperHw::adjustMaxThreadsPerThreadGroup(uint32_t maxThreadsPerThreadGroup, uint32_t simt, uint32_t grfCount, bool isHeaplessModeEnabled) const { return maxThreadsPerThreadGroup; } diff --git a/shared/source/os_interface/product_helper_hw.h b/shared/source/os_interface/product_helper_hw.h index cc9e5ffe4e..e115f354c6 100644 --- a/shared/source/os_interface/product_helper_hw.h +++ b/shared/source/os_interface/product_helper_hw.h @@ -194,7 +194,7 @@ class ProductHelperHw : public ProductHelper { bool supports2DBlockStore() const override; bool supports2DBlockLoad() const override; uint32_t getNumCacheRegions() const override; - uint32_t adjustMaxThreadsPerThreadGroup(uint32_t maxThreadsPerThreadGroup, uint32_t simt, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, bool isHeaplessModeEnabled) const override; + uint32_t adjustMaxThreadsPerThreadGroup(uint32_t maxThreadsPerThreadGroup, uint32_t simt, uint32_t grfCount, bool isHeaplessModeEnabled) const override; uint64_t getPatIndex(CacheRegion cacheRegion, CachePolicy cachePolicy) const override; uint32_t getGmmResourceUsageOverride(uint32_t usageType) const override; bool isSharingWith3dOrMediaAllowed() const override; diff --git a/shared/source/xe2_hpg_core/gfx_core_helper_xe2_hpg_core.cpp b/shared/source/xe2_hpg_core/gfx_core_helper_xe2_hpg_core.cpp index 31cf227b9f..9fbb820976 100644 --- a/shared/source/xe2_hpg_core/gfx_core_helper_xe2_hpg_core.cpp +++ b/shared/source/xe2_hpg_core/gfx_core_helper_xe2_hpg_core.cpp @@ -256,22 +256,25 @@ uint32_t GfxCoreHelperHw::overrideMaxWorkGroupSize(uint32_t maxWG) const } template <> -uint32_t GfxCoreHelperHw::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const { +uint32_t GfxCoreHelperHw::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const { uint32_t numThreadsPerThreadGroup = getThreadsPerWG(simd, totalWorkItems); if (debugManager.flags.RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup.get() == 1) { return numThreadsPerThreadGroup; } - auto simt = isSimd1(simd) ? 32u : simd; + uint32_t maxThreadsPerThreadGroup = 32u; - if (grfCount != GrfConfig::largeGrfNumber && ((simt == 16u) || (simt == 32u && !isHwLocalIdGeneration))) { + // driver limit maxWorkgroupSize to 1024 (NEO-11881) so for simt 32 the max threads per thread group is 32 + if ((grfCount != GrfConfig::largeGrfNumber && (simd == 16u)) || isSimd1(simd)) { maxThreadsPerThreadGroup = 64u; } - return std::min(numThreadsPerThreadGroup, maxThreadsPerThreadGroup); + numThreadsPerThreadGroup = std::min(numThreadsPerThreadGroup, maxThreadsPerThreadGroup); + DEBUG_BREAK_IF(numThreadsPerThreadGroup * simd > CommonConstants::maxWorkgroupSize); + return numThreadsPerThreadGroup; } template <> -uint32_t GfxCoreHelperHw::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const { - const uint32_t threadsPerThreadGroup = calculateNumThreadsPerThreadGroup(simd, defaultMaxGroupSize, grfCount, isHwLocalGeneration, rootDeviceEnvironment); +uint32_t GfxCoreHelperHw::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const { + const uint32_t threadsPerThreadGroup = calculateNumThreadsPerThreadGroup(simd, defaultMaxGroupSize, grfCount, rootDeviceEnvironment); return (threadsPerThreadGroup * simd); } diff --git a/shared/source/xe3_core/gfx_core_helper_xe3_core.cpp b/shared/source/xe3_core/gfx_core_helper_xe3_core.cpp index 57e7725e7f..0093ae6072 100644 --- a/shared/source/xe3_core/gfx_core_helper_xe3_core.cpp +++ b/shared/source/xe3_core/gfx_core_helper_xe3_core.cpp @@ -255,28 +255,8 @@ uint32_t GfxCoreHelperHw::overrideMaxWorkGroupSize(uint32_t maxWG) const } template <> -uint32_t GfxCoreHelperHw::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const { - uint32_t numThreadsPerThreadGroup = getThreadsPerWG(simd, totalWorkItems); - if (debugManager.flags.RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup.get() == 1) { - return numThreadsPerThreadGroup; - } - auto simt = isSimd1(simd) ? 32u : simd; - uint32_t maxThreadsPerThreadGroup = 32u; - if (grfCount == 512) { - maxThreadsPerThreadGroup = 16u; - } else if (grfCount == 192 && ((simt == 16u) || (simt == 32u && !isHwLocalIdGeneration))) { - maxThreadsPerThreadGroup = 40u; - } else if (grfCount == 160 && ((simt == 16u) || (simt == 32u && !isHwLocalIdGeneration))) { - maxThreadsPerThreadGroup = 48u; - } else if (grfCount <= 128 && ((simt == 16u) || (simt == 32u && !isHwLocalIdGeneration))) { - maxThreadsPerThreadGroup = 64u; - } - return std::min(numThreadsPerThreadGroup, maxThreadsPerThreadGroup); -} - -template <> -uint32_t GfxCoreHelperHw::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const { - const uint32_t threadsPerThreadGroup = calculateNumThreadsPerThreadGroup(simd, defaultMaxGroupSize, grfCount, isHwLocalGeneration, rootDeviceEnvironment); +uint32_t GfxCoreHelperHw::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const { + const uint32_t threadsPerThreadGroup = calculateNumThreadsPerThreadGroup(simd, defaultMaxGroupSize, grfCount, rootDeviceEnvironment); return (threadsPerThreadGroup * simd); } } // namespace NEO diff --git a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp index b7edf62560..a28d2ea2bc 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp @@ -1650,18 +1650,16 @@ HWTEST_F(GfxCoreHelperTest, givenNumGrfAndSimdSizeWhenAdjustingMaxWorkGroupSizeT constexpr auto defaultMaxGroupSize = 1024u; uint32_t simdSize = 16u; - uint32_t isHwLocalIdGeneration = true; uint32_t numGrfRequired = GrfConfig::largeGrfNumber; - EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment)); + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize, rootDeviceEnvironment)); simdSize = 32u; numGrfRequired = GrfConfig::largeGrfNumber; - EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment)); + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize, rootDeviceEnvironment)); simdSize = 16u; - isHwLocalIdGeneration = false; numGrfRequired = GrfConfig::defaultGrfNumber; - EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment)); + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize, rootDeviceEnvironment)); } HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue, IsAtMostXeHpcCore) { @@ -1679,7 +1677,7 @@ HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThe }}; for (auto &[simtSize, totalWgSize, expectedNumThreadsPerThreadGroup] : values) { - EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, true, rootDeviceEnvironment)); + EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, rootDeviceEnvironment)); } } @@ -1689,19 +1687,19 @@ HWTEST_F(GfxCoreHelperTest, givenFlagRemoveRestrictionsOnNumberOfThreadsInGpgpuT const auto &gfxCoreHelper = getHelper(); const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); - std::array, 8> values = {{ - {32u, 32u, 128u, 1, 1u}, // SIMT Size, totalWorkItems, Max Num of threads, Grf size, Hw local id generation - {32u, 64u, 32u, 1, 2u}, - {32u, 128u, 256u, 1, 4u}, - {32u, 1024u, 128u, 1, 32u}, - {16u, 32u, 32u, 0, 2u}, - {16u, 64u, 256u, 0, 4u}, - {16u, 128u, 128u, 0, 8u}, - {16u, 1024u, 256u, 0, 64u}, + std::array, 8> values = {{ + {32u, 32u, 128u, 1u}, // SIMT Size, totalWorkItems,Grf size, Max Num of threads + {32u, 64u, 32u, 2u}, + {32u, 128u, 256u, 4u}, + {32u, 1024u, 128u, 32u}, + {16u, 32u, 32u, 2u}, + {16u, 64u, 256u, 4u}, + {16u, 128u, 128u, 8u}, + {16u, 1024u, 256u, 64u}, }}; - for (auto &[simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) { - EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, rootDeviceEnvironment)); + for (auto &[simtSize, totalWgSize, grfsize, expectedNumThreadsPerThreadGroup] : values) { + EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, rootDeviceEnvironment)); } } diff --git a/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp b/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp index b343b08e70..a64f0aee59 100644 --- a/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp +++ b/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp @@ -81,7 +81,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP NEO::MockExecutionEnvironment mockExecutionEnvironment{}; auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; - auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.v0.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize); + auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.v0.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, rootDeviceEnvironment), MemoryConstants::cacheLineSize); EXPECT_EQ(localIdsSize + ImplicitArgsV0::getAlignedSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment)); } diff --git a/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp b/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp index 2d1196e8c8..c4a83fba22 100644 --- a/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp +++ b/shared/test/unit_test/xe2_hpg_core/gfx_core_helper_tests_xe2_hpg_core.cpp @@ -807,23 +807,17 @@ XE2_HPG_CORETEST_F(GfxCoreHelperTestsXe2HpgCore, givenNumGrfAndSimdSizeWhenAdjus auto defaultMaxWorkGroupSize = 2048u; const auto &gfxCoreHelper = getHelper(); const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); - std::array, 12> values = {{ - {GrfConfig::defaultGrfNumber, 16u, 0u, 1024u}, // Grf Size, SIMT Size, HW local-id generation, Max Num of threads - {GrfConfig::defaultGrfNumber, 16u, 1u, 1024u}, - {GrfConfig::defaultGrfNumber, 32u, 1u, 1024u}, - {GrfConfig::defaultGrfNumber, 32u, 0u, 2048u}, - {GrfConfig::largeGrfNumber, 16u, 0u, 512u}, - {GrfConfig::largeGrfNumber, 16u, 1u, 512u}, - {GrfConfig::largeGrfNumber, 32u, 0u, 1024u}, - {GrfConfig::largeGrfNumber, 32u, 1u, 1024u}, - {GrfConfig::defaultGrfNumber, 1u, 1u, 32u}, - {GrfConfig::defaultGrfNumber, 1u, 0u, 64u}, - {GrfConfig::largeGrfNumber, 1u, 0u, 32u}, - {GrfConfig::largeGrfNumber, 1u, 1u, 32u}, + std::array, 6> values = {{ + {GrfConfig::defaultGrfNumber, 16u, 1024u}, // Grf Size, SIMT Size, Max Num of threads + {GrfConfig::defaultGrfNumber, 32u, 1024u}, + {GrfConfig::largeGrfNumber, 16u, 512u}, + {GrfConfig::largeGrfNumber, 32u, 1024u}, + {GrfConfig::defaultGrfNumber, 1u, 64u}, + {GrfConfig::largeGrfNumber, 1u, 64u}, }}; - for (auto &[grfSize, simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) { - EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.adjustMaxWorkGroupSize(grfSize, simtSize, isHwLocalIdGeneration, defaultMaxWorkGroupSize, rootDeviceEnvironment)); + for (auto &[grfSize, simtSize, expectedNumThreadsPerThreadGroup] : values) { + EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.adjustMaxWorkGroupSize(grfSize, simtSize, defaultMaxWorkGroupSize, rootDeviceEnvironment)); } } @@ -831,23 +825,17 @@ XE2_HPG_CORETEST_F(GfxCoreHelperTestsXe2HpgCore, givenParamsWhenCalculateNumThre auto &gfxCoreHelper = getHelper(); const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); auto totalWgSize = 2048u; - std::array, 12> values = {{ - {GrfConfig::defaultGrfNumber, 16u, 0u, 64u}, // Grf Size, SIMT Size, HW local-id generation, Max Num of threads - {GrfConfig::defaultGrfNumber, 16u, 1u, 64u}, - {GrfConfig::defaultGrfNumber, 32u, 1u, 32u}, - {GrfConfig::defaultGrfNumber, 32u, 0u, 64u}, - {GrfConfig::defaultGrfNumber, 1u, 1u, 32u}, - {GrfConfig::defaultGrfNumber, 1u, 0u, 64u}, - {GrfConfig::largeGrfNumber, 16u, 0u, 32u}, - {GrfConfig::largeGrfNumber, 16u, 1u, 32u}, - {GrfConfig::largeGrfNumber, 32u, 0u, 32u}, - {GrfConfig::largeGrfNumber, 32u, 1u, 32u}, - {GrfConfig::largeGrfNumber, 1u, 0u, 32u}, - {GrfConfig::largeGrfNumber, 1u, 1u, 32u}, + std::array, 6> values = {{ + {GrfConfig::defaultGrfNumber, 16u, 64u}, // Grf Size, SIMT Size, Max Num of threads + {GrfConfig::defaultGrfNumber, 32u, 32u}, + {GrfConfig::defaultGrfNumber, 1u, 64u}, + {GrfConfig::largeGrfNumber, 16u, 32u}, + {GrfConfig::largeGrfNumber, 32u, 32u}, + {GrfConfig::largeGrfNumber, 1u, 64u}, }}; - for (auto &[grfSize, simtSize, isHwLocalIdGeneration, expectedNumThdreadsPerThreadGroup] : values) { - EXPECT_EQ(expectedNumThdreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfSize, isHwLocalIdGeneration, rootDeviceEnvironment)); + for (auto &[grfSize, simtSize, expectedNumThdreadsPerThreadGroup] : values) { + EXPECT_EQ(expectedNumThdreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfSize, rootDeviceEnvironment)); } } diff --git a/shared/test/unit_test/xe3_core/gfx_core_helper_xe3_core_tests.cpp b/shared/test/unit_test/xe3_core/gfx_core_helper_xe3_core_tests.cpp index 610e7e239a..3f158368e2 100644 --- a/shared/test/unit_test/xe3_core/gfx_core_helper_xe3_core_tests.cpp +++ b/shared/test/unit_test/xe3_core/gfx_core_helper_xe3_core_tests.cpp @@ -754,41 +754,26 @@ XE3_CORETEST_F(GfxCoreHelperTestsXe3Core, givenNumGrfAndSimdSizeWhenAdjustingMax auto defaultMaxWorkGroupSize = 2048u; const auto &gfxCoreHelper = getHelper(); const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); - std::array, 30> values = {{ - {128u, 16u, 0u, 1024u}, // Grf Size, SIMT Size, HW local-id generation, Max Num of threads - {128u, 16u, 1u, 1024u}, - {128u, 32u, 1u, 1024u}, - {128u, 32u, 0u, 2048u}, - {160u, 16u, 0u, 768u}, - {160u, 16u, 1u, 768u}, - {160u, 32u, 1u, 1024u}, - {160u, 32u, 0u, 1536u}, - {192u, 16u, 0u, 640u}, - {192u, 16u, 1u, 640u}, - {192u, 32u, 1u, 1024u}, - {192u, 32u, 0u, 1280u}, - {256u, 16u, 0u, 512u}, - {256u, 16u, 1u, 512u}, - {256u, 32u, 1u, 1024u}, - {256u, 32u, 0u, 1024u}, - {512u, 16u, 0u, 256u}, - {512u, 16u, 1u, 256u}, - {512u, 32u, 1u, 512u}, - {512u, 32u, 0u, 512u}, - {128u, 1u, 1u, 32u}, - {128u, 1u, 0u, 64u}, - {160u, 1u, 1u, 32u}, - {160u, 1u, 0u, 48u}, - {192u, 1u, 1u, 32u}, - {192u, 1u, 0u, 40u}, - {256u, 1u, 1u, 32u}, - {256u, 1u, 0u, 32u}, - {512u, 1u, 1u, 16u}, - {512u, 1u, 0u, 16u}, + std::array, 15> values = {{ + {128u, 16u, 1024u}, // Grf Size, SIMT Size, Max Num of threads + {128u, 32u, 1024u}, + {160u, 16u, 768u}, + {160u, 32u, 1024u}, + {192u, 16u, 640u}, + {192u, 32u, 1024u}, + {256u, 16u, 512u}, + {256u, 32u, 1024u}, + {512u, 16u, 256u}, + {512u, 32u, 512u}, + {128u, 1u, 64u}, + {160u, 1u, 48u}, + {192u, 1u, 40u}, + {256u, 1u, 32u}, + {512u, 1u, 16u}, }}; - for (auto &[grfSize, simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) { - EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.adjustMaxWorkGroupSize(grfSize, simtSize, isHwLocalIdGeneration, defaultMaxWorkGroupSize, rootDeviceEnvironment)); + for (auto &[grfSize, simtSize, expectedNumThreadsPerThreadGroup] : values) { + EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.adjustMaxWorkGroupSize(grfSize, simtSize, defaultMaxWorkGroupSize, rootDeviceEnvironment)); } } @@ -801,41 +786,26 @@ XE3_CORETEST_F(GfxCoreHelperTestsXe3Core, givenParamsWhenCalculateNumThreadsPerT auto &gfxCoreHelper = getHelper(); const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); auto totalWgSize = 2048u; - std::array, 30> values = {{ - {128u, 16u, 0u, 64u}, // Grf Size, SIMT Size, HW local-id generation, Max Num of threads - {128u, 16u, 1u, 64u}, - {128u, 32u, 1u, 32u}, - {128u, 32u, 0u, 64u}, - {128u, 1u, 1u, 32u}, - {128u, 1u, 0u, 64u}, - {160u, 16u, 0u, 48u}, - {160u, 16u, 1u, 48u}, - {160u, 32u, 1u, 32u}, - {160u, 32u, 0u, 48u}, - {160u, 1u, 1u, 32u}, - {160u, 1u, 0u, 48u}, - {192u, 16u, 0u, 40u}, - {192u, 16u, 1u, 40u}, - {192u, 32u, 1u, 32u}, - {192u, 32u, 0u, 40u}, - {192u, 1u, 1u, 32u}, - {192u, 1u, 0u, 40u}, - {256u, 16u, 0u, 32u}, - {256u, 16u, 1u, 32u}, - {256u, 32u, 1u, 32u}, - {256u, 32u, 0u, 32u}, - {256u, 1u, 1u, 32u}, - {256u, 1u, 0u, 32u}, - {512u, 16u, 0u, 16u}, - {512u, 16u, 1u, 16u}, - {512u, 32u, 1u, 16u}, - {512u, 32u, 0u, 16u}, - {512u, 1u, 1u, 16u}, - {512u, 1u, 0u, 16u}, + std::array, 15> values = {{ + {128u, 16u, 64u}, // Grf Size, SIMT Size, Max Num of threads + {128u, 32u, 32u}, + {128u, 1u, 64u}, + {160u, 16u, 48u}, + {160u, 32u, 32u}, + {160u, 1u, 48u}, + {192u, 16u, 40u}, + {192u, 32u, 32u}, + {192u, 1u, 40u}, + {256u, 16u, 32u}, + {256u, 32u, 32u}, + {256u, 1u, 32u}, + {512u, 16u, 16u}, + {512u, 32u, 16u}, + {512u, 1u, 16u}, }}; - for (auto &[grfSize, simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) { - EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfSize, isHwLocalIdGeneration, rootDeviceEnvironment)); + for (auto &[grfSize, simtSize, expectedNumThreadsPerThreadGroup] : values) { + EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfSize, rootDeviceEnvironment)); } }