diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 5b6b778a70..45cba094e0 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -8,6 +8,7 @@ #include "level_zero/core/source/kernel/kernel_imp.h" #include "shared/source/assert_handler/assert_handler.h" +#include "shared/source/command_container/implicit_scaling.h" #include "shared/source/debugger/debugger_l0.h" #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/gmm_helper/gmm_helper.h" @@ -482,8 +483,18 @@ ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount const uint32_t workDim = 3; const size_t localWorkSize[] = {groupSize[0], groupSize[1], groupSize[2]}; + uint32_t numSubDevicesForExecution = 1; + + bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment); + auto deviceBitfield = module->getDevice()->getNEODevice()->getDeviceBitfield(); + + if (NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) { + numSubDevicesForExecution = static_cast(deviceBitfield.count()); + } + *totalGroupCount = NEO::KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment, descriptor, + numSubDevicesForExecution, usedSlmSize, workDim, localWorkSize, diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp index 6854508ae2..671cd02bcb 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp @@ -447,6 +447,26 @@ HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsed EXPECT_EQ(expected, getMaxWorkGroupCount()); } +HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenMultiTileWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd) { + DebugManagerStateRestore restore; + + neoDevice->deviceBitfield = 0b1; + auto baseCount = getMaxWorkGroupCount(); + + debugManager.flags.EnableImplicitScaling.set(1); + neoDevice->deviceBitfield = 0b11; + + auto countWithSubDevices = getMaxWorkGroupCount(); + + auto &helper = neoDevice->getGfxCoreHelper(); + + if (helper.singleTileExecImplicitScalingRequired(true)) { + EXPECT_EQ(baseCount, countWithSubDevices); + } else { + EXPECT_EQ(baseCount * 2, countWithSubDevices); + } +} + HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount) { usesBarriers = 1; auto expected = dssCount * (maxBarrierCount / usesBarriers); diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 8e89cab83b..e408c8318f 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -8,6 +8,7 @@ #include "opencl/source/kernel/kernel.h" #include "shared/source/built_ins/built_ins.h" +#include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/execution_environment/execution_environment.h" @@ -1132,8 +1133,18 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local auto usedSlmSize = helper.alignSlmSize(slmTotalSize); + uint32_t numSubDevicesForExecution = 1; + + bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment); + auto deviceBitfield = commandQueue->getClDevice().getDeviceBitfield(); + + if (NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) { + numSubDevicesForExecution = static_cast(deviceBitfield.count()); + } + auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment, kernelInfo.kernelDescriptor, + numSubDevicesForExecution, usedSlmSize, workDim, localWorkSize, diff --git a/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl b/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl index ac523819c9..e39e6a07e2 100644 --- a/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl +++ b/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -83,4 +83,31 @@ TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenVariousInputWhenGetting EXPECT_EQ(expectedMaxConcurrentWorkGroupCount, maxConcurrentWorkGroupCount); } +TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenMultiTileWhenGettingMaxConcurrentWorkGroupCountThenCorrectValuesAreReturned) { + DebugManagerStateRestore restore; + auto &mockDevice = static_cast(pDevice->getDevice()); + + cl_uint workDim = 3; + size_t localWorkSize[] = {8, 8, 8}; + + const_cast(pKernel->getKernelInfo()).kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::defaultGrfNumber; + + mockDevice.deviceBitfield = 0b1; + + auto baseCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue); + + debugManager.flags.EnableImplicitScaling.set(1); + mockDevice.deviceBitfield = 0b11; + + auto countWithSubDevices = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue); + + auto &helper = pDevice->getGfxCoreHelper(); + + if (helper.singleTileExecImplicitScalingRequired(true)) { + EXPECT_EQ(baseCount, countWithSubDevices); + } else { + EXPECT_EQ(baseCount * 2, countWithSubDevices); + } +} + } // namespace ULT diff --git a/shared/source/helpers/kernel_helpers.cpp b/shared/source/helpers/kernel_helpers.cpp index efa41ead19..f0ea2124b6 100644 --- a/shared/source/helpers/kernel_helpers.cpp +++ b/shared/source/helpers/kernel_helpers.cpp @@ -20,7 +20,7 @@ namespace NEO { -uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor, +uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor, uint32_t numSubDevices, uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced) { if (debugManager.flags.OverrideMaxWorkGroupCount.get() != -1) { return static_cast(debugManager.flags.OverrideMaxWorkGroupCount.get()); @@ -59,7 +59,13 @@ uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDev maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToSlm); } - return helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced); + maxWorkGroupsCount = helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced); + + if (!helper.singleTileExecImplicitScalingRequired(true)) { + maxWorkGroupsCount *= numSubDevices; + } + + return maxWorkGroupsCount; } KernelHelper::ErrorCode KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(KernelDescriptor::KernelAttributes attributes, Device *device) { diff --git a/shared/source/helpers/kernel_helpers.h b/shared/source/helpers/kernel_helpers.h index 5f37df6eed..fe535f38a3 100644 --- a/shared/source/helpers/kernel_helpers.h +++ b/shared/source/helpers/kernel_helpers.h @@ -24,7 +24,7 @@ struct KernelHelper { invalidKernel = 2 }; static uint32_t getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor, - uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced); + uint32_t numSubDevices, uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced); static inline uint64_t getPrivateSurfaceSize(uint64_t perHwThreadPrivateMemorySize, uint32_t computeUnitsUsedForScratch) { return perHwThreadPrivateMemorySize * computeUnitsUsedForScratch; } diff --git a/shared/test/unit_test/helpers/kernel_helpers_tests.cpp b/shared/test/unit_test/helpers/kernel_helpers_tests.cpp index 2fd62b1968..a063ea9d55 100644 --- a/shared/test/unit_test/helpers/kernel_helpers_tests.cpp +++ b/shared/test/unit_test/helpers/kernel_helpers_tests.cpp @@ -28,6 +28,7 @@ struct KernelHelperMaxWorkGroupsTests : ::testing::Test { uint32_t numberOfBarriers = 0; uint32_t workDim = 3; uint32_t grf = 128; + uint32_t numSubdevices = 1; size_t lws[3] = {10, 10, 10}; void SetUp() override { @@ -45,7 +46,7 @@ struct KernelHelperMaxWorkGroupsTests : ::testing::Test { hwInfo->gtSystemInfo.DualSubSliceCount = dssCount; hwInfo->capabilityTable.slmSize = (availableSlm / MemoryConstants::kiloByte) / dssCount; - return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, usedSlm, workDim, lws, engineType, false); + return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, numSubdevices, usedSlm, workDim, lws, engineType, false); } std::unique_ptr executionEnvironment; @@ -69,6 +70,22 @@ TEST_F(KernelHelperMaxWorkGroupsTests, GivenDebugFlagSetWhenGetMaxWorkGroupCount EXPECT_EQ(123u, getMaxWorkGroupCount()); } +TEST_F(KernelHelperMaxWorkGroupsTests, givenMultipleSubdevicesWenCalculatingMaxWorkGroupsCountTenMultiply) { + auto &helper = rootDeviceEnvironment->getHelper(); + + auto baseCount = getMaxWorkGroupCount(); + + numSubdevices = 4; + + auto countWithSubdevices = getMaxWorkGroupCount(); + + if (helper.singleTileExecImplicitScalingRequired(true)) { + EXPECT_EQ(baseCount, countWithSubdevices); + } else { + EXPECT_EQ(baseCount * numSubdevices, countWithSubdevices); + } +} + TEST_F(KernelHelperMaxWorkGroupsTests, GivenBarriersWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithRegardToBarriersCount) { numberOfBarriers = 0; auto baseCount = getMaxWorkGroupCount();