feature: improve reporting max cooperative group count

Related-To: NEO-8210 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
2024-01-22 11:57:53 +00:00 · 2024-01-22 11:57:53 +00:00 · b77e1a6a71
parent 8d56f8fb6b
commit b77e1a6a71
7 changed files with 97 additions and 5 deletions
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@ -8,6 +8,7 @@
 #include "level_zero/core/source/kernel/kernel_imp.h"

 #include "shared/source/assert_handler/assert_handler.h"
+#include "shared/source/command_container/implicit_scaling.h"
 #include "shared/source/debugger/debugger_l0.h"
 #include "shared/source/execution_environment/root_device_environment.h"
 #include "shared/source/gmm_helper/gmm_helper.h"
@ -482,8 +483,18 @@ ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount
    const uint32_t workDim = 3;
    const size_t localWorkSize[] = {groupSize[0], groupSize[1], groupSize[2]};

+    uint32_t numSubDevicesForExecution = 1;
+
+    bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment);
+    auto deviceBitfield = module->getDevice()->getNEODevice()->getDeviceBitfield();
+
+    if (NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) {
+        numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
+    }
+
    *totalGroupCount = NEO::KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
                                                               descriptor,
+                                                               numSubDevicesForExecution,
                                                               usedSlmSize,
                                                               workDim,
                                                               localWorkSize,
--- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp
+++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp
@ -447,6 +447,26 @@ HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsed
    EXPECT_EQ(expected, getMaxWorkGroupCount());
 }

+HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenMultiTileWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd) {
+    DebugManagerStateRestore restore;
+
+    neoDevice->deviceBitfield = 0b1;
+    auto baseCount = getMaxWorkGroupCount();
+
+    debugManager.flags.EnableImplicitScaling.set(1);
+    neoDevice->deviceBitfield = 0b11;
+
+    auto countWithSubDevices = getMaxWorkGroupCount();
+
+    auto &helper = neoDevice->getGfxCoreHelper();
+
+    if (helper.singleTileExecImplicitScalingRequired(true)) {
+        EXPECT_EQ(baseCount, countWithSubDevices);
+    } else {
+        EXPECT_EQ(baseCount * 2, countWithSubDevices);
+    }
+}
+
 HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount) {
    usesBarriers = 1;
    auto expected = dssCount * (maxBarrierCount / usesBarriers);
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@ -8,6 +8,7 @@
 #include "opencl/source/kernel/kernel.h"

 #include "shared/source/built_ins/built_ins.h"
+#include "shared/source/command_container/implicit_scaling.h"
 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/source/debug_settings/debug_settings_manager.h"
 #include "shared/source/execution_environment/execution_environment.h"
@ -1132,8 +1133,18 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local

    auto usedSlmSize = helper.alignSlmSize(slmTotalSize);

+    uint32_t numSubDevicesForExecution = 1;
+
+    bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment);
+    auto deviceBitfield = commandQueue->getClDevice().getDeviceBitfield();
+
+    if (NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) {
+        numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
+    }
+
    auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
                                                                kernelInfo.kernelDescriptor,
+                                                                numSubDevicesForExecution,
                                                                usedSlmSize,
                                                                workDim,
                                                                localWorkSize,
--- a/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl
+++ b/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -83,4 +83,31 @@ TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenVariousInputWhenGetting
    EXPECT_EQ(expectedMaxConcurrentWorkGroupCount, maxConcurrentWorkGroupCount);
 }

+TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenMultiTileWhenGettingMaxConcurrentWorkGroupCountThenCorrectValuesAreReturned) {
+    DebugManagerStateRestore restore;
+    auto &mockDevice = static_cast<MockDevice &>(pDevice->getDevice());
+
+    cl_uint workDim = 3;
+    size_t localWorkSize[] = {8, 8, 8};
+
+    const_cast<KernelInfo &>(pKernel->getKernelInfo()).kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::defaultGrfNumber;
+
+    mockDevice.deviceBitfield = 0b1;
+
+    auto baseCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
+
+    debugManager.flags.EnableImplicitScaling.set(1);
+    mockDevice.deviceBitfield = 0b11;
+
+    auto countWithSubDevices = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
+
+    auto &helper = pDevice->getGfxCoreHelper();
+
+    if (helper.singleTileExecImplicitScalingRequired(true)) {
+        EXPECT_EQ(baseCount, countWithSubDevices);
+    } else {
+        EXPECT_EQ(baseCount * 2, countWithSubDevices);
+    }
+}
+
 } // namespace ULT
--- a/shared/source/helpers/kernel_helpers.cpp
+++ b/shared/source/helpers/kernel_helpers.cpp
@ -20,7 +20,7 @@

 namespace NEO {

-uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
+uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor, uint32_t numSubDevices,
                                            uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced) {
    if (debugManager.flags.OverrideMaxWorkGroupCount.get() != -1) {
        return static_cast<uint32_t>(debugManager.flags.OverrideMaxWorkGroupCount.get());
@ -59,7 +59,13 @@ uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDev
        maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToSlm);
    }

-    return helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
+    maxWorkGroupsCount = helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
+
+    if (!helper.singleTileExecImplicitScalingRequired(true)) {
+        maxWorkGroupsCount *= numSubDevices;
+    }
+
+    return maxWorkGroupsCount;
 }

 KernelHelper::ErrorCode KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(KernelDescriptor::KernelAttributes attributes, Device *device) {
--- a/shared/source/helpers/kernel_helpers.h
+++ b/shared/source/helpers/kernel_helpers.h
@ -24,7 +24,7 @@ struct KernelHelper {
        invalidKernel = 2
    };
    static uint32_t getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
-                                         uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced);
+                                         uint32_t numSubDevices, uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced);
    static inline uint64_t getPrivateSurfaceSize(uint64_t perHwThreadPrivateMemorySize, uint32_t computeUnitsUsedForScratch) {
        return perHwThreadPrivateMemorySize * computeUnitsUsedForScratch;
    }
--- a/shared/test/unit_test/helpers/kernel_helpers_tests.cpp
+++ b/shared/test/unit_test/helpers/kernel_helpers_tests.cpp
@ -28,6 +28,7 @@ struct KernelHelperMaxWorkGroupsTests : ::testing::Test {
    uint32_t numberOfBarriers = 0;
    uint32_t workDim = 3;
    uint32_t grf = 128;
+    uint32_t numSubdevices = 1;
    size_t lws[3] = {10, 10, 10};

    void SetUp() override {
@ -45,7 +46,7 @@ struct KernelHelperMaxWorkGroupsTests : ::testing::Test {
        hwInfo->gtSystemInfo.DualSubSliceCount = dssCount;
        hwInfo->capabilityTable.slmSize = (availableSlm / MemoryConstants::kiloByte) / dssCount;

-        return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, usedSlm, workDim, lws, engineType, false);
+        return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, numSubdevices, usedSlm, workDim, lws, engineType, false);
    }

    std::unique_ptr<MockExecutionEnvironment> executionEnvironment;
@ -69,6 +70,22 @@ TEST_F(KernelHelperMaxWorkGroupsTests, GivenDebugFlagSetWhenGetMaxWorkGroupCount
    EXPECT_EQ(123u, getMaxWorkGroupCount());
 }

+TEST_F(KernelHelperMaxWorkGroupsTests, givenMultipleSubdevicesWenCalculatingMaxWorkGroupsCountTenMultiply) {
+    auto &helper = rootDeviceEnvironment->getHelper<NEO::GfxCoreHelper>();
+
+    auto baseCount = getMaxWorkGroupCount();
+
+    numSubdevices = 4;
+
+    auto countWithSubdevices = getMaxWorkGroupCount();
+
+    if (helper.singleTileExecImplicitScalingRequired(true)) {
+        EXPECT_EQ(baseCount, countWithSubdevices);
+    } else {
+        EXPECT_EQ(baseCount * numSubdevices, countWithSubdevices);
+    }
+}
+
 TEST_F(KernelHelperMaxWorkGroupsTests, GivenBarriersWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithRegardToBarriersCount) {
    numberOfBarriers = 0;
    auto baseCount = getMaxWorkGroupCount();