diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp
index 5b6b778a70..45cba094e0 100644
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -8,6 +8,7 @@
 #include "level_zero/core/source/kernel/kernel_imp.h"
 
 #include "shared/source/assert_handler/assert_handler.h"
+#include "shared/source/command_container/implicit_scaling.h"
 #include "shared/source/debugger/debugger_l0.h"
 #include "shared/source/execution_environment/root_device_environment.h"
 #include "shared/source/gmm_helper/gmm_helper.h"
@@ -482,8 +483,18 @@ ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount
     const uint32_t workDim = 3;
     const size_t localWorkSize[] = {groupSize[0], groupSize[1], groupSize[2]};
 
+    uint32_t numSubDevicesForExecution = 1;
+
+    bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment);
+    auto deviceBitfield = module->getDevice()->getNEODevice()->getDeviceBitfield();
+
+    if (NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) {
+        numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
+    }
+
     *totalGroupCount = NEO::KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
                                                                descriptor,
+                                                               numSubDevicesForExecution,
                                                                usedSlmSize,
                                                                workDim,
                                                                localWorkSize,
diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp
index 6854508ae2..671cd02bcb 100644
--- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp
+++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel_2.cpp
@@ -447,6 +447,26 @@ HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsed
     EXPECT_EQ(expected, getMaxWorkGroupCount());
 }
 
+HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenMultiTileWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd) {
+    DebugManagerStateRestore restore;
+
+    neoDevice->deviceBitfield = 0b1;
+    auto baseCount = getMaxWorkGroupCount();
+
+    debugManager.flags.EnableImplicitScaling.set(1);
+    neoDevice->deviceBitfield = 0b11;
+
+    auto countWithSubDevices = getMaxWorkGroupCount();
+
+    auto &helper = neoDevice->getGfxCoreHelper();
+
+    if (helper.singleTileExecImplicitScalingRequired(true)) {
+        EXPECT_EQ(baseCount, countWithSubDevices);
+    } else {
+        EXPECT_EQ(baseCount * 2, countWithSubDevices);
+    }
+}
+
 HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount) {
     usesBarriers = 1;
     auto expected = dssCount * (maxBarrierCount / usesBarriers);
diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp
index 8e89cab83b..e408c8318f 100644
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@@ -8,6 +8,7 @@
 #include "opencl/source/kernel/kernel.h"
 
 #include "shared/source/built_ins/built_ins.h"
+#include "shared/source/command_container/implicit_scaling.h"
 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/source/debug_settings/debug_settings_manager.h"
 #include "shared/source/execution_environment/execution_environment.h"
@@ -1132,8 +1133,18 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local
 
     auto usedSlmSize = helper.alignSlmSize(slmTotalSize);
 
+    uint32_t numSubDevicesForExecution = 1;
+
+    bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment);
+    auto deviceBitfield = commandQueue->getClDevice().getDeviceBitfield();
+
+    if (NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) {
+        numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
+    }
+
     auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
                                                                 kernelInfo.kernelDescriptor,
+                                                                numSubDevicesForExecution,
                                                                 usedSlmSize,
                                                                 workDim,
                                                                 localWorkSize,
diff --git a/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl b/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl
index ac523819c9..e39e6a07e2 100644
--- a/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl
+++ b/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -83,4 +83,31 @@ TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenVariousInputWhenGetting
     EXPECT_EQ(expectedMaxConcurrentWorkGroupCount, maxConcurrentWorkGroupCount);
 }
 
+TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenMultiTileWhenGettingMaxConcurrentWorkGroupCountThenCorrectValuesAreReturned) {
+    DebugManagerStateRestore restore;
+    auto &mockDevice = static_cast<MockDevice &>(pDevice->getDevice());
+
+    cl_uint workDim = 3;
+    size_t localWorkSize[] = {8, 8, 8};
+
+    const_cast<KernelInfo &>(pKernel->getKernelInfo()).kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::defaultGrfNumber;
+
+    mockDevice.deviceBitfield = 0b1;
+
+    auto baseCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
+
+    debugManager.flags.EnableImplicitScaling.set(1);
+    mockDevice.deviceBitfield = 0b11;
+
+    auto countWithSubDevices = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
+
+    auto &helper = pDevice->getGfxCoreHelper();
+
+    if (helper.singleTileExecImplicitScalingRequired(true)) {
+        EXPECT_EQ(baseCount, countWithSubDevices);
+    } else {
+        EXPECT_EQ(baseCount * 2, countWithSubDevices);
+    }
+}
+
 } // namespace ULT
diff --git a/shared/source/helpers/kernel_helpers.cpp b/shared/source/helpers/kernel_helpers.cpp
index efa41ead19..f0ea2124b6 100644
--- a/shared/source/helpers/kernel_helpers.cpp
+++ b/shared/source/helpers/kernel_helpers.cpp
@@ -20,7 +20,7 @@
 
 namespace NEO {
 
-uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
+uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor, uint32_t numSubDevices,
                                             uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced) {
     if (debugManager.flags.OverrideMaxWorkGroupCount.get() != -1) {
         return static_cast<uint32_t>(debugManager.flags.OverrideMaxWorkGroupCount.get());
@@ -59,7 +59,13 @@ uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDev
         maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToSlm);
     }
 
-    return helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
+    maxWorkGroupsCount = helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
+
+    if (!helper.singleTileExecImplicitScalingRequired(true)) {
+        maxWorkGroupsCount *= numSubDevices;
+    }
+
+    return maxWorkGroupsCount;
 }
 
 KernelHelper::ErrorCode KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(KernelDescriptor::KernelAttributes attributes, Device *device) {
diff --git a/shared/source/helpers/kernel_helpers.h b/shared/source/helpers/kernel_helpers.h
index 5f37df6eed..fe535f38a3 100644
--- a/shared/source/helpers/kernel_helpers.h
+++ b/shared/source/helpers/kernel_helpers.h
@@ -24,7 +24,7 @@ struct KernelHelper {
         invalidKernel = 2
     };
     static uint32_t getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
-                                         uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced);
+                                         uint32_t numSubDevices, uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced);
     static inline uint64_t getPrivateSurfaceSize(uint64_t perHwThreadPrivateMemorySize, uint32_t computeUnitsUsedForScratch) {
         return perHwThreadPrivateMemorySize * computeUnitsUsedForScratch;
     }
diff --git a/shared/test/unit_test/helpers/kernel_helpers_tests.cpp b/shared/test/unit_test/helpers/kernel_helpers_tests.cpp
index 2fd62b1968..a063ea9d55 100644
--- a/shared/test/unit_test/helpers/kernel_helpers_tests.cpp
+++ b/shared/test/unit_test/helpers/kernel_helpers_tests.cpp
@@ -28,6 +28,7 @@ struct KernelHelperMaxWorkGroupsTests : ::testing::Test {
     uint32_t numberOfBarriers = 0;
     uint32_t workDim = 3;
     uint32_t grf = 128;
+    uint32_t numSubdevices = 1;
     size_t lws[3] = {10, 10, 10};
 
     void SetUp() override {
@@ -45,7 +46,7 @@ struct KernelHelperMaxWorkGroupsTests : ::testing::Test {
         hwInfo->gtSystemInfo.DualSubSliceCount = dssCount;
         hwInfo->capabilityTable.slmSize = (availableSlm / MemoryConstants::kiloByte) / dssCount;
 
-        return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, usedSlm, workDim, lws, engineType, false);
+        return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, numSubdevices, usedSlm, workDim, lws, engineType, false);
     }
 
     std::unique_ptr<MockExecutionEnvironment> executionEnvironment;
@@ -69,6 +70,22 @@ TEST_F(KernelHelperMaxWorkGroupsTests, GivenDebugFlagSetWhenGetMaxWorkGroupCount
     EXPECT_EQ(123u, getMaxWorkGroupCount());
 }
 
+TEST_F(KernelHelperMaxWorkGroupsTests, givenMultipleSubdevicesWenCalculatingMaxWorkGroupsCountTenMultiply) {
+    auto &helper = rootDeviceEnvironment->getHelper<NEO::GfxCoreHelper>();
+
+    auto baseCount = getMaxWorkGroupCount();
+
+    numSubdevices = 4;
+
+    auto countWithSubdevices = getMaxWorkGroupCount();
+
+    if (helper.singleTileExecImplicitScalingRequired(true)) {
+        EXPECT_EQ(baseCount, countWithSubdevices);
+    } else {
+        EXPECT_EQ(baseCount * numSubdevices, countWithSubdevices);
+    }
+}
+
 TEST_F(KernelHelperMaxWorkGroupsTests, GivenBarriersWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithRegardToBarriersCount) {
     numberOfBarriers = 0;
     auto baseCount = getMaxWorkGroupCount();