From 914939c377f9825ada4853ade94a78e513edfc45 Mon Sep 17 00:00:00 2001
From: "Naklicki, Mateusz" <mateusz.naklicki@intel.com>
Date: Tue, 15 Nov 2022 13:48:45 +0000
Subject: [PATCH] Fix execution of cooperative kernels on multi-tile device

Add flag for forcing execution of kernels on single tile
Force cooperative kernels to use only single tile

Related-to: NEO-6729
Signed-off-by: Naklicki, Mateusz <mateusz.naklicki@intel.com>
---
 .../test_cmdlist_append_launch_kernel_2.cpp   |  8 +-
 opencl/source/command_queue/enqueue_common.h  |  2 +-
 .../gpgpu_walker_xehp_and_later.inl           |  3 +-
 .../hardware_interface_xehp_and_later.inl     |  3 +-
 opencl/source/helpers/task_information.cpp    |  2 +-
 .../dispatch_walker_tests_xehp_and_later.cpp  | 89 ++++++++++++++-----
 .../command_encoder_xehp_and_later.inl        |  4 +-
 .../command_container/implicit_scaling.h      |  1 +
 .../implicit_scaling_xehp_and_later.inl       |  9 +-
 .../walker_partition_interface.h              |  1 +
 .../walker_partition_xehp_and_later.h         | 13 ++-
 ..._encode_dispatch_kernel_xehp_and_later.cpp | 30 +++++++
 .../test_implicit_scaling_xehp_and_later.cpp  | 39 ++++----
 ...alker_partition_tests_xehp_and_later_2.cpp | 39 ++++++--
 .../fixtures/implicit_scaling_fixture.h       |  1 +
 15 files changed, 182 insertions(+), 62 deletions(-)
diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp
index 6afef1a221..09f141a9ae 100644
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp
@@ -1417,8 +1417,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, MultiTileCommandListAppendLaunchKernelXeHpCoreTest,
     EXPECT_EQ(4u, commandList->partitionCount);
 }
 
-HWTEST2_F(MultiTileCommandListAppendLaunchKernelXeHpCoreTest, givenCooperativeKernelWhenAppendingKernelsThenDoNotUseImplicitScaling, IsAtLeastXeHpCore) {
-    ze_group_count_t groupCount{1, 1, 1};
+HWTEST2_F(MultiTileCommandListAppendLaunchKernelXeHpCoreTest, givenCooperativeKernelWhenAppendingKernelsThenSetProperPartitionSize, IsAtLeastXeHpCore) {
+    ze_group_count_t groupCount{16, 1, 1};
 
     auto commandListWithNonCooperativeKernel = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
     auto result = commandListWithNonCooperativeKernel->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -1434,6 +1434,7 @@ HWTEST2_F(MultiTileCommandListAppendLaunchKernelXeHpCoreTest, givenCooperativeKe
     auto itorWalker = find<typename FamilyType::WALKER_TYPE *>(cmdList.begin(), cmdList.end());
     auto cmd = genCmdCast<typename FamilyType::WALKER_TYPE *>(*itorWalker);
     EXPECT_TRUE(cmd->getWorkloadPartitionEnable());
+    EXPECT_EQ(4u, cmd->getPartitionSize());
 
     auto commandListWithCooperativeKernel = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
     result = commandListWithCooperativeKernel->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -1445,11 +1446,12 @@ HWTEST2_F(MultiTileCommandListAppendLaunchKernelXeHpCoreTest, givenCooperativeKe
     sizeAfter = commandListWithCooperativeKernel->commandContainer.getCommandStream()->getUsed();
     cmdList.clear();
     ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
-        cmdList, ptrOffset(commandListWithNonCooperativeKernel->commandContainer.getCommandStream()->getCpuBase(), sizeBefore), sizeAfter - sizeBefore));
+        cmdList, ptrOffset(commandListWithCooperativeKernel->commandContainer.getCommandStream()->getCpuBase(), sizeBefore), sizeAfter - sizeBefore));
 
     itorWalker = find<typename FamilyType::WALKER_TYPE *>(cmdList.begin(), cmdList.end());
     cmd = genCmdCast<typename FamilyType::WALKER_TYPE *>(*itorWalker);
     EXPECT_TRUE(cmd->getWorkloadPartitionEnable());
+    EXPECT_EQ(16u, cmd->getPartitionSize());
 }
 
 HWTEST2_F(MultiTileCommandListAppendLaunchKernelXeHpCoreTest,
diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h
index ee7a3313b4..eabab3dc5f 100644
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -799,7 +799,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
         !eventBuilder.getEvent() || getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(),             // outOfOrderExecutionAllowed
         false,                                                                                                  // epilogueRequired
         false,                                                                                                  // usePerDssBackedBuffer
-        kernel->isSingleSubdevicePreferred(),                                                                   // useSingleSubdevice
+        false,                                                                                                  // useSingleSubdevice
         useGlobalAtomics,                                                                                       // useGlobalAtomics
         kernel->areMultipleSubDevicesInContext(),                                                               // areMultipleSubDevicesInContext
         kernel->requiresMemoryMigration(),                                                                      // memoryMigrationRequired
diff --git a/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl b/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl
index 7e9fcd8a8d..ce2fe1b7e9 100644
--- a/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl
+++ b/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl
@@ -138,8 +138,7 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilin
                   HardwareCommandsHelper<GfxFamily>::getSizeRequiredCS() +
                   EncodeMemoryPrefetch<GfxFamily>::getSizeForMemoryPrefetch(pKernel->getKernelInfo().heapInfo.KernelHeapSize, commandQueue.getDevice().getHardwareInfo());
     auto devices = commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getDeviceBitfield();
-    auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices,
-                                                                           !pKernel->isSingleSubdevicePreferred());
+    auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true);
     if (partitionWalker) {
         Vec3<size_t> groupStart = dispatchInfo.getStartOfWorkgroups();
         Vec3<size_t> groupCount = dispatchInfo.getNumberOfWorkgroups();
diff --git a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl
index 94861869fa..53814c65c2 100644
--- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl
+++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl
@@ -122,7 +122,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
     EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(hwInfo, walkerCmd, encodeWalkerArgs);
 
     auto devices = queueCsr.getOsContext().getDeviceBitfield();
-    auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, !kernel.isSingleSubdevicePreferred());
+    auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true);
 
     if (partitionWalker) {
         const uint64_t workPartitionAllocationGpuVa = commandQueue.getDevice().getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
@@ -135,6 +135,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
                                                              false,
                                                              kernel.usesImages(),
                                                              queueCsr.getDcFlushSupport(),
+                                                             kernel.isSingleSubdevicePreferred(),
                                                              workPartitionAllocationGpuVa,
                                                              hwInfo);
         if (queueCsr.isStaticWorkPartitioningEnabled()) {
diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp
index 282c24366d..6fc4095f10 100644
--- a/opencl/source/helpers/task_information.cpp
+++ b/opencl/source/helpers/task_information.cpp
@@ -205,7 +205,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
         commandQueue.getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(),      // outOfOrderExecutionAllowed
         false,                                                                            // epilogueRequired
         false,                                                                            // usePerDssBackedBuffer
-        kernel->isSingleSubdevicePreferred(),                                             // useSingleSubdevice
+        false,                                                                            // useSingleSubdevice
         kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, // useGlobalAtomics
         kernel->areMultipleSubDevicesInContext(),                                         // areMultipleSubDevicesInContext
         kernel->requiresMemoryMigration(),                                                // memoryMigrationRequired
diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp
index ba62c06cdd..c08b3e1bac 100644
--- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp
@@ -1319,29 +1319,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenProgramWal
     EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType());
 }
 
-HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenKernelThatPrefersSingleSubdeviceWhenProgramWalkerThenPartitioningIsNotUsed) {
-    if (!OSInterface::osEnableLocalMemory) {
-        GTEST_SKIP();
-    }
-
-    struct SingleSubdeviceKernel : public MockKernel {
-        using MockKernel::MockKernel;
-        bool isSingleSubdevicePreferred() const override { return true; }
-    };
-
-    auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
-    size_t gws[] = {2, 1, 1};
-    size_t lws[] = {1, 1, 1};
-    SingleSubdeviceKernel subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device);
-    cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
-
-    ClHardwareParse hwParser;
-    hwParser.parseCommands<FamilyType>(*cmdQ);
-    auto computeWalker = reinterpret_cast<typename FamilyType::COMPUTE_WALKER *>(hwParser.cmdWalker);
-    ASSERT_NE(nullptr, computeWalker);
-    EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, computeWalker->getPartitionType());
-}
-
 HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenProgramWalkerIsCalledWithPartitionLogicDisabledThenWalkerPartitionLogicIsNotExecuted) {
     if (!OSInterface::osEnableLocalMemory) {
         GTEST_SKIP();
@@ -1914,3 +1891,69 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenSimdSize1TWhenCheckToGener
     EXPECT_TRUE(EncodeDispatchKernel<FamilyType>::isRuntimeLocalIdsGenerationRequired(
         workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd));
 }
+
+struct XeHPAndLaterDispatchWalkerTestMultiTileDevice : public XeHPAndLaterDispatchWalkerBasicTest {
+    void SetUp() override {
+        DebugManager.flags.CreateMultipleSubDevices.set(2u);
+
+        XeHPAndLaterDispatchWalkerBasicTest::SetUp();
+    }
+    void TearDown() override {
+        XeHPAndLaterDispatchWalkerBasicTest::TearDown();
+    }
+};
+
+struct KernelWithSingleSubdevicePreferences : public MockKernel {
+    using MockKernel::MockKernel;
+    bool isSingleSubdevicePreferred() const override { return singleSubdevicePreferred; }
+
+    bool singleSubdevicePreferred = true;
+};
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerTestMultiTileDevice, givenKernelThatPrefersSingleSubdeviceWhenProgramWalkerThenKernelIsExecutedOnSingleTile) {
+    if (!OSInterface::osEnableLocalMemory) {
+        GTEST_SKIP();
+    }
+
+    auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
+    size_t gws[] = {2, 1, 1};
+    size_t lws[] = {1, 1, 1};
+    auto &commandStreamReceiver = cmdQ->getUltCommandStreamReceiver();
+    if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) {
+        commandStreamReceiver.createPreemptionAllocation();
+    }
+    KernelWithSingleSubdevicePreferences subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device);
+    subdeviceKernel.singleSubdevicePreferred = true;
+    cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
+
+    ClHardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(*cmdQ);
+    auto computeWalker = reinterpret_cast<typename FamilyType::COMPUTE_WALKER *>(hwParser.cmdWalker);
+    ASSERT_NE(nullptr, computeWalker);
+    EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType());
+    EXPECT_EQ(2u, computeWalker->getPartitionSize());
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerTestMultiTileDevice, givenKernelThatDoesntPreferSingleSubdeviceWhenProgramWalkerThenKernelIsExecutedOnAllTiles) {
+    if (!OSInterface::osEnableLocalMemory) {
+        GTEST_SKIP();
+    }
+
+    auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
+    size_t gws[] = {2, 1, 1};
+    size_t lws[] = {1, 1, 1};
+    auto &commandStreamReceiver = cmdQ->getUltCommandStreamReceiver();
+    if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) {
+        commandStreamReceiver.createPreemptionAllocation();
+    }
+    KernelWithSingleSubdevicePreferences subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device);
+    subdeviceKernel.singleSubdevicePreferred = false;
+    cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
+
+    ClHardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(*cmdQ);
+    auto computeWalker = reinterpret_cast<typename FamilyType::COMPUTE_WALKER *>(hwParser.cmdWalker);
+    ASSERT_NE(nullptr, computeWalker);
+    EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType());
+    EXPECT_EQ(1u, computeWalker->getPartitionSize());
+}
\ No newline at end of file
diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl
index fff61a5181..20063b0cf1 100644
--- a/shared/source/command_container/command_encoder_xehp_and_later.inl
+++ b/shared/source/command_container/command_encoder_xehp_and_later.inl
@@ -301,8 +301,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
 
     PreemptionHelper::applyPreemptionWaCmdsBegin<Family>(listCmdBufferStream, *args.device);
 
-    if ((args.partitionCount > 1 && !args.isCooperative) &&
-        !args.isInternal) {
+    if (args.partitionCount > 1 && !args.isInternal) {
         const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
         if (args.eventAddress != 0) {
             postSync.setOperation(POSTSYNC_DATA::OPERATION_WRITE_TIMESTAMP);
@@ -315,6 +314,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
                                                           !args.isKernelDispatchedFromImmediateCmdList,
                                                           false,
                                                           args.dcFlushEnable,
+                                                          args.isCooperative,
                                                           workPartitionAllocationGpuVa,
                                                           hwInfo);
     } else {
diff --git a/shared/source/command_container/implicit_scaling.h b/shared/source/command_container/implicit_scaling.h
index 4c2ccb1b86..8547c5a2eb 100644
--- a/shared/source/command_container/implicit_scaling.h
+++ b/shared/source/command_container/implicit_scaling.h
@@ -53,6 +53,7 @@ struct ImplicitScalingDispatch {
                                  bool apiSelfCleanup,
                                  bool usesImages,
                                  bool dcFlush,
+                                 bool forceExecutionOnSingleTile,
                                  uint64_t workPartitionAllocationGpuVa,
                                  const HardwareInfo &hwInfo);
 
diff --git a/shared/source/command_container/implicit_scaling_xehp_and_later.inl b/shared/source/command_container/implicit_scaling_xehp_and_later.inl
index 084af527ab..58a1f356a5 100644
--- a/shared/source/command_container/implicit_scaling_xehp_and_later.inl
+++ b/shared/source/command_container/implicit_scaling_xehp_and_later.inl
@@ -22,7 +22,8 @@ WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(uint64_t workPar
                                                                 bool preferStaticPartitioning,
                                                                 bool staticPartitioning,
                                                                 bool useSecondaryBatchBuffer,
-                                                                bool dcFlush) {
+                                                                bool dcFlush,
+                                                                bool forceExecutionOnSingleTile) {
     WalkerPartition::WalkerPartitionArgs args = {};
 
     args.workPartitionAllocationGpuVa = workPartitionAllocationGpuVa;
@@ -30,6 +31,7 @@ WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(uint64_t workPar
     args.tileCount = tileCount;
     args.staticPartitioning = staticPartitioning;
     args.preferredStaticPartitioning = preferStaticPartitioning;
+    args.forceExecutionOnSingleTile = forceExecutionOnSingleTile;
 
     args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup();
     args.initializeWparidRegister = ImplicitScalingHelper::isWparidRegisterInitializationRequired();
@@ -76,6 +78,7 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool apiSelfCleanup,
                                                                                       preferStaticPartitioning,
                                                                                       staticPartitioning,
                                                                                       false,
+                                                                                      false,
                                                                                       false);
 
     return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(args));
@@ -90,6 +93,7 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
                                                           bool apiSelfCleanup,
                                                           bool usesImages,
                                                           bool dcFlush,
+                                                          bool forceExecutionOnSingleTile,
                                                           uint64_t workPartitionAllocationGpuVa,
                                                           const HardwareInfo &hwInfo) {
     uint32_t totalProgrammedSize = 0u;
@@ -106,7 +110,8 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
                                                                                       preferStaticPartitioning,
                                                                                       staticPartitioning,
                                                                                       useSecondaryBatchBuffer,
-                                                                                      dcFlush);
+                                                                                      dcFlush,
+                                                                                      forceExecutionOnSingleTile);
 
     auto dispatchCommandsSize = getSize(apiSelfCleanup, preferStaticPartitioning, devices, {walkerCmd.getThreadGroupIdStartingX(), walkerCmd.getThreadGroupIdStartingY(), walkerCmd.getThreadGroupIdStartingZ()}, {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()});
     void *commandBuffer = commandStream.getSpace(dispatchCommandsSize);
diff --git a/shared/source/command_container/walker_partition_interface.h b/shared/source/command_container/walker_partition_interface.h
index 673d969cdf..3de29af39c 100644
--- a/shared/source/command_container/walker_partition_interface.h
+++ b/shared/source/command_container/walker_partition_interface.h
@@ -31,6 +31,7 @@ struct WalkerPartitionArgs {
     bool usePostSync = false;
     bool pipeControlBeforeCleanupCrossTileSync = false;
     bool dcFlushEnable = false;
+    bool forceExecutionOnSingleTile = false;
 };
 
 constexpr uint32_t wparidCCSOffset = 0x221C;
diff --git a/shared/source/command_container/walker_partition_xehp_and_later.h b/shared/source/command_container/walker_partition_xehp_and_later.h
index 5a28af787e..e1182571bd 100644
--- a/shared/source/command_container/walker_partition_xehp_and_later.h
+++ b/shared/source/command_container/walker_partition_xehp_and_later.h
@@ -480,7 +480,8 @@ uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) {
 template <typename GfxFamily>
 void programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgrammed,
                               COMPUTE_WALKER<GfxFamily> *inputWalker,
-                              uint32_t partitionCount) {
+                              uint32_t partitionCount,
+                              bool forceExecutionOnSingleTile) {
     auto computeWalker = putCommand<COMPUTE_WALKER<GfxFamily>>(inputAddress, totalBytesProgrammed);
     COMPUTE_WALKER<GfxFamily> cmd = *inputWalker;
 
@@ -503,7 +504,11 @@ void programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramme
             workgroupCount = inputWalker->getThreadGroupIdZDimension();
         }
 
-        cmd.setPartitionSize((workgroupCount + partitionCount - 1u) / partitionCount);
+        if (forceExecutionOnSingleTile) {
+            cmd.setPartitionSize(workgroupCount);
+        } else {
+            cmd.setPartitionSize(Math::divideAndRoundUp(workgroupCount, partitionCount));
+        }
     }
     *computeWalker = cmd;
 }
@@ -614,7 +619,7 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
         args.secondaryBatchBuffer);
 
     // Walker section
-    programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
+    programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.forceExecutionOnSingleTile);
 
     programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, args.secondaryBatchBuffer);
 
@@ -704,7 +709,7 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
     if (args.initializeWparidRegister) {
         programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset);
     }
-    programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
+    programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.forceExecutionOnSingleTile);
 
     // Prepare for cleanup section
     if (args.emitSelfCleanup) {
diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp
index 8292f6321a..46e770bb43 100644
--- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp
+++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp
@@ -1026,6 +1026,36 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesImplicitScaling,
     EXPECT_EQ(eventAddress, postSync.getDestinationAddress());
 }
 
+HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesImplicitScaling, givenCooperativeKernelWhenEncodingDispatchKernelThenExpectPartitionSizeEqualWorkgroupSize) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+    using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
+
+    uint32_t dims[] = {16, 1, 1};
+    std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
+
+    bool requiresUncachedMocs = false;
+    bool isInternal = false;
+    bool isCooperative = true;
+
+    EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
+    dispatchArgs.isInternal = isInternal;
+    dispatchArgs.isCooperative = isCooperative;
+    dispatchArgs.partitionCount = 2;
+    EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dispatchArgs, nullptr);
+
+    size_t containerUsedAfterBase = cmdContainer->getCommandStream()->getUsed();
+
+    GenCmdList partitionedWalkerList;
+    CmdParse<FamilyType>::parseCommandBuffer(partitionedWalkerList, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), containerUsedAfterBase);
+    auto itor = find<WALKER_TYPE *>(partitionedWalkerList.begin(), partitionedWalkerList.end());
+    ASSERT_NE(itor, partitionedWalkerList.end());
+
+    auto partitionWalkerCmd = genCmdCast<WALKER_TYPE *>(*itor);
+    EXPECT_EQ(WALKER_TYPE::PARTITION_TYPE::PARTITION_TYPE_X, partitionWalkerCmd->getPartitionType());
+    uint32_t expectedPartitionSize = dims[0];
+    EXPECT_EQ(expectedPartitionSize, partitionWalkerCmd->getPartitionSize());
+}
+
 struct CommandEncodeStatesDynamicImplicitScalingFixture : CommandEncodeStatesImplicitScalingFixture {
     void setUp() {
         DebugManager.flags.EnableStaticPartitioning.set(0);
diff --git a/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp b/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp
index 1a1300ac56..ec44e063f9 100644
--- a/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp
+++ b/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp
@@ -30,7 +30,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenGetSizeWhenDispatchingCm
     expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
 
     uint32_t partitionCount = 0;
-    ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, 0u, *defaultHwInfo);
+    ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag,
+                                                          forceExecutionOnSingleTileFlag, 0u, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(2u, partitionCount);
@@ -72,7 +73,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenWorkgroupOneAndNoPartiti
     expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
 
     uint32_t partitionCount = 0;
-    ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, false, false, false, dcFlushFlag, 0u, *defaultHwInfo);
+    ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, false, false, false, dcFlushFlag,
+                                                          forceExecutionOnSingleTileFlag, 0u, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(1u, partitionCount);
@@ -115,7 +117,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenWorkgroupOneAndPartition
     expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
 
     uint32_t partitionCount = 0;
-    ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag, 0u, *defaultHwInfo);
+    ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag,
+                                                          forceExecutionOnSingleTileFlag, 0u, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(1u, partitionCount);
@@ -162,7 +165,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenDi
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(2u, partitionCount);
@@ -214,7 +217,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenPa
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(2u, partitionCount);
@@ -268,7 +271,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
@@ -319,7 +322,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
@@ -356,7 +359,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
@@ -393,7 +396,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenDynamicPartitioningPrefe
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
@@ -441,7 +444,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
@@ -509,7 +512,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
@@ -569,7 +572,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
@@ -629,7 +632,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
@@ -696,7 +699,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
@@ -759,7 +762,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
@@ -824,7 +827,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
@@ -892,7 +895,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
@@ -959,7 +962,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
 
     uint32_t partitionCount = 0;
     ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, dcFlushFlag,
-                                                          workPartitionAllocationAddress, *defaultHwInfo);
+                                                          forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
     totalBytesProgrammed = commandStream.getUsed();
     EXPECT_EQ(expectedSize, totalBytesProgrammed);
     EXPECT_EQ(twoTile.count(), partitionCount);
diff --git a/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp b/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp
index ec8b89fb13..8212b02f14 100644
--- a/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp
+++ b/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_2.cpp
@@ -400,7 +400,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
 
     walker.setPartitionType(COMPUTE_WALKER<FamilyType>::PARTITION_TYPE::PARTITION_TYPE_X);
     void *walkerCommandAddress = cmdBufferAddress;
-    programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u);
+    programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, false);
     auto walkerCommand = genCmdCast<COMPUTE_WALKER<FamilyType> *>(walkerCommandAddress);
 
     ASSERT_NE(nullptr, walkerCommand);
@@ -411,7 +411,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
 
     walker.setPartitionType(COMPUTE_WALKER<FamilyType>::PARTITION_TYPE::PARTITION_TYPE_Y);
     walkerCommandAddress = cmdBufferAddress;
-    programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u);
+    programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, false);
     walkerCommand = genCmdCast<COMPUTE_WALKER<FamilyType> *>(walkerCommandAddress);
 
     ASSERT_NE(nullptr, walkerCommand);
@@ -420,7 +420,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
 
     walker.setPartitionType(COMPUTE_WALKER<FamilyType>::PARTITION_TYPE::PARTITION_TYPE_Z);
     walkerCommandAddress = cmdBufferAddress;
-    programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u);
+    programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, false);
     walkerCommand = genCmdCast<COMPUTE_WALKER<FamilyType> *>(walkerCommandAddress);
 
     ASSERT_NE(nullptr, walkerCommand);
@@ -430,7 +430,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
     //if we program with partition Count == 1 then do not trigger partition stuff
     walker.setPartitionType(COMPUTE_WALKER<FamilyType>::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
     walkerCommandAddress = cmdBufferAddress;
-    programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 1u);
+    programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 1u, false);
     walkerCommand = genCmdCast<COMPUTE_WALKER<FamilyType> *>(walkerCommandAddress);
 
     ASSERT_NE(nullptr, walkerCommand);
@@ -506,7 +506,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerWithDifferentWorkg
     EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z, walker.getPartitionType());
 }
 
-HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenDisalbedMinimalPartitionSizeWhenCoomputePartitionSizeThenProperValueIsReturned) {
+HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenDisabledMinimalPartitionSizeWhenComputePartitionSizeThenProperValueIsReturned) {
     WalkerPartition::COMPUTE_WALKER<FamilyType> walker;
     walker = FamilyType::cmdInitGpgpuWalker;
     walker.setThreadGroupIdXDimension(64u);
@@ -1672,3 +1672,32 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenBarrierProgrammingWhenEm
 
     EXPECT_EQ(parsedOffset, expectedCommandUsedSize);
 }
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTileWhenProgramComputeWalkerThenWalkerIsProperlyProgrammed) {
+    WalkerPartition::COMPUTE_WALKER<FamilyType> walker;
+    walker = FamilyType::cmdInitGpgpuWalker;
+    walker.setThreadGroupIdXDimension(32u);
+    walker.setThreadGroupIdYDimension(1u);
+    walker.setThreadGroupIdZDimension(1u);
+
+    bool forceExecutionOnSingleTile = false;
+    walker.setPartitionType(COMPUTE_WALKER<FamilyType>::PARTITION_TYPE::PARTITION_TYPE_X);
+    void *walkerCommandAddress = cmdBufferAddress;
+    programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, forceExecutionOnSingleTile);
+    auto walkerCommand = genCmdCast<COMPUTE_WALKER<FamilyType> *>(walkerCommandAddress);
+
+    ASSERT_NE(nullptr, walkerCommand);
+    EXPECT_TRUE(walkerCommand->getWorkloadPartitionEnable());
+    EXPECT_EQ(COMPUTE_WALKER<FamilyType>::PARTITION_TYPE::PARTITION_TYPE_X, walkerCommand->getPartitionType());
+    EXPECT_EQ(16u, walkerCommand->getPartitionSize());
+
+    forceExecutionOnSingleTile = true;
+    walkerCommandAddress = cmdBufferAddress;
+    programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, forceExecutionOnSingleTile);
+    walkerCommand = genCmdCast<COMPUTE_WALKER<FamilyType> *>(walkerCommandAddress);
+
+    ASSERT_NE(nullptr, walkerCommand);
+    EXPECT_TRUE(walkerCommand->getWorkloadPartitionEnable());
+    EXPECT_EQ(COMPUTE_WALKER<FamilyType>::PARTITION_TYPE::PARTITION_TYPE_X, walkerCommand->getPartitionType());
+    EXPECT_EQ(32u, walkerCommand->getPartitionSize());
+}
diff --git a/shared/test/unit_test/fixtures/implicit_scaling_fixture.h b/shared/test/unit_test/fixtures/implicit_scaling_fixture.h
index 420c364794..2a70023087 100644
--- a/shared/test/unit_test/fixtures/implicit_scaling_fixture.h
+++ b/shared/test/unit_test/fixtures/implicit_scaling_fixture.h
@@ -34,6 +34,7 @@ struct ImplicitScalingFixture : public CommandEncodeStatesFixture {
     DeviceBitfield twoTile;
     void *alignedMemory = nullptr;
     bool dcFlushFlag = false;
+    bool forceExecutionOnSingleTileFlag = false;
 };
 
 using ImplicitScalingTests = Test<ImplicitScalingFixture>;