Refactor and modularize walker partition code

Related-To: NEO-6244 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
2025-12-20 00:24:58 +08:00 · 2021-09-13 17:39:55 +00:00
parent e82c2e4653
commit b65d8909e4
19 changed files with 3184 additions and 2700 deletions
--- a/shared/source/command_container/implicit_scaling.cpp
+++ b/shared/source/command_container/implicit_scaling.cpp
@@ -34,12 +34,55 @@ bool ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired() {
    return synchronizeBeforeExecution;
 }

+bool ImplicitScalingHelper::isSemaphoreProgrammingRequired() {
+    auto semaphoreProgrammingRequired = ImplicitScaling::semaphoreProgrammingRequired;
+    if (NEO::DebugManager.flags.SynchronizeWithSemaphores.get() == 1) {
+        semaphoreProgrammingRequired = true;
+    }
+    return semaphoreProgrammingRequired;
+}
+
+bool ImplicitScalingHelper::isCrossTileAtomicRequired() {
+    auto crossTileAtomicSynchronization = ImplicitScaling::crossTileAtomicSynchronization;
+    if (NEO::DebugManager.flags.UseCrossAtomicSynchronization.get() == 0) {
+        crossTileAtomicSynchronization = false;
+    }
+    return crossTileAtomicSynchronization;
+}
+
 bool ImplicitScalingHelper::useAtomicsForNativeCleanup() {
    bool useAtomics = false;
-    int overrideUseAtomics = DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.get();
+    int overrideUseAtomics = DebugManager.flags.UseAtomicsForNativeSectionCleanup.get();
    if (overrideUseAtomics != -1) {
        useAtomics = !!(overrideUseAtomics);
    }
    return useAtomics;
 }
+
+bool ImplicitScalingHelper::programNativeCleanup(bool defaultNativeCleanup) {
+    int overrideProgramNativeCleanup = DebugManager.flags.ProgramNativeCleanup.get();
+    if (overrideProgramNativeCleanup != -1) {
+        defaultNativeCleanup = !!(overrideProgramNativeCleanup);
+    }
+    return defaultNativeCleanup;
+}
+
+bool ImplicitScalingHelper::initWparidRegister() {
+    bool initWparidRegister = true;
+    int overrideInitWparidRegister = DebugManager.flags.WparidRegisterProgramming.get();
+    if (overrideInitWparidRegister != -1) {
+        initWparidRegister = !!(overrideInitWparidRegister);
+    }
+    return initWparidRegister;
+}
+
+bool ImplicitScalingHelper::usePipeControl() {
+    bool usePipeControl = true;
+    int overrideUsePipeControl = DebugManager.flags.UsePipeControlAfterPartitionedWalker.get();
+    if (overrideUsePipeControl != -1) {
+        usePipeControl = !!(overrideUsePipeControl);
+    }
+    return usePipeControl;
+}
+
 } // namespace NEO
--- a/shared/source/command_container/implicit_scaling.h
+++ b/shared/source/command_container/implicit_scaling.h
@@ -15,12 +15,22 @@ class LinearStream;

 namespace ImplicitScaling {
 extern bool apiSupport;
-}
+extern bool semaphoreProgrammingRequired;
+extern bool crossTileAtomicSynchronization;
+
+constexpr uint32_t partitionAddressOffsetDwords = 2u;
+constexpr uint32_t partitionAddressOffset = sizeof(uint32_t) * partitionAddressOffsetDwords;
+} // namespace ImplicitScaling

 struct ImplicitScalingHelper {
    static bool isImplicitScalingEnabled(const DeviceBitfield &devices, bool preCondition);
+    static bool isSemaphoreProgrammingRequired();
+    static bool isCrossTileAtomicRequired();
    static bool isSynchronizeBeforeExecutionRequired();
    static bool useAtomicsForNativeCleanup();
+    static bool programNativeCleanup(bool defaultNativeCleanup);
+    static bool initWparidRegister();
+    static bool usePipeControl();
 };

 template <typename GfxFamily>
--- a/shared/source/command_container/implicit_scaling_xehp_and_later.inl
+++ b/shared/source/command_container/implicit_scaling_xehp_and_later.inl
@@ -20,6 +20,7 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool nativeCrossTileAtomicSyn
    typename GfxFamily::COMPUTE_WALKER::PARTITION_TYPE partitionType{};
    bool staticPartitioning = false;
    const uint32_t tileCount = static_cast<uint32_t>(devices.count());
+
    const uint32_t partitionCount = WalkerPartition::computePartitionCountAndPartitionType<GfxFamily>(tileCount,
                                                                                                      preferStaticPartitioning,
                                                                                                      groupStart,
@@ -28,15 +29,21 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool nativeCrossTileAtomicSyn
                                                                                                      &partitionType,
                                                                                                      &staticPartitioning);
    UNRECOVERABLE_IF(staticPartitioning && (tileCount != partitionCount));
+    WalkerPartition::WalkerPartitionArgs args = {};

-    auto synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
-    const bool useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
-    return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(false,
-                                                                                                16u,
-                                                                                                synchronizeBeforeExecution,
-                                                                                                nativeCrossTileAtomicSync,
-                                                                                                staticPartitioning,
-                                                                                                useAtomicsForNativeCleanup));
+    args.partitionCount = partitionCount;
+    args.tileCount = tileCount;
+    args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
+    args.useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
+    args.nativeCrossTileAtomicSync = ImplicitScalingHelper::programNativeCleanup(nativeCrossTileAtomicSync);
+    args.initializeWparidRegister = ImplicitScalingHelper::initWparidRegister();
+    args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired();
+    args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired();
+    args.usePipeControlStall = ImplicitScalingHelper::usePipeControl();
+    args.emitBatchBufferEnd = false;
+    args.staticPartitioning = staticPartitioning;
+
+    return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(args));
 }

 template <typename GfxFamily>
@@ -54,36 +61,43 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS

    bool staticPartitioning = false;
    partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType<GfxFamily>(&walkerCmd, tileCount, preferStaticPartitioning, usesImages, &staticPartitioning);
-    const bool synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
-    const bool useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
+
+    WalkerPartition::WalkerPartitionArgs args = {};
+    args.workPartitionAllocationGpuVa = workPartitionAllocationGpuVa;
+    args.partitionCount = partitionCount;
+    args.tileCount = tileCount;
+    args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
+    args.useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
+    args.nativeCrossTileAtomicSync = ImplicitScalingHelper::programNativeCleanup(nativeCrossTileAtomicSync);
+    args.initializeWparidRegister = ImplicitScalingHelper::initWparidRegister();
+    args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired();
+    args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired();
+    args.usePipeControlStall = ImplicitScalingHelper::usePipeControl();
+    args.emitBatchBufferEnd = false;
+    args.secondaryBatchBuffer = useSecondaryBatchBuffer;
+    args.staticPartitioning = staticPartitioning;
+
    if (staticPartitioning) {
        UNRECOVERABLE_IF(tileCount != partitionCount);
        WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
                                                                                commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
                                                                                &walkerCmd,
                                                                                totalProgrammedSize,
-                                                                                partitionCount,
-                                                                                tileCount,
-                                                                                synchronizeBeforeExecution,
-                                                                                useSecondaryBatchBuffer,
-                                                                                nativeCrossTileAtomicSync,
-                                                                                workPartitionAllocationGpuVa,
-                                                                                useAtomicsForNativeCleanup);
+                                                                                args);
    } else {
        if (DebugManager.flags.ExperimentalSetWalkerPartitionCount.get()) {
            partitionCount = DebugManager.flags.ExperimentalSetWalkerPartitionCount.get();
            if (partitionCount == 1u) {
                walkerCmd.setPartitionType(GfxFamily::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
            }
+            args.partitionCount = partitionCount;
        }

        WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
                                                                                 commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
-                                                                                 &walkerCmd, totalProgrammedSize,
-                                                                                 partitionCount, tileCount,
-                                                                                 false, synchronizeBeforeExecution, useSecondaryBatchBuffer,
-                                                                                 nativeCrossTileAtomicSync,
-                                                                                 useAtomicsForNativeCleanup);
+                                                                                 &walkerCmd,
+                                                                                 totalProgrammedSize,
+                                                                                 args);
    }
    commandStream.getSpace(totalProgrammedSize);
 }
--- a/shared/source/command_container/walker_partition_xehp_and_later.h
+++ b/shared/source/command_container/walker_partition_xehp_and_later.h
@@ -18,6 +18,22 @@

 namespace WalkerPartition {

+struct WalkerPartitionArgs {
+    uint64_t workPartitionAllocationGpuVa = 0;
+    uint32_t partitionCount = 0;
+    uint32_t tileCount = 0;
+    bool emitBatchBufferEnd = false;
+    bool secondaryBatchBuffer = false;
+    bool synchronizeBeforeExecution = false;
+    bool crossTileAtomicSynchronization = false;
+    bool semaphoreProgrammingRequired = false;
+    bool staticPartitioning = false;
+    bool nativeCrossTileAtomicSync = false;
+    bool useAtomicsForNativeCleanup = false;
+    bool initializeWparidRegister = false;
+    bool usePipeControlStall = false;
+};
+
 template <typename GfxFamily>
 using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
 template <typename GfxFamily>
@@ -73,22 +89,6 @@ Command *putCommand(void *&inputAddress, uint32_t &totalBytesProgrammed) {
    return commandToReturn;
 }

-bool inline isSemaphoreProgrammingRequired() {
-    auto semaphoreProgrammingRequired = false;
-    if (NEO::DebugManager.flags.ExperimentalSynchronizeWithSemaphores.get() == 1) {
-        semaphoreProgrammingRequired = true;
-    }
-    return semaphoreProgrammingRequired;
-}
-
-bool inline isCrossTileAtomicRequired() {
-    auto crossTileAtomicSynchronization = true;
-    if (NEO::DebugManager.flags.ExperimentalForceCrossAtomicSynchronization.get() == 0) {
-        crossTileAtomicSynchronization = false;
-    }
-    return crossTileAtomicSynchronization;
-}
-
 template <typename GfxFamily>
 uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitionCount,
                                               bool preferStaticPartitioning,
@@ -349,25 +349,55 @@ void programStoreMemImmediateDword(void *&inputAddress, uint32_t &totalBytesProg
    *storeDataImmediate = cmd;
 }

+template <typename GfxFamily>
+uint64_t computeNativeCrossTileSyncControlSectionSize(bool useAtomicsForNativeCleanup) {
+    if (useAtomicsForNativeCleanup) {
+        return sizeof(MI_ATOMIC<GfxFamily>);
+    } else {
+        return sizeof(MI_STORE_DATA_IMM<GfxFamily>);
+    }
+}
+
 template <typename GfxFamily>
 void programNativeCrossTileSyncControl(void *&inputAddress,
                                       uint32_t &totalBytesProgrammed,
-                                       uint64_t finalSyncTileCountField,
+                                       uint64_t address,
                                       bool useAtomicsForNativeCleanup) {
    if (useAtomicsForNativeCleanup) {
        programMiAtomic<GfxFamily>(inputAddress,
                                   totalBytesProgrammed,
-                                   finalSyncTileCountField,
+                                   address,
                                   false,
                                   MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_MOVE);
    } else {
        programStoreMemImmediateDword<GfxFamily>(inputAddress,
                                                 totalBytesProgrammed,
-                                                 finalSyncTileCountField,
+                                                 address,
                                                 0u);
    }
 }

+template <typename GfxFamily>
+uint64_t computeTilesSynchronizationWithAtomicsSectionSize() {
+    return sizeof(MI_ATOMIC<GfxFamily>) +
+           sizeof(MI_SEMAPHORE_WAIT<GfxFamily>);
+}
+
+template <typename GfxFamily>
+void programTilesSynchronizationWithAtomics(void *&currentBatchBufferPointer,
+                                            uint32_t &totalBytesProgrammed,
+                                            uint64_t atomicAddress,
+                                            uint32_t tileCount) {
+    programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
+    programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
+}
+
+template <typename GfxFamily>
+uint64_t computeNativeCrossTileSyncCleanupSectionSize(size_t fieldsForCleanupCount, bool useAtomicsForNativeCleanup) {
+    return fieldsForCleanupCount * computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) +
+           2 * computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>();
+}
+
 template <typename GfxFamily>
 void programNativeCrossTileSyncCleanup(void *&inputAddress,
                                       uint32_t &totalBytesProgrammed,
@@ -377,28 +407,18 @@ void programNativeCrossTileSyncCleanup(void *&inputAddress,
                                       uint32_t tileCount,
                                       bool useAtomicsForNativeCleanup) {
    // Synchronize tiles, so the fields are not cleared while still in use
-    programMiAtomic<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
-    programWaitForSemaphore<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
+    programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, tileCount);

    for (auto fieldIndex = 0u; fieldIndex < fieldsForCleanupCount; fieldIndex++) {
        const uint64_t addressForCleanup = baseAddressForCleanup + fieldIndex * sizeof(uint32_t);
-        if (useAtomicsForNativeCleanup) {
-            programMiAtomic<GfxFamily>(inputAddress,
-                                       totalBytesProgrammed,
-                                       addressForCleanup,
-                                       false,
-                                       MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_MOVE);
-        } else {
-            programStoreMemImmediateDword<GfxFamily>(inputAddress,
+        programNativeCrossTileSyncControl<GfxFamily>(inputAddress,
                                                     totalBytesProgrammed,
                                                     addressForCleanup,
-                                                     0u);
-        }
+                                                     useAtomicsForNativeCleanup);
    }

    //this synchronization point ensures that all tiles finished zeroing and will fairly access control section atomic variables
-    programMiAtomic<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
-    programWaitForSemaphore<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
+    programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * tileCount);
 }

 template <typename GfxFamily>
@@ -412,15 +432,6 @@ void programTilesSynchronizationWithPostSyncs(void *&currentBatchBufferPointer,
    }
 }

-template <typename GfxFamily>
-void programTilesSynchronizationWithAtomics(void *&currentBatchBufferPointer,
-                                            uint32_t &totalBytesProgrammed,
-                                            uint64_t atomicAddress,
-                                            uint32_t tileCount) {
-    programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
-    programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
-}
-
 template <typename GfxFamily>
 uint64_t computeWalkerSectionSize() {
    return sizeof(BATCH_BUFFER_START<GfxFamily>) +
@@ -428,46 +439,30 @@ uint64_t computeWalkerSectionSize() {
 }

 template <typename GfxFamily>
-uint64_t computeNativeCrossTileSyncControlSectionSize(bool useAtomicsForNativeCleanup) {
-    if (useAtomicsForNativeCleanup) {
-        return sizeof(MI_ATOMIC<GfxFamily>);
-    } else {
-        return sizeof(MI_STORE_DATA_IMM<GfxFamily>);
+uint64_t computeControlSectionOffset(WalkerPartitionArgs &args) {
+    uint64_t size = 0u;
+
+    size += args.synchronizeBeforeExecution ? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>() : 0;
+    size += sizeof(LOAD_REGISTER_IMM<GfxFamily>); //predication mask
+    size += sizeof(MI_ATOMIC<GfxFamily>);         //current id for partition
+    size += sizeof(LOAD_REGISTER_REG<GfxFamily>); //id into register
+    size += sizeof(MI_SET_PREDICATE<GfxFamily>) * 2 +
+            sizeof(BATCH_BUFFER_START<GfxFamily>) * 2;
+    size += (args.semaphoreProgrammingRequired ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * args.partitionCount : 0u);
+    size += computeWalkerSectionSize<GfxFamily>();
+    size += args.usePipeControlStall ? sizeof(PIPE_CONTROL<GfxFamily>) : 0u;
+    if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
+        size += computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>();
    }
-}
-
-template <typename GfxFamily>
-uint64_t computeNativeCrossTileSyncCleanupSectionSize(size_t fieldsForCleanupCount, bool useAtomicsForNativeCleanup) {
-    return fieldsForCleanupCount * computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) +
-           2 * sizeof(MI_ATOMIC<GfxFamily>) +
-           2 * sizeof(MI_SEMAPHORE_WAIT<GfxFamily>);
-}
-
-template <typename GfxFamily>
-uint64_t computeControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync, bool useAtomicsForNativeCleanup) {
-    auto synchronizationCount = (synchronizeBeforeExecution) ? 2u : 1u;
-    if (!isCrossTileAtomicRequired() && !nativeCrossTileAtomicSync) {
-        synchronizationCount--;
+    if (args.nativeCrossTileAtomicSync) {
+        size += computeNativeCrossTileSyncControlSectionSize<GfxFamily>(args.useAtomicsForNativeCleanup);
    }
-
-    return sizeof(LOAD_REGISTER_IMM<GfxFamily>) +
-           sizeof(MI_ATOMIC<GfxFamily>) * (1u + synchronizationCount) +
-           sizeof(LOAD_REGISTER_REG<GfxFamily>) +
-           sizeof(MI_SET_PREDICATE<GfxFamily>) * 2 +
-           sizeof(BATCH_BUFFER_START<GfxFamily>) * 2 +
-           sizeof(PIPE_CONTROL<GfxFamily>) +
-           sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * synchronizationCount +
-           (isSemaphoreProgrammingRequired() ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * partitionCount : 0u) +
-           computeWalkerSectionSize<GfxFamily>() +
-           (nativeCrossTileAtomicSync ? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) : 0u);
+    return size;
 }

 template <typename GfxFamily>
-uint64_t computeWalkerSectionStart(uint32_t partitionCount,
-                                   bool synchronizeBeforeExecution,
-                                   bool nativeCrossTileAtomicSync,
-                                   bool useAtomicsForNativeCleanup) {
-    return computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup) -
+uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) {
+    return computeControlSectionOffset<GfxFamily>(args) -
           computeWalkerSectionSize<GfxFamily>();
 }

@@ -537,26 +532,17 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
                                                  uint64_t gpuAddressOfAllocation,
                                                  COMPUTE_WALKER<GfxFamily> *inputWalker,
                                                  uint32_t &totalBytesProgrammed,
-                                                  uint32_t partitionCount,
-                                                  uint32_t tileCount,
-                                                  bool emitBatchBufferEnd,
-                                                  bool synchronizeBeforeExecution,
-                                                  bool secondaryBatchBuffer,
-                                                  bool nativeCrossTileAtomicSync,
-                                                  bool useAtomicsForNativeCleanup) {
+                                                  WalkerPartitionArgs &args) {
    totalBytesProgrammed = 0u;
    void *currentBatchBufferPointer = cpuPointer;

-    auto controlSectionOffset = computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
-    if (synchronizeBeforeExecution) {
+    auto controlSectionOffset = computeControlSectionOffset<GfxFamily>(args);
+    if (args.synchronizeBeforeExecution) {
        auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, inTileCount);
-        programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
-
-        //if all tiles hit the atomic, it means we may go further
-        programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
+        programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
    }

-    programWparidMask<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, partitionCount);
+    programWparidMask<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.partitionCount);

    programMiAtomic<GfxFamily>(currentBatchBufferPointer,
                               totalBytesProgrammed,
@@ -573,36 +559,32 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
    programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer,
                                         totalBytesProgrammed,
                                         gpuAddressOfAllocation +
-                                             computeWalkerSectionStart<GfxFamily>(partitionCount,
-                                                                                  synchronizeBeforeExecution,
-                                                                                  nativeCrossTileAtomicSync,
-                                                                                  useAtomicsForNativeCleanup),
+                                             computeWalkerSectionStart<GfxFamily>(args),
                                         true,
-                                         secondaryBatchBuffer);
+                                         args.secondaryBatchBuffer);

    //disable predication to not noop subsequent commands.
    programWparidPredication<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, false);

-    if (nativeCrossTileAtomicSync) {
+    if (args.nativeCrossTileAtomicSync) {
        const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
-        programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, useAtomicsForNativeCleanup);
+        programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForNativeCleanup);
    }

-    programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true);
+    if (args.usePipeControlStall) {
+        programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true);
+    }

-    if (isSemaphoreProgrammingRequired()) {
+    if (args.semaphoreProgrammingRequired) {
        auto postSyncAddress = inputWalker->getPostSync().getDestinationAddress() + 8llu;
-        for (uint32_t partitionId = 0u; partitionId < partitionCount; partitionId++) {
+        for (uint32_t partitionId = 0u; partitionId < args.partitionCount; partitionId++) {
            programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, postSyncAddress + partitionId * 16llu, 1u, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
        }
    }

-    if (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) {
+    if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
        auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, tileCount);
-        programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
-
-        //if all tiles hit the atomic, it means we may go further
-        programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
+        programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
    }

    //this bb start goes to the end of partitioned command buffer
@@ -611,12 +593,12 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
        totalBytesProgrammed,
        gpuAddressOfAllocation + controlSectionOffset + sizeof(BatchBufferControlData),
        false,
-        secondaryBatchBuffer);
+        args.secondaryBatchBuffer);

    //Walker section
-    programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
+    programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);

-    programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, secondaryBatchBuffer);
+    programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, args.secondaryBatchBuffer);

    auto controlSection = reinterpret_cast<BatchBufferControlData *>(ptrOffset(cpuPointer, static_cast<size_t>(controlSectionOffset)));
    controlSection->partitionCount = 0u;
@@ -626,18 +608,18 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
    totalBytesProgrammed += sizeof(BatchBufferControlData);
    currentBatchBufferPointer = ptrOffset(currentBatchBufferPointer, sizeof(BatchBufferControlData));

-    if (nativeCrossTileAtomicSync) {
+    if (args.nativeCrossTileAtomicSync) {
        const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
        programNativeCrossTileSyncCleanup<GfxFamily>(currentBatchBufferPointer,
                                                     totalBytesProgrammed,
                                                     finalSyncTileCountAddress,
                                                     gpuAddressOfAllocation + controlSectionOffset,
                                                     dynamicPartitioningFieldsForCleanupCount,
-                                                     tileCount,
-                                                     useAtomicsForNativeCleanup);
+                                                     args.tileCount,
+                                                     args.useAtomicsForNativeCleanup);
    }

-    if (emitBatchBufferEnd) {
+    if (args.emitBatchBufferEnd) {
        auto batchBufferEnd = putCommand<BATCH_BUFFER_END<GfxFamily>>(currentBatchBufferPointer, totalBytesProgrammed);
        *batchBufferEnd = GfxFamily::cmdInitBatchBufferEnd;
    }
@@ -651,14 +633,28 @@ struct StaticPartitioningControlSection {
 static constexpr inline size_t staticPartitioningFieldsForCleanupCount = sizeof(StaticPartitioningControlSection) / sizeof(uint32_t) - 1;

 template <typename GfxFamily>
-uint64_t computeStaticPartitioningControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync, bool useAtomicsForNativeCleanup) {
-    const auto beforeExecutionSyncAtomicSize = synchronizeBeforeExecution ? (sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) + sizeof(MI_ATOMIC<GfxFamily>)) : 0u;
-    const auto afterExecutionSyncAtomicSize = (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) ? (sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) + sizeof(MI_ATOMIC<GfxFamily>)) : 0u;
-    const auto afterExecutionSyncPostSyncSize = isSemaphoreProgrammingRequired() ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * partitionCount : 0u;
-    const auto nativeCrossTileSyncSize = nativeCrossTileAtomicSync ? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) : 0u;
+uint64_t computeStaticPartitioningControlSectionOffset(WalkerPartitionArgs &args) {
+    const auto beforeExecutionSyncAtomicSize = args.synchronizeBeforeExecution
+                                                   ? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>()
+                                                   : 0u;
+    const auto afterExecutionSyncAtomicSize = (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync)
+                                                  ? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>()
+                                                  : 0u;
+    const auto afterExecutionSyncPostSyncSize = args.semaphoreProgrammingRequired
+                                                    ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * args.partitionCount
+                                                    : 0u;
+    const auto nativeCrossTileSyncSize = args.nativeCrossTileAtomicSync
+                                             ? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(args.useAtomicsForNativeCleanup)
+                                             : 0u;
+    const auto wparidRegisterSize = args.initializeWparidRegister
+                                        ? sizeof(LOAD_REGISTER_MEM<GfxFamily>)
+                                        : 0u;
+    const auto pipeControlSize = args.usePipeControlStall
+                                     ? sizeof(PIPE_CONTROL<GfxFamily>)
+                                     : 0u;
    return beforeExecutionSyncAtomicSize +
-           sizeof(LOAD_REGISTER_MEM<GfxFamily>) +
-           sizeof(PIPE_CONTROL<GfxFamily>) +
+           wparidRegisterSize +
+           pipeControlSize +
           sizeof(COMPUTE_WALKER<GfxFamily>) +
           nativeCrossTileSyncSize +
           afterExecutionSyncAtomicSize +
@@ -671,49 +667,48 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
                                                 uint64_t gpuAddressOfAllocation,
                                                 COMPUTE_WALKER<GfxFamily> *inputWalker,
                                                 uint32_t &totalBytesProgrammed,
-                                                 uint32_t partitionCount,
-                                                 uint32_t tileCount,
-                                                 bool synchronizeBeforeExecution,
-                                                 bool secondaryBatchBuffer,
-                                                 bool nativeCrossTileAtomicSync,
-                                                 uint64_t workPartitionAllocationGpuVa,
-                                                 bool useAtomicsForNativeCleanup) {
+                                                 WalkerPartitionArgs &args) {
    totalBytesProgrammed = 0u;
    void *currentBatchBufferPointer = cpuPointer;

    // Get address of the control section
-    const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
+    const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset<GfxFamily>(args);
    const auto afterControlSectionOffset = controlSectionOffset + sizeof(StaticPartitioningControlSection);

    // Synchronize tiles before walker
-    if (synchronizeBeforeExecution) {
+    if (args.synchronizeBeforeExecution) {
        const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter);
-        programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount);
+        programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
    }

    // Load partition ID to wparid register and execute walker
-    programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, workPartitionAllocationGpuVa, wparidCCSOffset);
-    programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
+    if (args.initializeWparidRegister) {
+        programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset);
+    }
+    programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);

    // Prepare for cleanup section
-    if (nativeCrossTileAtomicSync) {
+    if (args.nativeCrossTileAtomicSync) {
        const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
-        programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, useAtomicsForNativeCleanup);
+        programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForNativeCleanup);
    }

-    programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true); // flush L3 cache
+    if (args.usePipeControlStall) {
+        programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true); // flush L3 cache
+    }

    // Synchronize tiles after walker
-    if (isSemaphoreProgrammingRequired()) {
-        programTilesSynchronizationWithPostSyncs<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
+    if (args.semaphoreProgrammingRequired) {
+        programTilesSynchronizationWithPostSyncs<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
    }
-    if (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) {
+
+    if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
        const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
-        programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount);
+        programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
    }

    // Jump over the control section
-    programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, secondaryBatchBuffer);
+    programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer);

    // Control section
    DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset);
@@ -724,35 +719,31 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
    DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset);

    // Cleanup section
-    if (nativeCrossTileAtomicSync) {
+    if (args.nativeCrossTileAtomicSync) {
        const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
        programNativeCrossTileSyncCleanup<GfxFamily>(currentBatchBufferPointer,
                                                     totalBytesProgrammed,
                                                     finalSyncTileCountAddress,
                                                     gpuAddressOfAllocation + controlSectionOffset,
                                                     staticPartitioningFieldsForCleanupCount,
-                                                     tileCount,
-                                                     useAtomicsForNativeCleanup);
+                                                     args.tileCount,
+                                                     args.useAtomicsForNativeCleanup);
    }
 }

 template <typename GfxFamily>
-uint64_t estimateSpaceRequiredInCommandBuffer(bool requiresBatchBufferEnd,
-                                              uint32_t partitionCount,
-                                              bool synchronizeBeforeExecution,
-                                              bool nativeCrossTileAtomicSync,
-                                              bool staticPartitioning,
-                                              bool useAtomicsForNativeCleanup) {
+uint64_t estimateSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args) {
+
    uint64_t size = {};
-    if (staticPartitioning) {
-        size += computeStaticPartitioningControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
+    if (args.staticPartitioning) {
+        size += computeStaticPartitioningControlSectionOffset<GfxFamily>(args);
        size += sizeof(StaticPartitioningControlSection);
-        size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(staticPartitioningFieldsForCleanupCount, useAtomicsForNativeCleanup) : 0u;
+        size += args.nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(staticPartitioningFieldsForCleanupCount, args.useAtomicsForNativeCleanup) : 0u;
    } else {
-        size += computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
+        size += computeControlSectionOffset<GfxFamily>(args);
        size += sizeof(BatchBufferControlData);
-        size += requiresBatchBufferEnd ? sizeof(BATCH_BUFFER_END<GfxFamily>) : 0u;
-        size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(dynamicPartitioningFieldsForCleanupCount, useAtomicsForNativeCleanup) : 0u;
+        size += args.emitBatchBufferEnd ? sizeof(BATCH_BUFFER_END<GfxFamily>) : 0u;
+        size += args.nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(dynamicPartitioningFieldsForCleanupCount, args.useAtomicsForNativeCleanup) : 0u;
    }
    return size;
 }
--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@@ -123,15 +123,11 @@ DECLARE_DEBUG_VARIABLE(int32_t, CFENumberOfWalkers, -1, "Set Number of Walkers i
 DECLARE_DEBUG_VARIABLE(int32_t, CFEMaximumNumberOfThreads, -1, "Set Maximum Number of Threads in CFE_STATE on XEHP, -1 - do not set")
 DECLARE_DEBUG_VARIABLE(int32_t, CFEOverDispatchControl, -1, "Set Over Dispatch Control in CFE_STATE on XEHP, -1 - do not set")
 DECLARE_DEBUG_VARIABLE(int32_t, CFELargeGRFThreadAdjustDisable, -1, "Set Large GRF thread adjust Disable field in CFE_STATE, -1 - do not set")
-DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWalkerInWparidMode, -1, "-1: default, 0: do not synchronize 1: synchronize all tiles prior to doing work distrubution")
-DECLARE_DEBUG_VARIABLE(int32_t, EnableWalkerPartition, -1, "-1: default, 0: disable, 1: enable, Enables Walker Partitioning via WPARID.")
 DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumComputeUnitsForScratch, -1, "Override number of compute units used for scratch size calculation")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceWorkgroupSize1x1x1, -1, "-1: default, 0: disable, 1: enable,  force workgroup size 1x1x1 in builtins")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceThreadGroupDispatchSize, -1, "Set ThreadGroupDispatchSize in INTERFACE_DESCRIPTOR_DATA, -1 - default, 0 - TG size 8, 1 - TG size 4, 2 - TG size 2, 3 - Reserved")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessL1CachingPolicy, -1, "-1: default, >=0 : program value for stateless L1 caching")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceMemoryBankIndexOverride, -1, "-1: default, 0: disable, 1:enable, Force index=1 of memory bank for XEHP")
-DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSynchronizeWithSemaphores, -1, "Experimental implementation: 1: Emit Semaphores waiting after Walker completion in WPARID mode 0: do not emit semaphores after Walker")
-DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalForceCrossAtomicSynchronization, -1, "Experimental implementation: 1: Cross Tile Atomic Synchronization present 0: Cross tile atomic synchronization disabled")
 DECLARE_DEBUG_VARIABLE(int32_t, EnablePrivateScratchSlot1, -1, "-1: default, 0: disable, 1: enable Allows using private scratch space")
 DECLARE_DEBUG_VARIABLE(int32_t, DisablePipeControlPrecedingPostSyncCommand, -1, "-1 default - disabled adding PIPE_CONTROL, 0 - disabled adding PIPE_CONTROL, 1 - enabled adding PIPE_CONTROL")
 DECLARE_DEBUG_VARIABLE(int32_t, UseCachingPolicyForIndirectObjectHeap, -1, "Use selected caching policy for IOH, -1 - default, 0 - Uncached, 1 - L3 Caching, 2 - L1 Caching")
@@ -142,13 +138,11 @@ DECLARE_DEBUG_VARIABLE(int32_t, ForceMultiGpuPartialWrites, -1, "-1: default - 0
 DECLARE_DEBUG_VARIABLE(int32_t, ForceMultiGpuAtomicsInComputeMode, -1, "-1: default - 0 for multiOsContext capable, 0: program value 0 in MultiGpuAtomics bit in STATE_COMPUTE_MODE, 1: program value 1 in MultiGpuAtomics bit in STATE_COMPUTE_MODE")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceMultiGpuAtomics, -1, "-1: default - 0 for multiOsContext capable, 0: program value 0 in MultiGpuAtomics controls 1: program value 1 in MultiGpuAtomics controls")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceBufferCompressionFormat, -1, "-1: default, >0: Format value")
-DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionCount, 0, "Experimental implementation: Set number of COMPUTE_WALKERs for a given Partition Type, 0 - do not set the feature.")
 DECLARE_DEBUG_VARIABLE(int32_t, EnableHwGenerationLocalIds, -1, "-1: default, 0: disable, 1: enable : Enables generation of local ids on HW")
 DECLARE_DEBUG_VARIABLE(int32_t, WalkerPartitionPreferHighestDimension, -1, "-1: default, 0: prefer biggest dimension, 1: prefer Z over Y over X if they divide partition count evenly")
 DECLARE_DEBUG_VARIABLE(int32_t, SetMinimalPartitionSize, -1, "-1 default value set to 512 workgroups, 0 - disabled, >0 - minimal partition size in workgroups (should be power of 2)")
 DECLARE_DEBUG_VARIABLE(int32_t, OverrideBlitterTargetMemory, -1, "-1:default 0: overwrites to System 1: overwrites to Local")
 DECLARE_DEBUG_VARIABLE(int32_t, OverrideBlitterMocs, -1, "-1: default, >=0 SetGivenMocsInBlitterTransfers")
-DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionType, -1, "Experimental implementation: Set COMPUTE_WALKER Partition Type. Valid values for types from 1 to 3")
 DECLARE_DEBUG_VARIABLE(int32_t, OverridePostSyncMocs, -1, "-1: default, >=0 Override post sync mocs with value")
 DECLARE_DEBUG_VARIABLE(int32_t, EnableImmediateVmBindExt, -1, "Use immediate bind extension to a new residency model on Linux (requires kernel support), -1: default (enabled whith direct submission), 0: disabled, 1: enabled")
 DECLARE_DEBUG_VARIABLE(int32_t, ForceExecutionTile, -1, "-1: default, 0+: given tile is choosen as submission, must be used with EnableWalkerPartition = 0.")
@@ -237,11 +231,21 @@ DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionOverrideComputeSupport, -1, "Ove
 DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableCacheFlush, -1, "-1: driver default, 0: additional cache flush is present 1: disable dispatching cache flush commands")
 DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionNewResourceTlbFlush, -1, "-1: driver default - flush when new resource is bound, 0: disabled, 1: enabled")
 DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableMonitorFence, -1, "Disable dispatching monitor fence commands")
-DECLARE_DEBUG_VARIABLE(bool, USMEvictAfterMigration, true, "Evict USM allocation after implicit migration to GPU")
 DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmissionController, -1, "Enable direct submission terminating after given timeout, -1: default, 0: disabled, 1: enabled")
 DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionControllerTimeout, -1, "Set direct submission controller timeout, -1: default 5 ms, >=0: timeout in ms")

+/* IMPLICIT SCALING */
+DECLARE_DEBUG_VARIABLE(int32_t, EnableWalkerPartition, -1, "-1: default, 0: disable, 1: enable, Enables Walker Partitioning via WPARID.")
+DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWalkerInWparidMode, -1, "-1: default, 0: do not synchronize 1: synchronize all tiles prior to doing work distrubution")
+DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWithSemaphores, -1, "-1: default (disabled),  1: Emit Semaphores waiting after Walker completion in WPARID mode 0: do not emit semaphores after Walker")
+DECLARE_DEBUG_VARIABLE(int32_t, UseCrossAtomicSynchronization, -1, "-1: default (enabled), 1: Cross Tile Atomic Synchronization present 0: Cross tile atomic synchronization disabled")
+DECLARE_DEBUG_VARIABLE(int32_t, UseAtomicsForNativeSectionCleanup, -1, "-1: default (disabled), 0: use store data op, 1: use atomic op")
+DECLARE_DEBUG_VARIABLE(int32_t, ProgramNativeCleanup, -1, "-1: default (API dependent), 0: Do not program native cleanup, 1: program native cleanup")
+DECLARE_DEBUG_VARIABLE(int32_t, WparidRegisterProgramming, -1, "-1: default (enabled), 0: do not program wparid register, 1: programing wparid register")
+DECLARE_DEBUG_VARIABLE(int32_t, UsePipeControlAfterPartitionedWalker, -1, "-1: default (enabled), 0: do not add PipeControl, 1: add PipeControl")
+
 /*FEATURE FLAGS*/
+DECLARE_DEBUG_VARIABLE(bool, USMEvictAfterMigration, true, "Evict USM allocation after implicit migration to GPU")
 DECLARE_DEBUG_VARIABLE(bool, EnableNV12, true, "Enables NV12 extension")
 DECLARE_DEBUG_VARIABLE(bool, EnablePackedYuv, true, "Enables cl_packed_yuv extension")
 DECLARE_DEBUG_VARIABLE(bool, EnableDeferredDeleter, true, "Enables async deleter")
@@ -305,8 +309,9 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicPipelineSelect, -1, "set SYSTOLI
 DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicInComputeWalker, -1, "set SYSTOLIC MODE ENABLE in COMPUTE_WALKER cmd, -1:default, 0:disable, 1:enable")

 /*EXPERIMENTAL TOGGLES*/
+DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionCount, 0, "Experimental implementation: Set number of COMPUTE_WALKERs for a given Partition Type, 0 - do not set the feature.")
+DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionType, -1, "Experimental implementation: Set COMPUTE_WALKER Partition Type. Valid values for types from 1 to 3")
 DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableCustomLocalMemoryAlignment, 0, "Align local memory allocations to a given value. Works only with allocations at least as big as the value.  0: no effect, 2097152: 2 megabytes, 1073741824: 1 gigabyte")
-DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalUseAtomicsForNativeSectionCleanup, -1, "-1: default (disabled), 0: use store data op, 1: use atomic op")

 /*DRIVER TOGGLES*/
 DECLARE_DEBUG_VARIABLE(int32_t, ForceOCLVersion, 0, "Force specific OpenCL API version")
--- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp
+++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp
@@ -1026,7 +1026,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
    uint32_t expectedPartitionSize = (dims[0] + partitionCount - 1u) / partitionCount;
    EXPECT_EQ(expectedPartitionSize, partitionWalkerCmd->getPartitionSize());

-    auto cleanupSectionOffset = WalkerPartition::computeControlSectionOffset<FamilyType>(partitionCount, false, true, false);
+    WalkerPartition::WalkerPartitionArgs args = {};
+    args.initializeWparidRegister = true;
+    args.usePipeControlStall = true;
+    args.partitionCount = partitionCount;
+    args.nativeCrossTileAtomicSync = true;
+
+    auto cleanupSectionOffset = WalkerPartition::computeControlSectionOffset<FamilyType>(args);
    uint64_t expectedCleanupGpuVa = cmdContainer->getCommandStream()->getGraphicsAllocation()->getGpuAddress() +
                                    cleanupSectionOffset;
    constexpr uint32_t expectedData = 0ull;
--- a/shared/test/unit_test/encoders/test_implicit_scaling.cpp
+++ b/shared/test/unit_test/encoders/test_implicit_scaling.cpp
@@ -51,11 +51,85 @@ TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingAtomicsForNativeCle
 }

 TEST_F(ImplicitScalingTests, givenForceNotUseAtomicsWhenCheckingAtomicsForNativeCleanupThenExpectFalse) {
-    DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.set(0);
+    DebugManager.flags.UseAtomicsForNativeSectionCleanup.set(0);
    EXPECT_FALSE(ImplicitScalingHelper::useAtomicsForNativeCleanup());
 }

 TEST_F(ImplicitScalingTests, givenForceUseAtomicsWhenCheckingAtomicsForNativeCleanupThenExpectTrue) {
-    DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.set(1);
+    DebugManager.flags.UseAtomicsForNativeSectionCleanup.set(1);
    EXPECT_TRUE(ImplicitScalingHelper::useAtomicsForNativeCleanup());
 }
+
+TEST_F(ImplicitScalingTests, givenDefaultSettingsIsFalseWhenCheckingProgramNativeCleanupThenExpectFalse) {
+    EXPECT_FALSE(ImplicitScalingHelper::programNativeCleanup(false));
+}
+
+TEST_F(ImplicitScalingTests, givenDefaultSettingsIsTrueWhenCheckingProgramNativeCleanupThenExpectTrue) {
+    EXPECT_TRUE(ImplicitScalingHelper::programNativeCleanup(true));
+}
+
+TEST_F(ImplicitScalingTests, givenForceNotProgramNativeCleanupWhenDefaultNativeCleanupIsTrueThenExpectFalse) {
+    DebugManager.flags.ProgramNativeCleanup.set(0);
+    EXPECT_FALSE(ImplicitScalingHelper::programNativeCleanup(true));
+}
+
+TEST_F(ImplicitScalingTests, givenForceProgramNativeCleanupWhenDefaultNativeCleanupIsFalseThenExpectTrue) {
+    DebugManager.flags.ProgramNativeCleanup.set(1);
+    EXPECT_TRUE(ImplicitScalingHelper::programNativeCleanup(false));
+}
+
+TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingToProgramWparidRegisterThenExpectTrue) {
+    EXPECT_TRUE(ImplicitScalingHelper::initWparidRegister());
+}
+
+TEST_F(ImplicitScalingTests, givenForceNotProgramWparidRegisterWhenCheckingRegisterProgramThenExpectFalse) {
+    DebugManager.flags.WparidRegisterProgramming.set(0);
+    EXPECT_FALSE(ImplicitScalingHelper::initWparidRegister());
+}
+
+TEST_F(ImplicitScalingTests, givenForceProgramWparidRegisterWhenCheckingRegisterProgramThenExpectTrue) {
+    DebugManager.flags.WparidRegisterProgramming.set(1);
+    EXPECT_TRUE(ImplicitScalingHelper::initWparidRegister());
+}
+
+TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingToUsePipeControlThenExpectTrue) {
+    EXPECT_TRUE(ImplicitScalingHelper::usePipeControl());
+}
+
+TEST_F(ImplicitScalingTests, givenForceNotUsePipeControlWhenCheckingPipeControlUseThenExpectFalse) {
+    DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
+    EXPECT_FALSE(ImplicitScalingHelper::usePipeControl());
+}
+
+TEST_F(ImplicitScalingTests, givenForceUsePipeControlWhenCheckingPipeControlUseThenExpectTrue) {
+    DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(1);
+    EXPECT_TRUE(ImplicitScalingHelper::usePipeControl());
+}
+
+TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingSemaphoreUseThenExpectFalse) {
+    EXPECT_FALSE(ImplicitScalingHelper::isSemaphoreProgrammingRequired());
+}
+
+TEST_F(ImplicitScalingTests, givenForceSemaphoreNotUseWhenCheckingSemaphoreUseThenExpectFalse) {
+    DebugManager.flags.SynchronizeWithSemaphores.set(0);
+    EXPECT_FALSE(ImplicitScalingHelper::isSemaphoreProgrammingRequired());
+}
+
+TEST_F(ImplicitScalingTests, givenForceSemaphoreUseWhenCheckingSemaphoreUseThenExpectTrue) {
+    DebugManager.flags.SynchronizeWithSemaphores.set(1);
+    EXPECT_TRUE(ImplicitScalingHelper::isSemaphoreProgrammingRequired());
+}
+
+TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingCrossTileAtomicSyncThenExpectTrue) {
+    EXPECT_TRUE(ImplicitScalingHelper::isCrossTileAtomicRequired());
+}
+
+TEST_F(ImplicitScalingTests, givenForceDisableWhenCheckingCrossTileAtomicSyncThenExpectFalse) {
+    DebugManager.flags.UseCrossAtomicSynchronization.set(0);
+    EXPECT_FALSE(ImplicitScalingHelper::isCrossTileAtomicRequired());
+}
+
+TEST_F(ImplicitScalingTests, givenForceEnableWhenCheckingCrossTileAtomicSyncThenExpectTrue) {
+    DebugManager.flags.UseCrossAtomicSynchronization.set(1);
+    EXPECT_TRUE(ImplicitScalingHelper::isCrossTileAtomicRequired());
+}
--- a/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp
+++ b/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp
@@ -232,3 +232,111 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
    auto itorLrm = find<MI_LOAD_REGISTER_MEM *>(loadRegisterMemList.begin(), loadRegisterMemList.end());
    ASSERT_NE(itorLrm, loadRegisterMemList.end());
 }
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPreferredWhenForceDisabledWparidRegisterThenExpectNoCommandFound) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+    using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
+    using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
+    using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
+
+    DebugManager.flags.WparidRegisterProgramming.set(0);
+
+    uint64_t workPartitionAllocationAddress = 0x987654;
+    uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
+
+    WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker;
+    walker.setThreadGroupIdXDimension(1);
+    auto &postSync = walker.getPostSync();
+    postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
+    postSync.setDestinationAddress(postSyncAddress);
+
+    size_t expectedSize = 0;
+    size_t totalBytesProgrammed = 0;
+
+    expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
+
+    uint32_t partitionCount = 0;
+    ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress);
+    totalBytesProgrammed = commandStream.getUsed();
+    EXPECT_EQ(expectedSize, totalBytesProgrammed);
+    EXPECT_EQ(twoTile.count(), partitionCount);
+
+    HardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(commandStream, 0);
+
+    GenCmdList loadRegisterMemList = hwParser.getCommandsList<MI_LOAD_REGISTER_MEM>();
+    auto itorLrm = find<MI_LOAD_REGISTER_MEM *>(loadRegisterMemList.begin(), loadRegisterMemList.end());
+    EXPECT_EQ(itorLrm, loadRegisterMemList.end());
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPreferredWhenForceDisabledPipeControlThenExpectNoCommandFound) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+    using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
+    using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+
+    DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
+
+    uint64_t workPartitionAllocationAddress = 0x987654;
+    uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
+
+    WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker;
+    walker.setThreadGroupIdXDimension(1);
+    auto &postSync = walker.getPostSync();
+    postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
+    postSync.setDestinationAddress(postSyncAddress);
+
+    size_t expectedSize = 0;
+    size_t totalBytesProgrammed = 0;
+
+    expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
+
+    uint32_t partitionCount = 0;
+    ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress);
+    totalBytesProgrammed = commandStream.getUsed();
+    EXPECT_EQ(expectedSize, totalBytesProgrammed);
+    EXPECT_EQ(twoTile.count(), partitionCount);
+
+    HardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(commandStream, 0);
+
+    GenCmdList pipeControlList = hwParser.getCommandsList<PIPE_CONTROL>();
+    auto itorPipeControl = find<PIPE_CONTROL *>(pipeControlList.begin(), pipeControlList.end());
+    EXPECT_EQ(itorPipeControl, pipeControlList.end());
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenDynamicPartitioningPreferredWhenForceDisabledPipeControlThenExpectNoCommandFound) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+    using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
+    using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+
+    DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
+
+    uint64_t workPartitionAllocationAddress = 0x0;
+    uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
+
+    WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker;
+    walker.setThreadGroupIdXDimension(32);
+    auto &postSync = walker.getPostSync();
+    postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
+    postSync.setDestinationAddress(postSyncAddress);
+
+    size_t expectedSize = 0;
+    size_t totalBytesProgrammed = 0;
+
+    expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
+
+    uint32_t partitionCount = 0;
+    ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress);
+    totalBytesProgrammed = commandStream.getUsed();
+    EXPECT_EQ(expectedSize, totalBytesProgrammed);
+    EXPECT_EQ(twoTile.count(), partitionCount);
+
+    HardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(commandStream, 0);
+
+    GenCmdList pipeControlList = hwParser.getCommandsList<PIPE_CONTROL>();
+    auto itorPipeControl = find<PIPE_CONTROL *>(pipeControlList.begin(), pipeControlList.end());
+    EXPECT_EQ(itorPipeControl, pipeControlList.end());
+}