Make partitioned post sync operations for partitioned workloads

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
2021-09-03 11:42:31 +00:00 · 2021-09-03 11:42:31 +00:00 · 6b299a3ab0
parent 86f8150dc7
commit 6b299a3ab0
22 changed files with 359 additions and 56 deletions
--- a/level_zero/core/source/cmdlist/cmdlist.h
+++ b/level_zero/core/source/cmdlist/cmdlist.h
@ -216,12 +216,6 @@ struct CommandList : _ze_command_list_handle_t {
        TYPE_IMMEDIATE = 1u
    };

-    CommandQueue *cmdQImmediate = nullptr;
-    NEO::CommandStreamReceiver *csr = nullptr;
-    uint32_t cmdListType = CommandListType::TYPE_REGULAR;
-    Device *device = nullptr;
-    std::vector<Kernel *> printfFunctionContainer;
-
    virtual ze_result_t executeCommandListImmediate(bool performMigration) = 0;
    virtual ze_result_t initialize(Device *device, NEO::EngineGroupType engineGroupType, ze_command_list_flags_t flags) = 0;
    virtual ~CommandList();
@ -241,33 +235,41 @@ struct CommandList : _ze_command_list_handle_t {
        return commandsToPatch;
    }

-    bool isSyncModeQueue = false;
-    bool commandListSLMEnabled = false;
-    uint32_t commandListPerThreadScratchSize = 0u;
-    NEO::PreemptionMode commandListPreemptionMode = NEO::PreemptionMode::Initial;
-    uint32_t threadArbitrationPolicy = NEO::ThreadArbitrationPolicy::RoundRobin;
-    bool isFlushTaskSubmissionEnabled = false;
-
    void makeResidentAndMigrate(bool);
    void migrateSharedAllocations();

+    std::vector<Kernel *> printfFunctionContainer;
+    CommandQueue *cmdQImmediate = nullptr;
+    NEO::CommandStreamReceiver *csr = nullptr;
+    Device *device = nullptr;
+    NEO::PreemptionMode commandListPreemptionMode = NEO::PreemptionMode::Initial;
+    uint32_t cmdListType = CommandListType::TYPE_REGULAR;
+    uint32_t commandListPerThreadScratchSize = 0u;
+    uint32_t threadArbitrationPolicy = NEO::ThreadArbitrationPolicy::RoundRobin;
+    uint32_t partitionCount = 1;
+    bool isFlushTaskSubmissionEnabled = false;
+    bool isSyncModeQueue = false;
+    bool commandListSLMEnabled = false;
+
  protected:
-    std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
-    NEO::EngineGroupType engineGroupType;
-    ze_command_list_flags_t flags = 0u;
-    UnifiedMemoryControls unifiedMemoryControls;
-    bool indirectAllocationsAllowed = false;
-    bool internalUsage = false;
-    bool containsCooperativeKernelsFlag = false;
    NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);
    NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize);
-    bool containsStatelessUncachedResource = false;
+
+    std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
+    std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;

    NEO::StreamProperties requiredStreamState{};
    NEO::StreamProperties finalStreamState{};
    CommandsToPatch commandsToPatch{};

-    std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;
+    ze_command_list_flags_t flags = 0u;
+    UnifiedMemoryControls unifiedMemoryControls;
+
+    NEO::EngineGroupType engineGroupType;
+    bool indirectAllocationsAllowed = false;
+    bool internalUsage = false;
+    bool containsCooperativeKernelsFlag = false;
+    bool containsStatelessUncachedResource = false;
 };

 using CommandListAllocatorFn = CommandList *(*)(uint32_t);
--- a/level_zero/core/source/cmdlist/cmdlist_hw.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl
@ -108,7 +108,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
        device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc);
    }
    this->ownedPrivateAllocations.clear();
-
+    partitionCount = 1;
    return ZE_RESULT_SUCCESS;
 }

--- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl
@ -228,6 +228,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
                                                 partitionCount,
                                                 internalUsage,
                                                 isCooperative);
+    this->partitionCount = std::max(partitionCount, this->partitionCount);
    if (hEvent) {
        auto event = Event::fromHandle(hEvent);
        if (partitionCount > 1) {
--- a/level_zero/core/source/cmdqueue/cmdqueue.cpp
+++ b/level_zero/core/source/cmdqueue/cmdqueue.cpp
@ -106,10 +106,17 @@ ze_result_t CommandQueueImp::synchronizeByPollingForTaskCount(uint64_t timeout)
        timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
    }

-    csr->waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, this->taskCount);
-
-    if (*csr->getTagAddress() < taskCountToWait) {
-        return ZE_RESULT_NOT_READY;
+    if (partitionCount > 1) {
+        volatile uint32_t *pollAddress = csr->getTagAddress();
+        for (uint32_t i = 0; i < partitionCount; i++) {
+            csr->waitForCompletionWithTimeout(pollAddress, enableTimeout, timeoutMicroseconds, this->taskCount);
+            pollAddress += addressOffsetDwords;
+        }
+    } else {
+        csr->waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, this->taskCount);
+        if (*csr->getTagAddress() < taskCountToWait) {
+            return ZE_RESULT_NOT_READY;
+        }
    }

    postSyncOperations();
--- a/level_zero/core/source/cmdqueue/cmdqueue.h
+++ b/level_zero/core/source/cmdqueue/cmdqueue.h
@ -57,6 +57,7 @@ struct CommandQueue : _ze_command_queue_handle_t {

  protected:
    NEO::PreemptionMode commandQueuePreemptionMode = NEO::PreemptionMode::Initial;
+    uint32_t partitionCount = 1;
    bool preemptionCmdSyncProgramming = true;
    bool commandQueueDebugCmdsProgrammed = false;
    bool isCopyOnlyCommandQueue = false;
--- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl
+++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl
@ -10,6 +10,7 @@
 #include "shared/source/built_ins/built_ins.h"
 #include "shared/source/built_ins/sip.h"
 #include "shared/source/command_container/command_encoder.h"
+#include "shared/source/command_container/implicit_scaling.h"
 #include "shared/source/command_stream/command_stream_receiver_hw.h"
 #include "shared/source/command_stream/linear_stream.h"
 #include "shared/source/command_stream/preemption.h"
@ -72,6 +73,9 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
    using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
    using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;

+    using MI_LOAD_REGISTER_MEM = typename GfxFamily::MI_LOAD_REGISTER_MEM;
+    using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
+
    auto lockCSR = csr->obtainUniqueOwnership();

    auto anyCommandListWithCooperativeKernels = false;
@ -177,6 +181,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
                heapContainer.push_back(element);
            }
        }
+
+        partitionCount = std::max(partitionCount, commandList->partitionCount);
    }

    size_t linearStreamSizeEstimate = totalCmdBuffers * sizeof(MI_BATCH_BUFFER_START);
@ -240,6 +246,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
    }

    linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
+    if (partitionCount > 1) {
+        linearStreamSizeEstimate += sizeof(MI_LOAD_REGISTER_MEM) + sizeof(MI_LOAD_REGISTER_IMM);
+    }
+
    size_t alignedSize = alignUp<size_t>(linearStreamSizeEstimate, minCmdBufferPtrAlign);
    size_t padding = alignedSize - linearStreamSizeEstimate;
    reserveLinearStreamSize(alignedSize);
@ -399,6 +409,17 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(

    commandQueuePreemptionMode = statePreemption;

+    if (partitionCount > 1) {
+        uint64_t workPartitionAddress = csr->getWorkPartitionAllocationGpuAddress();
+        NEO::EncodeSetMMIO<GfxFamily>::encodeMEM(child,
+                                                 NEO::PartitionRegisters<GfxFamily>::wparidCCSOffset,
+                                                 workPartitionAddress);
+        NEO::EncodeSetMMIO<GfxFamily>::encodeIMM(child,
+                                                 NEO::PartitionRegisters<GfxFamily>::addressOffsetCCSOffset,
+                                                 addressOffset,
+                                                 true);
+    }
+
    if (hFence) {
        csr->makeResident(fence->getAllocation());
        if (isCopyOnlyCommandQueue) {
@ -407,6 +428,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
            NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(child, fence->getGpuAddress(), Fence::STATE_SIGNALED, args);
        } else {
            NEO::PipeControlArgs args(true);
+            if (partitionCount > 1) {
+                args.workloadPartitionOffset = true;
+                fence->setPartitionCount(partitionCount);
+            }
            NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
                child, POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
                fence->getGpuAddress(),
@ -539,6 +564,9 @@ void CommandQueueHw<gfxCoreFamily>::dispatchTaskCountWrite(NEO::LinearStream &co
        NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(commandStream, gpuAddress, taskCountToWrite, args);
    } else {
        NEO::PipeControlArgs args(true);
+        if (partitionCount > 1) {
+            args.workloadPartitionOffset = true;
+        }
        args.notifyEnable = csr->isUsedNotifyEnableForPostSync();
        NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
            commandStream,
--- a/level_zero/core/source/cmdqueue/cmdqueue_imp.h
+++ b/level_zero/core/source/cmdqueue/cmdqueue_imp.h
@ -62,6 +62,9 @@ struct CommandQueueImp : public CommandQueue {
        MemoryConstants::cacheLineSize +
        NEO::CSRequirements::csOverfetchSize;

+    static constexpr uint32_t addressOffsetDwords = 2u;
+    static constexpr uint32_t addressOffset = sizeof(uint32_t) * addressOffsetDwords;
+
    CommandQueueImp() = delete;
    CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc);

--- a/level_zero/core/source/fence/fence.cpp
+++ b/level_zero/core/source/fence/fence.cpp
@ -37,9 +37,15 @@ ze_result_t FenceImp::queryStatus() {
        csr->downloadAllocations();
    }

-    uint64_t *hostAddr = static_cast<uint64_t *>(allocation->getUnderlyingBuffer());
+    void *hostAddr = static_cast<uint64_t *>(allocation->getUnderlyingBuffer());
    uint32_t queryVal = Fence::STATE_CLEARED;
-    memcpy_s(static_cast<void *>(&queryVal), sizeof(uint32_t), static_cast<void *>(hostAddr), sizeof(uint32_t));
+    for (uint32_t i = 0; i < partitionCount; i++) {
+        memcpy_s(static_cast<void *>(&queryVal), sizeof(uint32_t), hostAddr, sizeof(uint32_t));
+        if (queryVal == Fence::STATE_CLEARED) {
+            break;
+        }
+        hostAddr = ptrOffset(hostAddr, CommandQueueImp::addressOffset);
+    }
    return queryVal == Fence::STATE_CLEARED ? ZE_RESULT_NOT_READY : ZE_RESULT_SUCCESS;
 }

--- a/level_zero/core/source/fence/fence.h
+++ b/level_zero/core/source/fence/fence.h
@ -47,8 +47,13 @@ struct Fence : _ze_fence_handle_t {
        return allocation->getGpuAddress();
    }

+    void setPartitionCount(uint32_t newPartitionCount) {
+        partitionCount = newPartitionCount;
+    }
+
  protected:
    NEO::GraphicsAllocation *allocation = nullptr;
+    uint32_t partitionCount = 1;
 };

 struct FenceImp : public Fence {
--- a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h
+++ b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h
@ -27,8 +27,10 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp {
    using BaseClass::printfFunctionContainer;
    using BaseClass::submitBatchBuffer;
    using BaseClass::synchronizeByPollingForTaskCount;
+    using BaseClass::taskCount;
    using CommandQueue::commandQueuePreemptionMode;
    using CommandQueue::internalUsage;
+    using CommandQueue::partitionCount;

    WhiteBox(Device *device, NEO::CommandStreamReceiver *csr,
             const ze_command_queue_desc_t *desc);
@ -85,6 +87,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
    using BaseClass::commandStream;
    using BaseClass::printfFunctionContainer;
    using L0::CommandQueue::internalUsage;
+    using L0::CommandQueue::partitionCount;
    using L0::CommandQueue::preemptionCmdSyncProgramming;
    using L0::CommandQueueImp::csr;

--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp
@ -845,5 +845,22 @@ HWTEST2_F(CommandListCreate, whenContainsCooperativeKernelsIsCalledThenCorrectVa
    }
 }

+HWTEST_F(CommandListCreate, whenCommandListIsResetThenPartitionCountIsReversedToOne) {
+    ze_result_t returnValue;
+    std::unique_ptr<L0::CommandList> commandList(CommandList::create(productFamily,
+                                                                     device,
+                                                                     NEO::EngineGroupType::Compute,
+                                                                     0u,
+                                                                     returnValue));
+    EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
+
+    commandList->partitionCount = 2;
+
+    returnValue = commandList->reset();
+    EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
+
+    EXPECT_EQ(1u, commandList->partitionCount);
+}
+
 } // namespace ult
 } // namespace L0
--- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp
@ -1529,6 +1529,12 @@ struct SynchronizeCsr : public NEO::UltCommandStreamReceiver<GfxFamily> {
        tagAddress = new uint32_t;
    }

+    bool waitForCompletionWithTimeout(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override {
+        enableTimeoutSet = enableTimeout;
+        waitForComplitionCalledTimes++;
+        return true;
+    }
+
    bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override {
        enableTimeoutSet = enableTimeout;
        waitForComplitionCalledTimes++;
@ -1623,6 +1629,32 @@ HWTEST_F(CommandQueueSynchronizeTest, givenDebugOverrideEnabledWhenCallToSynchro
    L0::CommandQueue::fromHandle(commandQueue)->destroy();
 }

+HWTEST_F(CommandQueueSynchronizeTest, givenMultiplePartitionCountWhenCallingSynchronizeThenExpectTheSameNumberCsrSynchronizeCalls) {
+    auto csr = std::unique_ptr<SynchronizeCsr<FamilyType>>(new SynchronizeCsr<FamilyType>(*device->getNEODevice()->getExecutionEnvironment(),
+                                                                                          device->getNEODevice()->getDeviceBitfield()));
+    csr->setupContext(*device->getNEODevice()->getDefaultEngine().osContext);
+
+    const ze_command_queue_desc_t desc{};
+    ze_result_t returnValue;
+    auto commandQueue = whitebox_cast(CommandQueue::create(productFamily,
+                                                           device,
+                                                           csr.get(),
+                                                           &desc,
+                                                           false,
+                                                           false,
+                                                           returnValue));
+    EXPECT_EQ(returnValue, ZE_RESULT_SUCCESS);
+    ASSERT_NE(nullptr, commandQueue);
+
+    commandQueue->partitionCount = 2;
+    uint64_t timeout = std::numeric_limits<uint64_t>::max();
+    commandQueue->synchronize(timeout);
+
+    EXPECT_EQ(2u, csr->waitForComplitionCalledTimes);
+
+    L0::CommandQueue::fromHandle(commandQueue)->destroy();
+}
+
 struct MemoryManagerCommandQueueCreateNegativeTest : public NEO::MockMemoryManager {
    MemoryManagerCommandQueueCreateNegativeTest(NEO::ExecutionEnvironment &executionEnvironment) : NEO::MockMemoryManager(const_cast<NEO::ExecutionEnvironment &>(executionEnvironment)) {}
    NEO::GraphicsAllocation *allocateGraphicsMemoryWithProperties(const NEO::AllocationProperties &properties) override {
--- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist.cpp
@ -53,6 +53,42 @@ struct CommandQueueExecuteCommandLists : public Test<DeviceFixture> {
    ze_command_list_handle_t commandLists[numCommandLists];
 };

+struct MultiDeviceCommandQueueExecuteCommandLists : public Test<MultiDeviceFixture> {
+    void SetUp() override {
+        DebugManager.flags.EnableWalkerPartition.set(1);
+        numRootDevices = 1u;
+        MultiDeviceFixture::SetUp();
+
+        uint32_t deviceCount = 1;
+        ze_device_handle_t deviceHandle;
+        driverHandle->getDevice(&deviceCount, &deviceHandle);
+        device = Device::fromHandle(deviceHandle);
+        ASSERT_NE(nullptr, device);
+
+        ze_result_t returnValue;
+        commandLists[0] = CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)->toHandle();
+        ASSERT_NE(nullptr, commandLists[0]);
+        EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
+
+        commandLists[1] = CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)->toHandle();
+        ASSERT_NE(nullptr, commandLists[1]);
+        EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
+    }
+
+    void TearDown() override {
+        for (auto i = 0u; i < numCommandLists; i++) {
+            auto commandList = CommandList::fromHandle(commandLists[i]);
+            commandList->destroy();
+        }
+
+        MultiDeviceFixture::TearDown();
+    }
+
+    L0::Device *device = nullptr;
+    const static uint32_t numCommandLists = 2;
+    ze_command_list_handle_t commandLists[numCommandLists];
+};
+
 HWTEST_F(CommandQueueExecuteCommandLists, whenASecondLevelBatchBufferPerCommandListAddedThenProperSizeExpected) {
    using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
    using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
@ -763,5 +799,95 @@ HWTEST_F(CommandQueueExecuteCommandListSWTagsTests, givenEnableSWTagsAndCommandL
    EXPECT_TRUE(tagFound);
 }

+HWTEST2_F(MultiDeviceCommandQueueExecuteCommandLists, givenMultiplePartitionCountWhenExecutingCmdListThenExpectMmioProgrammingAndCorrectEstimation, IsAtLeastXeHpCore) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+    using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION;
+    using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
+    using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
+    using PARSE = typename FamilyType::PARSE;
+
+    ze_command_queue_desc_t desc{};
+    desc.ordinal = 0u;
+    desc.index = 0u;
+    desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
+    desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
+
+    ze_result_t returnValue;
+    auto commandQueue = whitebox_cast(CommandQueue::create(productFamily,
+                                                           device,
+                                                           device->getNEODevice()->getDefaultEngine().commandStreamReceiver,
+                                                           &desc,
+                                                           false,
+                                                           false,
+                                                           returnValue));
+    EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
+
+    ze_fence_desc_t fenceDesc{};
+    auto fence = whitebox_cast(Fence::create(commandQueue, &fenceDesc));
+    ASSERT_NE(nullptr, fence);
+    ze_fence_handle_t fenceHandle = fence->toHandle();
+
+    ASSERT_NE(nullptr, commandQueue->commandStream);
+
+    //1st execute call initialized pipeline
+    auto result = commandQueue->executeCommandLists(numCommandLists, commandLists, fenceHandle, true);
+    EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
+
+    auto usedSpaceBefore = commandQueue->commandStream->getUsed();
+    result = commandQueue->executeCommandLists(numCommandLists, commandLists, fenceHandle, true);
+    ASSERT_EQ(ZE_RESULT_SUCCESS, result);
+    auto usedSpaceAfter = commandQueue->commandStream->getUsed();
+    ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
+    size_t cmdBufferSizeWithoutMmioProgramming = usedSpaceAfter - usedSpaceBefore;
+
+    auto workPartitionAddress = device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
+
+    for (auto i = 0u; i < numCommandLists; i++) {
+        auto commandList = CommandList::fromHandle(commandLists[i]);
+        commandList->partitionCount = 2;
+    }
+
+    usedSpaceBefore = commandQueue->commandStream->getUsed();
+    result = commandQueue->executeCommandLists(numCommandLists, commandLists, fenceHandle, true);
+    ASSERT_EQ(ZE_RESULT_SUCCESS, result);
+    usedSpaceAfter = commandQueue->commandStream->getUsed();
+    ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
+    size_t cmdBufferSizeWithtMmioProgramming = usedSpaceAfter - usedSpaceBefore;
+
+    size_t expectedSizeWithMmioProgramming = cmdBufferSizeWithoutMmioProgramming + sizeof(MI_LOAD_REGISTER_IMM) + sizeof(MI_LOAD_REGISTER_MEM);
+    EXPECT_GE(expectedSizeWithMmioProgramming, cmdBufferSizeWithtMmioProgramming);
+
+    GenCmdList cmdList;
+    ASSERT_TRUE(PARSE::parseCommandBuffer(cmdList, ptrOffset(commandQueue->commandStream->getCpuBase(), usedSpaceBefore), usedSpaceAfter));
+
+    auto itorLri = find<MI_LOAD_REGISTER_IMM *>(cmdList.begin(), cmdList.end());
+    ASSERT_NE(cmdList.end(), itorLri);
+    auto itorLrm = find<MI_LOAD_REGISTER_MEM *>(cmdList.begin(), cmdList.end());
+    ASSERT_NE(cmdList.end(), itorLrm);
+
+    auto loadRegisterImm = static_cast<MI_LOAD_REGISTER_IMM *>(*itorLri);
+    EXPECT_EQ(0x23B4u, loadRegisterImm->getRegisterOffset());
+    EXPECT_EQ(8u, loadRegisterImm->getDataDword());
+
+    auto loadRegisterMem = static_cast<MI_LOAD_REGISTER_MEM *>(*itorLrm);
+    EXPECT_EQ(0x221Cu, loadRegisterMem->getRegisterAddress());
+    EXPECT_EQ(workPartitionAddress, loadRegisterMem->getMemoryAddress());
+
+    auto pipeControlList = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
+
+    uint32_t foundPostSyncPipeControl = 0u;
+    for (size_t i = 0; i < pipeControlList.size(); i++) {
+        auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(*pipeControlList[i]);
+        if (pipeControl->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
+            EXPECT_TRUE(pipeControl->getWorkloadPartitionIdOffsetEnable());
+            foundPostSyncPipeControl++;
+        }
+    }
+    EXPECT_EQ(2u, foundPostSyncPipeControl);
+
+    fence->destroy();
+    commandQueue->destroy();
+}
+
 } // namespace ult
 } // namespace L0
--- a/opencl/test/unit_test/helpers/hw_helper_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/helpers/hw_helper_tests_xehp_and_later.cpp
@ -405,3 +405,42 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HwHelperTestXeHPAndLater, givenHwHelperWhenGettingB
    EXPECT_EQ(messageExtDescriptor.getBindlessSurfaceOffsetToPatch(), value);
    EXPECT_EQ(0x200u, value);
 }
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, PipeControlHelperTestsXeHPAndLater, givenPostSyncPipeControlWhenSettingWorkloadPartitionFlagThenExpectPipeControlFlagSet) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+    using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION;
+
+    uint8_t buffer[128] = {};
+    LinearStream stream(buffer, sizeof(buffer));
+    HardwareInfo hardwareInfo = *defaultHwInfo;
+    uint64_t gpuAddress = 0xBADA550;
+    uint64_t data = 0xABCDEF;
+
+    PipeControlArgs args;
+    args.workloadPartitionOffset = true;
+
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        stream,
+        POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        gpuAddress,
+        data,
+        hardwareInfo,
+        args);
+
+    GenCmdList cmdList;
+    FamilyType::PARSE::parseCommandBuffer(cmdList, stream.getCpuBase(), stream.getUsed());
+    auto pipeControls = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
+
+    bool foundPostSyncPipeControl = false;
+    for (size_t i = 0; i < pipeControls.size(); i++) {
+        auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(*pipeControls[i]);
+        if (pipeControl->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
+            EXPECT_EQ(static_cast<uint32_t>(gpuAddress), pipeControl->getAddress());
+            EXPECT_EQ(data, pipeControl->getImmediateData());
+            EXPECT_TRUE(pipeControl->getWorkloadPartitionIdOffsetEnable());
+            foundPostSyncPipeControl = true;
+            break;
+        }
+    }
+    EXPECT_TRUE(foundPostSyncPipeControl);
+}
--- a/shared/source/command_container/command_encoder.h
+++ b/shared/source/command_container/command_encoder.h
@ -195,11 +195,13 @@ struct EncodeSetMMIO {
    static const size_t sizeREG = sizeof(MI_LOAD_REGISTER_REG);

    static void encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap);
-
    static void encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address);
-
    static void encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset);

+    static void encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap);
+    static void encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address);
+    static void encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset);
+
    static bool isRemapApplicable(uint32_t offset);
    static void remapOffset(MI_LOAD_REGISTER_MEM *pMiLoadReg);
    static void remapOffset(MI_LOAD_REGISTER_REG *pMiLoadReg);
--- a/shared/source/command_container/command_encoder.inl
+++ b/shared/source/command_container/command_encoder.inl
@ -297,30 +297,45 @@ void EncodeMath<Family>::bitwiseOr(CommandContainer &container,

 template <typename Family>
 inline void EncodeSetMMIO<Family>::encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap) {
-    LriHelper<Family>::program(container.getCommandStream(),
+    EncodeSetMMIO<Family>::encodeIMM(*container.getCommandStream(), offset, data, remap);
+}
+
+template <typename Family>
+inline void EncodeSetMMIO<Family>::encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address) {
+    EncodeSetMMIO<Family>::encodeMEM(*container.getCommandStream(), offset, address);
+}
+
+template <typename Family>
+inline void EncodeSetMMIO<Family>::encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset) {
+    EncodeSetMMIO<Family>::encodeREG(*container.getCommandStream(), dstOffset, srcOffset);
+}
+
+template <typename Family>
+inline void EncodeSetMMIO<Family>::encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap) {
+    LriHelper<Family>::program(&cmdStream,
                               offset,
                               data,
                               remap);
 }

 template <typename Family>
-void EncodeSetMMIO<Family>::encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address) {
+void EncodeSetMMIO<Family>::encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address) {
    MI_LOAD_REGISTER_MEM cmd = Family::cmdInitLoadRegisterMem;
    cmd.setRegisterAddress(offset);
    cmd.setMemoryAddress(address);
    remapOffset(&cmd);

-    auto buffer = container.getCommandStream()->getSpaceForCmd<MI_LOAD_REGISTER_MEM>();
+    auto buffer = cmdStream.getSpaceForCmd<MI_LOAD_REGISTER_MEM>();
    *buffer = cmd;
 }

 template <typename Family>
-void EncodeSetMMIO<Family>::encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset) {
+void EncodeSetMMIO<Family>::encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset) {
    MI_LOAD_REGISTER_REG cmd = Family::cmdInitLoadRegisterReg;
    cmd.setSourceRegisterAddress(srcOffset);
    cmd.setDestinationRegisterAddress(dstOffset);
    remapOffset(&cmd);
-    auto buffer = container.getCommandStream()->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
+    auto buffer = cmdStream.getSpaceForCmd<MI_LOAD_REGISTER_REG>();
    *buffer = cmd;
 }

@ -508,8 +523,8 @@ bool EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(const KernelDes
    return false;
 }

-template <typename GfxFamily>
-void EncodeDispatchKernel<GfxFamily>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {}
+template <typename Family>
+void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {}

 template <typename Family>
 void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) {
@ -706,12 +721,12 @@ void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferEnd(CommandContainer
    *buffer = cmd;
 }

-template <typename GfxFamily>
-void EncodeMiFlushDW<GfxFamily>::programMiFlushDw(LinearStream &commandStream, uint64_t immediateDataGpuAddress, uint64_t immediateData, MiFlushArgs &args) {
+template <typename Family>
+void EncodeMiFlushDW<Family>::programMiFlushDw(LinearStream &commandStream, uint64_t immediateDataGpuAddress, uint64_t immediateData, MiFlushArgs &args) {
    programMiFlushDwWA(commandStream);

    auto miFlushDwCmd = commandStream.getSpaceForCmd<MI_FLUSH_DW>();
-    MI_FLUSH_DW miFlush = GfxFamily::cmdInitMiFlushDw;
+    MI_FLUSH_DW miFlush = Family::cmdInitMiFlushDw;
    if (args.commandWithPostSync) {
        auto postSyncType = args.timeStampOperation ? MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_TIMESTAMP_REGISTER : MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD;
        miFlush.setPostSyncOperation(postSyncType);
@ -724,16 +739,16 @@ void EncodeMiFlushDW<GfxFamily>::programMiFlushDw(LinearStream &commandStream, u
    *miFlushDwCmd = miFlush;
 }

-template <typename GfxFamily>
-size_t EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() {
-    return sizeof(typename GfxFamily::MI_FLUSH_DW) + EncodeMiFlushDW<GfxFamily>::getMiFlushDwWaSize();
+template <typename Family>
+size_t EncodeMiFlushDW<Family>::getMiFlushDwCmdSizeForDataWrite() {
+    return sizeof(typename Family::MI_FLUSH_DW) + EncodeMiFlushDW<Family>::getMiFlushDwWaSize();
 }

-template <typename GfxFamily>
-inline void EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(LinearStream &commandStream, const GraphicsAllocation &graphicsAllocation, uint32_t size, size_t offset, const HardwareInfo &hwInfo) {}
+template <typename Family>
+inline void EncodeMemoryPrefetch<Family>::programMemoryPrefetch(LinearStream &commandStream, const GraphicsAllocation &graphicsAllocation, uint32_t size, size_t offset, const HardwareInfo &hwInfo) {}

-template <typename GfxFamily>
-inline size_t EncodeMemoryPrefetch<GfxFamily>::getSizeForMemoryPrefetch(size_t size) { return 0u; }
+template <typename Family>
+inline size_t EncodeMemoryPrefetch<Family>::getSizeForMemoryPrefetch(size_t size) { return 0u; }

 template <typename Family>
 void EncodeMiArbCheck<Family>::program(LinearStream &commandStream) {
--- a/shared/source/command_container/command_encoder_xehp_and_later.inl
+++ b/shared/source/command_container/command_encoder_xehp_and_later.inl
@ -586,9 +586,9 @@ bool EncodeSurfaceState<Family>::doBindingTablePrefetch() {
    return false;
 }

-template <typename GfxFamily>
-void EncodeSurfaceState<GfxFamily>::encodeExtraBufferParams(R_SURFACE_STATE *surfaceState, GraphicsAllocation *allocation, GmmHelper *gmmHelper,
-                                                            bool isReadOnly, uint32_t numAvailableDevices, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) {
+template <typename Family>
+void EncodeSurfaceState<Family>::encodeExtraBufferParams(R_SURFACE_STATE *surfaceState, GraphicsAllocation *allocation, GmmHelper *gmmHelper,
+                                                         bool isReadOnly, uint32_t numAvailableDevices, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) {
    Gmm *gmm = allocation ? allocation->getDefaultGmm() : nullptr;
    uint32_t compressionFormat = 0;

@ -627,7 +627,7 @@ void EncodeSurfaceState<GfxFamily>::encodeExtraBufferParams(R_SURFACE_STATE *sur
        surfaceState->setDisableSupportForMultiGpuPartialWrites(!!DebugManager.flags.ForceMultiGpuPartialWrites.get());
    }

-    if (EncodeSurfaceState<GfxFamily>::isAuxModeEnabled(surfaceState, gmm)) {
+    if (EncodeSurfaceState<Family>::isAuxModeEnabled(surfaceState, gmm)) {
        auto resourceFormat = gmm->gmmResourceInfo->getResourceFormat();
        compressionFormat = gmmHelper->getClientContext()->getSurfaceStateCompressionFormat(resourceFormat);

--- a/shared/source/command_container/implicit_scaling.h
+++ b/shared/source/command_container/implicit_scaling.h
@ -42,4 +42,12 @@ struct ImplicitScalingDispatch {
                                 uint64_t workPartitionAllocationGpuVa);
 };

+template <typename GfxFamily>
+struct PartitionRegisters {
+    enum {
+        wparidCCSOffset = 0x221C,
+        addressOffsetCCSOffset = 0x23B4
+    };
+};
+
 } // namespace NEO
--- a/shared/source/command_stream/command_stream_receiver.cpp
+++ b/shared/source/command_stream/command_stream_receiver.cpp
@ -257,6 +257,10 @@ void CommandStreamReceiver::cleanupResources() {
 }

 bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) {
+    return waitForCompletionWithTimeout(getTagAddress(), enableTimeout, timeoutMicroseconds, taskCountToWait);
+}
+
+bool CommandStreamReceiver::waitForCompletionWithTimeout(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) {
    std::chrono::high_resolution_clock::time_point time1, time2;
    int64_t timeDiff = 0;

@ -272,8 +276,8 @@ bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int
    }

    time1 = std::chrono::high_resolution_clock::now();
-    while (*getTagAddress() < taskCountToWait && timeDiff <= timeoutMicroseconds) {
-        if (WaitUtils::waitFunction(getTagAddress(), taskCountToWait)) {
+    while (*pollAddress < taskCountToWait && timeDiff <= timeoutMicroseconds) {
+        if (WaitUtils::waitFunction(pollAddress, taskCountToWait)) {
            break;
        }

@ -282,7 +286,8 @@ bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int
            timeDiff = std::chrono::duration_cast<std::chrono::microseconds>(time2 - time1).count();
        }
    }
-    if (*getTagAddress() >= taskCountToWait) {
+
+    if (*pollAddress >= taskCountToWait) {
        return true;
    }
    return false;
--- a/shared/source/command_stream/command_stream_receiver.h
+++ b/shared/source/command_stream/command_stream_receiver.h
@ -158,6 +158,7 @@ class CommandStreamReceiver {

    virtual void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0;
    virtual bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
+    MOCKABLE_VIRTUAL bool waitForCompletionWithTimeout(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
    virtual void downloadAllocations(){};

    void setSamplerCacheFlushRequired(SamplerCacheFlushState value) { this->samplerCacheFlushRequired = value; }
@ -292,7 +293,7 @@ class CommandStreamReceiver {
    LinearStream commandStream;

    // offset for debug state must be 8 bytes, if only 4 bytes are used tag writes overwrite it
-    const uint64_t debugPauseStateAddressOffset = 8;
+    const uint64_t debugPauseStateAddressOffset = MemoryConstants::cacheLineSize;
    uint64_t totalMemoryUsed = 0u;

    volatile uint32_t *tagAddress = nullptr;
--- a/shared/source/helpers/definitions/pipe_control_args_base.h
+++ b/shared/source/helpers/definitions/pipe_control_args_base.h
@ -21,6 +21,7 @@ struct PipeControlArgsBase {
    bool tlbInvalidation = false;
    bool compressionControlSurfaceCcsFlush = false;
    bool notifyEnable = false;
+    bool workloadPartitionOffset = false;

  protected:
    PipeControlArgsBase() = default;
--- a/shared/source/xe_hp_core/hw_helper_xe_hp_core.cpp
+++ b/shared/source/xe_hp_core/hw_helper_xe_hp_core.cpp
@ -186,6 +186,7 @@ template <>
 void MemorySynchronizationCommands<Family>::setPipeControlExtraProperties(PIPE_CONTROL &pipeControl, PipeControlArgs &args) {
    pipeControl.setHdcPipelineFlush(args.hdcPipelineFlush);
    pipeControl.setCompressionControlSurfaceCcsFlush(args.compressionControlSurfaceCcsFlush);
+    pipeControl.setWorkloadPartitionIdOffsetEnable(args.workloadPartitionOffset);

    if (DebugManager.flags.FlushAllCaches.get()) {
        pipeControl.setHdcPipelineFlush(true);