feature(ocl) use tags to synchronize multi root device events

Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
2023-01-12 09:59:50 +00:00 · 2023-01-12 09:59:50 +00:00 · 16bc84e27d
parent fecb52ac49
commit 16bc84e27d
59 changed files with 644 additions and 193 deletions
--- a/opencl/source/command_queue/command_queue_hw.h
+++ b/opencl/source/command_queue/command_queue_hw.h
@ -391,7 +391,8 @@ class CommandQueueHw : public CommandQueue {
                        EventsRequest &eventsRequest,
                        EventBuilder &externalEventBuilder,
                        std::unique_ptr<PrintfHandler> &&printfHandler,
-                        CommandStreamReceiver *bcsCsr);
+                        CommandStreamReceiver *bcsCsr,
+                        TagNodeBase *multiRootDeviceSyncNode);

    CompletionStamp enqueueCommandWithoutKernel(Surface **surfaces,
                                                size_t surfaceCount,
@ -422,7 +423,7 @@ class CommandQueueHw : public CommandQueue {
                                                 TimestampPacketDependencies &timestampPacketDependencies,
                                                 const EventsRequest &eventsRequest,
                                                 LinearStream *commandStream,
-                                                 uint32_t commandType, bool queueBlocked);
+                                                 uint32_t commandType, bool queueBlocked, TagNodeBase *multiRootDeviceEventSync);
    void submitCacheFlush(Surface **surfaces,
                          size_t numSurfaces,
                          LinearStream *commandStream,
@ -433,6 +434,8 @@ class CommandQueueHw : public CommandQueue {
    bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, TaskCountType taskCount, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) override;

    MOCKABLE_VIRTUAL bool isCacheFlushForBcsRequired() const;
+    void processSignalMultiRootDeviceNode(LinearStream *commandStream,
+                                          TagNodeBase *node);

  protected:
    MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){};
@ -473,7 +476,7 @@ class CommandQueueHw : public CommandQueue {
            blockedCommandsData = std::make_unique<KernelOperation>(commandStream, *gpgpuCsr.getInternalAllocationStorage());
        } else {
            commandStream = &getCommandStream<GfxFamily, commandType>(*this, csrDependencies, profilingRequired, perfCountersRequired,
-                                                                      blitEnqueue, multiDispatchInfo, surfaces, numSurfaces, isMarkerWithProfiling, eventsRequest.numEventsInWaitList > 0);
+                                                                      blitEnqueue, multiDispatchInfo, surfaces, numSurfaces, isMarkerWithProfiling, eventsRequest.numEventsInWaitList > 0, eventsRequest.outEvent);
        }
        return commandStream;
    }
--- a/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl
+++ b/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021-2022 Intel Corporation
+ * Copyright (C) 2021-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -45,7 +45,7 @@ bool CommandQueueHw<Family>::isCacheFlushCommand(uint32_t commandType) const {
 }

 template <>
-LinearStream &getCommandStream<Family, CL_COMMAND_RESOURCE_BARRIER>(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList) {
+LinearStream &getCommandStream<Family, CL_COMMAND_RESOURCE_BARRIER>(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent) {
    size_t expectedSizeCS = 0;
    [[maybe_unused]] bool usePostSync = false;
    if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@ -177,7 +177,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
    BlitPropertiesContainer blitPropertiesContainer;

    if (this->context->getRootDeviceIndices().size() > 1) {
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, computeCommandStreamReceiver);
+        eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, computeCommandStreamReceiver);
    }

    const bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo);
@ -226,7 +226,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
    }

    if (this->context->getRootDeviceIndices().size() > 1) {
-        TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStream, csrDeps);
+        TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer<GfxFamily>(commandStream, csrDeps);
    }

    if (enqueueWithBlitAuxTranslation) {
@ -280,6 +280,17 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
    } else if (isMarkerWithPostSyncWrite) {
        processDispatchForMarker(*this, &commandStream, eventsRequest, csrDeps);
    }
+    TagNodeBase *multiRootEventSyncStamp = nullptr;
+    if (eventBuilder.getEvent() && eventBuilder.getEvent()->getContext()->getRootDeviceIndices().size() > 1) {
+        multiRootEventSyncStamp = eventBuilder.getEvent()->getMultiRootTimestampSyncNode();
+        if (!blockQueue) {
+            this->getGpgpuCommandStreamReceiver().makeResident(*multiRootEventSyncStamp->getBaseGraphicsAllocation());
+        }
+        processSignalMultiRootDeviceNode(&commandStream, multiRootEventSyncStamp);
+        if (CL_COMMAND_MARKER == commandType) {
+            flushDependenciesForNonKernelCommand = true;
+        }
+    }

    CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0};
    const EnqueueProperties enqueueProperties(false, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType),
@ -382,7 +393,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
                       eventsRequest,
                       eventBuilder,
                       std::move(printfHandler),
-                       nullptr);
+                       nullptr,
+                       multiRootEventSyncStamp);
    }

    if (deferredTimestampPackets.get()) {
@ -497,7 +509,7 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(CommandS
                                                                        const MultiDispatchInfo &multiDispatchInfo,
                                                                        TimestampPacketDependencies &timestampPacketDependencies,
                                                                        const EventsRequest &eventsRequest, LinearStream *commandStream,
-                                                                        uint32_t commandType, bool queueBlocked) {
+                                                                        uint32_t commandType, bool queueBlocked, TagNodeBase *multiRootDeviceEventSync) {
    auto blitDirection = ClBlitProperties::obtainBlitDirection(commandType);

    auto blitProperties = ClBlitProperties::constructProperties(blitDirection, blitCommandStreamReceiver,
@ -510,7 +522,7 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(CommandS
        blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
        blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.barrierNodes);
    }
-
+    blitProperties.multiRootDeviceEventSync = multiRootDeviceEventSync;
    auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
    blitProperties.outputTimestampPacket = currentTimestampPacketNode;

@ -616,7 +628,20 @@ void CommandQueueHw<GfxFamily>::processDispatchForMarker(CommandQueue &commandQu
    HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
    getGpgpuCommandStreamReceiver().makeResident(*hwTimeStamps->getBaseGraphicsAllocation());
 }
-
+template <typename GfxFamily>
+void CommandQueueHw<GfxFamily>::processSignalMultiRootDeviceNode(LinearStream *commandStream,
+                                                                 TagNodeBase *node) {
+    const auto &hwInfo = getDevice().getHardwareInfo();
+    PipeControlArgs args;
+    args.dcFlushEnable = MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, hwInfo);
+    MemorySynchronizationCommands<GfxFamily>::addBarrierWithPostSyncOperation(
+        *commandStream,
+        PostSyncMode::ImmediateData,
+        node->getGpuAddress() + node->getContextEndOffset(),
+        std::numeric_limits<uint64_t>::max(),
+        hwInfo,
+        args);
+}
 template <typename GfxFamily>
 void CommandQueueHw<GfxFamily>::processDispatchForMarkerWithTimestampPacket(CommandQueue &commandQueue,
                                                                            LinearStream *commandStream,
@ -901,7 +926,8 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
    EventsRequest &eventsRequest,
    EventBuilder &externalEventBuilder,
    std::unique_ptr<PrintfHandler> &&printfHandler,
-    CommandStreamReceiver *bcsCsr) {
+    CommandStreamReceiver *bcsCsr,
+    TagNodeBase *multiRootDeviceSyncNode) {

    TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);

@ -972,7 +998,8 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
                                                         std::move(printfHandler),
                                                         preemptionMode,
                                                         multiDispatchInfo.peekMainKernel(),
-                                                         (uint32_t)multiDispatchInfo.size());
+                                                         (uint32_t)multiDispatchInfo.size(),
+                                                         multiRootDeviceSyncNode);
    }
    if (storeTimestampPackets) {
        command->setTimestampPacketNode(*timestampPacketContainer, std::move(timestampPacketDependencies));
@ -1274,10 +1301,14 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
    }

    TimestampPacketDependencies timestampPacketDependencies;
+    TagNodeBase *multiRootEventSyncStamp = nullptr;
    BlitPropertiesContainer blitPropertiesContainer;
    CsrDependencies csrDeps;

    eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All);
+    if (this->context->getRootDeviceIndices().size() > 1) {
+        eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, bcsCsr);
+    }
    auto allocator = bcsCsr.getTimestampPacketAllocator();

    if (!blockQueue) {
@ -1304,6 +1335,10 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
    if (eventBuilder.getEvent()) {
        eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
    }
+    if (eventBuilder.getEvent() && eventBuilder.getEvent()->getContext()->getRootDeviceIndices().size() > 1) {
+        multiRootEventSyncStamp = eventBuilder.getEvent()->getMultiRootTimestampSyncNode();
+        bcsCsr.makeResident(*multiRootEventSyncStamp->getBaseGraphicsAllocation());
+    }

    CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0};

@ -1320,7 +1355,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
    }

    blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies,
-                                                                    eventsRequest, gpgpuCommandStream, cmdType, blockQueue));
+                                                                    eventsRequest, gpgpuCommandStream, cmdType, blockQueue, multiRootEventSyncStamp));

    if (!blockQueue) {
        completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking,
@ -1347,7 +1382,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
    updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());

    if (blockQueue) {
-        enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr);
+        enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp);

        if (gpgpuSubmission) {
            if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) {
--- a/opencl/source/command_queue/gpgpu_walker.h
+++ b/opencl/source/command_queue/gpgpu_walker.h
@ -88,7 +88,7 @@ class GpgpuWalkerHelper {
 template <typename GfxFamily>
 struct EnqueueOperation {
    using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
-    static size_t getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitList);
+    static size_t getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent);
    static size_t getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo);
    static size_t getSizeRequiredForTimestampPacketWrite();
    static size_t getSizeForCacheFlushAfterWalkerCommands(const Kernel &kernel, const CommandQueue &commandQueue);
@ -101,8 +101,8 @@ struct EnqueueOperation {
 template <typename GfxFamily, uint32_t eventType>
 LinearStream &getCommandStream(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace,
                               bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo,
-                               Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList) {
-    size_t expectedSizeCS = EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(eventType, csrDeps, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, blitEnqueue, commandQueue, multiDispatchInfo, isMarkerWithProfiling, eventsInWaitList);
+                               Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent) {
+    size_t expectedSizeCS = EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(eventType, csrDeps, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, blitEnqueue, commandQueue, multiDispatchInfo, isMarkerWithProfiling, eventsInWaitList, outEvent);
    return commandQueue.getCS(expectedSizeCS);
 }

--- a/opencl/source/command_queue/gpgpu_walker_base.inl
+++ b/opencl/source/command_queue/gpgpu_walker_base.inl
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -165,7 +165,7 @@ size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWaDisableRccRhwoOptimization(cons
 }

 template <typename GfxFamily>
-size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitlist) {
+size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitlist, cl_event *outEvent) {
    size_t expectedSizeCS = 0;
    auto &hwInfo = commandQueue.getDevice().getHardwareInfo();
    auto &gfxCoreHelper = commandQueue.getDevice().getGfxCoreHelper();
@ -218,8 +218,14 @@ size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, c
    if (DebugManager.flags.GpuScratchRegWriteAfterWalker.get() != -1) {
        expectedSizeCS += sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM);
    }
-
-    expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<GfxFamily>(csrDeps);
+    expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer<GfxFamily>(csrDeps);
+    if (outEvent) {
+        auto pEvent = castToObjectOrAbort<Event>(*outEvent);
+        if ((pEvent->getContext()->getRootDeviceIndices().size() > 1) && (!pEvent->isUserEvent())) {
+            expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(hwInfo, false);
+        }
+    }
+    expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(false);

    return expectedSizeCS;
 }
--- a/opencl/source/command_queue/hardware_interface.h
+++ b/opencl/source/command_queue/hardware_interface.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -31,6 +31,7 @@ struct HardwareInterfaceWalkerArgs {
    size_t localWorkSizes[3] = {};
    TagNodeBase *hwTimeStamps = nullptr;
    TagNodeBase *hwPerfCounter = nullptr;
+    TagNodeBase *multiRootDeviceEventStamp = nullptr;
    TimestampPacketDependencies *timestampPacketDependencies = nullptr;
    TimestampPacketContainer *currentTimestampPacketNodes = nullptr;
    const Vec3<size_t> *numberOfWorkgroups = nullptr;
--- a/opencl/source/command_queue/hardware_interface_base.inl
+++ b/opencl/source/command_queue/hardware_interface_base.inl
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -133,6 +133,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
                                    walkerArgs.currentTimestampPacketNodes);

    walkerArgs.currentDispatchIndex = 0;
+
    for (auto &dispatchInfo : multiDispatchInfo) {
        dispatchInfo.dispatchInitCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo());
        walkerArgs.isMainKernel = (dispatchInfo.getKernel() == mainKernel);
--- a/opencl/source/context/context.cpp
+++ b/opencl/source/context/context.cpp
@ -49,7 +49,9 @@ Context::Context(

 Context::~Context() {
    gtpinNotifyContextDestroy((cl_context)this);
-
+    if (multiRootDeviceTimestampPacketAllocator.get() != nullptr) {
+        multiRootDeviceTimestampPacketAllocator.reset();
+    }
    if (smallBufferPoolAllocator.isAggregatedSmallBuffersEnabled(this)) {
        smallBufferPoolAllocator.releaseSmallBufferPool();
    }
@ -558,5 +560,15 @@ void Context::BufferPoolAllocator::releaseSmallBufferPool() {
    delete this->mainStorage;
    this->mainStorage = nullptr;
 }
+TagAllocatorBase *Context::getMultiRootDeviceTimestampPacketAllocator() {
+    return multiRootDeviceTimestampPacketAllocator.get();
+}
+void Context::setMultiRootDeviceTimestampPacketAllocator(std::unique_ptr<TagAllocatorBase> &allocator) {
+    multiRootDeviceTimestampPacketAllocator = std::move(allocator);
+}
+
+std::unique_lock<std::mutex> Context::obtainOwnershipForMultiRootDeviceAllocator() {
+    return std::unique_lock<std::mutex>(multiRootDeviceAllocatorMtx);
+}

 } // namespace NEO
--- a/opencl/source/context/context.h
+++ b/opencl/source/context/context.h
@ -37,6 +37,7 @@ class SharingFunctions;
 class SVMAllocsManager;
 class Program;
 class Platform;
+class TagAllocatorBase;

 template <>
 struct OpenCLObjectMapper<_cl_context> {
@ -223,6 +224,9 @@ class Context : public BaseObject<_cl_context> {
    BufferPoolAllocator &getBufferPoolAllocator() {
        return this->smallBufferPoolAllocator;
    }
+    TagAllocatorBase *getMultiRootDeviceTimestampPacketAllocator();
+    std::unique_lock<std::mutex> obtainOwnershipForMultiRootDeviceAllocator();
+    void setMultiRootDeviceTimestampPacketAllocator(std::unique_ptr<TagAllocatorBase> &allocator);

  protected:
    struct BuiltInKernel {
@ -263,6 +267,8 @@ class Context : public BaseObject<_cl_context> {
    uint32_t maxRootDeviceIndex = std::numeric_limits<uint32_t>::max();
    cl_bool preferD3dSharedResources = 0u;
    ContextType contextType = ContextType::CONTEXT_TYPE_DEFAULT;
+    std::unique_ptr<TagAllocatorBase> multiRootDeviceTimestampPacketAllocator;
+    std::mutex multiRootDeviceAllocatorMtx;

    bool interopUserSync = false;
    bool resolvesRequiredInKernels = false;
--- a/opencl/source/event/event.cpp
+++ b/opencl/source/event/event.cpp
@ -131,6 +131,9 @@ Event::~Event() {
        if (timeStampNode != nullptr) {
            timeStampNode->returnTag();
        }
+        if (multiRootTimeStampSyncNode != nullptr) {
+            multiRootTimeStampSyncNode->returnTag();
+        }
        if (perfCounterNode != nullptr) {
            cmdQueue->getPerfCounters()->deleteQuery(perfCounterNode->getQueryHandleRef());
            perfCounterNode->getQueryHandleRef() = {};
@ -883,7 +886,6 @@ TagNodeBase *Event::getHwTimeStampNode() {
 }

 TagNodeBase *Event::getHwPerfCounterNode() {
-
    if (!perfCounterNode && cmdQueue->getPerfCounters()) {
        const uint32_t gpuReportSize = HwPerfCounter::getSize(*(cmdQueue->getPerfCounters()));
        perfCounterNode = cmdQueue->getGpgpuCommandStreamReceiver().getEventPerfCountAllocator(gpuReportSize)->getTag();
@ -891,11 +893,27 @@ TagNodeBase *Event::getHwPerfCounterNode() {
    return perfCounterNode;
 }

+TagNodeBase *Event::getMultiRootTimestampSyncNode() {
+    auto lock = getContext()->obtainOwnershipForMultiRootDeviceAllocator();
+    if (getContext()->getMultiRootDeviceTimestampPacketAllocator() == nullptr) {
+        auto allocator = cmdQueue->getGpgpuCommandStreamReceiver().createMultiRootDeviceTimestampPacketAllocator(getContext()->getRootDeviceIndices());
+        getContext()->setMultiRootDeviceTimestampPacketAllocator(allocator);
+    }
+    lock.unlock();
+    if (multiRootDeviceTimestampPacketContainer.get() == nullptr) {
+        multiRootDeviceTimestampPacketContainer = std::make_unique<TimestampPacketContainer>();
+    }
+    multiRootTimeStampSyncNode = getContext()->getMultiRootDeviceTimestampPacketAllocator()->getTag();
+    multiRootDeviceTimestampPacketContainer->add(multiRootTimeStampSyncNode);
+    return multiRootTimeStampSyncNode;
+}
+
 void Event::addTimestampPacketNodes(const TimestampPacketContainer &inputTimestampPacketContainer) {
    timestampPacketContainer->assignAndIncrementNodesRefCounts(inputTimestampPacketContainer);
 }

 TimestampPacketContainer *Event::getTimestampPacketNodes() const { return timestampPacketContainer.get(); }
+TimestampPacketContainer *Event::getMultiRootDeviceTimestampPacketNodes() const { return multiRootDeviceTimestampPacketContainer.get(); }

 bool Event::checkUserEventDependencies(cl_uint numEventsInWaitList, const cl_event *eventWaitList) {
    bool userEventsDependencies = false;
--- a/opencl/source/event/event.h
+++ b/opencl/source/event/event.h
@ -115,6 +115,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {

    void addTimestampPacketNodes(const TimestampPacketContainer &inputTimestampPacketContainer);
    TimestampPacketContainer *getTimestampPacketNodes() const;
+    TimestampPacketContainer *getMultiRootDeviceTimestampPacketNodes() const;

    bool isPerfCountersEnabled() const {
        return perfCountersEnabled;
@ -129,6 +130,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
    }

    TagNodeBase *getHwPerfCounterNode();
+    TagNodeBase *getMultiRootTimestampSyncNode();

    std::unique_ptr<FlushStampTracker> flushStamp;
    std::atomic<TaskCountType> taskLevel;
@ -384,8 +386,10 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
    bool perfCountersEnabled;
    TagNodeBase *timeStampNode = nullptr;
    TagNodeBase *perfCounterNode = nullptr;
+    TagNodeBase *multiRootTimeStampSyncNode = nullptr;
    std::unique_ptr<TimestampPacketContainer> timestampPacketContainer;
    // number of events this event depends on
+    std::unique_ptr<TimestampPacketContainer> multiRootDeviceTimestampPacketContainer;
    std::atomic<int> parentCount;
    // event parents
    std::vector<Event *> parentEvents;
--- a/opencl/source/gen11/command_queue_gen11.cpp
+++ b/opencl/source/gen11/command_queue_gen11.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2022 Intel Corporation
+ * Copyright (C) 2019-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
--- a/opencl/source/gen12lp/command_queue_gen12lp.cpp
+++ b/opencl/source/gen12lp/command_queue_gen12lp.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2022 Intel Corporation
+ * Copyright (C) 2019-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
--- a/opencl/source/gen8/command_queue_gen8.cpp
+++ b/opencl/source/gen8/command_queue_gen8.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
--- a/opencl/source/gen9/command_queue_gen9.cpp
+++ b/opencl/source/gen9/command_queue_gen9.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
--- a/opencl/source/helpers/properties_helper.cpp
+++ b/opencl/source/helpers/properties_helper.cpp
@ -26,7 +26,6 @@ namespace NEO {
 void flushDependentCsr(CommandStreamReceiver &dependentCsr, CsrDependencies &csrDeps) {
    auto csrOwnership = dependentCsr.obtainUniqueOwnership();
    dependentCsr.updateTagFromWait();
-    csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
 }

 void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const {
@ -60,6 +59,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
                if (productHelper.isDcFlushAllowed()) {
                    if (!dependentCsr.isLatestTaskCountFlushed()) {
                        flushDependentCsr(dependentCsr, csrDeps);
+                        // csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
                        currentCsr.makeResident(*dependentCsr.getTagAllocation());
                    }
                }
@ -68,23 +68,22 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
    }
 }

-void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const {
+void EventsRequest::fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const {
    for (cl_uint i = 0; i < this->numEventsInWaitList; i++) {
        auto event = castToObjectOrAbort<Event>(this->eventWaitList[i]);
        if (event->isUserEvent() || CompletionStamp::notReady == event->peekTaskCount()) {
            continue;
        }
-
        if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) {
+            auto timestampPacketContainer = event->getMultiRootDeviceTimestampPacketNodes();
+            if (!timestampPacketContainer || timestampPacketContainer->peekNodes().empty()) {
+                continue;
+            }
            auto &dependentCsr = event->getCommandQueue()->getGpgpuCommandStreamReceiver();
            if (!dependentCsr.isLatestTaskCountFlushed()) {
                flushDependentCsr(dependentCsr, csrDeps);
-            } else {
-                csrDeps.taskCountContainer.push_back({event->peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
            }
-
-            auto graphicsAllocation = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex());
-            currentCsr.getResidencyAllocations().push_back(graphicsAllocation);
+            csrDeps.multiRootTimeStampSyncContainer.push_back(timestampPacketContainer);
        }
    }
 }
--- a/opencl/source/helpers/properties_helper.h
+++ b/opencl/source/helpers/properties_helper.h
@ -25,7 +25,7 @@ struct EventsRequest {
        : numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), outEvent(outEvent) {}

    void fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const;
-    void fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const;
+    void fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const;
    void setupBcsCsrForOutputEvent(CommandStreamReceiver &bcsCsr) const;

    cl_uint numEventsInWaitList;
--- a/opencl/source/helpers/task_information.cpp
+++ b/opencl/source/helpers/task_information.cpp
@ -118,10 +118,11 @@ CompletionStamp &CommandMapUnmap::submit(TaskCountType taskLevel, bool terminate

 CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> surfaces,
                                           bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr<PrintfHandler> &&printfHandler,
-                                           PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount)
+                                           PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount,
+                                           TagNodeBase *multiRootDeviceSyncNode)
    : Command(commandQueue, kernelOperation), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM),
      commandType(commandType), printfHandler(std::move(printfHandler)), kernel(kernel),
-      kernelCount(kernelCount), preemptionMode(preemptionMode) {
+      kernelCount(kernelCount), preemptionMode(preemptionMode), multiRootDeviceSyncNode(multiRootDeviceSyncNode) {
    UNRECOVERABLE_IF(nullptr == this->kernel);
    kernel->incRefInternal();
 }
@ -163,6 +164,9 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term
        printfHandler->makeResident(commandStreamReceiver);
    }
    makeTimestampPacketsResident(commandStreamReceiver);
+    if (multiRootDeviceSyncNode != nullptr) {
+        commandStreamReceiver.makeResident(*multiRootDeviceSyncNode->getBaseGraphicsAllocation());
+    }

    if (kernelOperation->blitPropertiesContainer.size() > 0) {
        CsrDependencies csrDeps;
@ -214,7 +218,7 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term
        false);                                                                           // hasRelaxedOrderingDependencies

    if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
+        eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver);
    }

    const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
@ -307,7 +311,7 @@ TaskCountType CommandWithoutKernel::dispatchBlitOperation() {
    blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0];

    if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(blitProperties.csrDependencies, *bcsCsr);
+        eventsRequest.fillCsrDependenciesForRootDevices(blitProperties.csrDependencies, *bcsCsr);
    }

    const auto newTaskCount = bcsCsr->flushBcsTask(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
@ -389,7 +393,7 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term
        false);                                                                // hasRelaxedOrderingDependencies

    if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
+        eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver);
    }

    const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
--- a/opencl/source/helpers/task_information.h
+++ b/opencl/source/helpers/task_information.h
@ -127,7 +127,7 @@ class CommandComputeKernel : public Command {
  public:
    CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> surfaces,
                         bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr<PrintfHandler> &&printfHandler,
-                         PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount);
+                         PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount, TagNodeBase *multiRootDeviceSyncNode);

    ~CommandComputeKernel() override;

@ -146,6 +146,7 @@ class CommandComputeKernel : public Command {
    Kernel *kernel;
    uint32_t kernelCount;
    PreemptionMode preemptionMode;
+    TagNodeBase *multiRootDeviceSyncNode;
 };

 class CommandWithoutKernel : public Command {
--- a/opencl/source/xe_hp_core/command_queue_xe_hp_core.cpp
+++ b/opencl/source/xe_hp_core/command_queue_xe_hp_core.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021-2022 Intel Corporation
+ * Copyright (C) 2021-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
--- a/opencl/source/xe_hpc_core/command_queue_xe_hpc_core.cpp
+++ b/opencl/source/xe_hpc_core/command_queue_xe_hpc_core.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021-2022 Intel Corporation
+ * Copyright (C) 2021-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
--- a/opencl/source/xe_hpg_core/command_queue_xe_hpg_core.cpp
+++ b/opencl/source/xe_hpg_core/command_queue_xe_hpg_core.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021-2022 Intel Corporation
+ * Copyright (C) 2021-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
--- a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -19,6 +19,7 @@
 #include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
 #include "opencl/test/unit_test/fixtures/buffer_fixture.h"
 #include "opencl/test/unit_test/fixtures/image_fixture.h"
+#include "opencl/test/unit_test/helpers/cl_hw_parse.h"
 #include "opencl/test/unit_test/mocks/mock_command_queue.h"
 #include "opencl/test/unit_test/mocks/mock_event.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
@ -193,6 +194,7 @@ class MockCommandStreamReceiverWithFailingFlushBatchedSubmission : public MockCo
 template <typename GfxFamily>
 struct MockCommandQueueHwWithOverwrittenCsr : public CommandQueueHw<GfxFamily> {
    using CommandQueueHw<GfxFamily>::CommandQueueHw;
+    using CommandQueueHw<GfxFamily>::timestampPacketContainer;
    MockCommandStreamReceiverWithFailingFlushBatchedSubmission *csr;
    CommandStreamReceiver &getGpgpuCommandStreamReceiver() const override { return *csr; }
 };
--- a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp
@ -21,6 +21,7 @@
 #include "opencl/test/unit_test/mocks/mock_command_queue.h"
 #include "opencl/test/unit_test/mocks/mock_event.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
+#include "opencl/test/unit_test/mocks/mock_mdi.h"

 using namespace NEO;

@ -970,4 +971,4 @@ HWTEST_F(CommandQueueHwTest, GivenBuiltinKernelWhenBuiltinDispatchInfoBuilderIsP
    EXPECT_EQ(builder.paramsToUse.elws.x, dispatchInfo->getEnqueuedWorkgroupSize().x);
    EXPECT_EQ(builder.paramsToUse.offset.x, dispatchInfo->getOffset().x);
    EXPECT_EQ(builder.paramsToUse.kernel, dispatchInfo->getKernel());
-}
+}
--- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
+++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
@ -31,6 +31,7 @@
 #include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
 #include "opencl/test/unit_test/mocks/mock_buffer.h"
 #include "opencl/test/unit_test/mocks/mock_command_queue.h"
+#include "opencl/test/unit_test/mocks/mock_event.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
 #include "opencl/test/unit_test/mocks/mock_mdi.h"
 #include "opencl/test/unit_test/mocks/mock_program.h"
--- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021-2022 Intel Corporation
+ * Copyright (C) 2021-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -557,11 +557,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenTimestamp
    MockMultiDispatchInfo multiDispatchInfo(device.get(), std::vector<Kernel *>({kernel1.mockKernel, kernel2.mockKernel}));

    device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
    size_t sizeWithDisabled = cmdQ.requestedCmdStreamSize;

    device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
    size_t sizeWithEnabled = cmdQ.requestedCmdStreamSize;

    size_t additionalSize = 0u;
@ -669,7 +669,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenAutoLocal
    EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer());

    auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, CsrDependencies(), false, false,
-                                                                               false, *cmdQ.get(), multiDispatchInfo, false, false);
+                                                                               false, *cmdQ.get(), multiDispatchInfo, false, false, nullptr);
    expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
    expectedSizeCS = alignUp(expectedSizeCS, MemoryConstants::cacheLineSize);
    EXPECT_GE(expectedSizeCS, usedAfterCS - usedBeforeCS);
@ -738,7 +738,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
    EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer());

    auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, CsrDependencies(), false, false,
-                                                                               false, *cmdQ.get(), multiDispatchInfo, false, false);
+                                                                               false, *cmdQ.get(), multiDispatchInfo, false, false, nullptr);
    expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
    expectedSizeCS = alignUp(expectedSizeCS, MemoryConstants::cacheLineSize);
    EXPECT_GE(expectedSizeCS, usedAfterCS - usedBeforeCS);
--- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp
@ -234,7 +234,7 @@ HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontReg
    Surface *surfaces[] = {nullptr};
    mockCmdQ->enqueueBlocked(CL_COMMAND_MARKER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies,
                             blockedCommandsData, enqueuePropertiesForDependencyFlush, eventsRequest,
-                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), nullptr);
+                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), nullptr, nullptr);
    EXPECT_FALSE(blockedCommandsDataForDependencyFlush->blitEnqueue);
 }

@ -267,7 +267,7 @@ HWTEST_F(EnqueueHandlerTest, givenBlitPropertyWhenEnqueueIsBlockedThenRegisterBl
    Surface *surfaces[] = {nullptr};
    mockCmdQ->enqueueBlocked(CL_COMMAND_READ_BUFFER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies,
                             blockedCommandsData, enqueuePropertiesForBlitEnqueue, eventsRequest,
-                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), mockCmdQ->getBcsForAuxTranslation());
+                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), mockCmdQ->getBcsForAuxTranslation(), nullptr);
    EXPECT_TRUE(blockedCommandsDataForBlitEnqueue->blitEnqueue);
    EXPECT_EQ(blitProperties.srcAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->srcAllocation);
    EXPECT_EQ(blitProperties.dstAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->dstAllocation);
@ -351,7 +351,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitEnqueueWhenDispatchingCommandsWithoutK

    timestampPacketDependencies.cacheFlushNodes.add(mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag());
    BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies,
-                                                                            eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false);
+                                                                            eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr);

    BlitPropertiesContainer blitPropertiesContainer;
    blitPropertiesContainer.push_back(blitProperties);
@ -395,7 +395,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenN1EnabledWhenDispatchingWithoutKernelThenA
    mockCmdQ->obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, true, bcsCsr);
    timestampPacketDependencies.cacheFlushNodes.add(mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag());
    BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies,
-                                                                            eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false);
+                                                                            eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr);
    BlitPropertiesContainer blitPropertiesContainer;
    blitPropertiesContainer.push_back(blitProperties);

@ -441,7 +441,7 @@ HWTEST_F(DispatchFlagsTests, givenMockKernelWhenSettingAdditionalKernelExecInfoT
    std::vector<Surface *> v;

    pKernel->setAdditionalKernelExecInfo(123u);
-    std::unique_ptr<CommandComputeKernel> cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1));
+    std::unique_ptr<CommandComputeKernel> cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1, nullptr));
    cmd->submit(1u, false);
    EXPECT_EQ(mockCsr->passedDispatchFlags.additionalKernelExecInfo, 123u);

--- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp
@ -54,7 +54,7 @@ HWTEST2_F(DispatchFlagsTests, whenSubmittingKernelWithAdditionalKernelExecInfoTh
    std::vector<Surface *> v;

    pKernel->setAdditionalKernelExecInfo(AdditionalKernelExecInfo::DisableOverdispatch);
-    std::unique_ptr<CommandComputeKernel> cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1));
+    std::unique_ptr<CommandComputeKernel> cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1, nullptr));
    cmd->submit(1u, false);
    EXPECT_EQ(mockCsr->passedDispatchFlags.additionalKernelExecInfo, AdditionalKernelExecInfo::DisableOverdispatch);

--- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
@ -2003,10 +2003,10 @@ HWTEST_F(PauseOnGpuTests, givenGpuScratchWriteEnabledWhenEstimatingCommandStream
    dispatchInfo.setKernel(mockKernel.mockKernel);
    multiDispatchInfo.push(dispatchInfo);

-    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false);
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
    DebugManager.flags.GpuScratchRegWriteAfterWalker.set(1);

-    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);

    EXPECT_EQ(baseCommandStreamSize + sizeof(typename FamilyType::MI_LOAD_REGISTER_IMM), extendedCommandStreamSize);
 }
--- a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp
@ -1014,8 +1014,8 @@ HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithoutW
    dispatchInfo.setKernel(mockKernel.mockKernel);
    multiDispatchInfo.push(dispatchInfo);

-    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false);
-    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false);
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false, nullptr);

    EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO<FamilyType>::size + MemorySynchronizationCommands<FamilyType>::getSizeForSingleBarrier(false), extendedCommandStreamSize);
 }
@ -1033,8 +1033,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, EnqueueKernelTest, givenTimestampWriteEnableOnMulti
    dispatchInfo.setKernel(mockKernel.mockKernel);
    multiDispatchInfo.push(dispatchInfo);

-    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false);
-    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false);
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false, nullptr);

    EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO<FamilyType>::size + ImplicitScalingDispatch<FamilyType>::getBarrierSize(csr.peekHwInfo(), false, false), extendedCommandStreamSize);
 }
@ -1047,8 +1047,8 @@ HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithWait
    dispatchInfo.setKernel(mockKernel.mockKernel);
    multiDispatchInfo.push(dispatchInfo);

-    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false);
-    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, true);
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, true, nullptr);

    EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO<FamilyType>::size, extendedCommandStreamSize);
 }
--- a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp
+++ b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -23,6 +23,7 @@
 #include "opencl/test/unit_test/fixtures/hello_world_kernel_fixture.h"
 #include "opencl/test/unit_test/fixtures/image_fixture.h"
 #include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h"
+#include "opencl/test/unit_test/mocks/mock_event.h"

 using namespace NEO;

@ -96,7 +97,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenFillingBufferThenHeapsAndCommandBufferCo
    auto usedAfterSSH = ssh.getUsed();

    auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_FILL_BUFFER, CsrDependencies(), false, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredDSH(multiDispatchInfo);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredIOH(multiDispatchInfo);
    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@ -149,7 +150,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenCopyingBufferThenHeapsAndCommandBufferCo
    auto usedAfterSSH = ssh.getUsed();

    auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_COPY_BUFFER, CsrDependencies(), false, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredDSH(multiDispatchInfo);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredIOH(multiDispatchInfo);
    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@ -203,7 +204,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenReadingBufferNonBlockingThenHeapsAndComm
    auto usedAfterSSH = ssh.getUsed();

    auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, CsrDependencies(), false, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredDSH(multiDispatchInfo);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredIOH(multiDispatchInfo);
    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@ -258,7 +259,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenReadingBufferBlockingThenThenHeapsAndCom
    auto usedAfterSSH = ssh.getUsed();

    auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, CsrDependencies(), false, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredDSH(multiDispatchInfo);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredIOH(multiDispatchInfo);
    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@ -313,7 +314,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferNonBlockingThenHeapsAndComm
    auto usedAfterSSH = ssh.getUsed();

    auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, CsrDependencies(), false, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredDSH(multiDispatchInfo);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredIOH(multiDispatchInfo);
    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@ -365,7 +366,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferBlockingThenHeapsAndCommand
    auto usedAfterSSH = ssh.getUsed();

    auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, CsrDependencies(), false, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredDSH(multiDispatchInfo);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredIOH(multiDispatchInfo);
    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@ -380,6 +381,68 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferBlockingThenHeapsAndCommand
    EXPECT_GE(expectedSizeSSH, usedAfterSSH - usedBeforeSSH);
 }

+HWTEST_F(GetSizeRequiredBufferTest, GivenOutEventForSingleDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsNotAdded) {
+    UltClDeviceFactory deviceFactory{1, 0};
+    DebugManager.flags.EnableMultiRootDeviceContexts.set(true);
+
+    cl_device_id devices[] = {deviceFactory.rootDevices[0]};
+
+    MockContext pContext(ClDeviceVector(devices, 1));
+    MockKernelWithInternals mockKernel(*pContext.getDevices()[0]);
+    DispatchInfo dispatchInfo;
+    MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel);
+    dispatchInfo.setKernel(mockKernel.mockKernel);
+    multiDispatchInfo.push(dispatchInfo);
+    auto event = std::make_unique<MockEvent<Event>>(&pContext, nullptr, 0, 0, 0);
+    cl_event clEvent = event.get();
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent);
+
+    EXPECT_EQ(baseCommandStreamSize, extendedCommandStreamSize);
+}
+
+HWTEST_F(GetSizeRequiredBufferTest, GivenUserEventForMultiDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsNotAdded) {
+    UltClDeviceFactory deviceFactory{2, 0};
+    DebugManager.flags.EnableMultiRootDeviceContexts.set(true);
+
+    cl_device_id devices[] = {deviceFactory.rootDevices[0],
+                              deviceFactory.rootDevices[1]};
+
+    MockContext pContext(ClDeviceVector(devices, 2));
+    MockKernelWithInternals mockKernel(*pContext.getDevices()[0]);
+    DispatchInfo dispatchInfo;
+    MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel);
+    dispatchInfo.setKernel(mockKernel.mockKernel);
+    multiDispatchInfo.push(dispatchInfo);
+    auto userEvent1 = std::make_unique<UserEvent>(&pContext);
+    cl_event clEvent = userEvent1.get();
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent);
+
+    EXPECT_EQ(baseCommandStreamSize, extendedCommandStreamSize);
+}
+
+HWTEST_F(GetSizeRequiredBufferTest, GivenOutEventForMultiDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsAdded) {
+    UltClDeviceFactory deviceFactory{2, 0};
+    DebugManager.flags.EnableMultiRootDeviceContexts.set(true);
+
+    cl_device_id devices[] = {deviceFactory.rootDevices[0],
+                              deviceFactory.rootDevices[1]};
+
+    MockContext pContext(ClDeviceVector(devices, 2));
+    MockKernelWithInternals mockKernel(*pContext.getDevices()[0]);
+    DispatchInfo dispatchInfo;
+    MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel);
+    dispatchInfo.setKernel(mockKernel.mockKernel);
+    multiDispatchInfo.push(dispatchInfo);
+    auto event = std::make_unique<MockEvent<Event>>(&pContext, nullptr, 0, 0, 0);
+    cl_event clEvent = event.get();
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent);
+
+    EXPECT_EQ(baseCommandStreamSize + MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pContext.getDevices()[0]->getHardwareInfo(), false), extendedCommandStreamSize);
+}
+
 HWTEST_F(GetSizeRequiredBufferTest, givenMultipleKernelRequiringSshWhenTotalSizeIsComputedThenItIsProperlyAligned) {
    auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBuffer,
                                                                            pCmdQ->getClDevice());
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp
@ -1903,7 +1903,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenItIsUnblocke
    blockedCommandsData->setHeaps(dsh, ioh, ssh);

    std::vector<Surface *> surfaces;
-    event->setCommand(std::make_unique<CommandComputeKernel>(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1));
+    event->setCommand(std::make_unique<CommandComputeKernel>(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1, nullptr));
    event->submitCommand(false);

    EXPECT_EQ(numGrfRequired, csr->savedDispatchFlags.numGrfRequired);
@ -1948,7 +1948,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenInitializeBc
    auto blockedCommandsData = std::make_unique<KernelOperation>(cmdStream, *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());

    std::vector<Surface *> surfaces;
-    event->setCommand(std::make_unique<CommandComputeKernel>(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1));
+    event->setCommand(std::make_unique<CommandComputeKernel>(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1, nullptr));
    event->submitCommand(false);
    EXPECT_FALSE(pCmdQ->isCsrLocked);
 }
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp
@ -7,6 +7,7 @@

 #include "shared/source/command_stream/wait_status.h"
 #include "shared/test/common/mocks/mock_command_stream_receiver.h"
+#include "shared/test/common/mocks/mock_timestamp_container.h"
 #include "shared/test/common/mocks/ult_device_factory.h"
 #include "shared/test/common/test_macros/hw_test.h"

@ -14,6 +15,7 @@
 #include "opencl/source/event/user_event.h"
 #include "opencl/test/unit_test/fixtures/multi_root_device_fixture.h"
 #include "opencl/test/unit_test/fixtures/ult_command_stream_receiver_fixture.h"
+#include "opencl/test/unit_test/mocks/mock_event.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
 #include "opencl/test/unit_test/mocks/mock_program.h"
 #include "opencl/test/unit_test/test_macros/test_checks_ocl.h"
@ -45,12 +47,18 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu
    MockGraphicsAllocation svmAlloc(svmPtr, svmSize);

    Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
+    auto node1 = event1.getMultiRootTimestampSyncNode();
    Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
    Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
+    auto node3 = event3.getMultiRootTimestampSyncNode();
    Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    auto node4 = event4.getMultiRootTimestampSyncNode();
    Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
+    auto node5 = event5.getMultiRootTimestampSyncNode();
    UserEvent userEvent1(&pCmdQ1->getContext());
+    userEvent1.getMultiRootTimestampSyncNode();
    UserEvent userEvent2(&pCmdQ2->getContext());
+    userEvent2.getMultiRootTimestampSyncNode();

    userEvent1.setStatus(CL_COMPLETE);
    userEvent2.setStatus(CL_COMPLETE);
@ -87,12 +95,12 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu
        EXPECT_EQ(2u, semaphores.size());

        auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());

        auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
-        EXPECT_EQ(7u, semaphoreCmd1->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress());
    }

    {
@ -115,12 +123,12 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu
        EXPECT_EQ(2u, semaphores.size());

        auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());

        auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
-        EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress());
    }
    alignedFree(svmPtr);
 }
@ -147,17 +155,24 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo
    cl_device_id devices[] = {device1, device2, device3};

    auto context = std::make_unique<MockContext>(ClDeviceVector(devices, 3), false);
-
+    auto mockTagAllocator = std::make_unique<MockTagAllocator<>>(context->getRootDeviceIndices(), device1->getExecutionEnvironment()->memoryManager.get(), 10u);
+    std::unique_ptr<TagAllocatorBase> uniquePtr(mockTagAllocator.release());
+    context->setMultiRootDeviceTimestampPacketAllocator(uniquePtr);
    auto pCmdQ1 = context->getSpecialQueue(1u);
    auto pCmdQ2 = context->getSpecialQueue(2u);
    auto pCmdQ3 = context->getSpecialQueue(3u);

    Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
+    auto node1 = event1.getMultiRootTimestampSyncNode();
    Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
    Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
+    auto node3 = event3.getMultiRootTimestampSyncNode();
    Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    auto node4 = event4.getMultiRootTimestampSyncNode();
    Event event5(pCmdQ3, CL_COMMAND_NDRANGE_KERNEL, 7, 21);
+    auto node5 = event5.getMultiRootTimestampSyncNode();
    Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
+    auto node6 = event6.getMultiRootTimestampSyncNode();
    UserEvent userEvent1(&pCmdQ1->getContext());
    UserEvent userEvent2(&pCmdQ2->getContext());

@ -190,16 +205,16 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo
        EXPECT_EQ(3u, semaphores.size());

        auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());

        auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
-        EXPECT_EQ(21u, semaphoreCmd1->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ3->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress());

        auto semaphoreCmd2 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[2]));
-        EXPECT_EQ(7u, semaphoreCmd2->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd2->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node6->getContextEndAddress(0u)), semaphoreCmd2->getSemaphoreGraphicsAddress());
    }

    {
@ -215,16 +230,16 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo
        EXPECT_EQ(3u, semaphores.size());

        auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());

        auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
-        EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress());

        auto semaphoreCmd2 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[2]));
-        EXPECT_EQ(21u, semaphoreCmd2->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ3->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd2->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node5->getContextEndAddress(0u)), semaphoreCmd2->getSemaphoreGraphicsAddress());
    }

    {
@ -249,8 +264,8 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo
        EXPECT_EQ(1u, semaphores.size());

        auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());
    }
 }

@ -286,11 +301,16 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro
    using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;

    Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
+    event1.getMultiRootTimestampSyncNode();
    Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
    Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6);
+    event3.getMultiRootTimestampSyncNode();
    Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
+    event4.getMultiRootTimestampSyncNode();
    Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    event5.getMultiRootTimestampSyncNode();
    Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
+    event6.getMultiRootTimestampSyncNode();
    UserEvent userEvent1(&pCmdQ1->getContext());
    UserEvent userEvent2(&pCmdQ2->getContext());

@ -316,10 +336,10 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro

        EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr);
        CsrDependencies csrDeps;
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ1->getGpgpuCommandStreamReceiver());
+        eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ1->getGpgpuCommandStreamReceiver());

-        EXPECT_EQ(0u, csrDeps.taskCountContainer.size());
-        EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<FamilyType>(csrDeps));
+        EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size());
+        EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer<FamilyType>(csrDeps));
    }

    {
@ -342,10 +362,10 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro

        EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr);
        CsrDependencies csrDeps;
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver());
+        eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver());

-        EXPECT_EQ(3u, csrDeps.taskCountContainer.size());
-        EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<FamilyType>(csrDeps));
+        EXPECT_EQ(3u, csrDeps.multiRootTimeStampSyncContainer.size());
+        EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer<FamilyType>(csrDeps));
    }
 }

@ -376,6 +396,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW

    cl_event outputEvent2{};

+    auto currentCsUsedCmdq1 = pCmdQ1->getCS(0).getUsed();
    pCmdQ2->enqueueReadBuffer(buffer, CL_FALSE, 0, MemoryConstants::pageSize, hostPtr, nullptr,
                              1,
                              &outputEvent1,
@ -399,14 +420,12 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
        nullptr);
    {
        HardwareParse csHwParser;
-        csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
+        csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0), currentCsUsedCmdq1);
        auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());

        EXPECT_EQ(0u, semaphores.size());
    }
    userEvent1.setStatus(CL_COMPLETE);
-    event1->release();
-    event2->release();
    pCmdQ1->finish();
    pCmdQ2->finish();
    {
@ -417,7 +436,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
        EXPECT_EQ(1u, semaphores.size());
        auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
        EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(event2->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0)->getContextEndAddress(0u)), semaphoreCmd->getSemaphoreGraphicsAddress());
    }
    {
        HardwareParse csHwParser;
@ -426,9 +445,11 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW

        EXPECT_EQ(1u, semaphores.size());
        auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(0u, semaphoreCmd->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(event1->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0)->getContextEndAddress(0u)), semaphoreCmd->getSemaphoreGraphicsAddress());
    }
+    event1->release();
+    event2->release();
    buffer->release();
 }

@ -458,14 +479,14 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
    char hostPtr[MemoryConstants::pageSize]{};

    cl_event outputEvent2{};
-
+    auto currentCsUsed = pCmdQ1->getCS(0).getUsed();
    pCmdQ1->enqueueReadBuffer(buffer, CL_FALSE, 0, MemoryConstants::pageSize, hostPtr, nullptr,
                              1,
                              &outputEvent1,
                              &outputEvent2);
    {
        HardwareParse csHwParser;
-        csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
+        csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0), currentCsUsed);
        auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());

        EXPECT_EQ(0u, semaphores.size());
@ -482,7 +503,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
        nullptr);
    {
        HardwareParse csHwParser;
-        csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
+        csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0), currentCsUsed);
        auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());

        EXPECT_EQ(0u, semaphores.size());
@ -590,9 +611,6 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
        EXPECT_EQ(0u, semaphores.size());
    }
    userEvent1.setStatus(CL_COMPLETE);
-    event1->release();
-    event2->release();
-    event3->release();
    pCmdQ1->finish();
    pCmdQ2->finish();

@ -604,7 +622,8 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
        EXPECT_EQ(1u, semaphores.size());
        auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
        EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress());
+        auto node = event2->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0);
+        EXPECT_EQ(node->getGpuAddress() + node->getContextEndOffset(), semaphoreCmd->getSemaphoreGraphicsAddress());
    }
    {
        HardwareParse csHwParser;
@ -620,8 +639,9 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW

        EXPECT_EQ(2u, semaphores.size());
        auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(0u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        auto node = event1->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0);
+        EXPECT_EQ(node->getGpuAddress() + node->getContextEndOffset(), semaphoreCmd0->getSemaphoreGraphicsAddress());
    }
    {
        HardwareParse csHwParser;
@ -630,6 +650,9 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW

        EXPECT_LE(1u, semaphores.size());
    }
+    event1->release();
+    event2->release();
+    event3->release();
    buffer->release();
    pCmdQ1->release();
    pCmdQ2->release();
@ -961,3 +984,73 @@ HWTEST_F(BcsCrossDeviceMigrationTests, givenBufferWithMultiStorageWhenEnqueueRea

    EXPECT_EQ(buffer.get(), cmdQueue->migrateMultiGraphicsAllocationsReceivedOperationParams.srcMemObj);
 }
+
+HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyDoNotHaveMultiRootSyncNodeThenCsrDepsDoesNotHaveAnyMultiRootSyncContainer) {
+    Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
+    Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
+    Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6);
+    Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
+    Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
+    UserEvent userEvent1(&pCmdQ1->getContext());
+    UserEvent userEvent2(&pCmdQ2->getContext());
+
+    userEvent1.setStatus(CL_COMPLETE);
+    userEvent2.setStatus(CL_COMPLETE);
+    {
+        cl_event eventWaitList[] =
+            {
+                &event1,
+                &event2,
+                &event3,
+                &event4,
+                &event5,
+                &event6,
+                &userEvent1,
+            };
+        cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]);
+
+        EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr);
+        CsrDependencies csrDeps;
+        eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver());
+
+        EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size());
+    }
+}
+HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyDoNotHaveMultiRootSyncNodeContainersThenCsrDepsDoesNotHaveAnyMultiRootSyncContainer) {
+
+    MockEvent<Event> event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
+    event1.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer());
+    MockEvent<Event> event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
+    MockEvent<Event> event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
+    event3.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer());
+    MockEvent<Event> event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    event4.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer());
+    MockEvent<Event> event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    event5.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer());
+    MockEvent<Event> event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
+    event6.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer());
+    UserEvent userEvent1(&pCmdQ1->getContext());
+
+    userEvent1.setStatus(CL_COMPLETE);
+
+    {
+        cl_event eventWaitList[] =
+            {
+                &event1,
+                &event2,
+                &event3,
+                &event4,
+                &event5,
+                &event6,
+                &userEvent1,
+            };
+        cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]);
+
+        EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr);
+        CsrDependencies csrDeps;
+        eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver());
+
+        EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size());
+    }
+}
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp
@ -11,6 +11,7 @@
 #include "shared/source/command_stream/wait_status.h"
 #include "shared/source/helpers/constants.h"
 #include "shared/source/helpers/logical_state_helper.h"
+#include "shared/source/os_interface/device_factory.h"
 #include "shared/source/os_interface/hw_info_config.h"
 #include "shared/test/common/helpers/debug_manager_state_restore.h"
 #include "shared/test/common/helpers/engine_descriptor_helper.h"
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp
@ -1791,4 +1791,4 @@ HWTEST_F(BcsTests, givenHostPtrToImageWhenBlitBufferIsCalledThenBlitCmdIsFound)
    hwParser.parseCommands<FamilyType>(csr.commandStream, 0);
    auto cmdIterator = find<typename FamilyType::XY_BLOCK_COPY_BLT *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
    EXPECT_NE(hwParser.cmdList.end(), cmdIterator);
-}
+}
--- a/opencl/test/unit_test/event/event_builder_tests.cpp
+++ b/opencl/test/unit_test/event/event_builder_tests.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -79,7 +79,7 @@ TEST(EventBuilder, givenVirtualEventWithCommandThenFinalizeAddChild) {
      public:
        using CommandComputeKernel::eventsWaitlist;
        MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces, Kernel *kernel)
-            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
+            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {}
    };

    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
@ -129,7 +129,7 @@ TEST(EventBuilder, givenVirtualEventWithSubmittedCommandAsParentThenFinalizeNotA
      public:
        using CommandComputeKernel::eventsWaitlist;
        MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces, Kernel *kernel)
-            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
+            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {}
    };

    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
--- a/opencl/test/unit_test/event/event_tests.cpp
+++ b/opencl/test/unit_test/event/event_tests.cpp
@ -485,7 +485,7 @@ TEST_F(InternalsEventTest, GivenSubmitCommandFalseWhenSubmittingCommandsThenRefA

    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
    v.push_back(bufferSurf);
-    auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr);
    event.setCommand(std::unique_ptr<Command>(cmd));

    auto taskLevelBefore = csr.peekTaskLevel();
@ -528,7 +528,7 @@ TEST_F(InternalsEventTest, GivenSubmitCommandTrueWhenSubmittingCommandsThenRefAp
    NullSurface *surface = new NullSurface;
    v.push_back(surface);
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr);
    event.setCommand(std::unique_ptr<Command>(cmd));

    auto taskLevelBefore = csr.peekTaskLevel();
@ -579,7 +579,7 @@ TEST_F(InternalsEventTest, givenBlockedKernelWithPrintfWhenSubmittedThenPrintOut

    std::vector<Surface *> v;
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr);
    event.setCommand(std::unique_ptr<Command>(cmd));

    event.submitCommand(false);
@ -631,7 +631,7 @@ TEST_F(InternalsEventTest, givenGpuHangOnCmdQueueWaitFunctionAndBlockedKernelWit

    std::vector<Surface *> v;
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr);
    event.setCommand(std::unique_ptr<Command>(cmd));

    event.submitCommand(false);
@ -680,7 +680,7 @@ TEST_F(InternalsEventTest, givenGpuHangOnPrintingEnqueueOutputAndBlockedKernelWi

    std::vector<Surface *> v;
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr);
    event.setCommand(std::unique_ptr<Command>(cmd));

    event.submitCommand(false);
@ -1169,7 +1169,7 @@ HWTEST_F(EventTest, givenVirtualEventWhenCommandSubmittedThenLockCsrOccurs) {
      public:
        using CommandComputeKernel::eventsWaitlist;
        MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces, Kernel *kernel)
-            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
+            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {}
    };
    class MockEvent : public Event {
      public:
@ -1750,7 +1750,7 @@ HWTEST_F(InternalsEventTest, givenAbortedCommandWhenSubmitCalledThenDontUpdateFl
    blockedCommandsData->setHeaps(dsh, ioh, ssh);
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
    std::vector<Surface *> v;
-    auto cmd = new CommandComputeKernel(*pCmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(*pCmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr);
    event->setCommand(std::unique_ptr<Command>(cmd));

    FlushStamp expectedFlushStamp = 0;
@ -1893,3 +1893,35 @@ TEST(EventTimestampTest, givenEnableTimestampWaitWhenCheckIsTimestampWaitEnabled
        EXPECT_TRUE(event.isWaitForTimestampsEnabled());
    }
 }
+TEST(MultiRootEvent, givenContextWithMultiRootTagAllocatorWhenEventGetsTagThenNewAllocatorIsNotCreated) {
+    auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
+    MockContext context{};
+    MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false);
+    auto allocator = cmdQ.getGpgpuCommandStreamReceiver().createMultiRootDeviceTimestampPacketAllocator(context.getRootDeviceIndices());
+    auto allocatorPtr = allocator.get();
+    context.setMultiRootDeviceTimestampPacketAllocator(allocator);
+    MockEvent<Event> event{&cmdQ, CL_COMMAND_MARKER, 0, 0};
+    event.getMultiRootTimestampSyncNode();
+    EXPECT_EQ(allocatorPtr, context.getMultiRootDeviceTimestampPacketAllocator());
+}
+TEST(MultiRootEvent, givenContextWithoutMultiRootTagAllocatorWhenEventGetsTagThenNewAllocatorIsCreated) {
+    auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
+    MockContext context{};
+    MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false);
+    EXPECT_EQ(context.getMultiRootDeviceTimestampPacketAllocator(), nullptr);
+    MockEvent<Event> event{&cmdQ, CL_COMMAND_MARKER, 0, 0};
+    event.getMultiRootTimestampSyncNode();
+    EXPECT_NE(context.getMultiRootDeviceTimestampPacketAllocator(), nullptr);
+}
+TEST(MultiRootEvent, givenEventWithTagWhenEventGetsNewTagThenNewTagContainerIsNotCreated) {
+    auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
+    MockContext context{};
+    MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false);
+    MockEvent<Event> event{&cmdQ, CL_COMMAND_MARKER, 0, 0};
+    EXPECT_EQ(event.getMultiRootDeviceTimestampPacketNodes(), nullptr);
+    event.getMultiRootTimestampSyncNode();
+    auto containerPtr = event.getMultiRootDeviceTimestampPacketNodes();
+    EXPECT_NE(containerPtr, nullptr);
+    event.getMultiRootTimestampSyncNode();
+    EXPECT_EQ(containerPtr, event.getMultiRootDeviceTimestampPacketNodes());
+}
--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@ -2435,7 +2435,7 @@ HWTEST_F(GTPinTests, givenGtPinInitializedWhenSubmittingKernelCommandThenFlushed

    gtpinNotifyKernelSubmit(kernel.mockMultiDeviceKernel, mockCmdQ.get());

-    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1));
+    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr));
    CompletionStamp stamp = command->submit(20, false);

    ASSERT_EQ(1u, kernelExecQueue.size());
--- a/opencl/test/unit_test/helpers/task_information_tests.cpp
+++ b/opencl/test/unit_test/helpers/task_information_tests.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -154,7 +154,7 @@ TEST(CommandTest, givenWaitlistRequestWhenCommandComputeKernelIsCreatedThenMakeL
      public:
        using CommandComputeKernel::eventsWaitlist;
        MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces, Kernel *kernel)
-            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
+            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {}
    };

    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
@ -291,7 +291,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectD
    for (auto &surface : surfaces) {
        requiresCoherency |= surface->IsCoherent;
    }
-    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1));
+    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr));
    command->submit(20, false);

    EXPECT_FALSE(mockCsr->passedDispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode);
@ -339,7 +339,7 @@ HWTEST_F(DispatchFlagsTests, givenClCommandCopyImageWhenSubmitThenFlushTextureCa
    for (auto &surface : surfaces) {
        requiresCoherency |= surface->IsCoherent;
    }
-    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, commandType, nullptr, preemptionMode, kernel, 1));
+    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, commandType, nullptr, preemptionMode, kernel, 1, nullptr));
    command->submit(20, false);

    EXPECT_FALSE(mockCsr->passedDispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode);
@ -425,7 +425,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectD
    bool flushDC = false;
    bool slmUsed = false;
    bool ndRangeKernel = false;
-    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1));
+    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr));
    command->submit(20, false);

    EXPECT_TRUE(mockCsr->passedDispatchFlags.epilogueRequired);
--- a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp
+++ b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -34,11 +34,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWriteEnabl
    auto mockCmdQHw = std::make_unique<MockCommandQueueHw<FamilyType>>(context, device.get(), nullptr);

    device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
    auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize;

    device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
    auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize;

    auto extendedSize = sizeWithDisabled + sizeof(typename FamilyType::PIPE_CONTROL);
@ -52,7 +52,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat

    device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false,
-                                                            false, multiDispatchInfo, nullptr, 0, false, false);
+                                                            false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
    auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize;

    device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
@ -82,7 +82,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat
    eventsRequest.fillCsrDependenciesForTimestampPacketContainer(
        csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);

-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
    auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize;

    size_t sizeForNodeDependency = 0;
@ -143,7 +143,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr
    auto mockCmdQHw = std::make_unique<MockCommandQueueHw<FamilyType>>(context, device.get(), nullptr);

    device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
    auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize;

    device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
@ -172,7 +172,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr
    CsrDependencies csrDeps;
    eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);

-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
    auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize;

    size_t sizeForNodeDependency = 0;
--- a/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2022 Intel Corporation
+ * Copyright (C) 2019-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -212,7 +212,7 @@ HWTEST2_F(KernelWithCacheFlushTests, givenCacheFlushRequiredWhenEstimatingThenAd
    {
        EXPECT_FALSE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ));

-        initialSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false);
+        initialSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false, nullptr);
    }

    {
@ -226,7 +226,7 @@ HWTEST2_F(KernelWithCacheFlushTests, givenCacheFlushRequiredWhenEstimatingThenAd
        ultCsr.multiOsContextCapable = false;
        EXPECT_TRUE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ));

-        sizeWithCacheFlush = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false);
+        sizeWithCacheFlush = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false, nullptr);
    }

    EXPECT_EQ(initialSize + expectedDiff, sizeWithCacheFlush);
--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@ -504,6 +504,7 @@ class CommandStreamReceiverMock : public CommandStreamReceiver {
    using BaseClass::CommandStreamReceiver;

    TagAllocatorBase *getTimestampPacketAllocator() override { return nullptr; }
+    std::unique_ptr<TagAllocatorBase> createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override { return std::unique_ptr<TagAllocatorBase>(nullptr); }

    SubmissionStatus flushTagUpdate() override { return SubmissionStatus::SUCCESS; };
    void updateTagFromWait() override{};
--- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp
+++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp
@ -766,11 +766,11 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferOperationWithoutKernelWhenEstimati
    auto &hwInfo = cmdQ->getDevice().getHardwareInfo();

    auto readBufferCmdsSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, csrDependencies, false, false,
-                                                                                   true, *cmdQ, multiDispatchInfo, false, false);
+                                                                                   true, *cmdQ, multiDispatchInfo, false, false, nullptr);
    auto writeBufferCmdsSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, csrDependencies, false, false,
-                                                                                    true, *cmdQ, multiDispatchInfo, false, false);
+                                                                                    true, *cmdQ, multiDispatchInfo, false, false, nullptr);
    auto copyBufferCmdsSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_COPY_BUFFER, csrDependencies, false, false,
-                                                                                   true, *cmdQ, multiDispatchInfo, false, false);
+                                                                                   true, *cmdQ, multiDispatchInfo, false, false, nullptr);
    auto expectedSize = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<FamilyType>();

    if (cmdQ->isCacheFlushForBcsRequired()) {
--- a/opencl/test/unit_test/mocks/mock_command_queue.h
+++ b/opencl/test/unit_test/mocks/mock_command_queue.h
@ -249,6 +249,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
    using BaseClass::latestSentEnqueueType;
    using BaseClass::obtainCommandStream;
    using BaseClass::obtainNewTimestampPacketNodes;
+    using BaseClass::processDispatchForKernels;
    using BaseClass::requiresCacheFlushAfterWalker;
    using BaseClass::throttle;
    using BaseClass::timestampPacketContainer;
--- a/opencl/test/unit_test/mocks/mock_event.h
+++ b/opencl/test/unit_test/mocks/mock_event.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -39,6 +39,7 @@ struct MockEvent : public BaseEventType {
    using Event::calculateSubmitTimestampData;
    using Event::isWaitForTimestampsEnabled;
    using Event::magic;
+    using Event::multiRootDeviceTimestampPacketContainer;
    using Event::queueTimeStamp;
    using Event::submitTimeStamp;
    using Event::timestampPacketContainer;
--- a/opencl/test/unit_test/profiling/profiling_tests.cpp
+++ b/opencl/test/unit_test/profiling/profiling_tests.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -71,13 +71,13 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingAndFor

    MultiDispatchInfo multiDispatchInfo(&kernel);
    auto &commandStreamNDRangeKernel = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, CsrDependencies(), true, false, false,
-                                                                                               multiDispatchInfo, nullptr, 0, false, false);
+                                                                                               multiDispatchInfo, nullptr, 0, false, false, nullptr);
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, false, *pCmdQ, &kernel, {});
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamNDRangeKernel.getAvailableSpace(), requiredSize);

    auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, CsrDependencies(), true, false, false,
-                                                                            multiDispatchInfo, nullptr, 0, false, false);
+                                                                            multiDispatchInfo, nullptr, 0, false, false, nullptr);
    expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_TASK, true, false, *pCmdQ, &kernel, {});
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize);
@ -93,13 +93,13 @@ HWTEST_F(ProfilingTests, GivenCommandQueueWithProfilingAndForWorkloadWithNoKerne
    MultiDispatchInfo multiDispatchInfo(nullptr);
    auto &commandStreamMigrateMemObjects = getCommandStream<FamilyType, CL_COMMAND_MIGRATE_MEM_OBJECTS>(*pCmdQ, CsrDependencies(),
                                                                                                        true, false, false,
-                                                                                                        multiDispatchInfo, nullptr, 0, false, false);
+                                                                                                        multiDispatchInfo, nullptr, 0, false, false, nullptr);
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, false, *pCmdQ, nullptr, {});
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize);

    auto &commandStreamMarker = getCommandStream<FamilyType, CL_COMMAND_MARKER>(*pCmdQ, CsrDependencies(), true,
-                                                                                false, false, multiDispatchInfo, nullptr, 0, false, false);
+                                                                                false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
    expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_MARKER, true, false, *pCmdQ, nullptr, {});
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize);
@ -121,9 +121,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingAndFor
    multiDispatchInfo.push(dispatchInfo);
    multiDispatchInfo.push(dispatchInfo);
    auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, CsrDependencies(), true, false, false,
-                                                                            multiDispatchInfo, nullptr, 0, false, false);
+                                                                            multiDispatchInfo, nullptr, 0, false, false, nullptr);
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_TASK, CsrDependencies(), true, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize);
 }
@ -741,13 +741,13 @@ HWTEST_F(ProfilingWithPerfCountersTests, GivenCommandQueueWithProfilingPerfCount
    MultiDispatchInfo multiDispatchInfo(nullptr);
    auto &commandStreamMigrateMemObjects = getCommandStream<FamilyType, CL_COMMAND_MIGRATE_MEM_OBJECTS>(*pCmdQ, CsrDependencies(),
                                                                                                        true, true, false, multiDispatchInfo,
-                                                                                                        nullptr, 0, false, false);
+                                                                                                        nullptr, 0, false, false, nullptr);
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, true, *pCmdQ, nullptr, {});
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize);

    auto &commandStreamMarker = getCommandStream<FamilyType, CL_COMMAND_MARKER>(*pCmdQ, CsrDependencies(), true, true, false,
-                                                                                multiDispatchInfo, nullptr, 0, false, false);
+                                                                                multiDispatchInfo, nullptr, 0, false, false, nullptr);
    expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_MARKER, true, true, *pCmdQ, nullptr, {});
    EXPECT_GE(expectedSizeCS, requiredSize);
    EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize);
--- a/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp
+++ b/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp
@ -243,10 +243,14 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent
    MockGraphicsAllocation svmAlloc(svmPtr, svmSize);

    Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
+    auto node1 = event1.getMultiRootTimestampSyncNode();
    Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
    Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
+    auto node3 = event3.getMultiRootTimestampSyncNode();
    Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    auto node4 = event4.getMultiRootTimestampSyncNode();
    Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
+    auto node5 = event5.getMultiRootTimestampSyncNode();
    UserEvent userEvent1(&pCmdQ1->getContext());
    UserEvent userEvent2(&pCmdQ2->getContext());

@ -285,12 +289,12 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent
        EXPECT_EQ(3u, semaphores.size());

        auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());

        auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
-        EXPECT_EQ(7u, semaphoreCmd1->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress());
    }

    {
@ -313,12 +317,12 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent
        EXPECT_EQ(3u, semaphores.size());

        auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());

        auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
-        EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress());
    }
    alignedFree(svmPtr);
 }
--- a/shared/source/command_stream/command_stream_receiver.h
+++ b/shared/source/command_stream/command_stream_receiver.h
@ -221,6 +221,7 @@ class CommandStreamReceiver {
    TagAllocatorBase *getEventTsAllocator();
    TagAllocatorBase *getEventPerfCountAllocator(const uint32_t tagSize);
    virtual TagAllocatorBase *getTimestampPacketAllocator() = 0;
+    virtual std::unique_ptr<TagAllocatorBase> createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) = 0;

    virtual bool expectMemory(const void *gfxAddress, const void *srcAddress, size_t length, uint32_t compareOperation);

--- a/shared/source/command_stream/command_stream_receiver_hw.h
+++ b/shared/source/command_stream/command_stream_receiver_hw.h
@ -130,6 +130,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
    GraphicsAllocation *getClearColorAllocation() override;

    TagAllocatorBase *getTimestampPacketAllocator() override;
+    std::unique_ptr<TagAllocatorBase> createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override;

    void postInitFlagsSetup() override;
    void programActivePartitionConfig(LinearStream &csr);
--- a/shared/source/command_stream/command_stream_receiver_hw_base.inl
+++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl
@ -406,7 +406,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
    auto commandStreamStartCSR = commandStreamCSR.getUsed();

    TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);
-    TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);
+    TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);

    programActivePartitionConfigFlushTask(commandStreamCSR);
    programEngineModeCommands(commandStreamCSR, dispatchFlags);
@ -980,7 +980,7 @@ size_t CommandStreamReceiverHw<GfxFamily>::getRequiredCmdStreamSize(const Dispat
    }

    size += TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(dispatchFlags.csrDependencies);
-    size += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<GfxFamily>(dispatchFlags.csrDependencies);
+    size += TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer<GfxFamily>(dispatchFlags.csrDependencies);

    size += EncodeKernelArgsBuffer<GfxFamily>::getKernelArgsBufferCmdsSize(kernelArgsBufferAllocation, logicalStateHelper.get());

@ -1196,7 +1196,7 @@ TaskCountType CommandStreamReceiverHw<GfxFamily>::flushBcsTask(const BlitPropert

    for (auto &blitProperties : blitPropertiesContainer) {
        TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, blitProperties.csrDependencies);
-        TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStream, blitProperties.csrDependencies);
+        TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer<GfxFamily>(commandStream, blitProperties.csrDependencies);

        BlitCommandsHelper<GfxFamily>::encodeWa(commandStream, blitProperties, latestSentBcsWaValue);

@ -1229,6 +1229,12 @@ TaskCountType CommandStreamReceiverHw<GfxFamily>::flushBcsTask(const BlitPropert
        if (blitProperties.clearColorAllocation) {
            makeResident(*blitProperties.clearColorAllocation);
        }
+        if (blitProperties.multiRootDeviceEventSync != nullptr) {
+            MiFlushArgs args;
+            args.commandWithPostSync = true;
+            args.notifyEnable = isUsedNotifyEnableForPostSync();
+            EncodeMiFlushDW<GfxFamily>::programMiFlushDw(commandStream, blitProperties.multiRootDeviceEventSync->getGpuAddress() + blitProperties.multiRootDeviceEventSync->getContextEndOffset(), std::numeric_limits<uint64_t>::max(), args, hwInfo);
+        }
    }

    BlitCommandsHelper<GfxFamily>::programGlobalSequencerFlush(commandStream);
@ -1245,7 +1251,6 @@ TaskCountType CommandStreamReceiverHw<GfxFamily>::flushBcsTask(const BlitPropert

        MemorySynchronizationCommands<GfxFamily>::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), false, peekHwInfo());
    }
-
    if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnBlitCopy.get(), taskCount, PauseOnGpuProperties::PauseMode::AfterWorkload)) {
        BlitCommandsHelper<GfxFamily>::dispatchDebugPauseCommands(commandStream, getDebugPauseStateGPUAddress(),
                                                                  DebugPauseState::waitingForUserEndConfirmation,
@ -1522,6 +1527,11 @@ TagAllocatorBase *CommandStreamReceiverHw<GfxFamily>::getTimestampPacketAllocato
    return timestampPacketAllocator.get();
 }

+template <typename GfxFamily>
+std::unique_ptr<TagAllocatorBase> CommandStreamReceiverHw<GfxFamily>::createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) {
+    auto &gfxCoreHelper = getGfxCoreHelper();
+    return gfxCoreHelper.createTimestampPacketAllocator(rootDeviceIndices, getMemoryManager(), getPreferredTagPoolSize(), getType(), osContext->getDeviceBitfield());
+}
 template <typename GfxFamily>
 void CommandStreamReceiverHw<GfxFamily>::postInitFlagsSetup() {
    useNewResourceImplicitFlush = checkPlatformSupportsNewResourceImplicitFlush();
--- a/shared/source/command_stream/csr_deps.h
+++ b/shared/source/command_stream/csr_deps.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2022 Intel Corporation
+ * Copyright (C) 2020-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -22,7 +22,7 @@ class CsrDependencies {
        All
    };

-    StackVec<std::pair<TaskCountType, uint64_t>, 32> taskCountContainer;
+    StackVec<TimestampPacketContainer *, 32> multiRootTimeStampSyncContainer;
    StackVec<TimestampPacketContainer *, 32> timestampPacketContainer;

    void makeResident(CommandStreamReceiver &commandStreamReceiver) const;
--- a/shared/source/helpers/blit_commands_helper.cpp
+++ b/shared/source/helpers/blit_commands_helper.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2022 Intel Corporation
+ * Copyright (C) 2019-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -50,6 +50,7 @@ BlitProperties BlitProperties::constructPropertiesForReadWrite(BlitterConstants:
        BlitterConstants::BlitDirection::HostPtrToImage == blitDirection) {
        return {
            nullptr,                       // outputTimestampPacket
+            nullptr,                       // multiRootDeviceEventSync
            blitDirection,                 // blitDirection
            {},                            // csrDependencies
            AuxTranslationDirection::None, // auxTranslationDirection
@ -73,6 +74,7 @@ BlitProperties BlitProperties::constructPropertiesForReadWrite(BlitterConstants:
    } else {
        return {
            nullptr,                       // outputTimestampPacket
+            nullptr,                       // multiRootDeviceEventSync
            blitDirection,                 // blitDirection
            {},                            // csrDependencies
            AuxTranslationDirection::None, // auxTranslationDirection
@ -104,6 +106,7 @@ BlitProperties BlitProperties::constructPropertiesForCopy(GraphicsAllocation *ds

    return {
        nullptr,                                         // outputTimestampPacket
+        nullptr,                                         // multiRootDeviceEventSync
        BlitterConstants::BlitDirection::BufferToBuffer, // blitDirection
        {},                                              // csrDependencies
        AuxTranslationDirection::None,                   // auxTranslationDirection
@ -128,6 +131,7 @@ BlitProperties BlitProperties::constructPropertiesForAuxTranslation(AuxTranslati
    auto allocationSize = allocation->getUnderlyingBufferSize();
    return {
        nullptr,                                         // outputTimestampPacket
+        nullptr,                                         // multiRootDeviceEventSync
        BlitterConstants::BlitDirection::BufferToBuffer, // blitDirection
        {},                                              // csrDependencies
        auxTranslationDirection,                         // auxTranslationDirection
--- a/shared/source/helpers/blit_commands_helper.h
+++ b/shared/source/helpers/blit_commands_helper.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2022 Intel Corporation
+ * Copyright (C) 2019-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -61,6 +61,7 @@ struct BlitProperties {
                                                   CommandStreamReceiver &gpguCsr, CommandStreamReceiver &bcsCsr);

    TagNodeBase *outputTimestampPacket = nullptr;
+    TagNodeBase *multiRootDeviceEventSync = nullptr;
    BlitterConstants::BlitDirection blitDirection = BlitterConstants::BlitDirection::BufferToHostPtr;
    CsrDependencies csrDependencies;
    AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None;
--- a/shared/source/helpers/blit_commands_helper_base.inl
+++ b/shared/source/helpers/blit_commands_helper_base.inl
@ -127,7 +127,7 @@ size_t BlitCommandsHelper<GfxFamily>::estimateBlitCommandSize(const Vec3<size_t>

    sizePerBlit += estimatePostBlitCommandSize();
    return TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(csrDependencies) +
-           TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<GfxFamily>(csrDependencies) +
+           TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer<GfxFamily>(csrDependencies) +
           (sizePerBlit * nBlits) +
           timestampCmdSize +
           estimatePreBlitCommandSize();
@ -143,6 +143,9 @@ size_t BlitCommandsHelper<GfxFamily>::estimateBlitCommandsSize(const BlitPropert
        auto isImage = blitProperties.isImageOperation();
        size += BlitCommandsHelper<GfxFamily>::estimateBlitCommandSize(blitProperties.copySize, blitProperties.csrDependencies, updateTimestampPacket,
                                                                       profilingEnabled, isImage, rootDeviceEnvironment, blitProperties.isSystemMemoryPoolUsed);
+        if (blitProperties.multiRootDeviceEventSync != nullptr) {
+            size += EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite();
+        }
    }
    size += BlitCommandsHelper<GfxFamily>::getWaCmdsSize(blitPropertiesContainer);
    size += 2 * MemorySynchronizationCommands<GfxFamily>::getSizeForAdditonalSynchronization(*rootDeviceEnvironment.getHardwareInfo());
--- a/shared/source/helpers/timestamp_packet.h
+++ b/shared/source/helpers/timestamp_packet.h
@ -144,17 +144,11 @@ struct TimestampPacketHelper {
    }

    template <typename GfxFamily>
-    static void programCsrDependenciesForForTaskCountContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) {
-        auto &taskCountContainer = csrDependencies.taskCountContainer;
-
-        for (auto &[taskCountPreviousRootDevice, tagAddressPreviousRootDevice] : taskCountContainer) {
-            using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
-            using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
-
-            EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(cmdStream,
-                                                                  static_cast<uint64_t>(tagAddressPreviousRootDevice),
-                                                                  static_cast<uint32_t>(taskCountPreviousRootDevice),
-                                                                  COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
+    static void programCsrDependenciesForForMultiRootDeviceSyncContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) {
+        for (auto timestampPacketContainer : csrDependencies.multiRootTimeStampSyncContainer) {
+            for (auto &node : timestampPacketContainer->peekNodes()) {
+                TimestampPacketHelper::programSemaphore<GfxFamily>(cmdStream, *node);
+            }
        }
    }

@ -217,8 +211,8 @@ struct TimestampPacketHelper {
    }

    template <typename GfxFamily>
-    static size_t getRequiredCmdStreamSizeForTaskCountContainer(const CsrDependencies &csrDependencies) {
-        return csrDependencies.taskCountContainer.size() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
+    static size_t getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(const CsrDependencies &csrDependencies) {
+        return csrDependencies.multiRootTimeStampSyncContainer.size() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
    }
 };

--- a/shared/test/common/mocks/mock_command_stream_receiver.h
+++ b/shared/test/common/mocks/mock_command_stream_receiver.h
@ -18,6 +18,7 @@
 #include "shared/source/memory_manager/graphics_allocation.h"
 #include "shared/source/memory_manager/surface.h"
 #include "shared/source/os_interface/os_context.h"
+#include "shared/source/utilities/tag_allocator.h"
 #include "shared/test/common/helpers/dispatch_flags_helper.h"

 #include <optional>
@ -94,6 +95,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
    };

    TagAllocatorBase *getTimestampPacketAllocator() override { return nullptr; }
+    std::unique_ptr<TagAllocatorBase> createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override { return std::unique_ptr<TagAllocatorBase>(nullptr); }

    CompletionStamp flushTask(
        LinearStream &commandStream,
--- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp
+++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp
@ -15,10 +15,13 @@
 #include "shared/source/helpers/api_specific_config.h"
 #include "shared/source/memory_manager/internal_allocation_storage.h"
 #include "shared/source/memory_manager/surface.h"
+#include "shared/source/memory_manager/unified_memory_manager.h"
 #include "shared/source/os_interface/device_factory.h"
 #include "shared/source/os_interface/hw_info_config.h"
 #include "shared/source/os_interface/os_interface.h"
 #include "shared/source/utilities/tag_allocator.h"
+#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
+#include "shared/test/common/cmd_parse/hw_parse.h"
 #include "shared/test/common/fixtures/command_stream_receiver_fixture.inl"
 #include "shared/test/common/fixtures/device_fixture.h"
 #include "shared/test/common/helpers/batch_buffer_helper.h"
@ -32,6 +35,7 @@
 #include "shared/test/common/mocks/mock_execution_environment.h"
 #include "shared/test/common/mocks/mock_internal_allocation_storage.h"
 #include "shared/test/common/mocks/mock_memory_manager.h"
+#include "shared/test/common/mocks/mock_timestamp_container.h"
 #include "shared/test/common/mocks/ult_device_factory.h"
 #include "shared/test/common/test_macros/hw_test.h"
 #include "shared/test/common/test_macros/test_checks_shared.h"
@ -2463,3 +2467,89 @@ HWTEST_F(CommandStreamReceiverHwTest, givenVariousCsrModeWhenGettingTbxModeThenE
    ultCsr.commandStreamReceiverType = CommandStreamReceiverType::CSR_TBX_WITH_AUB;
    EXPECT_TRUE(ultCsr.isTbxMode());
 }
+
+HWTEST_F(CommandStreamReceiverHwTest, GivenTwoRootDevicesWhengetMultiRootDeviceTimestampPacketAllocatorCalledThenAllocatorForTwoDevicesCreated) {
+    auto executionEnvironment = std::make_unique<MockExecutionEnvironment>(defaultHwInfo.get(), true, 2u);
+    auto devices = DeviceFactory::createDevices(*executionEnvironment.release());
+    const RootDeviceIndicesContainer indices = {0u, 1u};
+    auto csr = devices[0]->getDefaultEngine().commandStreamReceiver;
+    auto allocator = csr->createMultiRootDeviceTimestampPacketAllocator(indices);
+    class MockTagAllocatorBase : public TagAllocatorBase {
+      public:
+        using TagAllocatorBase::maxRootDeviceIndex;
+    };
+    EXPECT_EQ(reinterpret_cast<MockTagAllocatorBase *>(allocator.get())->maxRootDeviceIndex, 1u);
+}
+HWTEST_F(CommandStreamReceiverHwTest, GivenFiveRootDevicesWhengetMultiRootDeviceTimestampPacketAllocatorCalledThenAllocatorForFiveDevicesCreated) {
+    auto executionEnvironment = std::make_unique<MockExecutionEnvironment>(defaultHwInfo.get(), true, 4u);
+    auto devices = DeviceFactory::createDevices(*executionEnvironment.release());
+    const RootDeviceIndicesContainer indices = {0u, 1u, 2u, 3u};
+    auto csr = devices[0]->getDefaultEngine().commandStreamReceiver;
+    auto allocator = csr->createMultiRootDeviceTimestampPacketAllocator(indices);
+    class MockTagAllocatorBase : public TagAllocatorBase {
+      public:
+        using TagAllocatorBase::maxRootDeviceIndex;
+    };
+    EXPECT_EQ(reinterpret_cast<MockTagAllocatorBase *>(allocator.get())->maxRootDeviceIndex, 3u);
+}
+HWTEST_F(CommandStreamReceiverHwTest, givenMultiRootDeviceSyncNodeWhenFlushBcsTAskThenMiFlushAdded) {
+    using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
+    auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
+    auto mockTagAllocator = std::make_unique<MockTagAllocator<>>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u);
+
+    auto blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::BufferToHostPtr,
+                                                                          commandStreamReceiver, commandStreamReceiver.getTagAllocation(), nullptr,
+                                                                          commandStreamReceiver.getTagAllocation()->getUnderlyingBuffer(),
+                                                                          commandStreamReceiver.getTagAllocation()->getGpuAddress(), 0,
+                                                                          0, 0, 0, 0, 0, 0, 0);
+    auto tag = mockTagAllocator->getTag();
+    blitProperties.multiRootDeviceEventSync = tag;
+
+    BlitPropertiesContainer container;
+    container.push_back(blitProperties);
+    commandStreamReceiver.flushBcsTask(container, true, false, *pDevice);
+    HardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(commandStreamReceiver.commandStream, 0);
+
+    auto cmdIterator = find<typename FamilyType::MI_FLUSH_DW *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
+    bool nodeAddressFound = false;
+    while (cmdIterator != hwParser.cmdList.end()) {
+        auto flush = genCmdCast<MI_FLUSH_DW *>(*cmdIterator);
+        if (flush->getDestinationAddress() == tag->getGpuAddress() + tag->getContextEndOffset()) {
+            nodeAddressFound = true;
+            break;
+        }
+        cmdIterator = find<typename FamilyType::MI_FLUSH_DW *>(++cmdIterator, hwParser.cmdList.end());
+    }
+    EXPECT_TRUE(nodeAddressFound);
+}
+HWTEST_F(CommandStreamReceiverHwTest, givenNullPtrAsMultiRootDeviceSyncNodeWhenFlushBcsTAskThenMiFlushNotAdded) {
+    using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
+    auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
+    auto mockTagAllocator = std::make_unique<MockTagAllocator<>>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u);
+
+    auto blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::BufferToHostPtr,
+                                                                          commandStreamReceiver, commandStreamReceiver.getTagAllocation(), nullptr,
+                                                                          commandStreamReceiver.getTagAllocation()->getUnderlyingBuffer(),
+                                                                          commandStreamReceiver.getTagAllocation()->getGpuAddress(), 0,
+                                                                          0, 0, 0, 0, 0, 0, 0);
+    auto tag = mockTagAllocator->getTag();
+
+    BlitPropertiesContainer container;
+    container.push_back(blitProperties);
+    commandStreamReceiver.flushBcsTask(container, true, false, *pDevice);
+    HardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(commandStreamReceiver.commandStream, 0);
+
+    auto cmdIterator = find<typename FamilyType::MI_FLUSH_DW *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
+    bool nodeAddressFound = false;
+    while (cmdIterator != hwParser.cmdList.end()) {
+        auto flush = genCmdCast<MI_FLUSH_DW *>(*cmdIterator);
+        if (flush->getDestinationAddress() == tag->getGpuAddress() + tag->getContextEndOffset()) {
+            nodeAddressFound = true;
+            break;
+        }
+        cmdIterator = find<typename FamilyType::MI_FLUSH_DW *>(++cmdIterator, hwParser.cmdList.end());
+    }
+    EXPECT_FALSE(nodeAddressFound);
+}
--- a/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp
+++ b/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp
@ -15,6 +15,7 @@
 #include "shared/test/common/helpers/debug_manager_state_restore.h"
 #include "shared/test/common/helpers/default_hw_info.h"
 #include "shared/test/common/mocks/mock_graphics_allocation.h"
+#include "shared/test/common/mocks/mock_timestamp_container.h"
 #include "shared/test/common/mocks/ult_device_factory.h"
 #include "shared/test/common/test_macros/test_checks_shared.h"

@ -663,3 +664,25 @@ HWTEST2_F(BlitTests, givenPlatformWhenCallingDispatchPreBlitCommandThenNoneMiFlu
    auto cmdIterator = find<typename FamilyType::MI_FLUSH_DW *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
    ASSERT_EQ(hwParser.cmdList.end(), cmdIterator);
 }
+
+HWTEST_F(BlitTests, givenPlatformWhenCallingDispatchPreBlitCommandThenNoneMiFlushDwIsProgramed) {
+    auto mockTagAllocator = std::make_unique<MockTagAllocator<>>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u);
+    auto tag = mockTagAllocator->getTag();
+    BlitProperties blitProperties{};
+    blitProperties.copySize = {1, 1, 1};
+    BlitPropertiesContainer blitPropertiesContainer1;
+    blitPropertiesContainer1.push_back(blitProperties);
+    blitPropertiesContainer1.push_back(blitProperties);
+    blitPropertiesContainer1.push_back(blitProperties);
+
+    auto estimatedSizeWithoutNode = BlitCommandsHelper<FamilyType>::estimateBlitCommandsSize(
+        blitPropertiesContainer1, false, true, false, pDevice->getRootDeviceEnvironment());
+    blitProperties.multiRootDeviceEventSync = tag;
+    BlitPropertiesContainer blitPropertiesContainer2;
+    blitPropertiesContainer2.push_back(blitProperties);
+    blitPropertiesContainer2.push_back(blitProperties);
+    blitPropertiesContainer2.push_back(blitProperties);
+    auto estimatedSizeWithNode = BlitCommandsHelper<FamilyType>::estimateBlitCommandsSize(
+        blitPropertiesContainer2, false, true, false, pDevice->getRootDeviceEnvironment());
+    EXPECT_NE(estimatedSizeWithoutNode, estimatedSizeWithNode);
+}
--- a/shared/test/unit_test/helpers/timestamp_packet_tests.cpp
+++ b/shared/test/unit_test/helpers/timestamp_packet_tests.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022 Intel Corporation
+ * Copyright (C) 2022-2023 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -301,3 +301,35 @@ HWTEST_F(DeviceTimestampPacketTests, givenDebugFlagSetWhenCreatingTimestampPacke

    EXPECT_FALSE(tag->canBeReleased());
 }
+
+using TimestampPacketHelperTests = Test<DeviceFixture>;
+
+HWTEST_F(TimestampPacketHelperTests, givenTagNodesInMultiRootSyncContainerWhenProgramingDependensiecThenSemaforesAreProgrammed) {
+    StackVec<char, 4096> buffer(4096);
+    LinearStream cmdStream(buffer.begin(), buffer.size());
+    CsrDependencies deps;
+    auto mockTagAllocator = std::make_unique<MockTagAllocator<>>(0, pDevice->getMemoryManager());
+    TimestampPacketContainer container = {};
+    container.add(mockTagAllocator->getTag());
+    deps.multiRootTimeStampSyncContainer.push_back(&container);
+    TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer<FamilyType>(cmdStream, deps);
+    EXPECT_EQ(cmdStream.getUsed(), sizeof(typename FamilyType::MI_SEMAPHORE_WAIT));
+}
+
+HWTEST_F(TimestampPacketHelperTests, givenEmptyContainerMultiRootSyncContainerWhenProgramingDependensiecThenZeroSemaforesAreProgrammed) {
+    StackVec<char, 4096> buffer(4096);
+    LinearStream cmdStream(buffer.begin(), buffer.size());
+    CsrDependencies deps;
+    TimestampPacketContainer container = {};
+    deps.multiRootTimeStampSyncContainer.push_back(&container);
+    TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer<FamilyType>(cmdStream, deps);
+    EXPECT_EQ(cmdStream.getUsed(), 0u);
+}
+
+HWTEST_F(TimestampPacketHelperTests, givenEmptyMultiRootSyncContainerWhenProgramingDependensiecThenZeroSemaforesAreProgrammed) {
+    StackVec<char, 4096> buffer(4096);
+    LinearStream cmdStream(buffer.begin(), buffer.size());
+    CsrDependencies deps;
+    TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer<FamilyType>(cmdStream, deps);
+    EXPECT_EQ(cmdStream.getUsed(), 0u);
+}