diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp
index f94a55679c..1bbdfd00b3 100644
--- a/opencl/source/command_queue/command_queue.cpp
+++ b/opencl/source/command_queue/command_queue.cpp
@@ -104,7 +104,9 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr
             timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
             deferredTimestampPackets = std::make_unique<TimestampPacketContainer>();
         }
-
+        if (context && context->getRootDeviceIndices().size() > 1) {
+            deferredMultiRootSyncNodes = std::make_unique<TimestampPacketContainer>();
+        }
         auto deferCmdQBcsInitialization = hwInfo.featureTable.ftrBcsInfo.count() > 1u;
 
         if (DebugManager.flags.DeferCmdQBcsInitialization.get() != -1) {
@@ -1248,6 +1250,10 @@ WaitStatus CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *pri
     if (deferredTimestampPackets) {
         deferredTimestampPackets->swapNodes(nodesToRelease);
     }
+    TimestampPacketContainer multiRootSyncNodesToRelease;
+    if (deferredMultiRootSyncNodes.get()) {
+        deferredMultiRootSyncNodes->swapNodes(multiRootSyncNodesToRelease);
+    }
 
     waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps);
 
diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h
index 8d9a210b00..c55d0544cd 100644
--- a/opencl/source/command_queue/command_queue.h
+++ b/opencl/source/command_queue/command_queue.h
@@ -439,6 +439,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
     bool requiresCacheFlushAfterWalker = false;
 
     std::unique_ptr<TimestampPacketContainer> deferredTimestampPackets;
+    std::unique_ptr<TimestampPacketContainer> deferredMultiRootSyncNodes;
     std::unique_ptr<TimestampPacketContainer> timestampPacketContainer;
 
     struct BcsTimestampPacketContainers {
diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h
index 5679c2f616..d9225c3727 100644
--- a/opencl/source/command_queue/command_queue_hw.h
+++ b/opencl/source/command_queue/command_queue_hw.h
@@ -391,7 +391,8 @@ class CommandQueueHw : public CommandQueue {
                         EventsRequest &eventsRequest,
                         EventBuilder &externalEventBuilder,
                         std::unique_ptr<PrintfHandler> &&printfHandler,
-                        CommandStreamReceiver *bcsCsr);
+                        CommandStreamReceiver *bcsCsr,
+                        TagNodeBase *multiRootDeviceSyncNode);
 
     CompletionStamp enqueueCommandWithoutKernel(Surface **surfaces,
                                                 size_t surfaceCount,
@@ -422,7 +423,7 @@ class CommandQueueHw : public CommandQueue {
                                                  TimestampPacketDependencies &timestampPacketDependencies,
                                                  const EventsRequest &eventsRequest,
                                                  LinearStream *commandStream,
-                                                 uint32_t commandType, bool queueBlocked);
+                                                 uint32_t commandType, bool queueBlocked, TagNodeBase *multiRootDeviceEventSync);
     void submitCacheFlush(Surface **surfaces,
                           size_t numSurfaces,
                           LinearStream *commandStream,
@@ -433,6 +434,8 @@ class CommandQueueHw : public CommandQueue {
     bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, TaskCountType taskCount, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) override;
 
     MOCKABLE_VIRTUAL bool isCacheFlushForBcsRequired() const;
+    MOCKABLE_VIRTUAL void processSignalMultiRootDeviceNode(LinearStream *commandStream,
+                                                           TagNodeBase *node);
 
   protected:
     MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){};
@@ -473,7 +476,7 @@ class CommandQueueHw : public CommandQueue {
             blockedCommandsData = std::make_unique<KernelOperation>(commandStream, *gpgpuCsr.getInternalAllocationStorage());
         } else {
             commandStream = &getCommandStream<GfxFamily, commandType>(*this, csrDependencies, profilingRequired, perfCountersRequired,
-                                                                      blitEnqueue, multiDispatchInfo, surfaces, numSurfaces, isMarkerWithProfiling, eventsRequest.numEventsInWaitList > 0);
+                                                                      blitEnqueue, multiDispatchInfo, surfaces, numSurfaces, isMarkerWithProfiling, eventsRequest.numEventsInWaitList > 0, eventsRequest.outEvent);
         }
         return commandStream;
     }
diff --git a/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl b/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl
index f2f9bb590a..27b00dd488 100644
--- a/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl
+++ b/opencl/source/command_queue/command_queue_hw_xehp_and_later.inl
@@ -45,7 +45,7 @@ bool CommandQueueHw<Family>::isCacheFlushCommand(uint32_t commandType) const {
 }
 
 template <>
-LinearStream &getCommandStream<Family, CL_COMMAND_RESOURCE_BARRIER>(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList) {
+LinearStream &getCommandStream<Family, CL_COMMAND_RESOURCE_BARRIER>(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo, Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent) {
     size_t expectedSizeCS = 0;
     [[maybe_unused]] bool usePostSync = false;
     if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h
index 554f70cbcd..a9bf384746 100644
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -45,6 +45,7 @@
 #include <new>
 
 namespace NEO {
+struct RootDeviceEnvironment;
 
 template <typename GfxFamily>
 template <uint32_t commandType, size_t surfaceCount>
@@ -178,7 +179,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
     BlitPropertiesContainer blitPropertiesContainer;
 
     if (this->context->getRootDeviceIndices().size() > 1) {
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, computeCommandStreamReceiver);
+        eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, computeCommandStreamReceiver);
     }
 
     const bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo);
@@ -227,7 +228,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
     }
 
     if (this->context->getRootDeviceIndices().size() > 1) {
-        TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStream, csrDeps);
+        TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer<GfxFamily>(commandStream, csrDeps);
     }
 
     if (enqueueWithBlitAuxTranslation) {
@@ -281,6 +282,18 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
     } else if (isMarkerWithPostSyncWrite) {
         processDispatchForMarker(*this, &commandStream, eventsRequest, csrDeps);
     }
+    TagNodeBase *multiRootEventSyncStamp = nullptr;
+    if (eventBuilder.getEvent() && eventBuilder.getEvent()->getContext()->getRootDeviceIndices().size() > 1 &&
+        !(multiDispatchInfo.empty() && CL_COMMAND_MARKER != commandType)) {
+        multiRootEventSyncStamp = eventBuilder.getEvent()->getMultiRootTimestampSyncNode();
+        if (!blockQueue) {
+            this->getGpgpuCommandStreamReceiver().makeResident(*multiRootEventSyncStamp->getBaseGraphicsAllocation());
+        }
+        processSignalMultiRootDeviceNode(&commandStream, multiRootEventSyncStamp);
+        if (CL_COMMAND_MARKER == commandType) {
+            flushDependenciesForNonKernelCommand = true;
+        }
+    }
 
     CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0};
     const EnqueueProperties enqueueProperties(false, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType),
@@ -383,13 +396,17 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
                        eventsRequest,
                        eventBuilder,
                        std::move(printfHandler),
-                       nullptr);
+                       nullptr,
+                       multiRootEventSyncStamp);
     }
 
     if (deferredTimestampPackets.get()) {
         timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets);
         csrDeps.copyNodesToNewContainer(*deferredTimestampPackets);
     }
+    if (deferredMultiRootSyncNodes.get()) {
+        csrDeps.copyRootDeviceSyncNodesToNewContainer(*deferredMultiRootSyncNodes);
+    }
 
     commandStreamReceiverOwnership.unlock();
     queueOwnership.unlock();
@@ -498,7 +515,7 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(CommandS
                                                                         const MultiDispatchInfo &multiDispatchInfo,
                                                                         TimestampPacketDependencies &timestampPacketDependencies,
                                                                         const EventsRequest &eventsRequest, LinearStream *commandStream,
-                                                                        uint32_t commandType, bool queueBlocked) {
+                                                                        uint32_t commandType, bool queueBlocked, TagNodeBase *multiRootDeviceEventSync) {
     auto blitDirection = ClBlitProperties::obtainBlitDirection(commandType);
 
     auto blitProperties = ClBlitProperties::constructProperties(blitDirection, blitCommandStreamReceiver,
@@ -511,7 +528,7 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(CommandS
         blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
         blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.barrierNodes);
     }
-
+    blitProperties.multiRootDeviceEventSync = multiRootDeviceEventSync;
     auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
     blitProperties.outputTimestampPacket = currentTimestampPacketNode;
 
@@ -617,7 +634,20 @@ void CommandQueueHw<GfxFamily>::processDispatchForMarker(CommandQueue &commandQu
     HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
     getGpgpuCommandStreamReceiver().makeResident(*hwTimeStamps->getBaseGraphicsAllocation());
 }
-
+template <typename GfxFamily>
+void CommandQueueHw<GfxFamily>::processSignalMultiRootDeviceNode(LinearStream *commandStream,
+                                                                 TagNodeBase *node) {
+    const auto &hwInfo = getDevice().getHardwareInfo();
+    PipeControlArgs args;
+    args.dcFlushEnable = MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, device->getRootDeviceEnvironment());
+    MemorySynchronizationCommands<GfxFamily>::addBarrierWithPostSyncOperation(
+        *commandStream,
+        PostSyncMode::ImmediateData,
+        node->getGpuAddress() + node->getContextEndOffset(),
+        std::numeric_limits<uint64_t>::max(),
+        hwInfo,
+        args);
+}
 template <typename GfxFamily>
 void CommandQueueHw<GfxFamily>::processDispatchForMarkerWithTimestampPacket(CommandQueue &commandQueue,
                                                                             LinearStream *commandStream,
@@ -903,7 +933,8 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
     EventsRequest &eventsRequest,
     EventBuilder &externalEventBuilder,
     std::unique_ptr<PrintfHandler> &&printfHandler,
-    CommandStreamReceiver *bcsCsr) {
+    CommandStreamReceiver *bcsCsr,
+    TagNodeBase *multiRootDeviceSyncNode) {
 
     TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
 
@@ -974,7 +1005,8 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
                                                          std::move(printfHandler),
                                                          preemptionMode,
                                                          multiDispatchInfo.peekMainKernel(),
-                                                         (uint32_t)multiDispatchInfo.size());
+                                                         (uint32_t)multiDispatchInfo.size(),
+                                                         multiRootDeviceSyncNode);
     }
     if (storeTimestampPackets) {
         command->setTimestampPacketNode(*timestampPacketContainer, std::move(timestampPacketDependencies));
@@ -1281,10 +1313,14 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
     }
 
     TimestampPacketDependencies timestampPacketDependencies;
+    TagNodeBase *multiRootEventSyncStamp = nullptr;
     BlitPropertiesContainer blitPropertiesContainer;
     CsrDependencies csrDeps;
 
     eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All);
+    if (this->context->getRootDeviceIndices().size() > 1) {
+        eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, bcsCsr);
+    }
     auto allocator = bcsCsr.getTimestampPacketAllocator();
 
     if (!blockQueue) {
@@ -1311,6 +1347,10 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
     if (eventBuilder.getEvent()) {
         eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
     }
+    if (eventBuilder.getEvent() && eventBuilder.getEvent()->getContext()->getRootDeviceIndices().size() > 1) {
+        multiRootEventSyncStamp = eventBuilder.getEvent()->getMultiRootTimestampSyncNode();
+        bcsCsr.makeResident(*multiRootEventSyncStamp->getBaseGraphicsAllocation());
+    }
 
     CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0};
 
@@ -1327,7 +1367,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
     }
 
     blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies,
-                                                                    eventsRequest, gpgpuCommandStream, cmdType, blockQueue));
+                                                                    eventsRequest, gpgpuCommandStream, cmdType, blockQueue, multiRootEventSyncStamp));
 
     if (!blockQueue) {
         completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking,
@@ -1354,7 +1394,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
     updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());
 
     if (blockQueue) {
-        enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr);
+        enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp);
 
         if (gpgpuSubmission) {
             if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) {
@@ -1365,6 +1405,9 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
 
     timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets);
     csrDeps.copyNodesToNewContainer(*deferredTimestampPackets);
+    if (deferredMultiRootSyncNodes.get()) {
+        csrDeps.copyRootDeviceSyncNodesToNewContainer(*deferredMultiRootSyncNodes);
+    }
     if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() != 1) {
         commandStreamReceiverOwnership.unlock();
     }
diff --git a/opencl/source/command_queue/gpgpu_walker.h b/opencl/source/command_queue/gpgpu_walker.h
index e22858c851..a7ae002f44 100644
--- a/opencl/source/command_queue/gpgpu_walker.h
+++ b/opencl/source/command_queue/gpgpu_walker.h
@@ -88,7 +88,7 @@ class GpgpuWalkerHelper {
 template <typename GfxFamily>
 struct EnqueueOperation {
     using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
-    static size_t getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitList);
+    static size_t getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent);
     static size_t getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo);
     static size_t getSizeRequiredForTimestampPacketWrite();
     static size_t getSizeForCacheFlushAfterWalkerCommands(const Kernel &kernel, const CommandQueue &commandQueue);
@@ -101,8 +101,8 @@ struct EnqueueOperation {
 template <typename GfxFamily, uint32_t eventType>
 LinearStream &getCommandStream(CommandQueue &commandQueue, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace,
                                bool reservePerfCounterCmdsSpace, bool blitEnqueue, const MultiDispatchInfo &multiDispatchInfo,
-                               Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList) {
-    size_t expectedSizeCS = EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(eventType, csrDeps, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, blitEnqueue, commandQueue, multiDispatchInfo, isMarkerWithProfiling, eventsInWaitList);
+                               Surface **surfaces, size_t numSurfaces, bool isMarkerWithProfiling, bool eventsInWaitList, cl_event *outEvent) {
+    size_t expectedSizeCS = EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(eventType, csrDeps, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, blitEnqueue, commandQueue, multiDispatchInfo, isMarkerWithProfiling, eventsInWaitList, outEvent);
     return commandQueue.getCS(expectedSizeCS);
 }
 
diff --git a/opencl/source/command_queue/gpgpu_walker_base.inl b/opencl/source/command_queue/gpgpu_walker_base.inl
index 66b6096611..6780ac7f57 100644
--- a/opencl/source/command_queue/gpgpu_walker_base.inl
+++ b/opencl/source/command_queue/gpgpu_walker_base.inl
@@ -166,7 +166,7 @@ size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWaDisableRccRhwoOptimization(cons
 }
 
 template <typename GfxFamily>
-size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitlist) {
+size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitlist, cl_event *outEvent) {
     size_t expectedSizeCS = 0;
     auto &hwInfo = commandQueue.getDevice().getHardwareInfo();
     auto &gfxCoreHelper = commandQueue.getDevice().getGfxCoreHelper();
@@ -219,8 +219,14 @@ size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, c
     if (DebugManager.flags.GpuScratchRegWriteAfterWalker.get() != -1) {
         expectedSizeCS += sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM);
     }
-
-    expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<GfxFamily>(csrDeps);
+    expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer<GfxFamily>(csrDeps);
+    if (outEvent) {
+        auto pEvent = castToObjectOrAbort<Event>(*outEvent);
+        if ((pEvent->getContext()->getRootDeviceIndices().size() > 1) && (!pEvent->isUserEvent())) {
+            expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(hwInfo, false);
+        }
+    }
+    expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(false);
 
     return expectedSizeCS;
 }
diff --git a/opencl/source/command_queue/hardware_interface.h b/opencl/source/command_queue/hardware_interface.h
index 92c59afcc3..be8ae196c2 100644
--- a/opencl/source/command_queue/hardware_interface.h
+++ b/opencl/source/command_queue/hardware_interface.h
@@ -33,6 +33,7 @@ struct HardwareInterfaceWalkerArgs {
     size_t localWorkSizes[3] = {};
     TagNodeBase *hwTimeStamps = nullptr;
     TagNodeBase *hwPerfCounter = nullptr;
+    TagNodeBase *multiRootDeviceEventStamp = nullptr;
     TimestampPacketDependencies *timestampPacketDependencies = nullptr;
     TimestampPacketContainer *currentTimestampPacketNodes = nullptr;
     const Vec3<size_t> *numberOfWorkgroups = nullptr;
diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl
index 8f61426f56..ac19321ea9 100644
--- a/opencl/source/command_queue/hardware_interface_base.inl
+++ b/opencl/source/command_queue/hardware_interface_base.inl
@@ -133,6 +133,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
                                     walkerArgs.currentTimestampPacketNodes);
 
     walkerArgs.currentDispatchIndex = 0;
+
     for (auto &dispatchInfo : multiDispatchInfo) {
         dispatchInfo.dispatchInitCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getRootDeviceEnvironment());
         walkerArgs.isMainKernel = (dispatchInfo.getKernel() == mainKernel);
diff --git a/opencl/source/context/context.cpp b/opencl/source/context/context.cpp
index a725a72e65..7f0236f940 100644
--- a/opencl/source/context/context.cpp
+++ b/opencl/source/context/context.cpp
@@ -19,6 +19,7 @@
 #include "shared/source/memory_manager/memory_manager.h"
 #include "shared/source/memory_manager/unified_memory_manager.h"
 #include "shared/source/utilities/heap_allocator.h"
+#include "shared/source/utilities/tag_allocator.h"
 
 #include "opencl/source/cl_device/cl_device.h"
 #include "opencl/source/command_queue/command_queue.h"
@@ -49,7 +50,9 @@ Context::Context(
 
 Context::~Context() {
     gtpinNotifyContextDestroy((cl_context)this);
-
+    if (multiRootDeviceTimestampPacketAllocator.get() != nullptr) {
+        multiRootDeviceTimestampPacketAllocator.reset();
+    }
     if (smallBufferPoolAllocator.isAggregatedSmallBuffersEnabled(this)) {
         smallBufferPoolAllocator.releaseSmallBufferPool();
     }
@@ -564,5 +567,15 @@ void Context::BufferPoolAllocator::releaseSmallBufferPool() {
     delete this->mainStorage;
     this->mainStorage = nullptr;
 }
+TagAllocatorBase *Context::getMultiRootDeviceTimestampPacketAllocator() {
+    return multiRootDeviceTimestampPacketAllocator.get();
+}
+void Context::setMultiRootDeviceTimestampPacketAllocator(std::unique_ptr<TagAllocatorBase> &allocator) {
+    multiRootDeviceTimestampPacketAllocator = std::move(allocator);
+}
+
+std::unique_lock<std::mutex> Context::obtainOwnershipForMultiRootDeviceAllocator() {
+    return std::unique_lock<std::mutex>(multiRootDeviceAllocatorMtx);
+}
 
 } // namespace NEO
diff --git a/opencl/source/context/context.h b/opencl/source/context/context.h
index f06597683a..45d359ecb5 100644
--- a/opencl/source/context/context.h
+++ b/opencl/source/context/context.h
@@ -37,6 +37,7 @@ class SharingFunctions;
 class SVMAllocsManager;
 class Program;
 class Platform;
+class TagAllocatorBase;
 
 template <>
 struct OpenCLObjectMapper<_cl_context> {
@@ -223,6 +224,9 @@ class Context : public BaseObject<_cl_context> {
     BufferPoolAllocator &getBufferPoolAllocator() {
         return this->smallBufferPoolAllocator;
     }
+    TagAllocatorBase *getMultiRootDeviceTimestampPacketAllocator();
+    std::unique_lock<std::mutex> obtainOwnershipForMultiRootDeviceAllocator();
+    void setMultiRootDeviceTimestampPacketAllocator(std::unique_ptr<TagAllocatorBase> &allocator);
 
   protected:
     struct BuiltInKernel {
@@ -263,6 +267,8 @@ class Context : public BaseObject<_cl_context> {
     uint32_t maxRootDeviceIndex = std::numeric_limits<uint32_t>::max();
     cl_bool preferD3dSharedResources = 0u;
     ContextType contextType = ContextType::CONTEXT_TYPE_DEFAULT;
+    std::unique_ptr<TagAllocatorBase> multiRootDeviceTimestampPacketAllocator;
+    std::mutex multiRootDeviceAllocatorMtx;
 
     bool interopUserSync = false;
     bool resolvesRequiredInKernels = false;
diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp
index 56c3c1647a..c257518913 100644
--- a/opencl/source/event/event.cpp
+++ b/opencl/source/event/event.cpp
@@ -884,7 +884,6 @@ TagNodeBase *Event::getHwTimeStampNode() {
 }
 
 TagNodeBase *Event::getHwPerfCounterNode() {
-
     if (!perfCounterNode && cmdQueue->getPerfCounters()) {
         const uint32_t gpuReportSize = HwPerfCounter::getSize(*(cmdQueue->getPerfCounters()));
         perfCounterNode = cmdQueue->getGpgpuCommandStreamReceiver().getEventPerfCountAllocator(gpuReportSize)->getTag();
@@ -892,11 +891,27 @@ TagNodeBase *Event::getHwPerfCounterNode() {
     return perfCounterNode;
 }
 
+TagNodeBase *Event::getMultiRootTimestampSyncNode() {
+    auto lock = getContext()->obtainOwnershipForMultiRootDeviceAllocator();
+    if (getContext()->getMultiRootDeviceTimestampPacketAllocator() == nullptr) {
+        auto allocator = cmdQueue->getGpgpuCommandStreamReceiver().createMultiRootDeviceTimestampPacketAllocator(getContext()->getRootDeviceIndices());
+        getContext()->setMultiRootDeviceTimestampPacketAllocator(allocator);
+    }
+    lock.unlock();
+    if (multiRootDeviceTimestampPacketContainer.get() == nullptr) {
+        multiRootDeviceTimestampPacketContainer = std::make_unique<TimestampPacketContainer>();
+    }
+    multiRootTimeStampSyncNode = getContext()->getMultiRootDeviceTimestampPacketAllocator()->getTag();
+    multiRootDeviceTimestampPacketContainer->add(multiRootTimeStampSyncNode);
+    return multiRootTimeStampSyncNode;
+}
+
 void Event::addTimestampPacketNodes(const TimestampPacketContainer &inputTimestampPacketContainer) {
     timestampPacketContainer->assignAndIncrementNodesRefCounts(inputTimestampPacketContainer);
 }
 
 TimestampPacketContainer *Event::getTimestampPacketNodes() const { return timestampPacketContainer.get(); }
+TimestampPacketContainer *Event::getMultiRootDeviceTimestampPacketNodes() const { return multiRootDeviceTimestampPacketContainer.get(); }
 
 bool Event::checkUserEventDependencies(cl_uint numEventsInWaitList, const cl_event *eventWaitList) {
     bool userEventsDependencies = false;
diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h
index a502fe415d..dd7ed18471 100644
--- a/opencl/source/event/event.h
+++ b/opencl/source/event/event.h
@@ -115,6 +115,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
 
     void addTimestampPacketNodes(const TimestampPacketContainer &inputTimestampPacketContainer);
     TimestampPacketContainer *getTimestampPacketNodes() const;
+    TimestampPacketContainer *getMultiRootDeviceTimestampPacketNodes() const;
 
     bool isPerfCountersEnabled() const {
         return perfCountersEnabled;
@@ -129,6 +130,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
     }
 
     TagNodeBase *getHwPerfCounterNode();
+    TagNodeBase *getMultiRootTimestampSyncNode();
 
     std::unique_ptr<FlushStampTracker> flushStamp;
     std::atomic<TaskCountType> taskLevel;
@@ -384,8 +386,10 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
     bool perfCountersEnabled;
     TagNodeBase *timeStampNode = nullptr;
     TagNodeBase *perfCounterNode = nullptr;
+    TagNodeBase *multiRootTimeStampSyncNode = nullptr;
     std::unique_ptr<TimestampPacketContainer> timestampPacketContainer;
     // number of events this event depends on
+    std::unique_ptr<TimestampPacketContainer> multiRootDeviceTimestampPacketContainer;
     std::atomic<int> parentCount;
     // event parents
     std::vector<Event *> parentEvents;
diff --git a/opencl/source/helpers/properties_helper.cpp b/opencl/source/helpers/properties_helper.cpp
index 52dcaa6b1a..e34b1ffb00 100644
--- a/opencl/source/helpers/properties_helper.cpp
+++ b/opencl/source/helpers/properties_helper.cpp
@@ -26,7 +26,6 @@ namespace NEO {
 void flushDependentCsr(CommandStreamReceiver &dependentCsr, CsrDependencies &csrDeps) {
     auto csrOwnership = dependentCsr.obtainUniqueOwnership();
     dependentCsr.updateTagFromWait();
-    csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
 }
 
 void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const {
@@ -68,23 +67,22 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
     }
 }
 
-void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const {
+void EventsRequest::fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const {
     for (cl_uint i = 0; i < this->numEventsInWaitList; i++) {
         auto event = castToObjectOrAbort<Event>(this->eventWaitList[i]);
         if (event->isUserEvent() || CompletionStamp::notReady == event->peekTaskCount()) {
             continue;
         }
-
         if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) {
+            auto timestampPacketContainer = event->getMultiRootDeviceTimestampPacketNodes();
+            if (!timestampPacketContainer || timestampPacketContainer->peekNodes().empty()) {
+                continue;
+            }
             auto &dependentCsr = event->getCommandQueue()->getGpgpuCommandStreamReceiver();
             if (!dependentCsr.isLatestTaskCountFlushed()) {
                 flushDependentCsr(dependentCsr, csrDeps);
-            } else {
-                csrDeps.taskCountContainer.push_back({event->peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
             }
-
-            auto graphicsAllocation = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex());
-            currentCsr.getResidencyAllocations().push_back(graphicsAllocation);
+            csrDeps.multiRootTimeStampSyncContainer.push_back(timestampPacketContainer);
         }
     }
 }
diff --git a/opencl/source/helpers/properties_helper.h b/opencl/source/helpers/properties_helper.h
index 37b2c6564b..5a653b18e5 100644
--- a/opencl/source/helpers/properties_helper.h
+++ b/opencl/source/helpers/properties_helper.h
@@ -25,7 +25,7 @@ struct EventsRequest {
         : numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), outEvent(outEvent) {}
 
     void fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const;
-    void fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const;
+    void fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const;
     void setupBcsCsrForOutputEvent(CommandStreamReceiver &bcsCsr) const;
 
     cl_uint numEventsInWaitList;
diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp
index e91ff96ec8..75dbcf5661 100644
--- a/opencl/source/helpers/task_information.cpp
+++ b/opencl/source/helpers/task_information.cpp
@@ -120,10 +120,11 @@ CompletionStamp &CommandMapUnmap::submit(TaskCountType taskLevel, bool terminate
 
 CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> surfaces,
                                            bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr<PrintfHandler> &&printfHandler,
-                                           PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount)
+                                           PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount,
+                                           TagNodeBase *multiRootDeviceSyncNode)
     : Command(commandQueue, kernelOperation), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM),
       commandType(commandType), printfHandler(std::move(printfHandler)), kernel(kernel),
-      kernelCount(kernelCount), preemptionMode(preemptionMode) {
+      kernelCount(kernelCount), preemptionMode(preemptionMode), multiRootDeviceSyncNode(multiRootDeviceSyncNode) {
     UNRECOVERABLE_IF(nullptr == this->kernel);
     kernel->incRefInternal();
 }
@@ -165,6 +166,9 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term
         printfHandler->makeResident(commandStreamReceiver);
     }
     makeTimestampPacketsResident(commandStreamReceiver);
+    if (multiRootDeviceSyncNode != nullptr) {
+        commandStreamReceiver.makeResident(*multiRootDeviceSyncNode->getBaseGraphicsAllocation());
+    }
 
     if (kernelOperation->blitPropertiesContainer.size() > 0) {
         CsrDependencies csrDeps;
@@ -217,7 +221,7 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term
         false);                                                                           // stateCacheInvalidation
 
     if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
+        eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver);
     }
 
     const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
@@ -310,7 +314,7 @@ TaskCountType CommandWithoutKernel::dispatchBlitOperation() {
     blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0];
 
     if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(blitProperties.csrDependencies, *bcsCsr);
+        eventsRequest.fillCsrDependenciesForRootDevices(blitProperties.csrDependencies, *bcsCsr);
     }
 
     const auto newTaskCount = bcsCsr->flushBcsTask(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
@@ -393,7 +397,7 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term
         false);                                                                // stateCacheInvalidation
 
     if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
+        eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver);
     }
 
     const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
diff --git a/opencl/source/helpers/task_information.h b/opencl/source/helpers/task_information.h
index 68f378c693..c3ebd01b6c 100644
--- a/opencl/source/helpers/task_information.h
+++ b/opencl/source/helpers/task_information.h
@@ -127,7 +127,7 @@ class CommandComputeKernel : public Command {
   public:
     CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> surfaces,
                          bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr<PrintfHandler> &&printfHandler,
-                         PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount);
+                         PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount, TagNodeBase *multiRootDeviceSyncNode);
 
     ~CommandComputeKernel() override;
 
@@ -146,6 +146,7 @@ class CommandComputeKernel : public Command {
     Kernel *kernel;
     uint32_t kernelCount;
     PreemptionMode preemptionMode;
+    TagNodeBase *multiRootDeviceSyncNode;
 };
 
 class CommandWithoutKernel : public Command {
diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp
index fa064de0cd..c369349555 100644
--- a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp
@@ -19,6 +19,7 @@
 #include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
 #include "opencl/test/unit_test/fixtures/buffer_fixture.h"
 #include "opencl/test/unit_test/fixtures/image_fixture.h"
+#include "opencl/test/unit_test/helpers/cl_hw_parse.h"
 #include "opencl/test/unit_test/mocks/mock_command_queue.h"
 #include "opencl/test/unit_test/mocks/mock_event.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
@@ -193,6 +194,7 @@ class MockCommandStreamReceiverWithFailingFlushBatchedSubmission : public MockCo
 template <typename GfxFamily>
 struct MockCommandQueueHwWithOverwrittenCsr : public CommandQueueHw<GfxFamily> {
     using CommandQueueHw<GfxFamily>::CommandQueueHw;
+    using CommandQueueHw<GfxFamily>::timestampPacketContainer;
     MockCommandStreamReceiverWithFailingFlushBatchedSubmission *csr;
     CommandStreamReceiver &getGpgpuCommandStreamReceiver() const override { return *csr; }
 };
diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp
index 2ea8f48fed..7fd8c44ef5 100644
--- a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp
@@ -23,6 +23,7 @@
 #include "opencl/test/unit_test/mocks/mock_command_queue.h"
 #include "opencl/test/unit_test/mocks/mock_event.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
+#include "opencl/test/unit_test/mocks/mock_mdi.h"
 
 using namespace NEO;
 
@@ -972,4 +973,4 @@ HWTEST_F(CommandQueueHwTest, GivenBuiltinKernelWhenBuiltinDispatchInfoBuilderIsP
     EXPECT_EQ(builder.paramsToUse.elws.x, dispatchInfo->getEnqueuedWorkgroupSize().x);
     EXPECT_EQ(builder.paramsToUse.offset.x, dispatchInfo->getOffset().x);
     EXPECT_EQ(builder.paramsToUse.kernel, dispatchInfo->getKernel());
-}
+}
\ No newline at end of file
diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
index c43df32c18..a902334eb6 100644
--- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
+++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
@@ -31,6 +31,7 @@
 #include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
 #include "opencl/test/unit_test/mocks/mock_buffer.h"
 #include "opencl/test/unit_test/mocks/mock_command_queue.h"
+#include "opencl/test/unit_test/mocks/mock_event.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
 #include "opencl/test/unit_test/mocks/mock_mdi.h"
 #include "opencl/test/unit_test/mocks/mock_program.h"
diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp
index 3b36aa347a..dbc921929b 100644
--- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp
@@ -557,11 +557,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenTimestamp
     MockMultiDispatchInfo multiDispatchInfo(device.get(), std::vector<Kernel *>({kernel1.mockKernel, kernel2.mockKernel}));
 
     device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
     size_t sizeWithDisabled = cmdQ.requestedCmdStreamSize;
 
     device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
     size_t sizeWithEnabled = cmdQ.requestedCmdStreamSize;
 
     size_t additionalSize = 0u;
@@ -669,7 +669,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenAutoLocal
     EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer());
 
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, CsrDependencies(), false, false,
-                                                                               false, *cmdQ.get(), multiDispatchInfo, false, false);
+                                                                               false, *cmdQ.get(), multiDispatchInfo, false, false, nullptr);
     expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
     expectedSizeCS = alignUp(expectedSizeCS, MemoryConstants::cacheLineSize);
     EXPECT_GE(expectedSizeCS, usedAfterCS - usedBeforeCS);
@@ -738,7 +738,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
     EXPECT_EQ((uint32_t)(expectedKernelStartOffset), idd.getKernelStartPointer());
 
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, CsrDependencies(), false, false,
-                                                                               false, *cmdQ.get(), multiDispatchInfo, false, false);
+                                                                               false, *cmdQ.get(), multiDispatchInfo, false, false, nullptr);
     expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
     expectedSizeCS = alignUp(expectedSizeCS, MemoryConstants::cacheLineSize);
     EXPECT_GE(expectedSizeCS, usedAfterCS - usedBeforeCS);
diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp
index edd2fb1601..f06bff02e4 100644
--- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp
@@ -24,6 +24,7 @@
 #include "opencl/source/mem_obj/buffer.h"
 #include "opencl/test/unit_test/fixtures/dispatch_flags_fixture.h"
 #include "opencl/test/unit_test/fixtures/enqueue_handler_fixture.h"
+#include "opencl/test/unit_test/fixtures/multi_root_device_fixture.h"
 #include "opencl/test/unit_test/mocks/mock_command_queue.h"
 #include "opencl/test/unit_test/mocks/mock_event.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
@@ -234,7 +235,7 @@ HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontReg
     Surface *surfaces[] = {nullptr};
     mockCmdQ->enqueueBlocked(CL_COMMAND_MARKER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies,
                              blockedCommandsData, enqueuePropertiesForDependencyFlush, eventsRequest,
-                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), nullptr);
+                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), nullptr, nullptr);
     EXPECT_FALSE(blockedCommandsDataForDependencyFlush->blitEnqueue);
 }
 
@@ -267,7 +268,7 @@ HWTEST_F(EnqueueHandlerTest, givenBlitPropertyWhenEnqueueIsBlockedThenRegisterBl
     Surface *surfaces[] = {nullptr};
     mockCmdQ->enqueueBlocked(CL_COMMAND_READ_BUFFER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies,
                              blockedCommandsData, enqueuePropertiesForBlitEnqueue, eventsRequest,
-                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), mockCmdQ->getBcsForAuxTranslation());
+                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), mockCmdQ->getBcsForAuxTranslation(), nullptr);
     EXPECT_TRUE(blockedCommandsDataForBlitEnqueue->blitEnqueue);
     EXPECT_EQ(blitProperties.srcAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->srcAllocation);
     EXPECT_EQ(blitProperties.dstAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->dstAllocation);
@@ -351,7 +352,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitEnqueueWhenDispatchingCommandsWithoutK
 
     timestampPacketDependencies.cacheFlushNodes.add(mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag());
     BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies,
-                                                                            eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false);
+                                                                            eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr);
 
     BlitPropertiesContainer blitPropertiesContainer;
     blitPropertiesContainer.push_back(blitProperties);
@@ -390,7 +391,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitOperationWhenEnqueueCommandWithoutKern
     CsrDependencies csrDeps;
 
     BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies,
-                                                                            eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false);
+                                                                            eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr);
 
     BlitPropertiesContainer blitPropertiesContainer;
     blitPropertiesContainer.push_back(blitProperties);
@@ -432,7 +433,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenN1EnabledWhenDispatchingWithoutKernelThenA
     mockCmdQ->obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, true, bcsCsr);
     timestampPacketDependencies.cacheFlushNodes.add(mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag());
     BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies,
-                                                                            eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false);
+                                                                            eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr);
     BlitPropertiesContainer blitPropertiesContainer;
     blitPropertiesContainer.push_back(blitProperties);
 
@@ -478,7 +479,7 @@ HWTEST_F(DispatchFlagsTests, givenMockKernelWhenSettingAdditionalKernelExecInfoT
     std::vector<Surface *> v;
 
     pKernel->setAdditionalKernelExecInfo(123u);
-    std::unique_ptr<CommandComputeKernel> cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1));
+    std::unique_ptr<CommandComputeKernel> cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1, nullptr));
     cmd->submit(1u, false);
     EXPECT_EQ(mockCsr->passedDispatchFlags.additionalKernelExecInfo, 123u);
 
@@ -541,4 +542,41 @@ HWTEST_F(EnqueueHandlerTest, givenTimestampPacketWriteDisabledAndCommandWithCach
     EXPECT_EQ(nullptr, container);
     clReleaseEvent(event);
 }
-} // namespace NEO
+
+template <typename FamilyType>
+class MockCommandQueueWithProcessSignal : public MockCommandQueueHw<FamilyType> {
+    using MockCommandQueueHw<FamilyType>::MockCommandQueueHw;
+
+  public:
+    void processSignalMultiRootDeviceNode(LinearStream *commandStream,
+                                          TagNodeBase *node) override {
+        processSignalMultiRootDeviceNodeCalled++;
+    }
+    uint32_t processSignalMultiRootDeviceNodeCalled = 0;
+};
+
+using EnqueueHandlerMultiRootSync = MultiRootDeviceFixture;
+
+HWTEST_F(EnqueueHandlerMultiRootSync, givenOutEventInMultiRootContextWhenEnqueuehandlerForMapOperationCalledThenMultiRootTagIsNotSignaled) {
+    auto mockCmdQ = std::make_unique<MockCommandQueueWithProcessSignal<FamilyType>>(context.get(), device1, nullptr);
+    auto event = std::make_unique<MockEvent<Event>>(context.get(), nullptr, 0, 0, 0);
+    cl_event clEvent = event.get();
+
+    MultiDispatchInfo multiDispatch;
+    mockCmdQ->template enqueueHandler<CL_COMMAND_SVM_MAP>(nullptr, 0, false, multiDispatch, 0, nullptr, &clEvent);
+    EXPECT_EQ(mockCmdQ->processSignalMultiRootDeviceNodeCalled, 0u);
+    clReleaseEvent(clEvent);
+}
+
+HWTEST_F(EnqueueHandlerMultiRootSync, givenOutEventInMultiRootContextWhenEnqueuehandlerForMarkerOperationCalledThenMultiRootTagIsSignaled) {
+    auto mockCmdQ = std::make_unique<MockCommandQueueWithProcessSignal<FamilyType>>(context.get(), device1, nullptr);
+    auto event = std::make_unique<MockEvent<Event>>(context.get(), nullptr, 0, 0, 0);
+    cl_event clEvent = event.get();
+
+    MultiDispatchInfo multiDispatch;
+    mockCmdQ->template enqueueHandler<CL_COMMAND_MARKER>(nullptr, 0, false, multiDispatch, 0, nullptr, &clEvent);
+    EXPECT_EQ(mockCmdQ->processSignalMultiRootDeviceNodeCalled, 1u);
+    clReleaseEvent(clEvent);
+}
+
+} // namespace NEO
\ No newline at end of file
diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp
index f51f97a02b..f3d47baf7a 100644
--- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp
@@ -54,7 +54,7 @@ HWTEST2_F(DispatchFlagsTests, whenSubmittingKernelWithAdditionalKernelExecInfoTh
     std::vector<Surface *> v;
 
     pKernel->setAdditionalKernelExecInfo(AdditionalKernelExecInfo::DisableOverdispatch);
-    std::unique_ptr<CommandComputeKernel> cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1));
+    std::unique_ptr<CommandComputeKernel> cmd(new CommandComputeKernel(*mockCmdQ.get(), blockedCommandsData, v, false, false, false, std::move(printfHandler), PreemptionMode::Disabled, pKernel, 1, nullptr));
     cmd->submit(1u, false);
     EXPECT_EQ(mockCsr->passedDispatchFlags.additionalKernelExecInfo, AdditionalKernelExecInfo::DisableOverdispatch);
 
diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
index 2565649421..390343b2be 100644
--- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
@@ -2005,10 +2005,10 @@ HWTEST_F(PauseOnGpuTests, givenGpuScratchWriteEnabledWhenEstimatingCommandStream
     dispatchInfo.setKernel(mockKernel.mockKernel);
     multiDispatchInfo.push(dispatchInfo);
 
-    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false);
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
     DebugManager.flags.GpuScratchRegWriteAfterWalker.set(1);
 
-    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
 
     EXPECT_EQ(baseCommandStreamSize + sizeof(typename FamilyType::MI_LOAD_REGISTER_IMM), extendedCommandStreamSize);
 }
diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp
index 49871b6595..39cf18a236 100644
--- a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp
@@ -1014,8 +1014,8 @@ HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithoutW
     dispatchInfo.setKernel(mockKernel.mockKernel);
     multiDispatchInfo.push(dispatchInfo);
 
-    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false);
-    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false);
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false, nullptr);
 
     EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO<FamilyType>::size + MemorySynchronizationCommands<FamilyType>::getSizeForSingleBarrier(false), extendedCommandStreamSize);
 }
@@ -1033,8 +1033,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, EnqueueKernelTest, givenTimestampWriteEnableOnMulti
     dispatchInfo.setKernel(mockKernel.mockKernel);
     multiDispatchInfo.push(dispatchInfo);
 
-    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false);
-    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false);
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false, nullptr);
 
     EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO<FamilyType>::size + ImplicitScalingDispatch<FamilyType>::getBarrierSize(csr.peekHwInfo(), false, false), extendedCommandStreamSize);
 }
@@ -1047,8 +1047,8 @@ HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithWait
     dispatchInfo.setKernel(mockKernel.mockKernel);
     multiDispatchInfo.push(dispatchInfo);
 
-    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false);
-    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, true);
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, true, nullptr);
 
     EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO<FamilyType>::size, extendedCommandStreamSize);
 }
diff --git a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp
index 3587e5e578..f0d5895991 100644
--- a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp
+++ b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp
@@ -23,6 +23,7 @@
 #include "opencl/test/unit_test/fixtures/hello_world_kernel_fixture.h"
 #include "opencl/test/unit_test/fixtures/image_fixture.h"
 #include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h"
+#include "opencl/test/unit_test/mocks/mock_event.h"
 
 using namespace NEO;
 
@@ -96,7 +97,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenFillingBufferThenHeapsAndCommandBufferCo
     auto usedAfterSSH = ssh.getUsed();
 
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_FILL_BUFFER, CsrDependencies(), false, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredDSH(multiDispatchInfo);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredIOH(multiDispatchInfo);
     auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@@ -149,7 +150,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenCopyingBufferThenHeapsAndCommandBufferCo
     auto usedAfterSSH = ssh.getUsed();
 
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_COPY_BUFFER, CsrDependencies(), false, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredDSH(multiDispatchInfo);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredIOH(multiDispatchInfo);
     auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@@ -203,7 +204,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenReadingBufferNonBlockingThenHeapsAndComm
     auto usedAfterSSH = ssh.getUsed();
 
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, CsrDependencies(), false, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredDSH(multiDispatchInfo);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredIOH(multiDispatchInfo);
     auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@@ -258,7 +259,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenReadingBufferBlockingThenThenHeapsAndCom
     auto usedAfterSSH = ssh.getUsed();
 
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, CsrDependencies(), false, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredDSH(multiDispatchInfo);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredIOH(multiDispatchInfo);
     auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@@ -313,7 +314,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferNonBlockingThenHeapsAndComm
     auto usedAfterSSH = ssh.getUsed();
 
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, CsrDependencies(), false, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredDSH(multiDispatchInfo);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredIOH(multiDispatchInfo);
     auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@@ -365,7 +366,7 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferBlockingThenHeapsAndCommand
     auto usedAfterSSH = ssh.getUsed();
 
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, CsrDependencies(), false, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredDSH(multiDispatchInfo);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredIOH(multiDispatchInfo);
     auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@@ -380,6 +381,68 @@ HWTEST_F(GetSizeRequiredBufferTest, WhenWritingBufferBlockingThenHeapsAndCommand
     EXPECT_GE(expectedSizeSSH, usedAfterSSH - usedBeforeSSH);
 }
 
+HWTEST_F(GetSizeRequiredBufferTest, GivenOutEventForSingleDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsNotAdded) {
+    UltClDeviceFactory deviceFactory{1, 0};
+    DebugManager.flags.EnableMultiRootDeviceContexts.set(true);
+
+    cl_device_id devices[] = {deviceFactory.rootDevices[0]};
+
+    MockContext pContext(ClDeviceVector(devices, 1));
+    MockKernelWithInternals mockKernel(*pContext.getDevices()[0]);
+    DispatchInfo dispatchInfo;
+    MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel);
+    dispatchInfo.setKernel(mockKernel.mockKernel);
+    multiDispatchInfo.push(dispatchInfo);
+    auto event = std::make_unique<MockEvent<Event>>(&pContext, nullptr, 0, 0, 0);
+    cl_event clEvent = event.get();
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent);
+
+    EXPECT_EQ(baseCommandStreamSize, extendedCommandStreamSize);
+}
+
+HWTEST_F(GetSizeRequiredBufferTest, GivenUserEventForMultiDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsNotAdded) {
+    UltClDeviceFactory deviceFactory{2, 0};
+    DebugManager.flags.EnableMultiRootDeviceContexts.set(true);
+
+    cl_device_id devices[] = {deviceFactory.rootDevices[0],
+                              deviceFactory.rootDevices[1]};
+
+    MockContext pContext(ClDeviceVector(devices, 2));
+    MockKernelWithInternals mockKernel(*pContext.getDevices()[0]);
+    DispatchInfo dispatchInfo;
+    MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel);
+    dispatchInfo.setKernel(mockKernel.mockKernel);
+    multiDispatchInfo.push(dispatchInfo);
+    auto userEvent1 = std::make_unique<UserEvent>(&pContext);
+    cl_event clEvent = userEvent1.get();
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent);
+
+    EXPECT_EQ(baseCommandStreamSize, extendedCommandStreamSize);
+}
+
+HWTEST_F(GetSizeRequiredBufferTest, GivenOutEventForMultiDeviceContextWhenCalculatingCSSizeThenExtraPipeControlIsAdded) {
+    UltClDeviceFactory deviceFactory{2, 0};
+    DebugManager.flags.EnableMultiRootDeviceContexts.set(true);
+
+    cl_device_id devices[] = {deviceFactory.rootDevices[0],
+                              deviceFactory.rootDevices[1]};
+
+    MockContext pContext(ClDeviceVector(devices, 2));
+    MockKernelWithInternals mockKernel(*pContext.getDevices()[0]);
+    DispatchInfo dispatchInfo;
+    MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel);
+    dispatchInfo.setKernel(mockKernel.mockKernel);
+    multiDispatchInfo.push(dispatchInfo);
+    auto event = std::make_unique<MockEvent<Event>>(&pContext, nullptr, 0, 0, 0);
+    cl_event clEvent = event.get();
+    auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
+    auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, &clEvent);
+
+    EXPECT_EQ(baseCommandStreamSize + MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pContext.getDevices()[0]->getHardwareInfo(), false), extendedCommandStreamSize);
+}
+
 HWTEST_F(GetSizeRequiredBufferTest, givenMultipleKernelRequiringSshWhenTotalSizeIsComputedThenItIsProperlyAligned) {
     auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBuffer,
                                                                             pCmdQ->getClDevice());
diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp
index 1a3c2e8517..10a032bbf4 100644
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp
@@ -1904,7 +1904,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenItIsUnblocke
     blockedCommandsData->setHeaps(dsh, ioh, ssh);
 
     std::vector<Surface *> surfaces;
-    event->setCommand(std::make_unique<CommandComputeKernel>(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1));
+    event->setCommand(std::make_unique<CommandComputeKernel>(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1, nullptr));
     event->submitCommand(false);
 
     EXPECT_EQ(numGrfRequired, csr->savedDispatchFlags.numGrfRequired);
@@ -1949,7 +1949,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenInitializeBc
     auto blockedCommandsData = std::make_unique<KernelOperation>(cmdStream, *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
 
     std::vector<Surface *> surfaces;
-    event->setCommand(std::make_unique<CommandComputeKernel>(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1));
+    event->setCommand(std::make_unique<CommandComputeKernel>(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1, nullptr));
     event->submitCommand(false);
     EXPECT_FALSE(pCmdQ->isCsrLocked);
 }
diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp
index e04eac63b8..b9d282d5e1 100644
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp
@@ -8,6 +8,7 @@
 #include "shared/source/command_stream/wait_status.h"
 #include "shared/source/helpers/timestamp_packet.h"
 #include "shared/test/common/mocks/mock_command_stream_receiver.h"
+#include "shared/test/common/mocks/mock_timestamp_container.h"
 #include "shared/test/common/mocks/ult_device_factory.h"
 #include "shared/test/common/test_macros/hw_test.h"
 
@@ -15,6 +16,7 @@
 #include "opencl/source/event/user_event.h"
 #include "opencl/test/unit_test/fixtures/multi_root_device_fixture.h"
 #include "opencl/test/unit_test/fixtures/ult_command_stream_receiver_fixture.h"
+#include "opencl/test/unit_test/mocks/mock_event.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
 #include "opencl/test/unit_test/mocks/mock_program.h"
 #include "opencl/test/unit_test/test_macros/test_checks_ocl.h"
@@ -46,12 +48,18 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu
     MockGraphicsAllocation svmAlloc(svmPtr, svmSize);
 
     Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
+    auto node1 = event1.getMultiRootTimestampSyncNode();
     Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
     Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
+    auto node3 = event3.getMultiRootTimestampSyncNode();
     Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    auto node4 = event4.getMultiRootTimestampSyncNode();
     Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
+    auto node5 = event5.getMultiRootTimestampSyncNode();
     UserEvent userEvent1(&pCmdQ1->getContext());
+    userEvent1.getMultiRootTimestampSyncNode();
     UserEvent userEvent2(&pCmdQ2->getContext());
+    userEvent2.getMultiRootTimestampSyncNode();
 
     userEvent1.setStatus(CL_COMPLETE);
     userEvent2.setStatus(CL_COMPLETE);
@@ -88,12 +96,12 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu
         EXPECT_EQ(2u, semaphores.size());
 
         auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());
 
         auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
-        EXPECT_EQ(7u, semaphoreCmd1->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress());
     }
 
     {
@@ -116,12 +124,12 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMu
         EXPECT_EQ(2u, semaphores.size());
 
         auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());
 
         auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
-        EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress());
     }
     alignedFree(svmPtr);
 }
@@ -148,17 +156,24 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo
     cl_device_id devices[] = {device1, device2, device3};
 
     auto context = std::make_unique<MockContext>(ClDeviceVector(devices, 3), false);
-
+    auto mockTagAllocator = std::make_unique<MockTagAllocator<>>(context->getRootDeviceIndices(), device1->getExecutionEnvironment()->memoryManager.get(), 10u);
+    std::unique_ptr<TagAllocatorBase> uniquePtr(mockTagAllocator.release());
+    context->setMultiRootDeviceTimestampPacketAllocator(uniquePtr);
     auto pCmdQ1 = context->getSpecialQueue(1u);
     auto pCmdQ2 = context->getSpecialQueue(2u);
     auto pCmdQ3 = context->getSpecialQueue(3u);
 
     Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
+    auto node1 = event1.getMultiRootTimestampSyncNode();
     Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
     Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
+    auto node3 = event3.getMultiRootTimestampSyncNode();
     Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    auto node4 = event4.getMultiRootTimestampSyncNode();
     Event event5(pCmdQ3, CL_COMMAND_NDRANGE_KERNEL, 7, 21);
+    auto node5 = event5.getMultiRootTimestampSyncNode();
     Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
+    auto node6 = event6.getMultiRootTimestampSyncNode();
     UserEvent userEvent1(&pCmdQ1->getContext());
     UserEvent userEvent2(&pCmdQ2->getContext());
 
@@ -191,16 +206,16 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo
         EXPECT_EQ(3u, semaphores.size());
 
         auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());
 
         auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
-        EXPECT_EQ(21u, semaphoreCmd1->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ3->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress());
 
         auto semaphoreCmd2 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[2]));
-        EXPECT_EQ(7u, semaphoreCmd2->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd2->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node6->getContextEndAddress(0u)), semaphoreCmd2->getSemaphoreGraphicsAddress());
     }
 
     {
@@ -216,16 +231,16 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo
         EXPECT_EQ(3u, semaphores.size());
 
         auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());
 
         auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
-        EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress());
 
         auto semaphoreCmd2 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[2]));
-        EXPECT_EQ(21u, semaphoreCmd2->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ3->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd2->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node5->getContextEndAddress(0u)), semaphoreCmd2->getSemaphoreGraphicsAddress());
     }
 
     {
@@ -250,8 +265,8 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo
         EXPECT_EQ(1u, semaphores.size());
 
         auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());
     }
 }
 
@@ -287,11 +302,16 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro
     using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
 
     Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
+    event1.getMultiRootTimestampSyncNode();
     Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
     Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6);
+    event3.getMultiRootTimestampSyncNode();
     Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
+    event4.getMultiRootTimestampSyncNode();
     Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    event5.getMultiRootTimestampSyncNode();
     Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
+    event6.getMultiRootTimestampSyncNode();
     UserEvent userEvent1(&pCmdQ1->getContext());
     UserEvent userEvent2(&pCmdQ2->getContext());
 
@@ -317,10 +337,10 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro
 
         EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr);
         CsrDependencies csrDeps;
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ1->getGpgpuCommandStreamReceiver());
+        eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ1->getGpgpuCommandStreamReceiver());
 
-        EXPECT_EQ(0u, csrDeps.taskCountContainer.size());
-        EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<FamilyType>(csrDeps));
+        EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size());
+        EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer<FamilyType>(csrDeps));
     }
 
     {
@@ -343,10 +363,10 @@ HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnviro
 
         EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr);
         CsrDependencies csrDeps;
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver());
+        eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver());
 
-        EXPECT_EQ(3u, csrDeps.taskCountContainer.size());
-        EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<FamilyType>(csrDeps));
+        EXPECT_EQ(3u, csrDeps.multiRootTimeStampSyncContainer.size());
+        EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer<FamilyType>(csrDeps));
     }
 }
 
@@ -377,6 +397,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
 
     cl_event outputEvent2{};
 
+    auto currentCsUsedCmdq1 = pCmdQ1->getCS(0).getUsed();
     pCmdQ2->enqueueReadBuffer(buffer, CL_FALSE, 0, MemoryConstants::pageSize, hostPtr, nullptr,
                               1,
                               &outputEvent1,
@@ -400,14 +421,12 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
         nullptr);
     {
         HardwareParse csHwParser;
-        csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
+        csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0), currentCsUsedCmdq1);
         auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
 
         EXPECT_EQ(0u, semaphores.size());
     }
     userEvent1.setStatus(CL_COMPLETE);
-    event1->release();
-    event2->release();
     pCmdQ1->finish();
     pCmdQ2->finish();
     {
@@ -418,7 +437,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
         EXPECT_EQ(1u, semaphores.size());
         auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
         EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(event2->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0)->getContextEndAddress(0u)), semaphoreCmd->getSemaphoreGraphicsAddress());
     }
     {
         HardwareParse csHwParser;
@@ -427,9 +446,11 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
 
         EXPECT_EQ(1u, semaphores.size());
         auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(0u, semaphoreCmd->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(event1->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0)->getContextEndAddress(0u)), semaphoreCmd->getSemaphoreGraphicsAddress());
     }
+    event1->release();
+    event2->release();
     buffer->release();
 }
 
@@ -459,14 +480,14 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
     char hostPtr[MemoryConstants::pageSize]{};
 
     cl_event outputEvent2{};
-
+    auto currentCsUsed = pCmdQ1->getCS(0).getUsed();
     pCmdQ1->enqueueReadBuffer(buffer, CL_FALSE, 0, MemoryConstants::pageSize, hostPtr, nullptr,
                               1,
                               &outputEvent1,
                               &outputEvent2);
     {
         HardwareParse csHwParser;
-        csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
+        csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0), currentCsUsed);
         auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
 
         EXPECT_EQ(0u, semaphores.size());
@@ -483,7 +504,7 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
         nullptr);
     {
         HardwareParse csHwParser;
-        csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
+        csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0), currentCsUsed);
         auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
 
         EXPECT_EQ(0u, semaphores.size());
@@ -591,9 +612,6 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
         EXPECT_EQ(0u, semaphores.size());
     }
     userEvent1.setStatus(CL_COMPLETE);
-    event1->release();
-    event2->release();
-    event3->release();
     pCmdQ1->finish();
     pCmdQ2->finish();
 
@@ -605,7 +623,8 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
         EXPECT_EQ(1u, semaphores.size());
         auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
         EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress());
+        auto node = event2->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0);
+        EXPECT_EQ(node->getGpuAddress() + node->getContextEndOffset(), semaphoreCmd->getSemaphoreGraphicsAddress());
     }
     {
         HardwareParse csHwParser;
@@ -621,8 +640,9 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
 
         EXPECT_EQ(2u, semaphores.size());
         auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(0u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        auto node = event1->getMultiRootDeviceTimestampPacketNodes()->peekNodes().at(0);
+        EXPECT_EQ(node->getGpuAddress() + node->getContextEndOffset(), semaphoreCmd0->getSemaphoreGraphicsAddress());
     }
     {
         HardwareParse csHwParser;
@@ -631,6 +651,9 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
 
         EXPECT_LE(1u, semaphores.size());
     }
+    event1->release();
+    event2->release();
+    event3->release();
     buffer->release();
     pCmdQ1->release();
     pCmdQ2->release();
@@ -962,3 +985,73 @@ HWTEST_F(BcsCrossDeviceMigrationTests, givenBufferWithMultiStorageWhenEnqueueRea
 
     EXPECT_EQ(buffer.get(), cmdQueue->migrateMultiGraphicsAllocationsReceivedOperationParams.srcMemObj);
 }
+
+HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyDoNotHaveMultiRootSyncNodeThenCsrDepsDoesNotHaveAnyMultiRootSyncContainer) {
+    Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
+    Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
+    Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6);
+    Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
+    Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
+    UserEvent userEvent1(&pCmdQ1->getContext());
+    UserEvent userEvent2(&pCmdQ2->getContext());
+
+    userEvent1.setStatus(CL_COMPLETE);
+    userEvent2.setStatus(CL_COMPLETE);
+    {
+        cl_event eventWaitList[] =
+            {
+                &event1,
+                &event2,
+                &event3,
+                &event4,
+                &event5,
+                &event6,
+                &userEvent1,
+            };
+        cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]);
+
+        EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr);
+        CsrDependencies csrDeps;
+        eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver());
+
+        EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size());
+    }
+}
+HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyDoNotHaveMultiRootSyncNodeContainersThenCsrDepsDoesNotHaveAnyMultiRootSyncContainer) {
+
+    MockEvent<Event> event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
+    event1.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer());
+    MockEvent<Event> event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
+    MockEvent<Event> event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
+    event3.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer());
+    MockEvent<Event> event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    event4.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer());
+    MockEvent<Event> event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    event5.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer());
+    MockEvent<Event> event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
+    event6.multiRootDeviceTimestampPacketContainer.reset(new TimestampPacketContainer());
+    UserEvent userEvent1(&pCmdQ1->getContext());
+
+    userEvent1.setStatus(CL_COMPLETE);
+
+    {
+        cl_event eventWaitList[] =
+            {
+                &event1,
+                &event2,
+                &event3,
+                &event4,
+                &event5,
+                &event6,
+                &userEvent1,
+            };
+        cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]);
+
+        EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr);
+        CsrDependencies csrDeps;
+        eventsRequest.fillCsrDependenciesForRootDevices(csrDeps, pCmdQ2->getGpgpuCommandStreamReceiver());
+
+        EXPECT_EQ(0u, csrDeps.multiRootTimeStampSyncContainer.size());
+    }
+}
diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp
index 4018e2ac6e..99dcefb0be 100644
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp
@@ -12,6 +12,7 @@
 #include "shared/source/helpers/blit_commands_helper.h"
 #include "shared/source/helpers/constants.h"
 #include "shared/source/helpers/logical_state_helper.h"
+#include "shared/source/os_interface/device_factory.h"
 #include "shared/source/os_interface/hw_info_config.h"
 #include "shared/test/common/helpers/debug_manager_state_restore.h"
 #include "shared/test/common/helpers/engine_descriptor_helper.h"
diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp
index b0bb0304ce..42758e1498 100644
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp
@@ -1793,4 +1793,4 @@ HWTEST_F(BcsTests, givenHostPtrToImageWhenBlitBufferIsCalledThenBlitCmdIsFound)
     hwParser.parseCommands<FamilyType>(csr.commandStream, 0);
     auto cmdIterator = find<typename FamilyType::XY_BLOCK_COPY_BLT *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
     EXPECT_NE(hwParser.cmdList.end(), cmdIterator);
-}
+}
\ No newline at end of file
diff --git a/opencl/test/unit_test/event/event_builder_tests.cpp b/opencl/test/unit_test/event/event_builder_tests.cpp
index 234248a617..ff0cda16de 100644
--- a/opencl/test/unit_test/event/event_builder_tests.cpp
+++ b/opencl/test/unit_test/event/event_builder_tests.cpp
@@ -79,7 +79,7 @@ TEST(EventBuilder, givenVirtualEventWithCommandThenFinalizeAddChild) {
       public:
         using CommandComputeKernel::eventsWaitlist;
         MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces, Kernel *kernel)
-            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
+            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {}
     };
 
     auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
@@ -129,7 +129,7 @@ TEST(EventBuilder, givenVirtualEventWithSubmittedCommandAsParentThenFinalizeNotA
       public:
         using CommandComputeKernel::eventsWaitlist;
         MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces, Kernel *kernel)
-            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
+            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {}
     };
 
     auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
diff --git a/opencl/test/unit_test/event/event_tests.cpp b/opencl/test/unit_test/event/event_tests.cpp
index 4d58140ed5..9ef77c60a3 100644
--- a/opencl/test/unit_test/event/event_tests.cpp
+++ b/opencl/test/unit_test/event/event_tests.cpp
@@ -486,7 +486,7 @@ TEST_F(InternalsEventTest, GivenSubmitCommandFalseWhenSubmittingCommandsThenRefA
 
     PreemptionMode preemptionMode = pDevice->getPreemptionMode();
     v.push_back(bufferSurf);
-    auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr);
     event.setCommand(std::unique_ptr<Command>(cmd));
 
     auto taskLevelBefore = csr.peekTaskLevel();
@@ -529,7 +529,7 @@ TEST_F(InternalsEventTest, GivenSubmitCommandTrueWhenSubmittingCommandsThenRefAp
     NullSurface *surface = new NullSurface;
     v.push_back(surface);
     PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr);
     event.setCommand(std::unique_ptr<Command>(cmd));
 
     auto taskLevelBefore = csr.peekTaskLevel();
@@ -580,7 +580,7 @@ TEST_F(InternalsEventTest, givenBlockedKernelWithPrintfWhenSubmittedThenPrintOut
 
     std::vector<Surface *> v;
     PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr);
     event.setCommand(std::unique_ptr<Command>(cmd));
 
     event.submitCommand(false);
@@ -632,7 +632,7 @@ TEST_F(InternalsEventTest, givenGpuHangOnCmdQueueWaitFunctionAndBlockedKernelWit
 
     std::vector<Surface *> v;
     PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr);
     event.setCommand(std::unique_ptr<Command>(cmd));
 
     event.submitCommand(false);
@@ -681,7 +681,7 @@ TEST_F(InternalsEventTest, givenGpuHangOnPrintingEnqueueOutputAndBlockedKernelWi
 
     std::vector<Surface *> v;
     PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(mockCmdQueue, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1, nullptr);
     event.setCommand(std::unique_ptr<Command>(cmd));
 
     event.submitCommand(false);
@@ -1170,7 +1170,7 @@ HWTEST_F(EventTest, givenVirtualEventWhenCommandSubmittedThenLockCsrOccurs) {
       public:
         using CommandComputeKernel::eventsWaitlist;
         MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces, Kernel *kernel)
-            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
+            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {}
     };
     class MockEvent : public Event {
       public:
@@ -1751,7 +1751,7 @@ HWTEST_F(InternalsEventTest, givenAbortedCommandWhenSubmitCalledThenDontUpdateFl
     blockedCommandsData->setHeaps(dsh, ioh, ssh);
     PreemptionMode preemptionMode = pDevice->getPreemptionMode();
     std::vector<Surface *> v;
-    auto cmd = new CommandComputeKernel(*pCmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(*pCmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1, nullptr);
     event->setCommand(std::unique_ptr<Command>(cmd));
 
     FlushStamp expectedFlushStamp = 0;
@@ -1894,3 +1894,35 @@ TEST(EventTimestampTest, givenEnableTimestampWaitWhenCheckIsTimestampWaitEnabled
         EXPECT_TRUE(event.isWaitForTimestampsEnabled());
     }
 }
+TEST(MultiRootEvent, givenContextWithMultiRootTagAllocatorWhenEventGetsTagThenNewAllocatorIsNotCreated) {
+    auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
+    MockContext context{};
+    MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false);
+    auto allocator = cmdQ.getGpgpuCommandStreamReceiver().createMultiRootDeviceTimestampPacketAllocator(context.getRootDeviceIndices());
+    auto allocatorPtr = allocator.get();
+    context.setMultiRootDeviceTimestampPacketAllocator(allocator);
+    MockEvent<Event> event{&cmdQ, CL_COMMAND_MARKER, 0, 0};
+    event.getMultiRootTimestampSyncNode();
+    EXPECT_EQ(allocatorPtr, context.getMultiRootDeviceTimestampPacketAllocator());
+}
+TEST(MultiRootEvent, givenContextWithoutMultiRootTagAllocatorWhenEventGetsTagThenNewAllocatorIsCreated) {
+    auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
+    MockContext context{};
+    MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false);
+    EXPECT_EQ(context.getMultiRootDeviceTimestampPacketAllocator(), nullptr);
+    MockEvent<Event> event{&cmdQ, CL_COMMAND_MARKER, 0, 0};
+    event.getMultiRootTimestampSyncNode();
+    EXPECT_NE(context.getMultiRootDeviceTimestampPacketAllocator(), nullptr);
+}
+TEST(MultiRootEvent, givenEventWithTagWhenEventGetsNewTagThenNewTagContainerIsNotCreated) {
+    auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
+    MockContext context{};
+    MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false);
+    MockEvent<Event> event{&cmdQ, CL_COMMAND_MARKER, 0, 0};
+    EXPECT_EQ(event.getMultiRootDeviceTimestampPacketNodes(), nullptr);
+    event.getMultiRootTimestampSyncNode();
+    auto containerPtr = event.getMultiRootDeviceTimestampPacketNodes();
+    EXPECT_NE(containerPtr, nullptr);
+    event.getMultiRootTimestampSyncNode();
+    EXPECT_EQ(containerPtr, event.getMultiRootDeviceTimestampPacketNodes());
+}
\ No newline at end of file
diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
index 98e8f404d4..df7c3da23a 100644
--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@@ -2435,7 +2435,7 @@ HWTEST_F(GTPinTests, givenGtPinInitializedWhenSubmittingKernelCommandThenFlushed
 
     gtpinNotifyKernelSubmit(kernel.mockMultiDeviceKernel, mockCmdQ.get());
 
-    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1));
+    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr));
     CompletionStamp stamp = command->submit(20, false);
 
     ASSERT_EQ(1u, kernelExecQueue.size());
diff --git a/opencl/test/unit_test/helpers/task_information_tests.cpp b/opencl/test/unit_test/helpers/task_information_tests.cpp
index b4489548af..302f090012 100644
--- a/opencl/test/unit_test/helpers/task_information_tests.cpp
+++ b/opencl/test/unit_test/helpers/task_information_tests.cpp
@@ -154,7 +154,7 @@ TEST(CommandTest, givenWaitlistRequestWhenCommandComputeKernelIsCreatedThenMakeL
       public:
         using CommandComputeKernel::eventsWaitlist;
         MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces, Kernel *kernel)
-            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
+            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0, nullptr) {}
     };
 
     auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
@@ -291,7 +291,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectD
     for (auto &surface : surfaces) {
         requiresCoherency |= surface->IsCoherent;
     }
-    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1));
+    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr));
     command->submit(20, false);
 
     EXPECT_FALSE(mockCsr->passedDispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode);
@@ -339,7 +339,7 @@ HWTEST_F(DispatchFlagsTests, givenClCommandCopyImageWhenSubmitThenFlushTextureCa
     for (auto &surface : surfaces) {
         requiresCoherency |= surface->IsCoherent;
     }
-    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, commandType, nullptr, preemptionMode, kernel, 1));
+    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, commandType, nullptr, preemptionMode, kernel, 1, nullptr));
     command->submit(20, false);
 
     EXPECT_FALSE(mockCsr->passedDispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode);
@@ -425,7 +425,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectD
     bool flushDC = false;
     bool slmUsed = false;
     bool ndRangeKernel = false;
-    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1));
+    std::unique_ptr<Command> command(new CommandComputeKernel(*mockCmdQ, kernelOperation, surfaces, flushDC, slmUsed, ndRangeKernel, nullptr, preemptionMode, kernel, 1, nullptr));
     command->submit(20, false);
 
     EXPECT_TRUE(mockCsr->passedDispatchFlags.epilogueRequired);
diff --git a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp
index 1abdebb924..5d1fdd51b3 100644
--- a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp
+++ b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp
@@ -34,11 +34,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWriteEnabl
     auto mockCmdQHw = std::make_unique<MockCommandQueueHw<FamilyType>>(context, device.get(), nullptr);
 
     device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
     auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize;
 
     device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
     auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize;
 
     auto extendedSize = sizeWithDisabled + sizeof(typename FamilyType::PIPE_CONTROL);
@@ -52,7 +52,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat
 
     device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
     getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false,
-                                                            false, multiDispatchInfo, nullptr, 0, false, false);
+                                                            false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
     auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize;
 
     device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
@@ -82,7 +82,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat
     eventsRequest.fillCsrDependenciesForTimestampPacketContainer(
         csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
 
-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
     auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize;
 
     size_t sizeForNodeDependency = 0;
@@ -143,7 +143,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr
     auto mockCmdQHw = std::make_unique<MockCommandQueueHw<FamilyType>>(context, device.get(), nullptr);
 
     device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
     auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize;
 
     device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
@@ -172,7 +172,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr
     CsrDependencies csrDeps;
     eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
 
-    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false);
+    getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
     auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize;
 
     size_t sizeForNodeDependency = 0;
diff --git a/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp b/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp
index c983c1876b..230f9c65d9 100644
--- a/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_cache_flush_requirements_tests.cpp
@@ -213,7 +213,7 @@ HWTEST2_F(KernelWithCacheFlushTests, givenCacheFlushRequiredWhenEstimatingThenAd
     {
         EXPECT_FALSE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ));
 
-        initialSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false);
+        initialSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false, nullptr);
     }
 
     {
@@ -227,7 +227,7 @@ HWTEST2_F(KernelWithCacheFlushTests, givenCacheFlushRequiredWhenEstimatingThenAd
         ultCsr.multiOsContextCapable = false;
         EXPECT_TRUE(mockKernel->mockKernel->Kernel::requiresCacheFlushCommand(*cmdQ));
 
-        sizeWithCacheFlush = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false);
+        sizeWithCacheFlush = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, csrDeps, false, false, false, *cmdQ, multiDispatchInfo, false, false, nullptr);
     }
 
     EXPECT_EQ(initialSize + expectedDiff, sizeWithCacheFlush);
diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp
index 4bf2b03166..3cddb38da9 100644
--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@@ -505,6 +505,7 @@ class CommandStreamReceiverMock : public CommandStreamReceiver {
     using BaseClass::CommandStreamReceiver;
 
     TagAllocatorBase *getTimestampPacketAllocator() override { return nullptr; }
+    std::unique_ptr<TagAllocatorBase> createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override { return std::unique_ptr<TagAllocatorBase>(nullptr); }
 
     SubmissionStatus flushTagUpdate() override { return SubmissionStatus::SUCCESS; };
     void updateTagFromWait() override{};
diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp
index b01b706520..c4b1a4c128 100644
--- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp
+++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp
@@ -767,11 +767,11 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferOperationWithoutKernelWhenEstimati
     auto &hwInfo = cmdQ->getDevice().getHardwareInfo();
 
     auto readBufferCmdsSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, csrDependencies, false, false,
-                                                                                   true, *cmdQ, multiDispatchInfo, false, false);
+                                                                                   true, *cmdQ, multiDispatchInfo, false, false, nullptr);
     auto writeBufferCmdsSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, csrDependencies, false, false,
-                                                                                    true, *cmdQ, multiDispatchInfo, false, false);
+                                                                                    true, *cmdQ, multiDispatchInfo, false, false, nullptr);
     auto copyBufferCmdsSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_COPY_BUFFER, csrDependencies, false, false,
-                                                                                   true, *cmdQ, multiDispatchInfo, false, false);
+                                                                                   true, *cmdQ, multiDispatchInfo, false, false, nullptr);
     auto expectedSize = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<FamilyType>();
 
     if (cmdQ->isCacheFlushForBcsRequired()) {
diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h
index 553b0f9397..6a501e2546 100644
--- a/opencl/test/unit_test/mocks/mock_command_queue.h
+++ b/opencl/test/unit_test/mocks/mock_command_queue.h
@@ -249,6 +249,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
     using BaseClass::latestSentEnqueueType;
     using BaseClass::obtainCommandStream;
     using BaseClass::obtainNewTimestampPacketNodes;
+    using BaseClass::processDispatchForKernels;
     using BaseClass::requiresCacheFlushAfterWalker;
     using BaseClass::throttle;
     using BaseClass::timestampPacketContainer;
diff --git a/opencl/test/unit_test/mocks/mock_event.h b/opencl/test/unit_test/mocks/mock_event.h
index bde34c8904..6636dd9362 100644
--- a/opencl/test/unit_test/mocks/mock_event.h
+++ b/opencl/test/unit_test/mocks/mock_event.h
@@ -39,6 +39,7 @@ struct MockEvent : public BaseEventType {
     using Event::calculateSubmitTimestampData;
     using Event::isWaitForTimestampsEnabled;
     using Event::magic;
+    using Event::multiRootDeviceTimestampPacketContainer;
     using Event::queueTimeStamp;
     using Event::submitTimeStamp;
     using Event::timestampPacketContainer;
diff --git a/opencl/test/unit_test/profiling/profiling_tests.cpp b/opencl/test/unit_test/profiling/profiling_tests.cpp
index 92beea833f..7dec14a546 100644
--- a/opencl/test/unit_test/profiling/profiling_tests.cpp
+++ b/opencl/test/unit_test/profiling/profiling_tests.cpp
@@ -71,13 +71,13 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingAndFor
 
     MultiDispatchInfo multiDispatchInfo(&kernel);
     auto &commandStreamNDRangeKernel = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, CsrDependencies(), true, false, false,
-                                                                                               multiDispatchInfo, nullptr, 0, false, false);
+                                                                                               multiDispatchInfo, nullptr, 0, false, false, nullptr);
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, true, false, *pCmdQ, &kernel, {});
     EXPECT_GE(expectedSizeCS, requiredSize);
     EXPECT_GE(commandStreamNDRangeKernel.getAvailableSpace(), requiredSize);
 
     auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, CsrDependencies(), true, false, false,
-                                                                            multiDispatchInfo, nullptr, 0, false, false);
+                                                                            multiDispatchInfo, nullptr, 0, false, false, nullptr);
     expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_TASK, true, false, *pCmdQ, &kernel, {});
     EXPECT_GE(expectedSizeCS, requiredSize);
     EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize);
@@ -93,13 +93,13 @@ HWTEST_F(ProfilingTests, GivenCommandQueueWithProfilingAndForWorkloadWithNoKerne
     MultiDispatchInfo multiDispatchInfo(nullptr);
     auto &commandStreamMigrateMemObjects = getCommandStream<FamilyType, CL_COMMAND_MIGRATE_MEM_OBJECTS>(*pCmdQ, CsrDependencies(),
                                                                                                         true, false, false,
-                                                                                                        multiDispatchInfo, nullptr, 0, false, false);
+                                                                                                        multiDispatchInfo, nullptr, 0, false, false, nullptr);
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, false, *pCmdQ, nullptr, {});
     EXPECT_GE(expectedSizeCS, requiredSize);
     EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize);
 
     auto &commandStreamMarker = getCommandStream<FamilyType, CL_COMMAND_MARKER>(*pCmdQ, CsrDependencies(), true,
-                                                                                false, false, multiDispatchInfo, nullptr, 0, false, false);
+                                                                                false, false, multiDispatchInfo, nullptr, 0, false, false, nullptr);
     expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_MARKER, true, false, *pCmdQ, nullptr, {});
     EXPECT_GE(expectedSizeCS, requiredSize);
     EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize);
@@ -121,9 +121,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingAndFor
     multiDispatchInfo.push(dispatchInfo);
     multiDispatchInfo.push(dispatchInfo);
     auto &commandStreamTask = getCommandStream<FamilyType, CL_COMMAND_TASK>(*pCmdQ, CsrDependencies(), true, false, false,
-                                                                            multiDispatchInfo, nullptr, 0, false, false);
+                                                                            multiDispatchInfo, nullptr, 0, false, false, nullptr);
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_TASK, CsrDependencies(), true, false,
-                                                                               false, *pCmdQ, multiDispatchInfo, false, false);
+                                                                               false, *pCmdQ, multiDispatchInfo, false, false, nullptr);
     EXPECT_GE(expectedSizeCS, requiredSize);
     EXPECT_GE(commandStreamTask.getAvailableSpace(), requiredSize);
 }
@@ -741,13 +741,13 @@ HWTEST_F(ProfilingWithPerfCountersTests, GivenCommandQueueWithProfilingPerfCount
     MultiDispatchInfo multiDispatchInfo(nullptr);
     auto &commandStreamMigrateMemObjects = getCommandStream<FamilyType, CL_COMMAND_MIGRATE_MEM_OBJECTS>(*pCmdQ, CsrDependencies(),
                                                                                                         true, true, false, multiDispatchInfo,
-                                                                                                        nullptr, 0, false, false);
+                                                                                                        nullptr, 0, false, false, nullptr);
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_MIGRATE_MEM_OBJECTS, true, true, *pCmdQ, nullptr, {});
     EXPECT_GE(expectedSizeCS, requiredSize);
     EXPECT_GE(commandStreamMigrateMemObjects.getAvailableSpace(), requiredSize);
 
     auto &commandStreamMarker = getCommandStream<FamilyType, CL_COMMAND_MARKER>(*pCmdQ, CsrDependencies(), true, true, false,
-                                                                                multiDispatchInfo, nullptr, 0, false, false);
+                                                                                multiDispatchInfo, nullptr, 0, false, false, nullptr);
     expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_MARKER, true, true, *pCmdQ, nullptr, {});
     EXPECT_GE(expectedSizeCS, requiredSize);
     EXPECT_GE(commandStreamMarker.getAvailableSpace(), requiredSize);
diff --git a/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp b/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp
index 4d23062f48..ddef4dc98d 100644
--- a/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp
+++ b/opencl/test/unit_test/xe_hpc_core/pvc/command_stream_receiver_hw_tests_pvc.cpp
@@ -13,6 +13,7 @@
 #include "shared/source/helpers/engine_node_helper.h"
 #include "shared/source/helpers/preamble.h"
 #include "shared/source/os_interface/device_factory.h"
+#include "shared/source/utilities/tag_allocator.h"
 #include "shared/source/xe_hpc_core/hw_cmds_pvc.h"
 #include "shared/test/common/helpers/debug_manager_state_restore.h"
 #include "shared/test/common/helpers/variable_backup.h"
@@ -243,10 +244,14 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent
     MockGraphicsAllocation svmAlloc(svmPtr, svmSize);
 
     Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
+    auto node1 = event1.getMultiRootTimestampSyncNode();
     Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
     Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
+    auto node3 = event3.getMultiRootTimestampSyncNode();
     Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
+    auto node4 = event4.getMultiRootTimestampSyncNode();
     Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
+    auto node5 = event5.getMultiRootTimestampSyncNode();
     UserEvent userEvent1(&pCmdQ1->getContext());
     UserEvent userEvent2(&pCmdQ2->getContext());
 
@@ -285,12 +290,12 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent
         EXPECT_EQ(3u, semaphores.size());
 
         auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node4->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());
 
         auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
-        EXPECT_EQ(7u, semaphoreCmd1->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node5->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress());
     }
 
     {
@@ -313,12 +318,12 @@ PVCTEST_F(PvcMultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEvent
         EXPECT_EQ(3u, semaphores.size());
 
         auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
-        EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd0->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node1->getContextEndAddress(0u)), semaphoreCmd0->getSemaphoreGraphicsAddress());
 
         auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
-        EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword());
-        EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
+        EXPECT_EQ(1u, semaphoreCmd1->getSemaphoreDataDword());
+        EXPECT_EQ(reinterpret_cast<uint64_t>(node3->getContextEndAddress(0u)), semaphoreCmd1->getSemaphoreGraphicsAddress());
     }
     alignedFree(svmPtr);
 }
diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h
index f60549b24d..1753fa6323 100644
--- a/shared/source/command_stream/command_stream_receiver.h
+++ b/shared/source/command_stream/command_stream_receiver.h
@@ -220,6 +220,7 @@ class CommandStreamReceiver {
     TagAllocatorBase *getEventTsAllocator();
     TagAllocatorBase *getEventPerfCountAllocator(const uint32_t tagSize);
     virtual TagAllocatorBase *getTimestampPacketAllocator() = 0;
+    virtual std::unique_ptr<TagAllocatorBase> createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) = 0;
 
     virtual bool expectMemory(const void *gfxAddress, const void *srcAddress, size_t length, uint32_t compareOperation);
 
diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h
index 7aab4233e8..ff16a0835d 100644
--- a/shared/source/command_stream/command_stream_receiver_hw.h
+++ b/shared/source/command_stream/command_stream_receiver_hw.h
@@ -130,6 +130,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
     GraphicsAllocation *getClearColorAllocation() override;
 
     TagAllocatorBase *getTimestampPacketAllocator() override;
+    std::unique_ptr<TagAllocatorBase> createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override;
 
     void postInitFlagsSetup() override;
     void programActivePartitionConfig(LinearStream &csr);
diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl
index 6b0a19e926..a1fea16353 100644
--- a/shared/source/command_stream/command_stream_receiver_hw_base.inl
+++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl
@@ -409,7 +409,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
     auto commandStreamStartCSR = commandStreamCSR.getUsed();
 
     TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);
-    TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);
+    TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);
 
     programActivePartitionConfigFlushTask(commandStreamCSR);
     programEngineModeCommands(commandStreamCSR, dispatchFlags);
@@ -982,7 +982,7 @@ size_t CommandStreamReceiverHw<GfxFamily>::getRequiredCmdStreamSize(const Dispat
     }
 
     size += TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(dispatchFlags.csrDependencies);
-    size += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<GfxFamily>(dispatchFlags.csrDependencies);
+    size += TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer<GfxFamily>(dispatchFlags.csrDependencies);
 
     size += EncodeKernelArgsBuffer<GfxFamily>::getKernelArgsBufferCmdsSize(kernelArgsBufferAllocation, logicalStateHelper.get());
 
@@ -1198,7 +1198,7 @@ TaskCountType CommandStreamReceiverHw<GfxFamily>::flushBcsTask(const BlitPropert
 
     for (auto &blitProperties : blitPropertiesContainer) {
         TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, blitProperties.csrDependencies);
-        TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStream, blitProperties.csrDependencies);
+        TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer<GfxFamily>(commandStream, blitProperties.csrDependencies);
 
         BlitCommandsHelper<GfxFamily>::encodeWa(commandStream, blitProperties, latestSentBcsWaValue);
 
@@ -1231,6 +1231,12 @@ TaskCountType CommandStreamReceiverHw<GfxFamily>::flushBcsTask(const BlitPropert
         if (blitProperties.clearColorAllocation) {
             makeResident(*blitProperties.clearColorAllocation);
         }
+        if (blitProperties.multiRootDeviceEventSync != nullptr) {
+            MiFlushArgs args;
+            args.commandWithPostSync = true;
+            args.notifyEnable = isUsedNotifyEnableForPostSync();
+            EncodeMiFlushDW<GfxFamily>::programMiFlushDw(commandStream, blitProperties.multiRootDeviceEventSync->getGpuAddress() + blitProperties.multiRootDeviceEventSync->getContextEndOffset(), std::numeric_limits<uint64_t>::max(), args, hwInfo);
+        }
     }
 
     BlitCommandsHelper<GfxFamily>::programGlobalSequencerFlush(commandStream);
@@ -1247,7 +1253,6 @@ TaskCountType CommandStreamReceiverHw<GfxFamily>::flushBcsTask(const BlitPropert
 
         MemorySynchronizationCommands<GfxFamily>::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), false, peekHwInfo());
     }
-
     if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnBlitCopy.get(), taskCount, PauseOnGpuProperties::PauseMode::AfterWorkload)) {
         BlitCommandsHelper<GfxFamily>::dispatchDebugPauseCommands(commandStream, getDebugPauseStateGPUAddress(),
                                                                   DebugPauseState::waitingForUserEndConfirmation,
@@ -1524,6 +1529,11 @@ TagAllocatorBase *CommandStreamReceiverHw<GfxFamily>::getTimestampPacketAllocato
     return timestampPacketAllocator.get();
 }
 
+template <typename GfxFamily>
+std::unique_ptr<TagAllocatorBase> CommandStreamReceiverHw<GfxFamily>::createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) {
+    auto &gfxCoreHelper = getGfxCoreHelper();
+    return gfxCoreHelper.createTimestampPacketAllocator(rootDeviceIndices, getMemoryManager(), getPreferredTagPoolSize(), getType(), osContext->getDeviceBitfield());
+}
 template <typename GfxFamily>
 void CommandStreamReceiverHw<GfxFamily>::postInitFlagsSetup() {
     useNewResourceImplicitFlush = checkPlatformSupportsNewResourceImplicitFlush();
diff --git a/shared/source/command_stream/csr_deps.cpp b/shared/source/command_stream/csr_deps.cpp
index ba0429cec6..0ae2ab90fb 100644
--- a/shared/source/command_stream/csr_deps.cpp
+++ b/shared/source/command_stream/csr_deps.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2022 Intel Corporation
+ * Copyright (C) 2020-2023 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,4 +22,10 @@ void CsrDependencies::copyNodesToNewContainer(TimestampPacketContainer &newTimes
         newTimestampPacketContainer.assignAndIncrementNodesRefCounts(*timestampPacketContainer);
     }
 }
+void CsrDependencies::copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer) {
+    for (auto &timestampPacketContainer : multiRootTimeStampSyncContainer) {
+        newTimestampPacketContainer.assignAndIncrementNodesRefCounts(*timestampPacketContainer);
+    }
+}
+
 } // namespace NEO
diff --git a/shared/source/command_stream/csr_deps.h b/shared/source/command_stream/csr_deps.h
index eab7f8d115..a0cf5bcc68 100644
--- a/shared/source/command_stream/csr_deps.h
+++ b/shared/source/command_stream/csr_deps.h
@@ -22,10 +22,11 @@ class CsrDependencies {
         All
     };
 
-    StackVec<std::pair<TaskCountType, uint64_t>, 32> taskCountContainer;
+    StackVec<TimestampPacketContainer *, 32> multiRootTimeStampSyncContainer;
     StackVec<TimestampPacketContainer *, 32> timestampPacketContainer;
 
     void makeResident(CommandStreamReceiver &commandStreamReceiver) const;
     void copyNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer);
+    void copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer);
 };
 } // namespace NEO
diff --git a/shared/source/helpers/blit_commands_helper_base.inl b/shared/source/helpers/blit_commands_helper_base.inl
index 9fc8a7b68a..6e5177e738 100644
--- a/shared/source/helpers/blit_commands_helper_base.inl
+++ b/shared/source/helpers/blit_commands_helper_base.inl
@@ -127,7 +127,7 @@ size_t BlitCommandsHelper<GfxFamily>::estimateBlitCommandSize(const Vec3<size_t>
 
     sizePerBlit += estimatePostBlitCommandSize();
     return TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(csrDependencies) +
-           TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<GfxFamily>(csrDependencies) +
+           TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer<GfxFamily>(csrDependencies) +
            (sizePerBlit * nBlits) +
            timestampCmdSize +
            estimatePreBlitCommandSize();
@@ -143,6 +143,9 @@ size_t BlitCommandsHelper<GfxFamily>::estimateBlitCommandsSize(const BlitPropert
         auto isImage = blitProperties.isImageOperation();
         size += BlitCommandsHelper<GfxFamily>::estimateBlitCommandSize(blitProperties.copySize, blitProperties.csrDependencies, updateTimestampPacket,
                                                                        profilingEnabled, isImage, rootDeviceEnvironment, blitProperties.isSystemMemoryPoolUsed);
+        if (blitProperties.multiRootDeviceEventSync != nullptr) {
+            size += EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite();
+        }
     }
     size += BlitCommandsHelper<GfxFamily>::getWaCmdsSize(blitPropertiesContainer);
     size += 2 * MemorySynchronizationCommands<GfxFamily>::getSizeForAdditonalSynchronization(*rootDeviceEnvironment.getHardwareInfo());
diff --git a/shared/source/helpers/blit_properties.cpp b/shared/source/helpers/blit_properties.cpp
index c5ca44600c..5120d8e6da 100644
--- a/shared/source/helpers/blit_properties.cpp
+++ b/shared/source/helpers/blit_properties.cpp
@@ -43,6 +43,7 @@ BlitProperties BlitProperties::constructPropertiesForReadWrite(BlitterConstants:
         BlitterConstants::BlitDirection::HostPtrToImage == blitDirection) {
         return {
             nullptr,                       // outputTimestampPacket
+            nullptr,                       // multiRootDeviceEventSync
             blitDirection,                 // blitDirection
             {},                            // csrDependencies
             AuxTranslationDirection::None, // auxTranslationDirection
@@ -66,6 +67,7 @@ BlitProperties BlitProperties::constructPropertiesForReadWrite(BlitterConstants:
     } else {
         return {
             nullptr,                       // outputTimestampPacket
+            nullptr,                       // multiRootDeviceEventSync
             blitDirection,                 // blitDirection
             {},                            // csrDependencies
             AuxTranslationDirection::None, // auxTranslationDirection
@@ -97,6 +99,7 @@ BlitProperties BlitProperties::constructPropertiesForCopy(GraphicsAllocation *ds
 
     return {
         nullptr,                                         // outputTimestampPacket
+        nullptr,                                         // multiRootDeviceEventSync
         BlitterConstants::BlitDirection::BufferToBuffer, // blitDirection
         {},                                              // csrDependencies
         AuxTranslationDirection::None,                   // auxTranslationDirection
@@ -121,6 +124,7 @@ BlitProperties BlitProperties::constructPropertiesForAuxTranslation(AuxTranslati
     auto allocationSize = allocation->getUnderlyingBufferSize();
     return {
         nullptr,                                         // outputTimestampPacket
+        nullptr,                                         // multiRootDeviceEventSync
         BlitterConstants::BlitDirection::BufferToBuffer, // blitDirection
         {},                                              // csrDependencies
         auxTranslationDirection,                         // auxTranslationDirection
diff --git a/shared/source/helpers/blit_properties.h b/shared/source/helpers/blit_properties.h
index 20215fbc5d..8a24756f66 100644
--- a/shared/source/helpers/blit_properties.h
+++ b/shared/source/helpers/blit_properties.h
@@ -49,6 +49,7 @@ struct BlitProperties {
                                                    CommandStreamReceiver &gpguCsr, CommandStreamReceiver &bcsCsr);
 
     TagNodeBase *outputTimestampPacket = nullptr;
+    TagNodeBase *multiRootDeviceEventSync = nullptr;
     BlitterConstants::BlitDirection blitDirection;
     CsrDependencies csrDependencies;
     AuxTranslationDirection auxTranslationDirection;
diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h
index 5a63df976e..ab33afa42d 100644
--- a/shared/source/helpers/timestamp_packet.h
+++ b/shared/source/helpers/timestamp_packet.h
@@ -125,17 +125,11 @@ struct TimestampPacketHelper {
     }
 
     template <typename GfxFamily>
-    static void programCsrDependenciesForForTaskCountContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) {
-        auto &taskCountContainer = csrDependencies.taskCountContainer;
-
-        for (auto &[taskCountPreviousRootDevice, tagAddressPreviousRootDevice] : taskCountContainer) {
-            using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
-            using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
-
-            EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(cmdStream,
-                                                                  static_cast<uint64_t>(tagAddressPreviousRootDevice),
-                                                                  static_cast<uint32_t>(taskCountPreviousRootDevice),
-                                                                  COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
+    static void programCsrDependenciesForForMultiRootDeviceSyncContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) {
+        for (auto timestampPacketContainer : csrDependencies.multiRootTimeStampSyncContainer) {
+            for (auto &node : timestampPacketContainer->peekNodes()) {
+                TimestampPacketHelper::programSemaphore<GfxFamily>(cmdStream, *node);
+            }
         }
     }
 
@@ -199,8 +193,8 @@ struct TimestampPacketHelper {
     }
 
     template <typename GfxFamily>
-    static size_t getRequiredCmdStreamSizeForTaskCountContainer(const CsrDependencies &csrDependencies) {
-        return csrDependencies.taskCountContainer.size() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
+    static size_t getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer(const CsrDependencies &csrDependencies) {
+        return csrDependencies.multiRootTimeStampSyncContainer.size() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
     }
 };
 
diff --git a/shared/test/common/helpers/memory_management.h b/shared/test/common/helpers/memory_management.h
index cc23e87095..55651ad186 100644
--- a/shared/test/common/helpers/memory_management.h
+++ b/shared/test/common/helpers/memory_management.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2023 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h
index b83ede247f..4d7b93e4f2 100644
--- a/shared/test/common/mocks/mock_command_stream_receiver.h
+++ b/shared/test/common/mocks/mock_command_stream_receiver.h
@@ -19,6 +19,7 @@
 #include "shared/source/memory_manager/graphics_allocation.h"
 #include "shared/source/memory_manager/surface.h"
 #include "shared/source/os_interface/os_context.h"
+#include "shared/source/utilities/tag_allocator.h"
 #include "shared/test/common/helpers/dispatch_flags_helper.h"
 
 #include <optional>
@@ -99,6 +100,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
     };
 
     TagAllocatorBase *getTimestampPacketAllocator() override { return nullptr; }
+    std::unique_ptr<TagAllocatorBase> createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override { return std::unique_ptr<TagAllocatorBase>(nullptr); }
 
     CompletionStamp flushTask(
         LinearStream &commandStream,
diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp
index db2db27535..46b8ae553c 100644
--- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp
+++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp
@@ -15,10 +15,13 @@
 #include "shared/source/helpers/api_specific_config.h"
 #include "shared/source/memory_manager/internal_allocation_storage.h"
 #include "shared/source/memory_manager/surface.h"
+#include "shared/source/memory_manager/unified_memory_manager.h"
 #include "shared/source/os_interface/device_factory.h"
 #include "shared/source/os_interface/hw_info_config.h"
 #include "shared/source/os_interface/os_interface.h"
 #include "shared/source/utilities/tag_allocator.h"
+#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
+#include "shared/test/common/cmd_parse/hw_parse.h"
 #include "shared/test/common/fixtures/command_stream_receiver_fixture.inl"
 #include "shared/test/common/fixtures/device_fixture.h"
 #include "shared/test/common/helpers/batch_buffer_helper.h"
@@ -33,6 +36,7 @@
 #include "shared/test/common/mocks/mock_execution_environment.h"
 #include "shared/test/common/mocks/mock_internal_allocation_storage.h"
 #include "shared/test/common/mocks/mock_memory_manager.h"
+#include "shared/test/common/mocks/mock_timestamp_container.h"
 #include "shared/test/common/mocks/ult_device_factory.h"
 #include "shared/test/common/test_macros/hw_test.h"
 #include "shared/test/common/test_macros/test_checks_shared.h"
@@ -2477,3 +2481,89 @@ HWTEST_F(CommandStreamReceiverHwTest, givenVariousCsrModeWhenGettingTbxModeThenE
     ultCsr.commandStreamReceiverType = CommandStreamReceiverType::CSR_TBX_WITH_AUB;
     EXPECT_TRUE(ultCsr.isTbxMode());
 }
+
+HWTEST_F(CommandStreamReceiverHwTest, GivenTwoRootDevicesWhengetMultiRootDeviceTimestampPacketAllocatorCalledThenAllocatorForTwoDevicesCreated) {
+    auto executionEnvironment = std::make_unique<MockExecutionEnvironment>(defaultHwInfo.get(), true, 2u);
+    auto devices = DeviceFactory::createDevices(*executionEnvironment.release());
+    const RootDeviceIndicesContainer indices = {0u, 1u};
+    auto csr = devices[0]->getDefaultEngine().commandStreamReceiver;
+    auto allocator = csr->createMultiRootDeviceTimestampPacketAllocator(indices);
+    class MockTagAllocatorBase : public TagAllocatorBase {
+      public:
+        using TagAllocatorBase::maxRootDeviceIndex;
+    };
+    EXPECT_EQ(reinterpret_cast<MockTagAllocatorBase *>(allocator.get())->maxRootDeviceIndex, 1u);
+}
+HWTEST_F(CommandStreamReceiverHwTest, GivenFiveRootDevicesWhengetMultiRootDeviceTimestampPacketAllocatorCalledThenAllocatorForFiveDevicesCreated) {
+    auto executionEnvironment = std::make_unique<MockExecutionEnvironment>(defaultHwInfo.get(), true, 4u);
+    auto devices = DeviceFactory::createDevices(*executionEnvironment.release());
+    const RootDeviceIndicesContainer indices = {0u, 1u, 2u, 3u};
+    auto csr = devices[0]->getDefaultEngine().commandStreamReceiver;
+    auto allocator = csr->createMultiRootDeviceTimestampPacketAllocator(indices);
+    class MockTagAllocatorBase : public TagAllocatorBase {
+      public:
+        using TagAllocatorBase::maxRootDeviceIndex;
+    };
+    EXPECT_EQ(reinterpret_cast<MockTagAllocatorBase *>(allocator.get())->maxRootDeviceIndex, 3u);
+}
+HWTEST_F(CommandStreamReceiverHwTest, givenMultiRootDeviceSyncNodeWhenFlushBcsTAskThenMiFlushAdded) {
+    using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
+    auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
+    auto mockTagAllocator = std::make_unique<MockTagAllocator<>>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u);
+
+    auto blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::BufferToHostPtr,
+                                                                          commandStreamReceiver, commandStreamReceiver.getTagAllocation(), nullptr,
+                                                                          commandStreamReceiver.getTagAllocation()->getUnderlyingBuffer(),
+                                                                          commandStreamReceiver.getTagAllocation()->getGpuAddress(), 0,
+                                                                          0, 0, 0, 0, 0, 0, 0);
+    auto tag = mockTagAllocator->getTag();
+    blitProperties.multiRootDeviceEventSync = tag;
+
+    BlitPropertiesContainer container;
+    container.push_back(blitProperties);
+    commandStreamReceiver.flushBcsTask(container, true, false, *pDevice);
+    HardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(commandStreamReceiver.commandStream, 0);
+
+    auto cmdIterator = find<typename FamilyType::MI_FLUSH_DW *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
+    bool nodeAddressFound = false;
+    while (cmdIterator != hwParser.cmdList.end()) {
+        auto flush = genCmdCast<MI_FLUSH_DW *>(*cmdIterator);
+        if (flush->getDestinationAddress() == tag->getGpuAddress() + tag->getContextEndOffset()) {
+            nodeAddressFound = true;
+            break;
+        }
+        cmdIterator = find<typename FamilyType::MI_FLUSH_DW *>(++cmdIterator, hwParser.cmdList.end());
+    }
+    EXPECT_TRUE(nodeAddressFound);
+}
+HWTEST_F(CommandStreamReceiverHwTest, givenNullPtrAsMultiRootDeviceSyncNodeWhenFlushBcsTAskThenMiFlushNotAdded) {
+    using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
+    auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
+    auto mockTagAllocator = std::make_unique<MockTagAllocator<>>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u);
+
+    auto blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::BufferToHostPtr,
+                                                                          commandStreamReceiver, commandStreamReceiver.getTagAllocation(), nullptr,
+                                                                          commandStreamReceiver.getTagAllocation()->getUnderlyingBuffer(),
+                                                                          commandStreamReceiver.getTagAllocation()->getGpuAddress(), 0,
+                                                                          0, 0, 0, 0, 0, 0, 0);
+    auto tag = mockTagAllocator->getTag();
+
+    BlitPropertiesContainer container;
+    container.push_back(blitProperties);
+    commandStreamReceiver.flushBcsTask(container, true, false, *pDevice);
+    HardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(commandStreamReceiver.commandStream, 0);
+
+    auto cmdIterator = find<typename FamilyType::MI_FLUSH_DW *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
+    bool nodeAddressFound = false;
+    while (cmdIterator != hwParser.cmdList.end()) {
+        auto flush = genCmdCast<MI_FLUSH_DW *>(*cmdIterator);
+        if (flush->getDestinationAddress() == tag->getGpuAddress() + tag->getContextEndOffset()) {
+            nodeAddressFound = true;
+            break;
+        }
+        cmdIterator = find<typename FamilyType::MI_FLUSH_DW *>(++cmdIterator, hwParser.cmdList.end());
+    }
+    EXPECT_FALSE(nodeAddressFound);
+}
\ No newline at end of file
diff --git a/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp b/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp
index c69b591a65..8bde71a82a 100644
--- a/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp
+++ b/shared/test/unit_test/helpers/blit_commands_helper_tests.cpp
@@ -16,6 +16,7 @@
 #include "shared/test/common/helpers/default_hw_info.h"
 #include "shared/test/common/mocks/mock_device.h"
 #include "shared/test/common/mocks/mock_graphics_allocation.h"
+#include "shared/test/common/mocks/mock_timestamp_container.h"
 #include "shared/test/common/mocks/ult_device_factory.h"
 #include "shared/test/common/test_macros/test_checks_shared.h"
 
@@ -664,3 +665,25 @@ HWTEST2_F(BlitTests, givenPlatformWhenCallingDispatchPreBlitCommandThenNoneMiFlu
     auto cmdIterator = find<typename FamilyType::MI_FLUSH_DW *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
     ASSERT_EQ(hwParser.cmdList.end(), cmdIterator);
 }
+
+HWTEST_F(BlitTests, givenPlatformWhenCallingDispatchPreBlitCommandThenNoneMiFlushDwIsProgramed) {
+    auto mockTagAllocator = std::make_unique<MockTagAllocator<>>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u);
+    auto tag = mockTagAllocator->getTag();
+    BlitProperties blitProperties{};
+    blitProperties.copySize = {1, 1, 1};
+    BlitPropertiesContainer blitPropertiesContainer1;
+    blitPropertiesContainer1.push_back(blitProperties);
+    blitPropertiesContainer1.push_back(blitProperties);
+    blitPropertiesContainer1.push_back(blitProperties);
+
+    auto estimatedSizeWithoutNode = BlitCommandsHelper<FamilyType>::estimateBlitCommandsSize(
+        blitPropertiesContainer1, false, true, false, pDevice->getRootDeviceEnvironment());
+    blitProperties.multiRootDeviceEventSync = tag;
+    BlitPropertiesContainer blitPropertiesContainer2;
+    blitPropertiesContainer2.push_back(blitProperties);
+    blitPropertiesContainer2.push_back(blitProperties);
+    blitPropertiesContainer2.push_back(blitProperties);
+    auto estimatedSizeWithNode = BlitCommandsHelper<FamilyType>::estimateBlitCommandsSize(
+        blitPropertiesContainer2, false, true, false, pDevice->getRootDeviceEnvironment());
+    EXPECT_NE(estimatedSizeWithoutNode, estimatedSizeWithNode);
+}
\ No newline at end of file
diff --git a/shared/test/unit_test/helpers/timestamp_packet_tests.cpp b/shared/test/unit_test/helpers/timestamp_packet_tests.cpp
index 8ea1f939fa..f3f4e14904 100644
--- a/shared/test/unit_test/helpers/timestamp_packet_tests.cpp
+++ b/shared/test/unit_test/helpers/timestamp_packet_tests.cpp
@@ -303,3 +303,35 @@ HWTEST_F(DeviceTimestampPacketTests, givenDebugFlagSetWhenCreatingTimestampPacke
 
     EXPECT_FALSE(tag->canBeReleased());
 }
+
+using TimestampPacketHelperTests = Test<DeviceFixture>;
+
+HWTEST_F(TimestampPacketHelperTests, givenTagNodesInMultiRootSyncContainerWhenProgramingDependensiecThenSemaforesAreProgrammed) {
+    StackVec<char, 4096> buffer(4096);
+    LinearStream cmdStream(buffer.begin(), buffer.size());
+    CsrDependencies deps;
+    auto mockTagAllocator = std::make_unique<MockTagAllocator<>>(0, pDevice->getMemoryManager());
+    TimestampPacketContainer container = {};
+    container.add(mockTagAllocator->getTag());
+    deps.multiRootTimeStampSyncContainer.push_back(&container);
+    TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer<FamilyType>(cmdStream, deps);
+    EXPECT_EQ(cmdStream.getUsed(), sizeof(typename FamilyType::MI_SEMAPHORE_WAIT));
+}
+
+HWTEST_F(TimestampPacketHelperTests, givenEmptyContainerMultiRootSyncContainerWhenProgramingDependensiecThenZeroSemaforesAreProgrammed) {
+    StackVec<char, 4096> buffer(4096);
+    LinearStream cmdStream(buffer.begin(), buffer.size());
+    CsrDependencies deps;
+    TimestampPacketContainer container = {};
+    deps.multiRootTimeStampSyncContainer.push_back(&container);
+    TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer<FamilyType>(cmdStream, deps);
+    EXPECT_EQ(cmdStream.getUsed(), 0u);
+}
+
+HWTEST_F(TimestampPacketHelperTests, givenEmptyMultiRootSyncContainerWhenProgramingDependensiecThenZeroSemaforesAreProgrammed) {
+    StackVec<char, 4096> buffer(4096);
+    LinearStream cmdStream(buffer.begin(), buffer.size());
+    CsrDependencies deps;
+    TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer<FamilyType>(cmdStream, deps);
+    EXPECT_EQ(cmdStream.getUsed(), 0u);
+}
\ No newline at end of file