diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h
index 2be3ecfbf9..36c7013b3b 100644
--- a/opencl/source/command_queue/command_queue_hw.h
+++ b/opencl/source/command_queue/command_queue_hw.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -9,6 +9,7 @@
 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/source/command_stream/preemption.h"
 #include "shared/source/device/device.h"
+#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h"
 #include "shared/source/helpers/engine_control.h"
 #include "shared/source/helpers/hw_info.h"
 #include "shared/source/os_interface/os_context.h"
@@ -401,7 +402,8 @@ class CommandQueueHw : public CommandQueue {
                         EventBuilder &externalEventBuilder,
                         std::unique_ptr<PrintfHandler> &&printfHandler,
                         CommandStreamReceiver *bcsCsr,
-                        TagNodeBase *multiRootDeviceSyncNode);
+                        TagNodeBase *multiRootDeviceSyncNode,
+                        CsrDependencyContainer *csrDependencies);
 
     CompletionStamp enqueueCommandWithoutKernel(Surface **surfaces,
                                                 size_t surfaceCount,
@@ -449,6 +451,7 @@ class CommandQueueHw : public CommandQueue {
 
   protected:
     MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){};
+    MOCKABLE_VIRTUAL bool prepareCsrDependency(CsrDependencies &csrDeps, CsrDependencyContainer &dependencyTags, TimestampPacketDependencies &timestampPacketDependencies, TagAllocatorBase *allocator, bool blockQueue);
     size_t calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image);
 
     cl_int enqueueReadWriteBufferOnCpuWithMemoryTransfer(cl_command_type commandType, Buffer *buffer,
diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h
index afe7559980..fce4fbdb7d 100644
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -9,6 +9,7 @@
 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/source/command_stream/wait_status.h"
 #include "shared/source/direct_submission/relaxed_ordering_helper.h"
+#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h"
 #include "shared/source/helpers/engine_node_helper.h"
 #include "shared/source/helpers/flat_batch_buffer_helper.h"
 #include "shared/source/helpers/flush_stamp.h"
@@ -457,7 +458,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
                        eventBuilder,
                        std::move(printfHandler),
                        nullptr,
-                       multiRootEventSyncStamp);
+                       multiRootEventSyncStamp,
+                       nullptr);
     }
 
     if (deferredTimestampPackets.get()) {
@@ -994,7 +996,8 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
     EventBuilder &externalEventBuilder,
     std::unique_ptr<PrintfHandler> &&printfHandler,
     CommandStreamReceiver *bcsCsr,
-    TagNodeBase *multiRootDeviceSyncNode) {
+    TagNodeBase *multiRootDeviceSyncNode,
+    CsrDependencyContainer *dependencyTags) {
 
     TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
 
@@ -1033,9 +1036,8 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
 
         storeTimestampPackets = (timestampPacketContainer != nullptr);
     }
-
     if (enqueueProperties.operation != EnqueueProperties::Operation::gpuKernel) {
-        command = std::make_unique<CommandWithoutKernel>(*this, blockedCommandsData);
+        command = std::make_unique<CommandWithoutKernel>(*this, blockedCommandsData, dependencyTags);
     } else {
         // store task data in event
         std::vector<Surface *> allSurfaces;
@@ -1244,6 +1246,23 @@ size_t CommandQueueHw<GfxFamily>::calculateHostPtrSizeForImage(const size_t *reg
     return Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, image->getImageDesc().image_type);
 }
 
+template <typename GfxFamily>
+bool CommandQueueHw<GfxFamily>::prepareCsrDependency(CsrDependencies &csrDeps, CsrDependencyContainer &dependencyTags, TimestampPacketDependencies &timestampPacketDependencies, TagAllocatorBase *allocator, bool blockQueue) {
+    for (auto &dependentCsr : csrDeps.csrWithMultiEngineDependencies) {
+        auto tag = allocator->getTag();
+        timestampPacketDependencies.multiCsrDependencies.add(tag);
+        if (!blockQueue) {
+            bool submitStatus = dependentCsr->submitDependencyUpdate(tag);
+            if (!submitStatus) {
+                return submitStatus;
+            }
+        } else {
+            dependencyTags.push_back(std::make_pair(dependentCsr, tag));
+        }
+    }
+    return true;
+}
+
 template <typename GfxFamily>
 bool CommandQueueHw<GfxFamily>::isSplitEnqueueBlitNeeded(TransferDirection transferDirection, size_t transferSize, CommandStreamReceiver &csr) {
     auto bcsSplit = getDevice().isBcsSplitSupported() &&
@@ -1438,14 +1457,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
     if (isCacheFlushForBcsRequired() && gpgpuSubmission) {
         timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
     }
-    for (auto &dependentCsr : csrDeps.csrWithMultiEngineDependencies) {
-        auto tag = allocator->getTag();
-        timestampPacketDependencies.multiCsrDependencies.add(tag);
-        bool submitStatus = dependentCsr->submitDependencyUpdate(tag);
-        if (!submitStatus) {
-            return CL_OUT_OF_RESOURCES;
-        }
-    }
+
     obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr);
     csrDeps.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
 
@@ -1472,6 +1484,13 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
         gpgpuCommandStream = obtainCommandStream<cmdType>(csrDeps, true, blockQueue, multiDispatchInfo, eventsRequest, blockedCommandsData, nullptr, 0, false, false);
         gpgpuCommandStreamStart = gpgpuCommandStream->getUsed();
     }
+    CsrDependencyContainer dependencyTags;
+    if (csrDeps.csrWithMultiEngineDependencies.size() > 0) {
+        bool submitStatus = prepareCsrDependency(csrDeps, dependencyTags, timestampPacketDependencies, allocator, blockQueue);
+        if (!submitStatus) {
+            return CL_OUT_OF_RESOURCES;
+        }
+    }
 
     blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies,
                                                                     eventsRequest, gpgpuCommandStream, cmdType, blockQueue, multiRootEventSyncStamp));
@@ -1501,7 +1520,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
     updateFromCompletionStamp(completionStamp, pEventBuilder->getEvent());
 
     if (blockQueue) {
-        enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, *pEventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp);
+        enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, *pEventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp, &dependencyTags);
 
         if (gpgpuSubmission) {
             if (debugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) {
diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp
index a58cc99dfe..141328b5c4 100644
--- a/opencl/source/helpers/task_information.cpp
+++ b/opencl/source/helpers/task_information.cpp
@@ -130,7 +130,7 @@ CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::uniq
                                            bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr<PrintfHandler> &&printfHandler,
                                            PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount,
                                            TagNodeBase *multiRootDeviceSyncNode)
-    : Command(commandQueue, kernelOperation), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM),
+    : Command(commandQueue, kernelOperation, nullptr), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM),
       commandType(commandType), printfHandler(std::move(printfHandler)), kernel(kernel),
       kernelCount(kernelCount), preemptionMode(preemptionMode), multiRootDeviceSyncNode(multiRootDeviceSyncNode) {
     UNRECOVERABLE_IF(nullptr == this->kernel);
@@ -326,6 +326,7 @@ TaskCountType CommandWithoutKernel::dispatchBlitOperation() {
     blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->cacheFlushNodes);
     blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->previousEnqueueNodes);
     blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->barrierNodes);
+    blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->multiCsrDependencies);
     blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0];
 
     if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
@@ -348,7 +349,13 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term
         this->terminated = true;
         return completionStamp;
     }
-
+    for (auto &tagCsrPair : csrDependencies) {
+        bool submitStatus = tagCsrPair.first->submitDependencyUpdate(tagCsrPair.second);
+        if (!submitStatus) {
+            completionStamp.taskCount = CompletionStamp::gpuHang;
+            return completionStamp;
+        }
+    }
     auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver();
 
     if (!kernelOperation) {
@@ -532,6 +539,10 @@ void Command::makeTimestampPacketsResident(CommandStreamReceiver &commandStreamR
 
 Command::Command(CommandQueue &commandQueue) : commandQueue(commandQueue) {}
 
-Command::Command(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation)
-    : commandQueue(commandQueue), kernelOperation(std::move(kernelOperation)) {}
+Command::Command(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, CsrDependencyContainer *csrDependencies)
+    : commandQueue(commandQueue), kernelOperation(std::move(kernelOperation)) {
+    if (csrDependencies) {
+        this->csrDependencies = *csrDependencies;
+    }
+}
 } // namespace NEO
diff --git a/opencl/source/helpers/task_information.h b/opencl/source/helpers/task_information.h
index 1f78cade9d..5b2ed58a06 100644
--- a/opencl/source/helpers/task_information.h
+++ b/opencl/source/helpers/task_information.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -7,6 +7,7 @@
 
 #pragma once
 #include "shared/source/command_stream/linear_stream.h"
+#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h"
 #include "shared/source/helpers/blit_properties.h"
 #include "shared/source/helpers/completion_stamp.h"
 #include "shared/source/helpers/map_operation_type.h"
@@ -87,7 +88,7 @@ class Command : public IFNode<Command> {
 
     Command() = delete;
     Command(CommandQueue &commandQueue);
-    Command(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation);
+    Command(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, CsrDependencyContainer *csrDependencies);
 
     ~Command() override;
     virtual LinearStream *getCommandStream() {
@@ -108,6 +109,7 @@ class Command : public IFNode<Command> {
     std::unique_ptr<TimestampPacketDependencies> timestampPacketDependencies;
     EventsRequest eventsRequest = {0, nullptr, nullptr};
     std::vector<cl_event> eventsWaitlist;
+    CsrDependencyContainer csrDependencies;
 };
 
 class CommandMapUnmap : public Command {
diff --git a/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp b/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp
index f491fc9f1b..abf493b798 100644
--- a/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp
+++ b/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp
@@ -1377,7 +1377,7 @@ HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenBlockedEventWhenWaitingForCom
     EXPECT_EQ(2u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
 
     clWaitForEvents(1, &outEvent1);
-    EXPECT_EQ(2u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
+    EXPECT_EQ(1u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
     EXPECT_EQ(1u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
 
     clReleaseEvent(outEvent1);
diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp
index b89ec4601e..1ea0d5ffaf 100644
--- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp
@@ -11,6 +11,7 @@
 #include "shared/source/gmm_helper/gmm.h"
 #include "shared/source/helpers/array_count.h"
 #include "shared/source/helpers/basic_math.h"
+#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h"
 #include "shared/source/helpers/engine_node_helper.h"
 #include "shared/source/helpers/timestamp_packet.h"
 #include "shared/source/memory_manager/internal_allocation_storage.h"
@@ -2514,6 +2515,76 @@ TEST_F(CommandQueueWithTimestampPacketTests, givenQueueWhenSettingAndQueryingLas
     }
 }
 
+HWTEST_F(CommandQueueWithTimestampPacketTests, givedDependencyBetweenCsrWhenPrepareDependencyUpdateCalledThenNewTagAddedToTimestampDependencies) {
+    MockContext context{};
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
+    auto dependentCsr = std::make_unique<MockCommandStreamReceiver>(*context.getDevice(0)->getExecutionEnvironment(), context.getDevice(0)->getRootDeviceIndex(), 1);
+    TimestampPacketDependencies dependencies{};
+    CsrDependencies csrDeps;
+    csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr.get());
+    CsrDependencyContainer dependencyMap;
+    TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
+    bool blockQueue = false;
+    mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue);
+    EXPECT_EQ(dependencies.multiCsrDependencies.peekNodes().size(), 1u);
+}
+
+HWTEST_F(CommandQueueWithTimestampPacketTests, givedNoDependencyBetweenCsrWhenPrepareDependencyUpdateCalledThenTagIsNotAddedToTimestampDependencies) {
+    MockContext context{};
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
+    TimestampPacketDependencies dependencies{};
+    CsrDependencies csrDeps;
+    CsrDependencyContainer dependencyMap;
+    TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
+    bool blockQueue = false;
+    mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue);
+    EXPECT_EQ(dependencies.multiCsrDependencies.peekNodes().size(), 0u);
+}
+
+HWTEST_F(CommandQueueWithTimestampPacketTests, givedDependencyBetweenCsrWhenPrepareDependencyUpdateCalledForNonBlockedQueueThenSubmitDependencyUpdateCalled) {
+    MockContext context{};
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
+    auto dependentCsr = std::make_unique<MockCommandStreamReceiver>(*context.getDevice(0)->getExecutionEnvironment(), context.getDevice(0)->getRootDeviceIndex(), 1);
+    TimestampPacketDependencies dependencies{};
+    CsrDependencies csrDeps;
+    csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr.get());
+    CsrDependencyContainer dependencyMap;
+    TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
+    bool blockQueue = false;
+    mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue);
+    EXPECT_EQ(dependentCsr->submitDependencyUpdateCalledTimes, 1u);
+    EXPECT_EQ(dependencyMap.size(), 0u);
+}
+
+HWTEST_F(CommandQueueWithTimestampPacketTests, givedDependencyBetweenCsrWhenPrepareDependencyUpdateCalledForBlockedQueueThenDependencyMapHasOneItem) {
+    MockContext context{};
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
+    auto dependentCsr = std::make_unique<MockCommandStreamReceiver>(*context.getDevice(0)->getExecutionEnvironment(), context.getDevice(0)->getRootDeviceIndex(), 1);
+    TimestampPacketDependencies dependencies{};
+    CsrDependencies csrDeps;
+    csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr.get());
+    CsrDependencyContainer dependencyMap;
+    TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
+    bool blockQueue = true;
+    mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue);
+    EXPECT_EQ(dependentCsr->submitDependencyUpdateCalledTimes, 0u);
+    EXPECT_EQ(dependencyMap.size(), 1u);
+}
+
+HWTEST_F(CommandQueueWithTimestampPacketTests, givedDependencyBetweenCsrWhenSubmitDependencyUpdateReturnsFalseThenProcessDependencyReturnsFalse) {
+    MockContext context{};
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
+    auto dependentCsr = std::make_unique<MockCommandStreamReceiver>(*context.getDevice(0)->getExecutionEnvironment(), context.getDevice(0)->getRootDeviceIndex(), 1);
+    TimestampPacketDependencies dependencies{};
+    CsrDependencies csrDeps;
+    csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr.get());
+    CsrDependencyContainer dependencyMap;
+    TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
+    bool blockQueue = false;
+    dependentCsr->submitDependencyUpdateReturnValue = false;
+    EXPECT_FALSE(mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue));
+}
+
 using KernelExecutionTypesTests = DispatchFlagsTests;
 HWTEST_F(KernelExecutionTypesTests, givenConcurrentKernelWhileDoingNonBlockedEnqueueThenCorrectKernelTypeIsSetInCSR) {
     using CsrType = MockCsrHw2<FamilyType>;
diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp
index 52aab100e4..c7ad7cc8cd 100644
--- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2023 Intel Corporation
+ * Copyright (C) 2019-2024 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -167,7 +167,7 @@ HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontReg
     Surface *surfaces[] = {nullptr};
     mockCmdQ->enqueueBlocked(CL_COMMAND_MARKER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies,
                              blockedCommandsData, enqueuePropertiesForDependencyFlush, eventsRequest,
-                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), nullptr, nullptr);
+                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), nullptr, nullptr, nullptr);
     EXPECT_FALSE(blockedCommandsDataForDependencyFlush->blitEnqueue);
 }
 
@@ -200,7 +200,7 @@ HWTEST_F(EnqueueHandlerTest, givenBlitPropertyWhenEnqueueIsBlockedThenRegisterBl
     Surface *surfaces[] = {nullptr};
     mockCmdQ->enqueueBlocked(CL_COMMAND_READ_BUFFER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies,
                              blockedCommandsData, enqueuePropertiesForBlitEnqueue, eventsRequest,
-                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), mockCmdQ->getBcsForAuxTranslation(), nullptr);
+                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), mockCmdQ->getBcsForAuxTranslation(), nullptr, nullptr);
     EXPECT_TRUE(blockedCommandsDataForBlitEnqueue->blitEnqueue);
     EXPECT_EQ(blitProperties.srcAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->srcAllocation);
     EXPECT_EQ(blitProperties.dstAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->dstAllocation);
diff --git a/opencl/test/unit_test/helpers/task_information_tests.cpp b/opencl/test/unit_test/helpers/task_information_tests.cpp
index 63a40d7dfb..f595a49a09 100644
--- a/opencl/test/unit_test/helpers/task_information_tests.cpp
+++ b/opencl/test/unit_test/helpers/task_information_tests.cpp
@@ -1,10 +1,11 @@
 /*
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
  */
 
+#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h"
 #include "shared/source/memory_manager/allocation_properties.h"
 #include "shared/source/memory_manager/internal_allocation_storage.h"
 #include "shared/test/common/mocks/mock_csr.h"
@@ -372,7 +373,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandWithoutKernelWhenSubmitThenPassCorrectD
     auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 1, AllocationType::commandBuffer, device->getDeviceBitfield()}));
     auto kernelOperation = std::make_unique<KernelOperation>(cmdStream, *mockCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
     kernelOperation->setHeaps(ih1, ih2, ih3);
-    std::unique_ptr<Command> command(new CommandWithoutKernel(*mockCmdQ, kernelOperation));
+    std::unique_ptr<Command> command(new CommandWithoutKernel(*mockCmdQ, kernelOperation, nullptr));
     command->setTimestampPacketNode(*mockCmdQ->timestampPacketContainer, std::move(timestampPacketDependencies));
 
     command->submit(20, false);
@@ -393,6 +394,64 @@ HWTEST_F(DispatchFlagsTests, givenCommandWithoutKernelWhenSubmitThenPassCorrectD
     EXPECT_FALSE(mockCsr->passedDispatchFlags.epilogueRequired);
 }
 
+HWTEST_F(DispatchFlagsTests, givenCsrDependencyWhenSubmitCommandWithoutKernelThenDependencyUpdateWasCalled) {
+    using CsrType = MockCsr1<FamilyType>;
+    setUpImpl<CsrType>();
+
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
+    auto mockCsr = static_cast<CsrType *>(&mockCmdQ->getGpgpuCommandStreamReceiver());
+    auto dependentCsr = std::make_unique<MockCommandStreamReceiver>(*device->getExecutionEnvironment(), device->getRootDeviceIndex(), device->getDeviceBitfield());
+
+    mockCsr->timestampPacketWriteEnabled = true;
+    mockCmdQ->timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
+    IndirectHeap *ih1 = nullptr, *ih2 = nullptr, *ih3 = nullptr;
+    TimestampPacketDependencies timestampPacketDependencies;
+    mockCmdQ->allocateHeapMemory(IndirectHeap::Type::dynamicState, 1, ih1);
+    mockCmdQ->allocateHeapMemory(IndirectHeap::Type::indirectObject, 1, ih2);
+    mockCmdQ->allocateHeapMemory(IndirectHeap::Type::surfaceState, 1, ih3);
+
+    auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 1, AllocationType::commandBuffer, device->getDeviceBitfield()}));
+    auto kernelOperation = std::make_unique<KernelOperation>(cmdStream, *mockCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
+    kernelOperation->setHeaps(ih1, ih2, ih3);
+    CsrDependencyContainer dependencyMap;
+    auto tag = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag();
+    dependencyMap.push_back(std::make_pair(dependentCsr.get(), tag));
+    std::unique_ptr<Command> command(new CommandWithoutKernel(*mockCmdQ, kernelOperation, &dependencyMap));
+    command->setTimestampPacketNode(*mockCmdQ->timestampPacketContainer, std::move(timestampPacketDependencies));
+
+    command->submit(20, false);
+    EXPECT_EQ(dependentCsr->submitDependencyUpdateCalledTimes, 1u);
+}
+
+HWTEST_F(DispatchFlagsTests, givenCsrDependencyWhendependencyUpdateReturnsFalseThenSubmitReturnGpuHang) {
+    using CsrType = MockCsr1<FamilyType>;
+    setUpImpl<CsrType>();
+
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
+    auto mockCsr = static_cast<CsrType *>(&mockCmdQ->getGpgpuCommandStreamReceiver());
+    auto dependentCsr = std::make_unique<MockCommandStreamReceiver>(*device->getExecutionEnvironment(), device->getRootDeviceIndex(), device->getDeviceBitfield());
+
+    mockCsr->timestampPacketWriteEnabled = true;
+    mockCmdQ->timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
+    IndirectHeap *ih1 = nullptr, *ih2 = nullptr, *ih3 = nullptr;
+    TimestampPacketDependencies timestampPacketDependencies;
+    mockCmdQ->allocateHeapMemory(IndirectHeap::Type::dynamicState, 1, ih1);
+    mockCmdQ->allocateHeapMemory(IndirectHeap::Type::indirectObject, 1, ih2);
+    mockCmdQ->allocateHeapMemory(IndirectHeap::Type::surfaceState, 1, ih3);
+
+    auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 1, AllocationType::commandBuffer, device->getDeviceBitfield()}));
+    auto kernelOperation = std::make_unique<KernelOperation>(cmdStream, *mockCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
+    kernelOperation->setHeaps(ih1, ih2, ih3);
+    CsrDependencyContainer dependencyMap;
+    auto tag = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag();
+    dependencyMap.push_back(std::make_pair(dependentCsr.get(), tag));
+    std::unique_ptr<Command> command(new CommandWithoutKernel(*mockCmdQ, kernelOperation, &dependencyMap));
+    command->setTimestampPacketNode(*mockCmdQ->timestampPacketContainer, std::move(timestampPacketDependencies));
+    dependentCsr->submitDependencyUpdateReturnValue = false;
+    auto stamp = command->submit(20, false);
+    EXPECT_EQ(stamp.taskCount, CompletionStamp::gpuHang);
+}
+
 HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectDispatchHints) {
     using CsrType = MockCsr1<FamilyType>;
     setUpImpl<CsrType>();
diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h
index baa71962ad..a141fd8dcb 100644
--- a/opencl/test/unit_test/mocks/mock_command_queue.h
+++ b/opencl/test/unit_test/mocks/mock_command_queue.h
@@ -281,6 +281,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
     using BaseClass::obtainCommandStream;
     using BaseClass::obtainNewTimestampPacketNodes;
     using BaseClass::overrideEngine;
+    using BaseClass::prepareCsrDependency;
     using BaseClass::processDispatchForKernels;
     using BaseClass::relaxedOrderingForGpgpuAllowed;
     using BaseClass::requiresCacheFlushAfterWalker;
diff --git a/shared/source/helpers/CMakeLists.txt b/shared/source/helpers/CMakeLists.txt
index 41d307fdfb..e8e2cd5959 100644
--- a/shared/source/helpers/CMakeLists.txt
+++ b/shared/source/helpers/CMakeLists.txt
@@ -20,6 +20,7 @@ set(NEO_CORE_HELPERS
     ${CMAKE_CURRENT_SOURCE_DIR}/array_count.h
     ${CMAKE_CURRENT_SOURCE_DIR}/aux_translation.h
     ${CMAKE_CURRENT_SOURCE_DIR}/basic_math.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/bcs_ccs_dependency_pair_container.h
     ${CMAKE_CURRENT_SOURCE_DIR}/bindless_heaps_helper.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/bindless_heaps_helper.h
     ${CMAKE_CURRENT_SOURCE_DIR}/bit_helpers.h
diff --git a/shared/source/helpers/bcs_ccs_dependency_pair_container.h b/shared/source/helpers/bcs_ccs_dependency_pair_container.h
new file mode 100644
index 0000000000..379598dca8
--- /dev/null
+++ b/shared/source/helpers/bcs_ccs_dependency_pair_container.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#pragma once
+#include <vector>
+
+namespace NEO {
+class CommandStreamReceiver;
+class TagNodeBase;
+using CsrDependencyContainer = std::vector<std::pair<CommandStreamReceiver *, TagNodeBase *>>;
+} // namespace NEO
diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h
index 8e65725a81..3851f1c53c 100644
--- a/shared/test/common/mocks/mock_command_stream_receiver.h
+++ b/shared/test/common/mocks/mock_command_stream_receiver.h
@@ -78,7 +78,10 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
 
     SubmissionStatus flushTagUpdate() override { return SubmissionStatus::success; };
     void updateTagFromWait() override{};
-    bool submitDependencyUpdate(TagNodeBase *tag) override { return true; };
+    bool submitDependencyUpdate(TagNodeBase *tag) override {
+        submitDependencyUpdateCalledTimes++;
+        return submitDependencyUpdateReturnValue;
+    }
     bool isUpdateTagFromWaitEnabled() override { return false; };
 
     void writeMemoryAub(aub_stream::AllocationParams &allocationParams) override {
@@ -243,6 +246,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
     uint32_t writeMemoryAubCalled = 0;
     uint32_t makeResidentCalledTimes = 0;
     uint32_t downloadAllocationsCalledCount = 0;
+    uint32_t submitDependencyUpdateCalledTimes = 0;
     int hostPtrSurfaceCreationMutexLockCount = 0;
     bool multiOsContextCapable = false;
     bool memoryCompressionEnabled = false;
@@ -259,6 +263,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
     BatchBuffer latestFlushedBatchBuffer = {};
     QueueThrottle getLastDirectSubmissionThrottleReturnValue = QueueThrottle::MEDIUM;
     bool getAcLineConnectedReturnValue = true;
+    bool submitDependencyUpdateReturnValue = true;
 };
 
 class MockCommandStreamReceiverWithFailingSubmitBatch : public MockCommandStreamReceiver {