fix: cache flush dependency for queue blocked

Related-to: NEO-9872, HSD-18038461954 Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
2024-05-20 10:39:06 +00:00 · 2024-05-20 10:39:06 +00:00 · 90df4b298b
parent e01d34741d
commit 90df4b298b
12 changed files with 215 additions and 28 deletions
--- a/opencl/source/command_queue/command_queue_hw.h
+++ b/opencl/source/command_queue/command_queue_hw.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -9,6 +9,7 @@
 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/source/command_stream/preemption.h"
 #include "shared/source/device/device.h"
+#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h"
 #include "shared/source/helpers/engine_control.h"
 #include "shared/source/helpers/hw_info.h"
 #include "shared/source/os_interface/os_context.h"
@ -401,7 +402,8 @@ class CommandQueueHw : public CommandQueue {
                        EventBuilder &externalEventBuilder,
                        std::unique_ptr<PrintfHandler> &&printfHandler,
                        CommandStreamReceiver *bcsCsr,
-                        TagNodeBase *multiRootDeviceSyncNode);
+                        TagNodeBase *multiRootDeviceSyncNode,
+                        CsrDependencyContainer *csrDependencies);

    CompletionStamp enqueueCommandWithoutKernel(Surface **surfaces,
                                                size_t surfaceCount,
@ -449,6 +451,7 @@ class CommandQueueHw : public CommandQueue {

  protected:
    MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){};
+    MOCKABLE_VIRTUAL bool prepareCsrDependency(CsrDependencies &csrDeps, CsrDependencyContainer &dependencyTags, TimestampPacketDependencies &timestampPacketDependencies, TagAllocatorBase *allocator, bool blockQueue);
    size_t calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image);

    cl_int enqueueReadWriteBufferOnCpuWithMemoryTransfer(cl_command_type commandType, Buffer *buffer,
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@ -9,6 +9,7 @@
 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/source/command_stream/wait_status.h"
 #include "shared/source/direct_submission/relaxed_ordering_helper.h"
+#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h"
 #include "shared/source/helpers/engine_node_helper.h"
 #include "shared/source/helpers/flat_batch_buffer_helper.h"
 #include "shared/source/helpers/flush_stamp.h"
@ -457,7 +458,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
                       eventBuilder,
                       std::move(printfHandler),
                       nullptr,
-                       multiRootEventSyncStamp);
+                       multiRootEventSyncStamp,
+                       nullptr);
    }

    if (deferredTimestampPackets.get()) {
@ -994,7 +996,8 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
    EventBuilder &externalEventBuilder,
    std::unique_ptr<PrintfHandler> &&printfHandler,
    CommandStreamReceiver *bcsCsr,
-    TagNodeBase *multiRootDeviceSyncNode) {
+    TagNodeBase *multiRootDeviceSyncNode,
+    CsrDependencyContainer *dependencyTags) {

    TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);

@ -1033,9 +1036,8 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(

        storeTimestampPackets = (timestampPacketContainer != nullptr);
    }
-
    if (enqueueProperties.operation != EnqueueProperties::Operation::gpuKernel) {
-        command = std::make_unique<CommandWithoutKernel>(*this, blockedCommandsData);
+        command = std::make_unique<CommandWithoutKernel>(*this, blockedCommandsData, dependencyTags);
    } else {
        // store task data in event
        std::vector<Surface *> allSurfaces;
@ -1244,6 +1246,23 @@ size_t CommandQueueHw<GfxFamily>::calculateHostPtrSizeForImage(const size_t *reg
    return Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, image->getImageDesc().image_type);
 }

+template <typename GfxFamily>
+bool CommandQueueHw<GfxFamily>::prepareCsrDependency(CsrDependencies &csrDeps, CsrDependencyContainer &dependencyTags, TimestampPacketDependencies &timestampPacketDependencies, TagAllocatorBase *allocator, bool blockQueue) {
+    for (auto &dependentCsr : csrDeps.csrWithMultiEngineDependencies) {
+        auto tag = allocator->getTag();
+        timestampPacketDependencies.multiCsrDependencies.add(tag);
+        if (!blockQueue) {
+            bool submitStatus = dependentCsr->submitDependencyUpdate(tag);
+            if (!submitStatus) {
+                return submitStatus;
+            }
+        } else {
+            dependencyTags.push_back(std::make_pair(dependentCsr, tag));
+        }
+    }
+    return true;
+}
+
 template <typename GfxFamily>
 bool CommandQueueHw<GfxFamily>::isSplitEnqueueBlitNeeded(TransferDirection transferDirection, size_t transferSize, CommandStreamReceiver &csr) {
    auto bcsSplit = getDevice().isBcsSplitSupported() &&
@ -1438,14 +1457,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
    if (isCacheFlushForBcsRequired() && gpgpuSubmission) {
        timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
    }
-    for (auto &dependentCsr : csrDeps.csrWithMultiEngineDependencies) {
-        auto tag = allocator->getTag();
-        timestampPacketDependencies.multiCsrDependencies.add(tag);
-        bool submitStatus = dependentCsr->submitDependencyUpdate(tag);
-        if (!submitStatus) {
-            return CL_OUT_OF_RESOURCES;
-        }
-    }
+
    obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr);
    csrDeps.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);

@ -1472,6 +1484,13 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
        gpgpuCommandStream = obtainCommandStream<cmdType>(csrDeps, true, blockQueue, multiDispatchInfo, eventsRequest, blockedCommandsData, nullptr, 0, false, false);
        gpgpuCommandStreamStart = gpgpuCommandStream->getUsed();
    }
+    CsrDependencyContainer dependencyTags;
+    if (csrDeps.csrWithMultiEngineDependencies.size() > 0) {
+        bool submitStatus = prepareCsrDependency(csrDeps, dependencyTags, timestampPacketDependencies, allocator, blockQueue);
+        if (!submitStatus) {
+            return CL_OUT_OF_RESOURCES;
+        }
+    }

    blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies,
                                                                    eventsRequest, gpgpuCommandStream, cmdType, blockQueue, multiRootEventSyncStamp));
@ -1501,7 +1520,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
    updateFromCompletionStamp(completionStamp, pEventBuilder->getEvent());

    if (blockQueue) {
-        enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, *pEventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp);
+        enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, *pEventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp, &dependencyTags);

        if (gpgpuSubmission) {
            if (debugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) {
--- a/opencl/source/helpers/task_information.cpp
+++ b/opencl/source/helpers/task_information.cpp
@ -130,7 +130,7 @@ CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::uniq
                                           bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr<PrintfHandler> &&printfHandler,
                                           PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount,
                                           TagNodeBase *multiRootDeviceSyncNode)
-    : Command(commandQueue, kernelOperation), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM),
+    : Command(commandQueue, kernelOperation, nullptr), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM),
      commandType(commandType), printfHandler(std::move(printfHandler)), kernel(kernel),
      kernelCount(kernelCount), preemptionMode(preemptionMode), multiRootDeviceSyncNode(multiRootDeviceSyncNode) {
    UNRECOVERABLE_IF(nullptr == this->kernel);
@ -326,6 +326,7 @@ TaskCountType CommandWithoutKernel::dispatchBlitOperation() {
    blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->cacheFlushNodes);
    blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->previousEnqueueNodes);
    blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->barrierNodes);
+    blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->multiCsrDependencies);
    blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0];

    if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
@ -348,7 +349,13 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term
        this->terminated = true;
        return completionStamp;
    }
-
+    for (auto &tagCsrPair : csrDependencies) {
+        bool submitStatus = tagCsrPair.first->submitDependencyUpdate(tagCsrPair.second);
+        if (!submitStatus) {
+            completionStamp.taskCount = CompletionStamp::gpuHang;
+            return completionStamp;
+        }
+    }
    auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver();

    if (!kernelOperation) {
@ -532,6 +539,10 @@ void Command::makeTimestampPacketsResident(CommandStreamReceiver &commandStreamR

 Command::Command(CommandQueue &commandQueue) : commandQueue(commandQueue) {}

-Command::Command(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation)
-    : commandQueue(commandQueue), kernelOperation(std::move(kernelOperation)) {}
+Command::Command(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, CsrDependencyContainer *csrDependencies)
+    : commandQueue(commandQueue), kernelOperation(std::move(kernelOperation)) {
+    if (csrDependencies) {
+        this->csrDependencies = *csrDependencies;
+    }
+}
 } // namespace NEO
--- a/opencl/source/helpers/task_information.h
+++ b/opencl/source/helpers/task_information.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -7,6 +7,7 @@

 #pragma once
 #include "shared/source/command_stream/linear_stream.h"
+#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h"
 #include "shared/source/helpers/blit_properties.h"
 #include "shared/source/helpers/completion_stamp.h"
 #include "shared/source/helpers/map_operation_type.h"
@ -87,7 +88,7 @@ class Command : public IFNode<Command> {

    Command() = delete;
    Command(CommandQueue &commandQueue);
-    Command(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation);
+    Command(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, CsrDependencyContainer *csrDependencies);

    ~Command() override;
    virtual LinearStream *getCommandStream() {
@ -108,6 +109,7 @@ class Command : public IFNode<Command> {
    std::unique_ptr<TimestampPacketDependencies> timestampPacketDependencies;
    EventsRequest eventsRequest = {0, nullptr, nullptr};
    std::vector<cl_event> eventsWaitlist;
+    CsrDependencyContainer csrDependencies;
 };

 class CommandMapUnmap : public Command {
--- a/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp
+++ b/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp
@ -1377,7 +1377,7 @@ HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenBlockedEventWhenWaitingForCom
    EXPECT_EQ(2u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load());

    clWaitForEvents(1, &outEvent1);
-    EXPECT_EQ(2u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
+    EXPECT_EQ(1u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
    EXPECT_EQ(1u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load());

    clReleaseEvent(outEvent1);
--- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp
@ -11,6 +11,7 @@
 #include "shared/source/gmm_helper/gmm.h"
 #include "shared/source/helpers/array_count.h"
 #include "shared/source/helpers/basic_math.h"
+#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h"
 #include "shared/source/helpers/engine_node_helper.h"
 #include "shared/source/helpers/timestamp_packet.h"
 #include "shared/source/memory_manager/internal_allocation_storage.h"
@ -2514,6 +2515,76 @@ TEST_F(CommandQueueWithTimestampPacketTests, givenQueueWhenSettingAndQueryingLas
    }
 }

+HWTEST_F(CommandQueueWithTimestampPacketTests, givedDependencyBetweenCsrWhenPrepareDependencyUpdateCalledThenNewTagAddedToTimestampDependencies) {
+    MockContext context{};
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
+    auto dependentCsr = std::make_unique<MockCommandStreamReceiver>(*context.getDevice(0)->getExecutionEnvironment(), context.getDevice(0)->getRootDeviceIndex(), 1);
+    TimestampPacketDependencies dependencies{};
+    CsrDependencies csrDeps;
+    csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr.get());
+    CsrDependencyContainer dependencyMap;
+    TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
+    bool blockQueue = false;
+    mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue);
+    EXPECT_EQ(dependencies.multiCsrDependencies.peekNodes().size(), 1u);
+}
+
+HWTEST_F(CommandQueueWithTimestampPacketTests, givedNoDependencyBetweenCsrWhenPrepareDependencyUpdateCalledThenTagIsNotAddedToTimestampDependencies) {
+    MockContext context{};
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
+    TimestampPacketDependencies dependencies{};
+    CsrDependencies csrDeps;
+    CsrDependencyContainer dependencyMap;
+    TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
+    bool blockQueue = false;
+    mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue);
+    EXPECT_EQ(dependencies.multiCsrDependencies.peekNodes().size(), 0u);
+}
+
+HWTEST_F(CommandQueueWithTimestampPacketTests, givedDependencyBetweenCsrWhenPrepareDependencyUpdateCalledForNonBlockedQueueThenSubmitDependencyUpdateCalled) {
+    MockContext context{};
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
+    auto dependentCsr = std::make_unique<MockCommandStreamReceiver>(*context.getDevice(0)->getExecutionEnvironment(), context.getDevice(0)->getRootDeviceIndex(), 1);
+    TimestampPacketDependencies dependencies{};
+    CsrDependencies csrDeps;
+    csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr.get());
+    CsrDependencyContainer dependencyMap;
+    TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
+    bool blockQueue = false;
+    mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue);
+    EXPECT_EQ(dependentCsr->submitDependencyUpdateCalledTimes, 1u);
+    EXPECT_EQ(dependencyMap.size(), 0u);
+}
+
+HWTEST_F(CommandQueueWithTimestampPacketTests, givedDependencyBetweenCsrWhenPrepareDependencyUpdateCalledForBlockedQueueThenDependencyMapHasOneItem) {
+    MockContext context{};
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
+    auto dependentCsr = std::make_unique<MockCommandStreamReceiver>(*context.getDevice(0)->getExecutionEnvironment(), context.getDevice(0)->getRootDeviceIndex(), 1);
+    TimestampPacketDependencies dependencies{};
+    CsrDependencies csrDeps;
+    csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr.get());
+    CsrDependencyContainer dependencyMap;
+    TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
+    bool blockQueue = true;
+    mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue);
+    EXPECT_EQ(dependentCsr->submitDependencyUpdateCalledTimes, 0u);
+    EXPECT_EQ(dependencyMap.size(), 1u);
+}
+
+HWTEST_F(CommandQueueWithTimestampPacketTests, givedDependencyBetweenCsrWhenSubmitDependencyUpdateReturnsFalseThenProcessDependencyReturnsFalse) {
+    MockContext context{};
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
+    auto dependentCsr = std::make_unique<MockCommandStreamReceiver>(*context.getDevice(0)->getExecutionEnvironment(), context.getDevice(0)->getRootDeviceIndex(), 1);
+    TimestampPacketDependencies dependencies{};
+    CsrDependencies csrDeps;
+    csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr.get());
+    CsrDependencyContainer dependencyMap;
+    TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
+    bool blockQueue = false;
+    dependentCsr->submitDependencyUpdateReturnValue = false;
+    EXPECT_FALSE(mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue));
+}
+
 using KernelExecutionTypesTests = DispatchFlagsTests;
 HWTEST_F(KernelExecutionTypesTests, givenConcurrentKernelWhileDoingNonBlockedEnqueueThenCorrectKernelTypeIsSetInCSR) {
    using CsrType = MockCsrHw2<FamilyType>;
--- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2023 Intel Corporation
+ * Copyright (C) 2019-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -167,7 +167,7 @@ HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontReg
    Surface *surfaces[] = {nullptr};
    mockCmdQ->enqueueBlocked(CL_COMMAND_MARKER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies,
                             blockedCommandsData, enqueuePropertiesForDependencyFlush, eventsRequest,
-                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), nullptr, nullptr);
+                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), nullptr, nullptr, nullptr);
    EXPECT_FALSE(blockedCommandsDataForDependencyFlush->blitEnqueue);
 }

@ -200,7 +200,7 @@ HWTEST_F(EnqueueHandlerTest, givenBlitPropertyWhenEnqueueIsBlockedThenRegisterBl
    Surface *surfaces[] = {nullptr};
    mockCmdQ->enqueueBlocked(CL_COMMAND_READ_BUFFER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies,
                             blockedCommandsData, enqueuePropertiesForBlitEnqueue, eventsRequest,
-                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), mockCmdQ->getBcsForAuxTranslation(), nullptr);
+                             eventBuilder, std::unique_ptr<PrintfHandler>(nullptr), mockCmdQ->getBcsForAuxTranslation(), nullptr, nullptr);
    EXPECT_TRUE(blockedCommandsDataForBlitEnqueue->blitEnqueue);
    EXPECT_EQ(blitProperties.srcAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->srcAllocation);
    EXPECT_EQ(blitProperties.dstAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->dstAllocation);
--- a/opencl/test/unit_test/helpers/task_information_tests.cpp
+++ b/opencl/test/unit_test/helpers/task_information_tests.cpp
@ -1,10 +1,11 @@
 /*
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

+#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h"
 #include "shared/source/memory_manager/allocation_properties.h"
 #include "shared/source/memory_manager/internal_allocation_storage.h"
 #include "shared/test/common/mocks/mock_csr.h"
@ -372,7 +373,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandWithoutKernelWhenSubmitThenPassCorrectD
    auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 1, AllocationType::commandBuffer, device->getDeviceBitfield()}));
    auto kernelOperation = std::make_unique<KernelOperation>(cmdStream, *mockCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
    kernelOperation->setHeaps(ih1, ih2, ih3);
-    std::unique_ptr<Command> command(new CommandWithoutKernel(*mockCmdQ, kernelOperation));
+    std::unique_ptr<Command> command(new CommandWithoutKernel(*mockCmdQ, kernelOperation, nullptr));
    command->setTimestampPacketNode(*mockCmdQ->timestampPacketContainer, std::move(timestampPacketDependencies));

    command->submit(20, false);
@ -393,6 +394,64 @@ HWTEST_F(DispatchFlagsTests, givenCommandWithoutKernelWhenSubmitThenPassCorrectD
    EXPECT_FALSE(mockCsr->passedDispatchFlags.epilogueRequired);
 }

+HWTEST_F(DispatchFlagsTests, givenCsrDependencyWhenSubmitCommandWithoutKernelThenDependencyUpdateWasCalled) {
+    using CsrType = MockCsr1<FamilyType>;
+    setUpImpl<CsrType>();
+
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
+    auto mockCsr = static_cast<CsrType *>(&mockCmdQ->getGpgpuCommandStreamReceiver());
+    auto dependentCsr = std::make_unique<MockCommandStreamReceiver>(*device->getExecutionEnvironment(), device->getRootDeviceIndex(), device->getDeviceBitfield());
+
+    mockCsr->timestampPacketWriteEnabled = true;
+    mockCmdQ->timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
+    IndirectHeap *ih1 = nullptr, *ih2 = nullptr, *ih3 = nullptr;
+    TimestampPacketDependencies timestampPacketDependencies;
+    mockCmdQ->allocateHeapMemory(IndirectHeap::Type::dynamicState, 1, ih1);
+    mockCmdQ->allocateHeapMemory(IndirectHeap::Type::indirectObject, 1, ih2);
+    mockCmdQ->allocateHeapMemory(IndirectHeap::Type::surfaceState, 1, ih3);
+
+    auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 1, AllocationType::commandBuffer, device->getDeviceBitfield()}));
+    auto kernelOperation = std::make_unique<KernelOperation>(cmdStream, *mockCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
+    kernelOperation->setHeaps(ih1, ih2, ih3);
+    CsrDependencyContainer dependencyMap;
+    auto tag = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag();
+    dependencyMap.push_back(std::make_pair(dependentCsr.get(), tag));
+    std::unique_ptr<Command> command(new CommandWithoutKernel(*mockCmdQ, kernelOperation, &dependencyMap));
+    command->setTimestampPacketNode(*mockCmdQ->timestampPacketContainer, std::move(timestampPacketDependencies));
+
+    command->submit(20, false);
+    EXPECT_EQ(dependentCsr->submitDependencyUpdateCalledTimes, 1u);
+}
+
+HWTEST_F(DispatchFlagsTests, givenCsrDependencyWhendependencyUpdateReturnsFalseThenSubmitReturnGpuHang) {
+    using CsrType = MockCsr1<FamilyType>;
+    setUpImpl<CsrType>();
+
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
+    auto mockCsr = static_cast<CsrType *>(&mockCmdQ->getGpgpuCommandStreamReceiver());
+    auto dependentCsr = std::make_unique<MockCommandStreamReceiver>(*device->getExecutionEnvironment(), device->getRootDeviceIndex(), device->getDeviceBitfield());
+
+    mockCsr->timestampPacketWriteEnabled = true;
+    mockCmdQ->timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
+    IndirectHeap *ih1 = nullptr, *ih2 = nullptr, *ih3 = nullptr;
+    TimestampPacketDependencies timestampPacketDependencies;
+    mockCmdQ->allocateHeapMemory(IndirectHeap::Type::dynamicState, 1, ih1);
+    mockCmdQ->allocateHeapMemory(IndirectHeap::Type::indirectObject, 1, ih2);
+    mockCmdQ->allocateHeapMemory(IndirectHeap::Type::surfaceState, 1, ih3);
+
+    auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 1, AllocationType::commandBuffer, device->getDeviceBitfield()}));
+    auto kernelOperation = std::make_unique<KernelOperation>(cmdStream, *mockCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
+    kernelOperation->setHeaps(ih1, ih2, ih3);
+    CsrDependencyContainer dependencyMap;
+    auto tag = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag();
+    dependencyMap.push_back(std::make_pair(dependentCsr.get(), tag));
+    std::unique_ptr<Command> command(new CommandWithoutKernel(*mockCmdQ, kernelOperation, &dependencyMap));
+    command->setTimestampPacketNode(*mockCmdQ->timestampPacketContainer, std::move(timestampPacketDependencies));
+    dependentCsr->submitDependencyUpdateReturnValue = false;
+    auto stamp = command->submit(20, false);
+    EXPECT_EQ(stamp.taskCount, CompletionStamp::gpuHang);
+}
+
 HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectDispatchHints) {
    using CsrType = MockCsr1<FamilyType>;
    setUpImpl<CsrType>();
--- a/opencl/test/unit_test/mocks/mock_command_queue.h
+++ b/opencl/test/unit_test/mocks/mock_command_queue.h
@ -281,6 +281,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
    using BaseClass::obtainCommandStream;
    using BaseClass::obtainNewTimestampPacketNodes;
    using BaseClass::overrideEngine;
+    using BaseClass::prepareCsrDependency;
    using BaseClass::processDispatchForKernels;
    using BaseClass::relaxedOrderingForGpgpuAllowed;
    using BaseClass::requiresCacheFlushAfterWalker;
--- a/shared/source/helpers/CMakeLists.txt
+++ b/shared/source/helpers/CMakeLists.txt
@ -20,6 +20,7 @@ set(NEO_CORE_HELPERS
    ${CMAKE_CURRENT_SOURCE_DIR}/array_count.h
    ${CMAKE_CURRENT_SOURCE_DIR}/aux_translation.h
    ${CMAKE_CURRENT_SOURCE_DIR}/basic_math.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/bcs_ccs_dependency_pair_container.h
    ${CMAKE_CURRENT_SOURCE_DIR}/bindless_heaps_helper.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/bindless_heaps_helper.h
    ${CMAKE_CURRENT_SOURCE_DIR}/bit_helpers.h
--- a/shared/source/helpers/bcs_ccs_dependency_pair_container.h
+++ b/shared/source/helpers/bcs_ccs_dependency_pair_container.h
@ -0,0 +1,15 @@
+/*
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#pragma once
+#include <vector>
+
+namespace NEO {
+class CommandStreamReceiver;
+class TagNodeBase;
+using CsrDependencyContainer = std::vector<std::pair<CommandStreamReceiver *, TagNodeBase *>>;
+} // namespace NEO
--- a/shared/test/common/mocks/mock_command_stream_receiver.h
+++ b/shared/test/common/mocks/mock_command_stream_receiver.h
@ -78,7 +78,10 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {

    SubmissionStatus flushTagUpdate() override { return SubmissionStatus::success; };
    void updateTagFromWait() override{};
-    bool submitDependencyUpdate(TagNodeBase *tag) override { return true; };
+    bool submitDependencyUpdate(TagNodeBase *tag) override {
+        submitDependencyUpdateCalledTimes++;
+        return submitDependencyUpdateReturnValue;
+    }
    bool isUpdateTagFromWaitEnabled() override { return false; };

    void writeMemoryAub(aub_stream::AllocationParams &allocationParams) override {
@ -243,6 +246,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
    uint32_t writeMemoryAubCalled = 0;
    uint32_t makeResidentCalledTimes = 0;
    uint32_t downloadAllocationsCalledCount = 0;
+    uint32_t submitDependencyUpdateCalledTimes = 0;
    int hostPtrSurfaceCreationMutexLockCount = 0;
    bool multiOsContextCapable = false;
    bool memoryCompressionEnabled = false;
@ -259,6 +263,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
    BatchBuffer latestFlushedBatchBuffer = {};
    QueueThrottle getLastDirectSubmissionThrottleReturnValue = QueueThrottle::MEDIUM;
    bool getAcLineConnectedReturnValue = true;
+    bool submitDependencyUpdateReturnValue = true;
 };

 class MockCommandStreamReceiverWithFailingSubmitBatch : public MockCommandStreamReceiver {