Allow blocked command stream programming for commands without Kernel

Change-Id: I691a029bd5511c8f710ef1bff8cc5a9feca644f3 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> Related-To: NEO-3433
2019-07-22 20:55:09 +02:00 · 2019-07-22 20:55:09 +02:00 · 38556cec29
parent 55a1ddab39
commit 38556cec29
17 changed files with 248 additions and 96 deletions
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 #!groovy
 dependenciesRevision='06357fd1499ba888288e517541564865ad9c136a-1292'
 strategy='EQUAL'
-allowedCD=259
+allowedCD=260
 allowedF=5
--- a/runtime/command_queue/command_queue.cpp
+++ b/runtime/command_queue/command_queue.cpp
@ -588,4 +588,27 @@ bool CommandQueue::blitEnqueueAllowed(bool queueBlocked, cl_command_type cmdType

    return commandAllowed && !queueBlocked && blitAllowed;
 }
+
+bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const {
+    if (!blockedQueue) {
+        return false;
+    }
+
+    if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType)) {
+        return true;
+    }
+
+    if ((CL_COMMAND_BARRIER == commandType || CL_COMMAND_MARKER == commandType) &&
+        getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
+
+        for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) {
+            auto waitlistEvent = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
+            if (waitlistEvent->getTimestampPacketNodes()) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
 } // namespace NEO
--- a/runtime/command_queue/command_queue.h
+++ b/runtime/command_queue/command_queue.h
@ -314,7 +314,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {

    MOCKABLE_VIRTUAL void updateFromCompletionStamp(const CompletionStamp &completionStamp);

-    virtual bool isCacheFlushCommand(uint32_t commandType) { return false; }
+    virtual bool isCacheFlushCommand(uint32_t commandType) const { return false; }

    cl_int getCommandQueueInfo(cl_command_queue_info paramName,
                               size_t paramValueSize, void *paramValue,
@ -429,6 +429,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
    cl_int enqueueUnmapMemObject(TransferProperties &transferProperties, EventsRequest &eventsRequest);

    virtual void obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType){};
+    bool isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const;

    MOCKABLE_VIRTUAL void obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes, bool clearAllDependencies);
    void processProperties(const cl_queue_properties *properties);
--- a/runtime/command_queue/command_queue_hw.h
+++ b/runtime/command_queue/command_queue_hw.h
@ -367,7 +367,7 @@ class CommandQueueHw : public CommandQueue {
                          LinearStream *commandStream,
                          uint64_t postSyncAddress);

-    bool isCacheFlushCommand(uint32_t commandType) override;
+    bool isCacheFlushCommand(uint32_t commandType) const override;

  protected:
    MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){};
@ -389,10 +389,11 @@ class CommandQueueHw : public CommandQueue {
    LinearStream *obtainCommandStream(const CsrDependencies &csrDependencies, bool profilingRequired,
                                      bool perfCountersRequired, bool blitEnqueue, bool blockedQueue,
                                      const MultiDispatchInfo &multiDispatchInfo,
+                                      const EventsRequest &eventsRequest,
                                      std::unique_ptr<KernelOperation> &blockedCommandsData,
                                      Surface **surfaces, size_t numSurfaces) {
        LinearStream *commandStream = nullptr;
-        if (blockedQueue && !multiDispatchInfo.empty()) {
+        if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue)) {
            constexpr size_t additionalAllocationSize = CSRequirements::csOverfetchSize;
            constexpr size_t allocationSize = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize;
            commandStream = new LinearStream();
--- a/runtime/command_queue/command_queue_hw_bdw_plus.inl
+++ b/runtime/command_queue/command_queue_hw_bdw_plus.inl
@ -17,7 +17,7 @@ void CommandQueueHw<GfxFamily>::submitCacheFlush(Surface **surfaces,
 }

 template <typename GfxFamily>
-bool CommandQueueHw<GfxFamily>::isCacheFlushCommand(uint32_t commandType) {
+bool CommandQueueHw<GfxFamily>::isCacheFlushCommand(uint32_t commandType) const {
    return false;
 }

--- a/runtime/command_queue/enqueue_common.h
+++ b/runtime/command_queue/enqueue_common.h
@ -228,7 +228,8 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
    }

    auto &commandStream = *obtainCommandStream<commandType>(csrDeps, profilingRequired, perfCountersRequired, blitEnqueue, blockQueue,
-                                                            multiDispatchInfo, blockedCommandsData, surfacesForResidency, numSurfaceForResidency);
+                                                            multiDispatchInfo, eventsRequest, blockedCommandsData, surfacesForResidency,
+                                                            numSurfaceForResidency);
    auto commandStreamStart = commandStream.getUsed();

    if (eventBuilder.getEvent() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
@ -763,15 +764,16 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
        eventBuilder = &internalEventBuilder;
        DBG_LOG(EventsDebugEnable, "enqueueBlocked", "new virtualEvent", eventBuilder->getEvent());
    }
+    auto outEvent = eventBuilder->getEvent();

    //update queue taskCount
-    taskCount = eventBuilder->getEvent()->getCompletionStamp();
+    taskCount = outEvent->getCompletionStamp();
+
+    std::unique_ptr<Command> command;
+    bool storeTimestampPackets = blockedCommandsData && timestampPacketContainer;

    if (multiDispatchInfo.empty()) {
-        DEBUG_BREAK_IF(!isCommandWithoutKernel(commandType));
-        auto cmd = std::make_unique<CommandMarker>(*this);
-
-        eventBuilder->getEvent()->setCommand(std::move(cmd));
+        command = std::make_unique<CommandMarker>(*this, blockedCommandsData);
    } else {
        //store task data in event
        std::vector<Surface *> allSurfaces;
@ -788,9 +790,8 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
            allSurfaces.push_back(surface->duplicate());
        }
        PreemptionMode preemptionMode = PreemptionHelper::taskPreemptionMode(*device, multiDispatchInfo);
-        auto cmd = std::make_unique<CommandComputeKernel>(
-            *this,
-            std::move(blockedCommandsData),
+        command = std::make_unique<CommandComputeKernel>(*this,
+                                                         blockedCommandsData,
                                                         allSurfaces,
                                                         shouldFlushDC(commandType, printfHandler.get()),
                                                         slmUsed,
@ -799,17 +800,16 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
                                                         preemptionMode,
                                                         multiDispatchInfo.peekMainKernel(),
                                                         (uint32_t)multiDispatchInfo.size());
-
-        if (timestampPacketContainer.get()) {
+    }
+    if (storeTimestampPackets) {
        for (cl_uint i = 0; i < eventsRequest.numEventsInWaitList; i++) {
            auto event = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
            event->incRefInternal();
        }
-            cmd->setTimestampPacketNode(*timestampPacketContainer, *previousTimestampPacketNodes);
-        }
-        cmd->setEventsRequest(eventsRequest);
-        eventBuilder->getEvent()->setCommand(std::move(cmd));
+        command->setTimestampPacketNode(*timestampPacketContainer, *previousTimestampPacketNodes);
+        command->setEventsRequest(eventsRequest);
    }
+    outEvent->setCommand(std::move(command));

    eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventsRequest.eventWaitList, eventsRequest.numEventsInWaitList));
    eventBuilder->addParentEvent(this->virtualEvent);
@ -819,7 +819,7 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
        this->virtualEvent->decRefInternal();
    }

-    this->virtualEvent = eventBuilder->getEvent();
+    this->virtualEvent = outEvent;
 }

 template <typename GfxFamily>
--- a/runtime/helpers/task_information.cpp
+++ b/runtime/helpers/task_information.cpp
@ -82,7 +82,7 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
    return completionStamp;
 }

-CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> kernelOperation, std::vector<Surface *> &surfaces,
+CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces,
                                           bool flushDC, bool usesSLM, bool ndRangeKernel, std::unique_ptr<PrintfHandler> printfHandler,
                                           PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount)
    : Command(commandQueue, kernelOperation), flushDC(flushDC), slmUsed(usesSLM),
@ -135,12 +135,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
    if (printfHandler) {
        printfHandler.get()->makeResident(commandStreamReceiver);
    }
-    if (currentTimestampPacketNodes) {
-        currentTimestampPacketNodes->makeResident(commandStreamReceiver);
-    }
-    if (previousTimestampPacketNodes) {
-        previousTimestampPacketNodes->makeResident(commandStreamReceiver);
-    }
+    makeTimestampPacketsResident();

    if (executionModelKernel) {
        uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
@ -224,10 +219,43 @@ CompletionStamp &CommandMarker::submit(uint32_t taskLevel, bool terminated) {
    }

    auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver();
+
+    if (!kernelOperation) {
        completionStamp.taskCount = commandStreamReceiver.peekTaskCount();
        completionStamp.taskLevel = commandStreamReceiver.peekTaskLevel();
        completionStamp.flushStamp = commandStreamReceiver.obtainCurrentFlushStamp();

+        return completionStamp;
+    }
+
+    auto lockCSR = commandStreamReceiver.obtainUniqueOwnership();
+
+    DispatchFlags dispatchFlags;
+    dispatchFlags.blocking = true;
+    dispatchFlags.lowPriority = commandQueue.getPriority() == QueuePriority::LOW;
+    dispatchFlags.throttle = commandQueue.getThrottle();
+    dispatchFlags.preemptionMode = commandQueue.getDevice().getPreemptionMode();
+    dispatchFlags.multiEngineQueue = commandQueue.isMultiEngineQueue();
+    dispatchFlags.guardCommandBufferWithPipeControl = true;
+    dispatchFlags.outOfOrderExecutionAllowed = commandStreamReceiver.isNTo1SubmissionModelEnabled();
+
+    UNRECOVERABLE_IF(!commandStreamReceiver.peekTimestampPacketWriteEnabled());
+
+    dispatchFlags.csrDependencies.fillFromEventsRequestAndMakeResident(eventsRequest, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
+
+    makeTimestampPacketsResident();
+
+    gtpinNotifyPreFlushTask(&commandQueue);
+
+    completionStamp = commandStreamReceiver.flushTask(*kernelOperation->commandStream,
+                                                      0,
+                                                      commandQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
+                                                      commandQueue.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
+                                                      commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
+                                                      taskLevel,
+                                                      dispatchFlags,
+                                                      commandQueue.getDevice());
+
    return completionStamp;
 }

@ -259,6 +287,17 @@ Command::~Command() {
    }
 }

+void Command::makeTimestampPacketsResident() {
+    auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver();
+
+    if (currentTimestampPacketNodes) {
+        currentTimestampPacketNodes->makeResident(commandStreamReceiver);
+    }
+    if (previousTimestampPacketNodes) {
+        previousTimestampPacketNodes->makeResident(commandStreamReceiver);
+    }
+}
+
 Command::Command(CommandQueue &commandQueue) : commandQueue(commandQueue) {}

 Command::Command(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation)
--- a/runtime/helpers/task_information.h
+++ b/runtime/helpers/task_information.h
@ -10,6 +10,7 @@
 #include "runtime/helpers/completion_stamp.h"
 #include "runtime/helpers/hw_info.h"
 #include "runtime/helpers/properties_helper.h"
+#include "runtime/helpers/timestamp_packet.h"
 #include "runtime/indirect_heap/indirect_heap.h"
 #include "runtime/utilities/iflist.h"

@ -92,6 +93,7 @@ class Command : public IFNode<Command> {
    }
    void setTimestampPacketNode(TimestampPacketContainer &current, TimestampPacketContainer &previous);
    void setEventsRequest(EventsRequest &eventsRequest);
+    void makeTimestampPacketsResident();

    TagNode<HwTimeStamps> *timestamp = nullptr;
    CompletionStamp completionStamp = {};
@ -122,7 +124,7 @@ class CommandMapUnmap : public Command {

 class CommandComputeKernel : public Command {
  public:
-    CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> kernelResources, std::vector<Surface *> &surfaces,
+    CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces,
                         bool flushDC, bool usesSLM, bool ndRangeKernel, std::unique_ptr<PrintfHandler> printfHandler,
                         PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount);

@ -146,7 +148,6 @@ class CommandComputeKernel : public Command {
 class CommandMarker : public Command {
  public:
    using Command::Command;
-
    CompletionStamp &submit(uint32_t taskLevel, bool terminated) override;
 };
 } // namespace NEO
--- a/unit_tests/command_queue/dispatch_walker_tests.cpp
+++ b/unit_tests/command_queue/dispatch_walker_tests.cpp
@ -745,8 +745,10 @@ HWTEST_F(DispatchWalkerTest, givenBlockedEnqueueWhenObtainingCommandStreamThenAl
    auto expectedSizeCS = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize;

    CsrDependencies csrDependencies;
+    EventsRequest eventsRequest(0, nullptr, nullptr);
    auto cmdStream = mockCmdQ.template obtainCommandStream<CL_COMMAND_NDRANGE_KERNEL>(csrDependencies, false, false, false, true,
-                                                                                      multiDispatchInfo, blockedKernelData, nullptr, 0u);
+                                                                                      multiDispatchInfo, eventsRequest, blockedKernelData,
+                                                                                      nullptr, 0u);

    EXPECT_EQ(expectedSizeCS, cmdStream->getMaxAvailableSpace());
    EXPECT_EQ(expectedSizeCSAllocation, cmdStream->getGraphicsAllocation()->getUnderlyingBufferSize());
--- a/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp
+++ b/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp
@ -24,7 +24,7 @@ class MockCommandQueueWithCacheFlush : public MockCommandQueueHw<GfxFamily> {
    using MockCommandQueueHw<GfxFamily>::MockCommandQueueHw;

  public:
-    bool isCacheFlushCommand(uint32_t commandType) override {
+    bool isCacheFlushCommand(uint32_t commandType) const override {
        return commandRequireCacheFlush;
    }
    bool commandRequireCacheFlush = false;
--- a/unit_tests/command_stream/command_stream_receiver_flush_task_3_tests.cpp
+++ b/unit_tests/command_stream/command_stream_receiver_flush_task_3_tests.cpp
@ -1450,11 +1450,11 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenItIsUnblocke
    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);

-    auto blockedCommandsData = new KernelOperation(cmdStream, *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
+    auto blockedCommandsData = std::make_unique<KernelOperation>(cmdStream, *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
    blockedCommandsData->setHeaps(dsh, ioh, ssh);

    std::vector<Surface *> surfaces;
-    event->setCommand(std::make_unique<CommandComputeKernel>(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandsData), surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel.get(), 1));
+    event->setCommand(std::make_unique<CommandComputeKernel>(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel.get(), 1));
    event->submitCommand(false);

    EXPECT_EQ(numGrfRequired, csr->savedDispatchFlags.numGrfRequired);
--- a/unit_tests/event/event_builder_tests.cpp
+++ b/unit_tests/event/event_builder_tests.cpp
@ -72,8 +72,8 @@ TEST(EventBuilder, givenVirtualEventWithCommandThenFinalizeAddChild) {
    class MockCommandComputeKernel : public CommandComputeKernel {
      public:
        using CommandComputeKernel::eventsWaitlist;
-        MockCommandComputeKernel(CommandQueue &commandQueue, KernelOperation *kernelResources, std::vector<Surface *> &surfaces, Kernel *kernel)
-            : CommandComputeKernel(commandQueue, std::unique_ptr<KernelOperation>(kernelResources), surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
+        MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces, Kernel *kernel)
+            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
    };

    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
@ -87,7 +87,7 @@ TEST(EventBuilder, givenVirtualEventWithCommandThenFinalizeAddChild) {
    auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({1, GraphicsAllocation::AllocationType::COMMAND_BUFFER}));

    std::vector<Surface *> surfaces;
-    auto kernelOperation = new KernelOperation(cmdStream, *device->getDefaultEngine().commandStreamReceiver->getInternalAllocationStorage());
+    auto kernelOperation = std::make_unique<KernelOperation>(cmdStream, *device->getDefaultEngine().commandStreamReceiver->getInternalAllocationStorage());
    kernelOperation->setHeaps(ih1, ih2, ih3);

    std::unique_ptr<MockCommandComputeKernel> command = std::make_unique<MockCommandComputeKernel>(cmdQ, kernelOperation, surfaces, kernel);
@ -121,8 +121,8 @@ TEST(EventBuilder, givenVirtualEventWithSubmittedCommandAsParentThenFinalizeNotA
    class MockCommandComputeKernel : public CommandComputeKernel {
      public:
        using CommandComputeKernel::eventsWaitlist;
-        MockCommandComputeKernel(CommandQueue &commandQueue, KernelOperation *kernelResources, std::vector<Surface *> &surfaces, Kernel *kernel)
-            : CommandComputeKernel(commandQueue, std::unique_ptr<KernelOperation>(kernelResources), surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
+        MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces, Kernel *kernel)
+            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
    };

    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
@ -136,7 +136,7 @@ TEST(EventBuilder, givenVirtualEventWithSubmittedCommandAsParentThenFinalizeNotA
    auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({4096, GraphicsAllocation::AllocationType::COMMAND_BUFFER}));

    std::vector<Surface *> surfaces;
-    auto kernelOperation = new KernelOperation(cmdStream, *device->getDefaultEngine().commandStreamReceiver->getInternalAllocationStorage());
+    auto kernelOperation = std::make_unique<KernelOperation>(cmdStream, *device->getDefaultEngine().commandStreamReceiver->getInternalAllocationStorage());
    kernelOperation->setHeaps(ih1, ih2, ih3);

    std::unique_ptr<MockCommandComputeKernel> command = std::make_unique<MockCommandComputeKernel>(cmdQ, kernelOperation, surfaces, kernel);
--- a/unit_tests/event/event_tests.cpp
+++ b/unit_tests/event/event_tests.cpp
@ -466,7 +466,7 @@ TEST_F(InternalsEventTest, processBlockedCommandsKernelOperation) {
    cmdQ.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
    cmdQ.allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);

-    auto blockedCommandsData = new KernelOperation(cmdStream, *cmdQ.getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
+    auto blockedCommandsData = std::make_unique<KernelOperation>(cmdStream, *cmdQ.getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
    blockedCommandsData->setHeaps(dsh, ioh, ssh);

    MockKernelWithInternals mockKernelWithInternals(*pDevice);
@ -486,7 +486,7 @@ TEST_F(InternalsEventTest, processBlockedCommandsKernelOperation) {

    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
    v.push_back(bufferSurf);
-    auto cmd = new CommandComputeKernel(cmdQ, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, nullptr, preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1);
    event.setCommand(std::unique_ptr<Command>(cmd));

    auto taskLevelBefore = csr.peekTaskLevel();
@ -518,7 +518,7 @@ TEST_F(InternalsEventTest, processBlockedCommandsAbortKernelOperation) {
    cmdQ.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
    cmdQ.allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);

-    auto blockedCommandsData = new KernelOperation(cmdStream, *cmdQ.getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
+    auto blockedCommandsData = std::make_unique<KernelOperation>(cmdStream, *cmdQ.getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
    blockedCommandsData->setHeaps(dsh, ioh, ssh);

    MockKernelWithInternals mockKernelWithInternals(*pDevice);
@ -529,7 +529,7 @@ TEST_F(InternalsEventTest, processBlockedCommandsAbortKernelOperation) {
    NullSurface *surface = new NullSurface;
    v.push_back(surface);
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmd = new CommandComputeKernel(cmdQ, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, nullptr, preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1);
    event.setCommand(std::unique_ptr<Command>(cmd));

    auto taskLevelBefore = csr.peekTaskLevel();
@ -552,7 +552,7 @@ TEST_F(InternalsEventTest, givenBlockedKernelWithPrintfWhenSubmittedThenPrintOut
    cmdQ.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
    cmdQ.allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);

-    auto blockedCommandsData = new KernelOperation(cmdStream, *cmdQ.getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
+    auto blockedCommandsData = std::make_unique<KernelOperation>(cmdStream, *cmdQ.getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
    blockedCommandsData->setHeaps(dsh, ioh, ssh);

    SPatchAllocateStatelessPrintfSurface *pPrintfSurface = new SPatchAllocateStatelessPrintfSurface();
@ -580,7 +580,7 @@ TEST_F(InternalsEventTest, givenBlockedKernelWithPrintfWhenSubmittedThenPrintOut

    std::vector<Surface *> v;
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmd = new CommandComputeKernel(cmdQ, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(cmdQ, blockedCommandsData, v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1);
    event.setCommand(std::unique_ptr<Command>(cmd));

    event.submitCommand(false);
@ -902,8 +902,8 @@ HWTEST_F(EventTest, givenVirtualEventWhenCommandSubmittedThenLockCSROccurs) {
    class MockCommandComputeKernel : public CommandComputeKernel {
      public:
        using CommandComputeKernel::eventsWaitlist;
-        MockCommandComputeKernel(CommandQueue &commandQueue, KernelOperation *kernelResources, std::vector<Surface *> &surfaces, Kernel *kernel)
-            : CommandComputeKernel(commandQueue, std::unique_ptr<KernelOperation>(kernelResources), surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
+        MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces, Kernel *kernel)
+            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
    };
    class MockEvent : public Event {
      public:
@ -922,7 +922,7 @@ HWTEST_F(EventTest, givenVirtualEventWhenCommandSubmittedThenLockCSROccurs) {
    auto cmdStream = new LinearStream(pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties({4096, GraphicsAllocation::AllocationType::COMMAND_BUFFER}));

    std::vector<Surface *> surfaces;
-    auto kernelOperation = new KernelOperation(cmdStream, *pDevice->getDefaultEngine().commandStreamReceiver->getInternalAllocationStorage());
+    auto kernelOperation = std::make_unique<KernelOperation>(cmdStream, *pDevice->getDefaultEngine().commandStreamReceiver->getInternalAllocationStorage());
    kernelOperation->setHeaps(ih1, ih2, ih3);

    std::unique_ptr<MockCommandComputeKernel> command = std::make_unique<MockCommandComputeKernel>(*pCmdQ, kernelOperation, surfaces, kernel);
@ -1473,11 +1473,11 @@ HWTEST_F(InternalsEventTest, givenAbortedCommandWhenSubmitCalledThenDontUpdateFl
    pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
-    auto blockedCommandsData = new KernelOperation(cmdStream, *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
+    auto blockedCommandsData = std::make_unique<KernelOperation>(cmdStream, *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
    blockedCommandsData->setHeaps(dsh, ioh, ssh);
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
    std::vector<Surface *> v;
-    auto cmd = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, nullptr, preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(*pCmdQ, blockedCommandsData, v, false, false, false, nullptr, preemptionMode, pKernel, 1);
    event->setCommand(std::unique_ptr<Command>(cmd));

    FlushStamp expectedFlushStamp = 0;
--- a/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp
+++ b/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp
@ -97,14 +97,14 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenLockedEMcritca
        size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSizeRequiredForExecutionModel(IndirectHeap::SURFACE_STATE, *parentKernel);

        auto cmdStreamAllocation = device->getMemoryManager()->allocateGraphicsMemoryWithProperties({4096, GraphicsAllocation::AllocationType::COMMAND_BUFFER});
-        KernelOperation *blockedCommandData = new KernelOperation(new LinearStream(cmdStreamAllocation),
+        auto blockedCommandData = std::make_unique<KernelOperation>(new LinearStream(cmdStreamAllocation),
                                                                    *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
        blockedCommandData->setHeaps(dsh, ioh, ssh);

        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, blockedCommandData, surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        cmdComputeKernel->submit(0, false);

@ -155,7 +155,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenParentKernelWh
        EXPECT_EQ(colorCalcSizeDevQueue, usedDSHBeforeSubmit);

        auto cmdStreamAllocation = device->getMemoryManager()->allocateGraphicsMemoryWithProperties({4096, GraphicsAllocation::AllocationType::COMMAND_BUFFER});
-        KernelOperation *blockedCommandData = new KernelOperation(new LinearStream(cmdStreamAllocation),
+        auto blockedCommandData = std::make_unique<KernelOperation>(new LinearStream(cmdStreamAllocation),
                                                                    *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
        blockedCommandData->setHeaps(dsh, ioh, ssh);

@ -164,7 +164,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenParentKernelWh
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, blockedCommandData, surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        cmdComputeKernel->submit(0, false);

@ -196,7 +196,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenParentKernelWh
        dsh->getSpace(mockDevQueue.getDshOffset());

        auto cmdStreamAllocation = device->getMemoryManager()->allocateGraphicsMemoryWithProperties({4096, GraphicsAllocation::AllocationType::COMMAND_BUFFER});
-        KernelOperation *blockedCommandData = new KernelOperation(new LinearStream(cmdStreamAllocation),
+        auto blockedCommandData = std::make_unique<KernelOperation>(new LinearStream(cmdStreamAllocation),
                                                                    *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
        blockedCommandData->setHeaps(dsh, ioh, ssh);

@ -205,7 +205,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenParentKernelWh
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, blockedCommandData, surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        cmdComputeKernel->submit(0, false);

@ -234,7 +234,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenBlockedParentK
        dsh->getSpace(mockDevQueue.getDshOffset());

        auto cmdStreamAllocation = device->getMemoryManager()->allocateGraphicsMemoryWithProperties({4096, GraphicsAllocation::AllocationType::COMMAND_BUFFER});
-        KernelOperation *blockedCommandData = new KernelOperation(new LinearStream(cmdStreamAllocation),
+        auto blockedCommandData = std::make_unique<KernelOperation>(new LinearStream(cmdStreamAllocation),
                                                                    *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
        blockedCommandData->setHeaps(dsh, ioh, ssh);

@ -243,7 +243,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenBlockedParentK
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, blockedCommandData, surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        auto timestamp = pCmdQ->getGpgpuCommandStreamReceiver().getEventTsAllocator()->getTag();
        cmdComputeKernel->timestamp = timestamp;
@ -275,7 +275,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenParentKernelWh
        dsh->getSpace(mockDevQueue.getDshOffset());

        auto cmdStreamAllocation = device->getMemoryManager()->allocateGraphicsMemoryWithProperties({4096, GraphicsAllocation::AllocationType::COMMAND_BUFFER});
-        KernelOperation *blockedCommandData = new KernelOperation(new LinearStream(cmdStreamAllocation),
+        auto blockedCommandData = std::make_unique<KernelOperation>(new LinearStream(cmdStreamAllocation),
                                                                    *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
        blockedCommandData->setHeaps(dsh, ioh, ssh);

@ -284,7 +284,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenParentKernelWh
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, blockedCommandData, surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        cmdComputeKernel->submit(0, false);

@ -328,14 +328,14 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenUsedCommandQue
        queueIoh.getSpace(usedSize);

        auto cmdStreamAllocation = device->getMemoryManager()->allocateGraphicsMemoryWithProperties({4096, GraphicsAllocation::AllocationType::COMMAND_BUFFER});
-        KernelOperation *blockedCommandData = new KernelOperation(new LinearStream(cmdStreamAllocation),
+        auto blockedCommandData = std::make_unique<KernelOperation>(new LinearStream(cmdStreamAllocation),
                                                                    *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
        blockedCommandData->setHeaps(dsh, ioh, ssh);

        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(cmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(cmdQ, blockedCommandData, surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        cmdComputeKernel->submit(0, false);

@ -376,14 +376,14 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenNotUsedSSHWhen
        void *sshBuffer = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u).getCpuBase();

        auto cmdStreamAllocation = device->getMemoryManager()->allocateGraphicsMemoryWithProperties({4096, GraphicsAllocation::AllocationType::COMMAND_BUFFER});
-        KernelOperation *blockedCommandData = new KernelOperation(new LinearStream(cmdStreamAllocation),
+        auto blockedCommandData = std::make_unique<KernelOperation>(new LinearStream(cmdStreamAllocation),
                                                                    *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
        blockedCommandData->setHeaps(dsh, ioh, ssh);

        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, blockedCommandData, surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        cmdComputeKernel->submit(0, false);

--- a/unit_tests/helpers/task_information_tests.cpp
+++ b/unit_tests/helpers/task_information_tests.cpp
@ -29,7 +29,7 @@ TEST(CommandTest, mapUnmapSubmitWithoutTerminateFlagFlushesCsr) {

    MemObjSizeArray size = {{1, 1, 1}};
    MemObjOffsetArray offset = {{0, 0, 0}};
-    std::unique_ptr<Command> command(new CommandMapUnmap(MapOperationType::MAP, buffer, size, offset, false, *cmdQ.get()));
+    std::unique_ptr<Command> command(new CommandMapUnmap(MapOperationType::MAP, buffer, size, offset, false, *cmdQ));
    CompletionStamp completionStamp = command->submit(20, false);

    auto expectedTaskCount = initialTaskCount + 1;
@ -46,7 +46,7 @@ TEST(CommandTest, mapUnmapSubmitWithTerminateFlagAbortsFlush) {

    MemObjSizeArray size = {{1, 1, 1}};
    MemObjOffsetArray offset = {{0, 0, 0}};
-    std::unique_ptr<Command> command(new CommandMapUnmap(MapOperationType::MAP, buffer, size, offset, false, *cmdQ.get()));
+    std::unique_ptr<Command> command(new CommandMapUnmap(MapOperationType::MAP, buffer, size, offset, false, *cmdQ));
    CompletionStamp completionStamp = command->submit(20, true);

    auto submitTaskCount = csr.peekTaskCount();
@ -91,8 +91,8 @@ TEST(CommandTest, givenWaitlistRequestWhenCommandComputeKernelIsCreatedThenMakeL
    class MockCommandComputeKernel : public CommandComputeKernel {
      public:
        using CommandComputeKernel::eventsWaitlist;
-        MockCommandComputeKernel(CommandQueue &commandQueue, KernelOperation *kernelResources, std::vector<Surface *> &surfaces, Kernel *kernel)
-            : CommandComputeKernel(commandQueue, std::unique_ptr<KernelOperation>(kernelResources), surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
+        MockCommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> &surfaces, Kernel *kernel)
+            : CommandComputeKernel(commandQueue, kernelOperation, surfaces, false, false, false, nullptr, PreemptionMode::Disabled, kernel, 0) {}
    };

    auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
@ -106,7 +106,7 @@ TEST(CommandTest, givenWaitlistRequestWhenCommandComputeKernelIsCreatedThenMakeL
    auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({1, GraphicsAllocation::AllocationType::COMMAND_BUFFER}));

    std::vector<Surface *> surfaces;
-    auto kernelOperation = new KernelOperation(cmdStream, *device->getDefaultEngine().commandStreamReceiver->getInternalAllocationStorage());
+    auto kernelOperation = std::make_unique<KernelOperation>(cmdStream, *device->getDefaultEngine().commandStreamReceiver->getInternalAllocationStorage());
    kernelOperation->setHeaps(ih1, ih2, ih3);

    UserEvent event1, event2, event3;
--- a/unit_tests/helpers/timestamp_packet_tests.cpp
+++ b/unit_tests/helpers/timestamp_packet_tests.cpp
@ -17,6 +17,7 @@
 #include "unit_tests/helpers/hw_parse.h"
 #include "unit_tests/mocks/mock_command_queue.h"
 #include "unit_tests/mocks/mock_context.h"
+#include "unit_tests/mocks/mock_csr.h"
 #include "unit_tests/mocks/mock_device.h"
 #include "unit_tests/mocks/mock_execution_environment.h"
 #include "unit_tests/mocks/mock_kernel.h"
@ -1373,6 +1374,72 @@ HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingWithoutK
    cmdQ->isQueueBlocked();
 }

+HWTEST_F(TimestampPacketTests, givenBlockedEnqueueWithoutKernelWhenSubmittingThenDispatchBlockedCommands) {
+    using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
+
+    auto mockCsr = new MockCsrHw2<FamilyType>(*device->getExecutionEnvironment());
+    device->resetCommandStreamReceiver(mockCsr);
+    mockCsr->timestampPacketWriteEnabled = true;
+    mockCsr->storeFlushedTaskStream = true;
+
+    auto cmdQ0 = clUniquePtr(new MockCommandQueueHw<FamilyType>(context, device.get(), nullptr));
+
+    auto &secondEngine = device->getEngine(aub_stream::ENGINE_RCS, true);
+    static_cast<UltCommandStreamReceiver<FamilyType> *>(secondEngine.commandStreamReceiver)->timestampPacketWriteEnabled = true;
+
+    auto cmdQ1 = clUniquePtr(new MockCommandQueueHw<FamilyType>(context, device.get(), nullptr));
+    cmdQ1->gpgpuEngine = &secondEngine;
+    cmdQ1->timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
+    EXPECT_NE(&cmdQ0->getGpgpuCommandStreamReceiver(), &cmdQ1->getGpgpuCommandStreamReceiver());
+
+    MockTimestampPacketContainer node0(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1);
+    MockTimestampPacketContainer node1(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1);
+
+    Event event0(cmdQ0.get(), 0, 0, 0); // on the same CSR
+    event0.addTimestampPacketNodes(node0);
+    Event event1(cmdQ1.get(), 0, 0, 0); // on different CSR
+    event1.addTimestampPacketNodes(node1);
+
+    uint32_t numEventsOnWaitlist = 3;
+
+    uint32_t commands[] = {CL_COMMAND_MARKER, CL_COMMAND_BARRIER};
+    for (int i = 0; i < 2; i++) {
+        UserEvent userEvent;
+        cl_event waitlist[] = {&event0, &event1, &userEvent};
+        if (commands[i] == CL_COMMAND_MARKER) {
+            cmdQ0->enqueueMarkerWithWaitList(numEventsOnWaitlist, waitlist, nullptr);
+        } else if (commands[i] == CL_COMMAND_BARRIER) {
+            cmdQ0->enqueueBarrierWithWaitList(numEventsOnWaitlist, waitlist, nullptr);
+        } else {
+            EXPECT_TRUE(false);
+        }
+
+        auto initialCsrStreamOffset = mockCsr->commandStream.getUsed();
+        userEvent.setStatus(CL_COMPLETE);
+
+        HardwareParse hwParserCsr;
+        HardwareParse hwParserCmdQ;
+        LinearStream taskStream(mockCsr->storedTaskStream.get(), mockCsr->storedTaskStreamSize);
+        taskStream.getSpace(mockCsr->storedTaskStreamSize);
+        hwParserCsr.parseCommands<FamilyType>(mockCsr->commandStream, initialCsrStreamOffset);
+        hwParserCmdQ.parseCommands<FamilyType>(taskStream, 0);
+
+        auto queueSemaphores = findAll<MI_SEMAPHORE_WAIT *>(hwParserCmdQ.cmdList.begin(), hwParserCmdQ.cmdList.end());
+        EXPECT_EQ(1u, queueSemaphores.size());
+        verifySemaphore(genCmdCast<MI_SEMAPHORE_WAIT *>(*(queueSemaphores[0])), node0.getNode(0));
+
+        auto csrSemaphores = findAll<MI_SEMAPHORE_WAIT *>(hwParserCsr.cmdList.begin(), hwParserCsr.cmdList.end());
+        EXPECT_EQ(1u, csrSemaphores.size());
+        verifySemaphore(genCmdCast<MI_SEMAPHORE_WAIT *>(*(csrSemaphores[0])), node1.getNode(0));
+
+        EXPECT_TRUE(mockCsr->passedDispatchFlags.blocking);
+        EXPECT_TRUE(mockCsr->passedDispatchFlags.guardCommandBufferWithPipeControl);
+        EXPECT_EQ(device->getPreemptionMode(), mockCsr->passedDispatchFlags.preemptionMode);
+
+        cmdQ0->isQueueBlocked();
+    }
+}
+
 HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingMarkerWithoutKernelThenInheritTimestampPacketsAndProgramSemaphores) {
    using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
    auto device2 = std::unique_ptr<MockDevice>(Device::create<MockDevice>(executionEnvironment, 1u));
@ -1504,10 +1571,11 @@ HWTEST_F(TimestampPacketTests, givenBlockedQueueWhenEnqueueingBarrierThenRequest

    MockCommandQueueHw<FamilyType> cmdQ(context, device.get(), nullptr);

-    UserEvent userEvent;
-    cl_event waitlist[] = {&userEvent};
+    auto userEvent = make_releaseable<UserEvent>();
+    cl_event waitlist[] = {userEvent.get()};
    cmdQ.enqueueBarrierWithWaitList(1, waitlist, nullptr);
    EXPECT_TRUE(csr.stallingPipeControlOnNextFlushRequired);
+    userEvent->setStatus(CL_COMPLETE);
 }

 HWTEST_F(TimestampPacketTests, givenPipeControlRequestWhenEstimatingCsrStreamSizeThenAddSizeForPipeControl) {
--- a/unit_tests/mocks/mock_csr.h
+++ b/unit_tests/mocks/mock_csr.h
@ -193,11 +193,28 @@ class MockCsrHw2 : public CommandStreamReceiverHw<GfxFamily> {
                              const IndirectHeap &dsh, const IndirectHeap &ioh,
                              const IndirectHeap &ssh, uint32_t taskLevel, DispatchFlags &dispatchFlags, Device &device) override {
        passedDispatchFlags = dispatchFlags;
+
        recordedCommandBuffer = std::unique_ptr<CommandBuffer>(new CommandBuffer(device));
-        return CommandStreamReceiverHw<GfxFamily>::flushTask(commandStream, commandStreamStart,
+        auto completionStamp = CommandStreamReceiverHw<GfxFamily>::flushTask(commandStream, commandStreamStart,
                                                                             dsh, ioh, ssh, taskLevel, dispatchFlags, device);
+
+        if (storeFlushedTaskStream && commandStream.getUsed() > commandStreamStart) {
+            storedTaskStreamSize = commandStream.getUsed() - commandStreamStart;
+            // Overfetch to allow command parser verify if "big" command is programmed at the end of allocation
+            auto overfetchedSize = storedTaskStreamSize + MemoryConstants::cacheLineSize;
+            storedTaskStream.reset(new uint8_t[overfetchedSize]);
+            memset(storedTaskStream.get(), 0, overfetchedSize);
+            memcpy_s(storedTaskStream.get(), storedTaskStreamSize,
+                     ptrOffset(commandStream.getCpuBase(), commandStreamStart), storedTaskStreamSize);
        }

+        return completionStamp;
+    }
+
+    bool storeFlushedTaskStream = false;
+    std::unique_ptr<uint8_t> storedTaskStream;
+    size_t storedTaskStreamSize = 0;
+
    int flushCalledCount = 0;
    std::unique_ptr<CommandBuffer> recordedCommandBuffer = nullptr;
    ResidencyContainer copyOfAllocations;