Pass eventWaitList to blocked command for semaphore programming

Change-Id: I8b56be03a7b89283f5368cf42d6788d70ebecdc7 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
2018-09-19 10:34:33 -07:00 · 2018-09-19 10:34:33 -07:00 · e06b370697
parent 26006a8482
commit e06b370697
7 changed files with 91 additions and 82 deletions
--- a/runtime/command_queue/command_queue_hw.h
+++ b/runtime/command_queue/command_queue_hw.h
@ -322,11 +322,11 @@ class CommandQueueHw : public CommandQueue {
                        bool &blocking,
                        const MultiDispatchInfo &multiDispatchInfo,
                        KernelOperation *blockedCommandsData,
-                        cl_uint numEventsInWaitList,
-                        const cl_event *eventWaitList,
+                        EventsRequest &eventsRequest,
                        bool slmUsed,
                        EventBuilder &externalEventBuilder,
                        std::unique_ptr<PrintfHandler> printfHandler);
+
  protected:
    MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){};
    MOCKABLE_VIRTUAL bool createAllocationForHostSurface(HostPtrSurface &surface);
--- a/runtime/command_queue/enqueue_common.h
+++ b/runtime/command_queue/enqueue_common.h
@ -292,6 +292,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
        slmUsed = multiDispatchInfo.usesSlm();
    }

+    EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
    CompletionStamp completionStamp;
    if (!blockQueue) {
        if (parentKernel) {
@ -340,8 +341,6 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
        auto submissionRequired = isCommandWithoutKernel(commandType) ? false : true;

        if (submissionRequired) {
-            EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr);
-
            completionStamp = enqueueNonBlocked<commandType>(
                surfacesForResidency,
                numSurfaceForResidency,
@ -426,8 +425,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
            blocking,
            multiDispatchInfo,
            blockedCommandsData,
-            numEventsInWaitList,
-            eventWaitList,
+            eventsRequest,
            slmUsed,
            eventBuilder,
            std::move(printfHandler));
@ -616,8 +614,7 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
    bool &blocking,
    const MultiDispatchInfo &multiDispatchInfo,
    KernelOperation *blockedCommandsData,
-    cl_uint numEventsInWaitList,
-    const cl_event *eventWaitList,
+    EventsRequest &eventsRequest,
    bool slmUsed,
    EventBuilder &externalEventBuilder,
    std::unique_ptr<PrintfHandler> printfHandler) {
@ -678,7 +675,6 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
        auto kernelOperation = std::unique_ptr<KernelOperation>(blockedCommandsData); // marking ownership
        auto cmd = std::make_unique<CommandComputeKernel>(
            *this,
-            commandStreamReceiver,
            std::move(kernelOperation),
            allSurfaces,
            shouldFlushDC(commandType, printfHandler.get()),
@ -692,10 +688,11 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
        if (timestampPacketNode) {
            cmd->setTimestampPacketNode(timestampPacketNode);
        }
+        cmd->setEventsRequest(eventsRequest);
        eventBuilder->getEvent()->setCommand(std::move(cmd));
    }

-    eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventWaitList, numEventsInWaitList));
+    eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventsRequest.eventWaitList, eventsRequest.numEventsInWaitList));
    eventBuilder->addParentEvent(this->virtualEvent);
    eventBuilder->finalize();

--- a/runtime/helpers/task_information.cpp
+++ b/runtime/helpers/task_information.cpp
@ -89,30 +89,21 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
    return completionStamp;
 }

-CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, CommandStreamReceiver &commandStreamReceiver,
-                                           std::unique_ptr<KernelOperation> kernelOperation, std::vector<Surface *> &surfaces,
+CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> kernelOperation, std::vector<Surface *> &surfaces,
                                           bool flushDC, bool usesSLM, bool ndRangeKernel, std::unique_ptr<PrintfHandler> printfHandler,
                                           PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount)
-    : commandQueue(commandQueue),
-      commandStreamReceiver(commandStreamReceiver),
-      kernelOperation(std::move(kernelOperation)),
-      flushDC(flushDC),
-      slmUsed(usesSLM),
-      NDRangeKernel(ndRangeKernel),
-      printfHandler(std::move(printfHandler)),
-      kernel(nullptr),
-      kernelCount(0) {
+    : commandQueue(commandQueue), kernelOperation(std::move(kernelOperation)), flushDC(flushDC), slmUsed(usesSLM),
+      NDRangeKernel(ndRangeKernel), printfHandler(std::move(printfHandler)), kernel(kernel),
+      kernelCount(kernelCount), preemptionMode(preemptionMode) {
    for (auto surface : surfaces) {
        this->surfaces.push_back(surface);
    }
-    this->kernel = kernel;
    UNRECOVERABLE_IF(nullptr == this->kernel);
    kernel->incRefInternal();
-    this->kernelCount = kernelCount;
-    this->preemptionMode = preemptionMode;
 }

 CommandComputeKernel::~CommandComputeKernel() {
+    auto &commandStreamReceiver = commandQueue.getDevice().getCommandStreamReceiver();
    if (timestampPacketNode) {
        auto allocator = commandStreamReceiver.getMemoryManager()->getTimestampPacketAllocator();
        allocator->returnTag(timestampPacketNode);
@ -131,6 +122,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
    if (terminated) {
        return completionStamp;
    }
+    auto &commandStreamReceiver = commandQueue.getDevice().getCommandStreamReceiver();
    bool executionModelKernel = kernel->isParentKernel;
    auto devQueue = commandQueue.getContext().getDefaultDeviceQueue();

@ -213,6 +205,9 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
    dispatchFlags.throttle = commandQueue.getThrottle();
    dispatchFlags.preemptionMode = preemptionMode;
    dispatchFlags.mediaSamplerRequired = kernel->isVmeKernel();
+    if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) {
+        dispatchFlags.outOfDeviceDependencies = &eventsRequest;
+    }

    DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady);

--- a/runtime/helpers/task_information.h
+++ b/runtime/helpers/task_information.h
@ -87,8 +87,7 @@ struct KernelOperation {

 class CommandComputeKernel : public Command {
  public:
-    CommandComputeKernel(CommandQueue &commandQueue, CommandStreamReceiver &commandStreamReceiver,
-                         std::unique_ptr<KernelOperation> kernelResources, std::vector<Surface *> &surfaces,
+    CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> kernelResources, std::vector<Surface *> &surfaces,
                         bool flushDC, bool usesSLM, bool ndRangeKernel, std::unique_ptr<PrintfHandler> printfHandler,
                         PreemptionMode preemptionMode, Kernel *kernel = nullptr, uint32_t kernelCount = 0);

@ -99,10 +98,10 @@ class CommandComputeKernel : public Command {
    LinearStream *getCommandStream() override { return kernelOperation->commandStream.get(); }

    void setTimestampPacketNode(TagNode<TimestampPacket> *node);
+    void setEventsRequest(EventsRequest &eventsRequest) { this->eventsRequest = eventsRequest; }

  private:
    CommandQueue &commandQueue;
-    CommandStreamReceiver &commandStreamReceiver;
    std::unique_ptr<KernelOperation> kernelOperation;
    std::vector<Surface *> surfaces;
    bool flushDC;
@ -113,6 +112,7 @@ class CommandComputeKernel : public Command {
    uint32_t kernelCount;
    PreemptionMode preemptionMode;
    TagNode<TimestampPacket> *timestampPacketNode = nullptr;
+    EventsRequest eventsRequest = {0, nullptr, nullptr};
 };

 class CommandMarker : public Command {
--- a/unit_tests/event/event_tests.cpp
+++ b/unit_tests/event/event_tests.cpp
@ -427,17 +427,17 @@ class SurfaceMock : public Surface {
 };

 TEST_F(InternalsEventTest, processBlockedCommandsKernelOperation) {
-    MockEvent<Event> event(nullptr, CL_COMMAND_NDRANGE_KERNEL, 0, 0);
-    CommandQueue *pCmdQ = new CommandQueue(mockContext, pDevice, 0);
+    CommandQueue cmdQ(mockContext, pDevice, nullptr);
+    MockEvent<Event> event(&cmdQ, CL_COMMAND_NDRANGE_KERNEL, 0, 0);

    auto cmdStream = new LinearStream(alignedMalloc(4096, 4096), 4096);
    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
-    pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
-    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
-    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
+    cmdQ.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
+    cmdQ.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
+    cmdQ.allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
    using UniqueIH = std::unique_ptr<IndirectHeap>;
    auto blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
-                                                   UniqueIH(ioh), UniqueIH(ssh), *pCmdQ->getDevice().getMemoryManager());
+                                                   UniqueIH(ioh), UniqueIH(ssh), *cmdQ.getDevice().getMemoryManager());

    MockKernelWithInternals mockKernelWithInternals(*pDevice);
    auto pKernel = mockKernelWithInternals.mockKernel;
@ -448,7 +448,7 @@ TEST_F(InternalsEventTest, processBlockedCommandsKernelOperation) {
    surface->graphicsAllocation = new GraphicsAllocation((void *)0x1234, 100u);
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
    v.push_back(surface);
-    auto cmd = new CommandComputeKernel(*pCmdQ, csr, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, nullptr, preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(cmdQ, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, nullptr, preemptionMode, pKernel, 1);
    event.setCommand(std::unique_ptr<Command>(cmd));

    auto taskLevelBefore = csr.peekTaskLevel();
@ -458,7 +458,6 @@ TEST_F(InternalsEventTest, processBlockedCommandsKernelOperation) {
    auto taskLevelAfter = csr.peekTaskLevel();

    EXPECT_EQ(taskLevelBefore + 1, taskLevelAfter);
-    delete pCmdQ;

    EXPECT_EQ(surface->resident, 1u);
    EXPECT_FALSE(surface->graphicsAllocation->isResident(0u));
@ -466,17 +465,17 @@ TEST_F(InternalsEventTest, processBlockedCommandsKernelOperation) {
 }

 TEST_F(InternalsEventTest, processBlockedCommandsAbortKernelOperation) {
-    MockEvent<Event> event(nullptr, CL_COMMAND_NDRANGE_KERNEL, 0, 0);
-    CommandQueue *pCmdQ = new CommandQueue(mockContext, pDevice, 0);
+    CommandQueue cmdQ(mockContext, pDevice, nullptr);
+    MockEvent<Event> event(&cmdQ, CL_COMMAND_NDRANGE_KERNEL, 0, 0);

    auto cmdStream = new LinearStream(alignedMalloc(4096, 4096), 4096);
    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
-    pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
-    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
-    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
+    cmdQ.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
+    cmdQ.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
+    cmdQ.allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
    using UniqueIH = std::unique_ptr<IndirectHeap>;
    auto blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
-                                                   UniqueIH(ioh), UniqueIH(ssh), *pCmdQ->getDevice().getMemoryManager());
+                                                   UniqueIH(ioh), UniqueIH(ssh), *cmdQ.getDevice().getMemoryManager());

    MockKernelWithInternals mockKernelWithInternals(*pDevice);
    auto pKernel = mockKernelWithInternals.mockKernel;
@ -486,7 +485,7 @@ TEST_F(InternalsEventTest, processBlockedCommandsAbortKernelOperation) {
    NullSurface *surface = new NullSurface;
    v.push_back(surface);
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmd = new CommandComputeKernel(*pCmdQ, csr, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, nullptr, preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(cmdQ, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, nullptr, preemptionMode, pKernel, 1);
    event.setCommand(std::unique_ptr<Command>(cmd));

    auto taskLevelBefore = csr.peekTaskLevel();
@ -496,22 +495,21 @@ TEST_F(InternalsEventTest, processBlockedCommandsAbortKernelOperation) {
    auto taskLevelAfter = csr.peekTaskLevel();

    EXPECT_EQ(taskLevelBefore, taskLevelAfter);
-    delete pCmdQ;
 }

 TEST_F(InternalsEventTest, givenBlockedKernelWithPrintfWhenSubmittedThenPrintOutput) {
    testing::internal::CaptureStdout();
-    MockEvent<Event> event(nullptr, CL_COMMAND_NDRANGE_KERNEL, 0, 0);
-    CommandQueue *pCmdQ = new CommandQueue(mockContext, pDevice, 0);
+    CommandQueue cmdQ(mockContext, pDevice, nullptr);
+    MockEvent<Event> event(&cmdQ, CL_COMMAND_NDRANGE_KERNEL, 0, 0);

    auto cmdStream = new LinearStream(alignedMalloc(4096, 4096), 4096);
    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
-    pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
-    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
-    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
+    cmdQ.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
+    cmdQ.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
+    cmdQ.allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
    using UniqueIH = std::unique_ptr<IndirectHeap>;
    auto blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
-                                                   UniqueIH(ioh), UniqueIH(ssh), *pCmdQ->getDevice().getMemoryManager());
+                                                   UniqueIH(ioh), UniqueIH(ssh), *cmdQ.getDevice().getMemoryManager());

    SPatchAllocateStatelessPrintfSurface *pPrintfSurface = new SPatchAllocateStatelessPrintfSurface();
    pPrintfSurface->DataParamOffset = 0;
@ -542,10 +540,9 @@ TEST_F(InternalsEventTest, givenBlockedKernelWithPrintfWhenSubmittedThenPrintOut
    printfSurface[0] = 8;
    printfSurface[1] = 0;

-    auto &csr = pDevice->getCommandStreamReceiver();
    std::vector<Surface *> v;
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmd = new CommandComputeKernel(*pCmdQ, csr, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(cmdQ, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, std::move(printfHandler), preemptionMode, pKernel, 1);
    event.setCommand(std::unique_ptr<Command>(cmd));

    event.submitCommand(false);
@ -555,7 +552,6 @@ TEST_F(InternalsEventTest, givenBlockedKernelWithPrintfWhenSubmittedThenPrintOut
    EXPECT_FALSE(surface->isResident(0u));

    delete pPrintfSurface;
-    delete pCmdQ;
 }

 TEST_F(InternalsEventTest, processBlockedCommandsMapOperation) {
@ -1431,7 +1427,7 @@ HWTEST_F(InternalsEventTest, givenAbortedCommandWhenSubmitCalledThenDontUpdateFl
                                                   UniqueIH(ioh), UniqueIH(ssh), *pCmdQ->getDevice().getMemoryManager());
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
    std::vector<Surface *> v;
-    auto cmd = new CommandComputeKernel(*pCmdQ, csr, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, nullptr, preemptionMode, pKernel, 1);
+    auto cmd = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, nullptr, preemptionMode, pKernel, 1);
    event->setCommand(std::unique_ptr<Command>(cmd));

    FlushStamp expectedFlushStamp = 0;
--- a/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp
+++ b/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp
@ -104,8 +104,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenLockedEMcritcalSectionWhenParentK
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, device->getCommandStreamReceiver(),
-                                                          std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        cmdComputeKernel->submit(0, false);

@ -166,8 +165,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, device->getCommandStreamReceiver(),
-                                                          std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        cmdComputeKernel->submit(0, false);

@ -209,8 +207,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, device->getCommandStreamReceiver(),
-                                                          std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        cmdComputeKernel->submit(0, false);

@ -249,8 +246,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenBlockedParentKernelWithProfilingW
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, device->getCommandStreamReceiver(),
-                                                          std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        HwTimeStamps timestamp;

@ -293,8 +289,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, device->getCommandStreamReceiver(),
-                                                          std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        cmdComputeKernel->submit(0, false);

@ -346,8 +341,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenUsedCommandQue
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(cmdQ, device->getCommandStreamReceiver(),
-                                                          std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(cmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        cmdComputeKernel->submit(0, false);

@ -396,8 +390,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenNotUsedSSHWhen
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
        std::vector<Surface *> surfaces;
-        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, device->getCommandStreamReceiver(),
-                                                          std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
+        auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);

        cmdComputeKernel->submit(0, false);

--- a/unit_tests/helpers/timestamp_packet_tests.cpp
+++ b/unit_tests/helpers/timestamp_packet_tests.cpp
@ -68,6 +68,15 @@ struct TimestampPacketTests : public TimestampPacketSimpleTests {
        mockCmdQ = std::make_unique<MockCommandQueue>(context.get(), device.get(), nullptr);
    }

+    template <typename MI_SEMAPHORE_WAIT>
+    void verifySemaphore(MI_SEMAPHORE_WAIT *semaphoreCmd, Event *compareEvent) {
+        EXPECT_NE(nullptr, semaphoreCmd);
+        EXPECT_EQ(semaphoreCmd->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
+        EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
+        EXPECT_EQ(compareEvent->getTimestampPacketNode()->tag->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd),
+                  semaphoreCmd->getSemaphoreGraphicsAddress());
+    };
+
    ExecutionEnvironment executionEnvironment;
    std::unique_ptr<MockDevice> device;
    std::unique_ptr<MockContext> context;
@ -363,14 +372,6 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEnqueueingThe
    HardwareParse hwParser;
    hwParser.parseCommands<FamilyType>(cmdStream, 0);

-    auto verifySemaphore = [](MI_SEMAPHORE_WAIT *semaphoreCmd, Event *compareEvent) {
-        EXPECT_NE(nullptr, semaphoreCmd);
-        EXPECT_EQ(semaphoreCmd->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
-        EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
-        EXPECT_EQ(compareEvent->getTimestampPacketNode()->tag->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd),
-                  semaphoreCmd->getSemaphoreGraphicsAddress());
-    };
-
    auto it = hwParser.cmdList.begin();
    verifySemaphore(genCmdCast<MI_SEMAPHORE_WAIT *>(*it++), &event4);
    verifySemaphore(genCmdCast<MI_SEMAPHORE_WAIT *>(*it++), &event6);
@ -381,6 +382,40 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEnqueueingThe
    }
 }

+HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEnqueueingBlockedThenProgramSemaphoresOnCsrStreamOnFlush) {
+    using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
+    auto device2 = std::unique_ptr<MockDevice>(Device::create<MockDevice>(nullptr, &executionEnvironment, 1u));
+
+    device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
+    MockContext context2(device2.get());
+
+    auto cmdQ1 = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
+    auto cmdQ2 = std::make_unique<MockCommandQueueHw<FamilyType>>(&context2, device2.get(), nullptr);
+
+    UserEvent userEvent;
+    Event event0(cmdQ1.get(), 0, 0, 0);
+    event0.setTimestampPacketNode(executionEnvironment.memoryManager->getTimestampPacketAllocator()->getTag());
+    Event event1(cmdQ2.get(), 0, 0, 0);
+    event1.setTimestampPacketNode(executionEnvironment.memoryManager->getTimestampPacketAllocator()->getTag());
+
+    cl_event waitlist[] = {&userEvent, &event0, &event1};
+    cmdQ1->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 3, waitlist, nullptr);
+    auto &cmdStream = device->getUltCommandStreamReceiver<FamilyType>().commandStream;
+    EXPECT_EQ(0u, cmdStream.getUsed());
+    userEvent.setStatus(CL_COMPLETE);
+
+    HardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(cmdStream, 0);
+
+    auto it = hwParser.cmdList.begin();
+    verifySemaphore(genCmdCast<MI_SEMAPHORE_WAIT *>(*it++), &event1);
+
+    while (it != hwParser.cmdList.end()) {
+        EXPECT_EQ(nullptr, genCmdCast<MI_SEMAPHORE_WAIT *>(*it));
+        it++;
+    }
+}
+
 HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenDispatchingThenProgramSemaphoresForWaitlist) {
    using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
    using WALKER = WALKER_TYPE<FamilyType>;
@ -429,13 +464,6 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenDispatchingTh
    HardwareParse hwParser;
    hwParser.parseCommands<FamilyType>(cmdStream, 0);

-    auto verifySemaphore = [](MI_SEMAPHORE_WAIT *semaphoreCmd, Event *compareEvent) {
-        EXPECT_EQ(semaphoreCmd->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
-        EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
-        EXPECT_EQ(compareEvent->getTimestampPacketNode()->tag->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd),
-                  semaphoreCmd->getSemaphoreGraphicsAddress());
-    };
-
    uint32_t semaphoresFound = 0;
    uint32_t walkersFound = 0;