diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index 4078f874e7..00781ee0c9 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -859,6 +859,9 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( dispatchFlags.multiEngineQueue = multiEngineQueue; dispatchFlags.preemptionMode = device->getPreemptionMode(); dispatchFlags.implicitFlush = blitEnqueue; + dispatchFlags.guardCommandBufferWithPipeControl = true; + dispatchFlags.outOfOrderExecutionAllowed = getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(); + if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { dispatchFlags.csrDependencies.fillFromEventsRequestAndMakeResident(eventsRequest, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); } diff --git a/unit_tests/command_queue/enqueue_barrier_tests.cpp b/unit_tests/command_queue/enqueue_barrier_tests.cpp index 69f96b3afc..31dcba3ebe 100644 --- a/unit_tests/command_queue/enqueue_barrier_tests.cpp +++ b/unit_tests/command_queue/enqueue_barrier_tests.cpp @@ -192,7 +192,7 @@ HWTEST_F(BarrierTest, eventWithWaitDependenciesShouldSync) { // in this case only cmdQ raises the taskLevel why csr stay intact EXPECT_EQ(8u, pCmdQ->taskLevel); if (csr.peekTimestampPacketWriteEnabled()) { - EXPECT_EQ(8u, commandStreamReceiver.peekTaskLevel()); + EXPECT_EQ(pCmdQ->taskLevel + 1, commandStreamReceiver.peekTaskLevel()); } else { EXPECT_EQ(7u, commandStreamReceiver.peekTaskLevel()); } diff --git a/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp b/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp index c16bf89f30..5ce400e311 100644 --- a/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp @@ -64,12 +64,13 @@ HWTEST_F(EnqueueHandlerTest, whenEnqueueCommandWithoutKernelThenPassCorrectDispa EXPECT_EQ(blocking, mockCsr->passedDispatchFlags.blocking); EXPECT_FALSE(mockCsr->passedDispatchFlags.implicitFlush); + EXPECT_TRUE(mockCsr->passedDispatchFlags.guardCommandBufferWithPipeControl); EXPECT_EQ(mockCmdQ->isMultiEngineQueue(), mockCsr->passedDispatchFlags.multiEngineQueue); EXPECT_EQ(pDevice->getPreemptionMode(), mockCsr->passedDispatchFlags.preemptionMode); mockCmdQ->gpgpuEngine->commandStreamReceiver = oldCsr; } -HWTEST_F(EnqueueHandlerTest, givenBlitEnqueueWhenDispatchingCommandsWithoutKernelThenDoImplicitflush) { +HWTEST_F(EnqueueHandlerTest, givenBlitEnqueueWhenDispatchingCommandsWithoutKernelThenDoImplicitFlush) { auto executionEnvironment = pDevice->getExecutionEnvironment(); auto mockCsr = std::make_unique>(*executionEnvironment); auto mockCmdQ = std::make_unique>(context, pDevice, nullptr); @@ -86,16 +87,47 @@ HWTEST_F(EnqueueHandlerTest, givenBlitEnqueueWhenDispatchingCommandsWithoutKerne mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, mockCmdQ->getCS(0), 0, blocking, true, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); EXPECT_TRUE(mockCsr->passedDispatchFlags.implicitFlush); + EXPECT_TRUE(mockCsr->passedDispatchFlags.guardCommandBufferWithPipeControl); mockCmdQ->gpgpuEngine->commandStreamReceiver = oldCsr; } -HWTEST_F(EnqueueHandlerTest, GivenCommandStreamWithoutKernelAndZeroSurfacesWhenEnqueuedHandlerThenUsedSizeEqualZero) { +HWTEST_F(EnqueueHandlerTest, givenN1EnabledWhenDispatchingWithoutKernelTheAllowOutOfOrderExecution) { + auto executionEnvironment = pDevice->getExecutionEnvironment(); + auto mockCsr = std::make_unique>(*executionEnvironment); + auto mockCmdQ = std::make_unique>(context, pDevice, nullptr); + mockCsr->setupContext(*mockCmdQ->gpgpuEngine->osContext); + mockCsr->initializeTagAllocation(); + auto oldCsr = mockCmdQ->gpgpuEngine->commandStreamReceiver; + mockCmdQ->gpgpuEngine->commandStreamReceiver = mockCsr.get(); + mockCsr->createPreemptionAllocation(); + TimestampPacketContainer previousTimestampPacketNodes; + EventsRequest eventsRequest(0, nullptr, nullptr); + EventBuilder eventBuilder; + + bool blocked = false; + + mockCsr->nTo1SubmissionModelEnabled = false; + mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, mockCmdQ->getCS(0), 0, blocked, true, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); + EXPECT_FALSE(mockCsr->passedDispatchFlags.outOfOrderExecutionAllowed); + + mockCsr->nTo1SubmissionModelEnabled = true; + mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, mockCmdQ->getCS(0), 0, blocked, true, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); + EXPECT_TRUE(mockCsr->passedDispatchFlags.outOfOrderExecutionAllowed); + + mockCmdQ->gpgpuEngine->commandStreamReceiver = oldCsr; +} + +HWTEST_F(EnqueueHandlerTest, GivenCommandStreamWithoutKernelAndZeroSurfacesWhenEnqueuedHandlerThenProgramPipeControl) { std::unique_ptr> mockCmdQ(new MockCommandQueueWithCacheFlush(context, pDevice, 0)); mockCmdQ->commandRequireCacheFlush = true; mockCmdQ->template enqueueHandler(nullptr, 0, false, nullptr, 0, nullptr, nullptr); - EXPECT_EQ(mockCmdQ->getCS(0).getUsed(), 0u); + + auto requiredCmdStreamSize = alignUp(PipeControlHelper::getSizeForPipeControlWithPostSyncOperation(), + MemoryConstants::cacheLineSize); + + EXPECT_EQ(mockCmdQ->getCS(0).getUsed(), requiredCmdStreamSize); } HWTEST_F(EnqueueHandlerTest, givenTimestampPacketWriteEnabledAndCommandWithCacheFlushWhenEnqueueingHandlerThenObtainNewStamp) { auto &csr = pDevice->getUltCommandStreamReceiver(); diff --git a/unit_tests/command_queue/enqueue_kernel_event_tests.cpp b/unit_tests/command_queue/enqueue_kernel_event_tests.cpp index 91f60989f0..d722f7a4dd 100644 --- a/unit_tests/command_queue/enqueue_kernel_event_tests.cpp +++ b/unit_tests/command_queue/enqueue_kernel_event_tests.cpp @@ -222,7 +222,7 @@ TEST_F(EventTests, eventPassedToEnqueueMarkerHasTheSameLevelAsPreviousCommand) { ASSERT_EQ(CL_SUCCESS, retVal); if (csr.peekTimestampPacketWriteEnabled()) { - EXPECT_EQ(csr.peekTaskLevel(), pEvent2->taskLevel); + EXPECT_EQ(csr.peekTaskLevel(), pCmdQ->taskLevel + 1); } else { EXPECT_EQ(csr.peekTaskLevel(), pEvent->taskLevel + 1); } diff --git a/unit_tests/command_queue/enqueue_marker_tests.cpp b/unit_tests/command_queue/enqueue_marker_tests.cpp index 597976a86b..8b7fa0e543 100644 --- a/unit_tests/command_queue/enqueue_marker_tests.cpp +++ b/unit_tests/command_queue/enqueue_marker_tests.cpp @@ -138,12 +138,13 @@ HWTEST_F(MarkerTest, returnedEventShouldHaveEqualDepthToLastCommandPacketInComma HWTEST_F(MarkerTest, eventWithWaitDependenciesShouldSync) { auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + uint32_t initialTaskLevel = 7; // In N:1, CSR is always highest task level. - commandStreamReceiver.taskLevel = 7; + commandStreamReceiver.taskLevel = initialTaskLevel; // In N:1, pCmdQ.level <= CSR.level - pCmdQ->taskLevel = 7; + pCmdQ->taskLevel = initialTaskLevel; // In N:1, event.level <= pCmdQ.level Event event1(pCmdQ, CL_COMMAND_NDRANGE_KERNEL, 5, 15); @@ -165,7 +166,12 @@ HWTEST_F(MarkerTest, eventWithWaitDependenciesShouldSync) { std::unique_ptr pEvent((Event *)(event)); // Should sync CSR & CmdQ levels. - EXPECT_EQ(commandStreamReceiver.peekTaskLevel(), pCmdQ->taskLevel); + if (pCmdQ->getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { + EXPECT_EQ(initialTaskLevel, pCmdQ->taskLevel); + EXPECT_EQ(initialTaskLevel + 1, commandStreamReceiver.peekTaskLevel()); + } else { + EXPECT_EQ(commandStreamReceiver.peekTaskLevel(), pCmdQ->taskLevel); + } EXPECT_EQ(pCmdQ->taskLevel, pEvent->taskLevel); EXPECT_EQ(7u, pEvent->taskLevel); } diff --git a/unit_tests/command_queue/get_size_required_tests.cpp b/unit_tests/command_queue/get_size_required_tests.cpp index 63b6f559c2..727ddf6677 100644 --- a/unit_tests/command_queue/get_size_required_tests.cpp +++ b/unit_tests/command_queue/get_size_required_tests.cpp @@ -71,12 +71,17 @@ HWTEST_F(GetSizeRequiredTest, enqueueMarker) { &eventReturned); EXPECT_EQ(CL_SUCCESS, retVal); - EXPECT_EQ(0u, commandStream.getUsed() - usedBeforeCS); + size_t expectedStreamSize = 0; + if (pCmdQ->getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { + expectedStreamSize = alignUp(PipeControlHelper::getSizeForPipeControlWithPostSyncOperation(), + +MemoryConstants::cacheLineSize); + } + EXPECT_EQ(expectedStreamSize, commandStream.getUsed() - usedBeforeCS); EXPECT_EQ(0u, dsh->getUsed() - usedBeforeDSH); EXPECT_EQ(0u, ioh->getUsed() - usedBeforeIOH); EXPECT_EQ(0u, ssh->getUsed() - usedBeforeSSH); - delete (Event *)eventReturned; + clReleaseEvent(eventReturned); } HWTEST_F(GetSizeRequiredTest, enqueueBarrierDoesntConsumeAnySpace) { @@ -92,9 +97,13 @@ HWTEST_F(GetSizeRequiredTest, enqueueBarrierDoesntConsumeAnySpace) { &eventReturned); EXPECT_EQ(CL_SUCCESS, retVal); - size_t expectedSize = 0; + size_t expectedStreamSize = 0; + if (pCmdQ->getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { + expectedStreamSize = alignUp(PipeControlHelper::getSizeForPipeControlWithPostSyncOperation(), + +MemoryConstants::cacheLineSize); + } - EXPECT_EQ(expectedSize, commandStream.getUsed() - usedBeforeCS); + EXPECT_EQ(expectedStreamSize, commandStream.getUsed() - usedBeforeCS); - delete (Event *)eventReturned; + clReleaseEvent(eventReturned); } diff --git a/unit_tests/mocks/mock_csr.h b/unit_tests/mocks/mock_csr.h index adbac88fa9..3ef504cbd0 100644 --- a/unit_tests/mocks/mock_csr.h +++ b/unit_tests/mocks/mock_csr.h @@ -162,6 +162,7 @@ class MockCsrHw2 : public CommandStreamReceiverHw { using CommandStreamReceiver::isPreambleSent; using CommandStreamReceiver::lastSentCoherencyRequest; using CommandStreamReceiver::mediaVfeStateDirty; + using CommandStreamReceiver::nTo1SubmissionModelEnabled; using CommandStreamReceiver::taskCount; using CommandStreamReceiver::taskLevel; using CommandStreamReceiver::timestampPacketWriteEnabled;