diff --git a/opencl/source/command_queue/command_queue_hw_base.inl b/opencl/source/command_queue/command_queue_hw_base.inl index ad3abf2a31..041ef23726 100644 --- a/opencl/source/command_queue/command_queue_hw_base.inl +++ b/opencl/source/command_queue/command_queue_hw_base.inl @@ -196,7 +196,9 @@ bool CommandQueueHw::isGpgpuSubmissionForBcsRequired(bool queueBlocked, return true; } - bool required = (latestSentEnqueueType != EnqueueProperties::Operation::Blit) && (latestSentEnqueueType != EnqueueProperties::Operation::None); + bool required = (latestSentEnqueueType != EnqueueProperties::Operation::Blit) && + (latestSentEnqueueType != EnqueueProperties::Operation::None) && + (isCacheFlushForBcsRequired() || !getGpgpuCommandStreamReceiver().isLatestTaskCountFlushed()); if (DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.get() == 1) { required = true; diff --git a/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp b/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp index 1c497717d0..1fdb1e4006 100644 --- a/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp +++ b/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp @@ -1750,6 +1750,44 @@ HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenCacheFlushN EXPECT_EQ(EnqueueProperties::Operation::GpuKernel, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(1u, gpgpuCsr->peekTaskCount()); + commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); + EXPECT_EQ(1u, gpgpuCsr->peekTaskCount()); + + commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); + EXPECT_EQ(1u, gpgpuCsr->peekTaskCount()); +} + +HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenCacheFlushNotRequiredAndEnqueueNotFlushedWhenDoingBcsCopyThenSubmitOnlyOnceAfterEnqueue) { + auto mockCommandQueue = static_cast *>(commandQueue.get()); + EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType); + + DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1); + DebugManager.flags.PerformImplicitFlushForNewResource.set(0); + DebugManager.flags.PerformImplicitFlushForIdleGpu.set(0); + + mockCommandQueue->overrideIsCacheFlushForBcsRequired.enabled = true; + mockCommandQueue->overrideIsCacheFlushForBcsRequired.returnValue = false; + mockCommandQueue->getGpgpuCommandStreamReceiver().overrideDispatchPolicy(DispatchMode::BatchedDispatch); + mockCommandQueue->getGpgpuCommandStreamReceiver().postInitFlagsSetup(); + + auto buffer = createBuffer(1, false); + buffer->forceDisallowCPUCopy = true; + int hostPtr = 0; + + commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); + EXPECT_EQ(0u, gpgpuCsr->peekTaskCount()); + + commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); + EXPECT_EQ(0u, gpgpuCsr->peekTaskCount()); + + commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(EnqueueProperties::Operation::GpuKernel, mockCommandQueue->latestSentEnqueueType); + EXPECT_EQ(1u, gpgpuCsr->peekTaskCount()); + commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(2u, gpgpuCsr->peekTaskCount()); diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp index 102312e73c..137e2f267b 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp @@ -1550,10 +1550,11 @@ HWTEST_F(IoqCommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingSubsequentBlitsTh EXPECT_EQ(0, gpgpuCsr.ensureCommandBufferAllocationCalled); } -HWTEST_F(IoqCommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingBlitAfterKernelThenGpgpuCommandStreamIsObtained) { +HWTEST_F(IoqCommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingBlitAfterNotFlushedKernelThenGpgpuCommandStreamIsObtained) { auto &gpgpuCsr = pDevice->getUltCommandStreamReceiver(); auto srcBuffer = std::unique_ptr{BufferHelper<>::create(pContext)}; auto dstBuffer = std::unique_ptr{BufferHelper<>::create(pContext)}; + pCmdQ->getGpgpuCommandStreamReceiver().overrideDispatchPolicy(DispatchMode::BatchedDispatch); MockKernelWithInternals mockKernelWithInternals(*pClDevice); size_t offset = 0; @@ -1576,6 +1577,35 @@ HWTEST_F(IoqCommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingBlitAfterKernelTh EXPECT_NE(ensureCommandBufferAllocationCalledAfterKernel, gpgpuCsr.ensureCommandBufferAllocationCalled); } +HWTEST_F(IoqCommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingBlitAfterFlushedKernelThenGpgpuCommandStreamIsNotObtained) { + auto &gpgpuCsr = pDevice->getUltCommandStreamReceiver(); + auto srcBuffer = std::unique_ptr{BufferHelper<>::create(pContext)}; + auto dstBuffer = std::unique_ptr{BufferHelper<>::create(pContext)}; + + DebugManagerStateRestore restorer; + DebugManager.flags.ForceCacheFlushForBcs.set(0); + + MockKernelWithInternals mockKernelWithInternals(*pClDevice); + size_t offset = 0; + size_t size = 1; + cl_int retVal = pCmdQ->enqueueKernel(mockKernelWithInternals.mockKernel, 1, &offset, &size, &size, 0, nullptr, nullptr); + ASSERT_EQ(CL_SUCCESS, retVal); + EXPECT_NE(0, gpgpuCsr.ensureCommandBufferAllocationCalled); + const auto ensureCommandBufferAllocationCalledAfterKernel = gpgpuCsr.ensureCommandBufferAllocationCalled; + + retVal = pCmdQ->enqueueCopyBuffer( + srcBuffer.get(), + dstBuffer.get(), + 0, + 0, + 1, + 0, + nullptr, + nullptr); + ASSERT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(ensureCommandBufferAllocationCalledAfterKernel, gpgpuCsr.ensureCommandBufferAllocationCalled); +} + HWTEST_F(OoqCommandQueueHwBlitTest, givenBlitAfterBarrierWhenEnqueueingCommandThenWaitForBarrierOnBlit) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 2acc4d0897..8632c26c6a 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -192,6 +192,10 @@ class CommandStreamReceiver { bool peekTimestampPacketWriteEnabled() const { return timestampPacketWriteEnabled; } + bool isLatestTaskCountFlushed() { + return this->peekLatestFlushedTaskCount() == this->peekTaskCount(); + } + size_t defaultSshSize = 0u; bool canUse4GbHeaps = true;