diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 34c9268e6f..ef43dee3e1 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -452,6 +452,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { }; std::array bcsTimestampPacketContainers; bool stallingCommandsOnNextFlushRequired = false; + bool splitBarrierRequired = false; }; template diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 470b5559e6..68a2fe0ea4 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -258,6 +258,7 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) { if (CL_COMMAND_BARRIER == commandType) { setStallingCommandsOnNextFlush(true); + this->splitBarrierRequired = true; } for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) { @@ -1245,7 +1246,7 @@ cl_int CommandQueueHw::enqueueBlitSplit(MultiDispatchInfo &dispatchIn di, numEventsInWaitList, eventWaitList, - event); + nullptr); DEBUG_BREAK_IF(ret != CL_SUCCESS); } @@ -1259,6 +1260,9 @@ cl_int CommandQueueHw::enqueueBlitSplit(MultiDispatchInfo &dispatchIn auto remainingSize = size; for (size_t i = 0; i < copyEngines.size(); i++) { + if (isOOQEnabled() && this->splitBarrierRequired) { + this->setStallingCommandsOnNextFlush(true); + } auto localSize = remainingSize / (copyEngines.size() - i); auto localParams = dispatchInfo.peekBuiltinOpParams(); localParams.size.x = localSize; @@ -1282,6 +1286,7 @@ cl_int CommandQueueHw::enqueueBlitSplit(MultiDispatchInfo &dispatchIn } this->timestampPacketContainer->swapNodes(splitNodes); + this->splitBarrierRequired = false; queueOwnership.unlock(); for (auto &lock : locks) { diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp index bb24cec171..a6b533e901 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp @@ -420,6 +420,7 @@ HWTEST_F(OoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenEnqueueReadWithRequeste memoryManager->returnFakeAllocation = true; auto cmdQHw = static_cast *>(this->pCmdQ); cmdQHw->setStallingCommandsOnNextFlush(true); + cmdQHw->splitBarrierRequired = true; std::unique_ptr osContext1(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), pDevice->getRootDeviceIndex(), 0, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS1, EngineUsage::Regular}, @@ -456,6 +457,68 @@ HWTEST_F(OoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenEnqueueReadWithRequeste EXPECT_EQ(csr2->peekTaskCount(), 2u); EXPECT_EQ(cmdQHw->getGpgpuCommandStreamReceiver().peekTaskCount(), 4u); EXPECT_EQ(cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->peekTaskCount(), 0u); + EXPECT_FALSE(cmdQHw->splitBarrierRequired); + + pCmdQ->release(); + pCmdQ = nullptr; +} + +HWTEST_F(OoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenEnqueueBarrierNonSplitCopyAndSplitCopyThenSplitWaitCorrectly) { + DebugManagerStateRestore restorer; + DebugManager.flags.DoCpuCopyOnReadBuffer.set(0); + DebugManager.flags.UpdateTaskCountFromWait.set(3); + auto memoryManager = static_cast(pDevice->getMemoryManager()); + memoryManager->returnFakeAllocation = true; + auto cmdQHw = static_cast *>(this->pCmdQ); + cmdQHw->setStallingCommandsOnNextFlush(true); + cmdQHw->splitBarrierRequired = true; + + std::unique_ptr osContext1(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), pDevice->getRootDeviceIndex(), 0, + EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS1, EngineUsage::Regular}, + PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield()))); + auto csr1 = std::make_unique>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + csr1->setupContext(*osContext1); + csr1->initializeTagAllocation(); + EngineControl control1(csr1.get(), osContext1.get()); + + std::unique_ptr osContext2(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), pDevice->getRootDeviceIndex(), 0, + EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS3, EngineUsage::Regular}, + PreemptionMode::ThreadGroup, pDevice->getDeviceBitfield()))); + auto csr2 = std::make_unique>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + csr2->setupContext(*osContext2); + csr2->initializeTagAllocation(); + EngineControl control2(csr2.get(), osContext2.get()); + + cmdQHw->bcsEngines[1] = &control1; + cmdQHw->bcsEngines[3] = &control2; + + BcsSplitBufferTraits::context = context; + auto buffer = clUniquePtr(BufferHelper::create()); + static_cast(buffer->getGraphicsAllocation(0u))->memoryPool = MemoryPool::LocalMemory; + char ptr[1] = {}; + + EXPECT_EQ(csr1->peekTaskCount(), 0u); + EXPECT_EQ(csr2->peekTaskCount(), 0u); + EXPECT_EQ(cmdQHw->getGpgpuCommandStreamReceiver().peekTaskCount(), 0u); + EXPECT_EQ(cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->peekTaskCount(), 0u); + + EXPECT_EQ(CL_SUCCESS, cmdQHw->enqueueReadBuffer(buffer.get(), CL_TRUE, 0, 16 * MemoryConstants::megaByte, ptr, nullptr, 0, nullptr, nullptr)); + + EXPECT_EQ(csr1->peekTaskCount(), 0u); + EXPECT_EQ(csr2->peekTaskCount(), 0u); + EXPECT_EQ(cmdQHw->getGpgpuCommandStreamReceiver().peekTaskCount(), 0u); + EXPECT_EQ(cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->peekTaskCount(), 2u); + EXPECT_TRUE(cmdQHw->splitBarrierRequired); + + DebugManager.flags.SplitBcsCopy.set(1); + + EXPECT_EQ(CL_SUCCESS, cmdQHw->enqueueReadBuffer(buffer.get(), CL_TRUE, 0, 16 * MemoryConstants::megaByte, ptr, nullptr, 0, nullptr, nullptr)); + + EXPECT_EQ(csr1->peekTaskCount(), 2u); + EXPECT_EQ(csr2->peekTaskCount(), 2u); + EXPECT_EQ(cmdQHw->getGpgpuCommandStreamReceiver().peekTaskCount(), 4u); + EXPECT_EQ(cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->peekTaskCount(), 2u); + EXPECT_FALSE(cmdQHw->splitBarrierRequired); pCmdQ->release(); pCmdQ = nullptr; @@ -519,6 +582,7 @@ HWTEST_F(IoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenEnqueueReadWithRequeste memoryManager->returnFakeAllocation = true; auto cmdQHw = static_cast *>(this->pCmdQ); cmdQHw->setStallingCommandsOnNextFlush(true); + cmdQHw->splitBarrierRequired = true; std::unique_ptr osContext1(OsContext::create(pDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), pDevice->getRootDeviceIndex(), 0, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS1, EngineUsage::Regular}, @@ -555,6 +619,7 @@ HWTEST_F(IoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenEnqueueReadWithRequeste EXPECT_EQ(csr2->peekTaskCount(), 2u); EXPECT_EQ(cmdQHw->getGpgpuCommandStreamReceiver().peekTaskCount(), 2u); EXPECT_EQ(cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->peekTaskCount(), 0u); + EXPECT_FALSE(cmdQHw->splitBarrierRequired); pCmdQ->release(); pCmdQ = nullptr; diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 09a7de634c..b810686021 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -257,6 +257,7 @@ class MockCommandQueueHw : public CommandQueueHw { using BaseClass::processDispatchForKernels; using BaseClass::relaxedOrderingForGpgpuAllowed; using BaseClass::requiresCacheFlushAfterWalker; + using BaseClass::splitBarrierRequired; using BaseClass::throttle; using BaseClass::timestampPacketContainer;