From bdeccab7aaa09f120229ae9353e908eb50bbe07c Mon Sep 17 00:00:00 2001 From: Dominik Dabek Date: Thu, 11 Jul 2024 08:48:10 +0000 Subject: [PATCH] fix: bcs enqueue after marker properly waits For an example sequence of: IOQ_1 -> enqueue copy, enqueue marker with waitlist (out event) IOQ_2 -> enqueue marker with waitlist (event), enqueue copy Add missing synchronization between the enqueue copies Related-To: NEO-11694 Signed-off-by: Dominik Dabek --- .../command_queue/command_queue_hw_base.inl | 20 ++++- opencl/source/command_queue/enqueue_common.h | 2 +- .../command_queue_hw_2_tests.cpp | 85 +++++++++++++++++++ 3 files changed, 102 insertions(+), 5 deletions(-) diff --git a/opencl/source/command_queue/command_queue_hw_base.inl b/opencl/source/command_queue/command_queue_hw_base.inl index 50b40e5d44..e4dfa3afbe 100644 --- a/opencl/source/command_queue/command_queue_hw_base.inl +++ b/opencl/source/command_queue/command_queue_hw_base.inl @@ -209,10 +209,22 @@ bool CommandQueueHw::isGpgpuSubmissionForBcsRequired(bool queueBlocked, if (queueBlocked || timestampPacketDependencies.barrierNodes.peekNodes().size() > 0u) { return true; } - - bool required = (latestSentEnqueueType != EnqueueProperties::Operation::blit) && - (latestSentEnqueueType != EnqueueProperties::Operation::none) && - (isCacheFlushForBcsRequired() || !(getGpgpuCommandStreamReceiver().getDispatchMode() == DispatchMode::immediateDispatch || getGpgpuCommandStreamReceiver().isLatestTaskCountFlushed())); + bool required = false; + switch (latestSentEnqueueType) { + case NEO::EnqueueProperties::Operation::explicitCacheFlush: + case NEO::EnqueueProperties::Operation::enqueueWithoutSubmission: + case NEO::EnqueueProperties::Operation::gpuKernel: + case NEO::EnqueueProperties::Operation::profilingOnly: + required = isCacheFlushForBcsRequired() || !(getGpgpuCommandStreamReceiver().getDispatchMode() == DispatchMode::immediateDispatch || getGpgpuCommandStreamReceiver().isLatestTaskCountFlushed()); + break; + case NEO::EnqueueProperties::Operation::dependencyResolveOnGpu: + return true; + break; + case NEO::EnqueueProperties::Operation::none: + case NEO::EnqueueProperties::Operation::blit: + default: + break; + } if (debugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.get() == 1) { required = true; diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 3b6b90a5aa..42f2f7c9ce 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -1461,7 +1461,7 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp } auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies); - if (isCacheFlushForBcsRequired() && gpgpuSubmission) { + if ((isCacheFlushForBcsRequired() || NEO::EnqueueProperties::Operation::dependencyResolveOnGpu == latestSentEnqueueType) && gpgpuSubmission) { timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag()); } diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp index ea5aac2eb3..a3df03e3dc 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp @@ -8,6 +8,7 @@ #include "shared/source/helpers/timestamp_packet.h" #include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/helpers/engine_descriptor_helper.h" +#include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/mocks/mock_builtins.h" #include "shared/test/common/mocks/mock_csr.h" #include "shared/test/common/mocks/mock_memory_manager.h" @@ -72,6 +73,90 @@ struct MockBuilder : BuiltinDispatchInfoBuilder { Params paramsToUse; }; +using MultiIoqCmdQSynchronizationTest = CommandQueueHwBlitTest; + +HWTEST_F(MultiIoqCmdQSynchronizationTest, givenTwoIoqCmdQsWhenEnqueuesSynchronizedWithMarkersThenCorrectSynchronizationIsApplied) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + if (pCmdQ->getTimestampPacketContainer() == nullptr) { + GTEST_SKIP(); + } + + auto buffer = std::unique_ptr{BufferHelper<>::create(pContext)}; + char ptr[1] = {}; + + auto pCmdQ2 = createCommandQueue(pClDevice); + cl_event outEvent; + size_t offset = 0; + cl_int status; + status = pCmdQ->enqueueWriteBuffer(buffer.get(), CL_FALSE, offset, 1u, ptr, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, status); + + status = pCmdQ->enqueueMarkerWithWaitList(0, nullptr, &outEvent); + EXPECT_EQ(CL_SUCCESS, status); + + auto node = castToObject(outEvent)->getTimestampPacketNodes()->peekNodes().at(0); + const auto nodeGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*node); + + auto cmdQ2Start = pCmdQ2->getCS(0).getUsed(); + status = pCmdQ2->enqueueMarkerWithWaitList(1, &outEvent, nullptr); + EXPECT_EQ(CL_SUCCESS, status); + + auto bcsStart = pCmdQ2->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->getCS(0).getUsed(); + status = pCmdQ2->enqueueReadBuffer(buffer.get(), CL_FALSE, offset, 1u, ptr, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, status); + + uint64_t bcsSemaphoreAddress = 0x0; + + { + LinearStream &bcsStream = pCmdQ2->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->getCS(0); + HardwareParse bcsHwParser; + bcsHwParser.parseCommands(bcsStream, bcsStart); + auto semaphoreCmdBcs = genCmdCast(*bcsHwParser.cmdList.begin()); + EXPECT_NE(nullptr, semaphoreCmdBcs); + EXPECT_EQ(1u, semaphoreCmdBcs->getSemaphoreDataDword()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, semaphoreCmdBcs->getCompareOperation()); + bcsSemaphoreAddress = semaphoreCmdBcs->getSemaphoreGraphicsAddress(); + } + + { + LinearStream &cmdQ2Stream = pCmdQ2->getCS(0); + HardwareParse ccsHwParser; + ccsHwParser.parseCommands(cmdQ2Stream, cmdQ2Start); + const auto semaphoreCcsItor = find(ccsHwParser.cmdList.begin(), ccsHwParser.cmdList.end()); + auto semaphoreCmd = genCmdCast(*semaphoreCcsItor); + ASSERT_NE(nullptr, semaphoreCmd); + EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); + EXPECT_EQ(nodeGpuAddress, semaphoreCmd->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, semaphoreCmd->getCompareOperation()); + + bool pipeControlForBcsSemaphoreFound = false; + auto pipeControlsAfterSemaphore = findAll(semaphoreCcsItor, ccsHwParser.cmdList.end()); + for (auto pipeControlIter : pipeControlsAfterSemaphore) { + auto pipeControlCmd = genCmdCast(*pipeControlIter); + if (0u == pipeControlCmd->getImmediateData() && + PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA == pipeControlCmd->getPostSyncOperation() && + NEO::UnitTestHelper::getPipeControlPostSyncAddress(*pipeControlCmd) == bcsSemaphoreAddress) { + pipeControlForBcsSemaphoreFound = true; + break; + } + } + EXPECT_TRUE(pipeControlForBcsSemaphoreFound); + } + + EXPECT_EQ(CL_SUCCESS, pCmdQ->finish()); + EXPECT_EQ(CL_SUCCESS, pCmdQ2->finish()); + + clReleaseEvent(outEvent); + // tearDown + if (pCmdQ2) { + auto blocked = pCmdQ2->isQueueBlocked(); + UNRECOVERABLE_IF(blocked); + pCmdQ2->release(); + } +} + struct BuiltinParamsCommandQueueHwTests : public CommandQueueHwTest { void setUpImpl(EBuiltInOps::Type operation) {