From ace883ca55912613a6167df197b7c006153f48dc Mon Sep 17 00:00:00 2001 From: Szymon Morek Date: Thu, 25 Jul 2024 15:45:32 +0000 Subject: [PATCH] performance: don't flush gpgpu if not required Related-To: NEO-12124 If queue is OOQ and there are no cross-engine dependencies, don't flush CCS before submitting copy on BCS. Signed-off-by: Szymon Morek --- opencl/source/command_queue/command_queue_hw.h | 2 +- .../command_queue/command_queue_hw_base.inl | 6 +++++- opencl/source/command_queue/enqueue_common.h | 2 +- opencl/source/helpers/properties_helper.cpp | 1 + .../command_queue/command_queue_tests.cpp | 18 ++++++++++++++++++ .../helpers/timestamp_packet_1_tests.cpp | 2 +- .../test/unit_test/mocks/mock_command_queue.h | 5 +++-- shared/source/command_stream/csr_deps.h | 1 + 8 files changed, 31 insertions(+), 6 deletions(-) diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 0d29303d59..9c9861ffaa 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -521,7 +521,7 @@ class CommandQueueHw : public CommandQueue { TimestampPacketDependencies ×tampPacketDependencies, bool relaxedOrderingEnabled); - MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const; + MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies, bool containsCrossEngineDependency) const; void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType); bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo); diff --git a/opencl/source/command_queue/command_queue_hw_base.inl b/opencl/source/command_queue/command_queue_hw_base.inl index e4dfa3afbe..4ee4cd0fe0 100644 --- a/opencl/source/command_queue/command_queue_hw_base.inl +++ b/opencl/source/command_queue/command_queue_hw_base.inl @@ -205,10 +205,14 @@ void CommandQueueHw::setupBlitAuxTranslation(MultiDispatchInfo &multiDis } template -bool CommandQueueHw::isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const { +bool CommandQueueHw::isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies, bool containsCrossEngineDependency) const { if (queueBlocked || timestampPacketDependencies.barrierNodes.peekNodes().size() > 0u) { return true; } + if (isOOQEnabled()) { + return containsCrossEngineDependency; + } + bool required = false; switch (latestSentEnqueueType) { case NEO::EnqueueProperties::Operation::explicitCacheFlush: diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 42f2f7c9ce..11f7a21d06 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -1460,7 +1460,7 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp migratedMemory = migrateMultiGraphicsAllocationsIfRequired(multiDispatchInfo.peekBuiltinOpParams(), bcsCsr); } - auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies); + auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies, csrDeps.containsCrossEngineDependency); if ((isCacheFlushForBcsRequired() || NEO::EnqueueProperties::Operation::dependencyResolveOnGpu == latestSentEnqueueType) && gpgpuSubmission) { timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag()); } diff --git a/opencl/source/helpers/properties_helper.cpp b/opencl/source/helpers/properties_helper.cpp index 494dfb866a..4f2327f8e9 100644 --- a/opencl/source/helpers/properties_helper.cpp +++ b/opencl/source/helpers/properties_helper.cpp @@ -67,6 +67,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci } csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr); } + csrDeps.containsCrossEngineDependency = true; } } } diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp index 502a007369..ed4c18dd14 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp @@ -3321,4 +3321,22 @@ HWTEST_F(CsrSelectionCommandQueueWithBlitterTests, givenImageFromBufferThenBcsAl } else { EXPECT_EQ(ccsCsr, &queue->selectCsrForBuiltinOperation(args)); } +} + +HWTEST_F(CommandQueueTests, GivenOOQCommandQueueWhenIsGpgpuSubmissionForBcsRequiredCalledThenReturnCorrectValue) { + auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); + MockContext context(device.get()); + auto mockCmdQ = std::make_unique>(&context, context.getDevice(0), nullptr); + mockCmdQ->latestSentEnqueueType = EnqueueProperties::Operation::gpuKernel; + mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true; + mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = true; + TimestampPacketDependencies dependencies{}; + auto containsCrossEngineDependency = false; + EXPECT_TRUE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency)); + + mockCmdQ->setOoqEnabled(); + EXPECT_FALSE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency)); + + containsCrossEngineDependency = true; + EXPECT_TRUE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency)); } \ No newline at end of file diff --git a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp index ba3ff2802d..0e77efbe6f 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp @@ -170,7 +170,7 @@ HWTEST_F(TimestampPacketTests, givenCrossCsrDependenciesWhenFillCsrDepsThendepen } else { EXPECT_EQ(csrDeps.csrWithMultiEngineDependencies.size(), 0u); } - + EXPECT_TRUE(csrDeps.containsCrossEngineDependency); mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 1; *mockCmdQHw->getUltCommandStreamReceiver().tagAddress = 1; mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 1; diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index bec4594816..0b379b6fc3 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -282,6 +282,7 @@ class MockCommandQueueHw : public CommandQueueHw { using BaseClass::heaplessStateInitEnabled; using BaseClass::isBlitAuxTranslationRequired; using BaseClass::isCompleted; + using BaseClass::isGpgpuSubmissionForBcsRequired; using BaseClass::latestSentEnqueueType; using BaseClass::minimalSizeForBcsSplit; using BaseClass::obtainCommandStream; @@ -440,11 +441,11 @@ class MockCommandQueueHw : public CommandQueueHw { } return BaseClass::isQueueBlocked(); } - bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const override { + bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies, bool containsCrossEngineDependency) const override { if (forceGpgpuSubmissionForBcsRequired != -1) { return forceGpgpuSubmissionForBcsRequired; } - return BaseClass::isGpgpuSubmissionForBcsRequired(queueBlocked, timestampPacketDependencies); + return BaseClass::isGpgpuSubmissionForBcsRequired(queueBlocked, timestampPacketDependencies, containsCrossEngineDependency); } bool waitForTimestamps(Range copyEnginesToWait, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) override { diff --git a/shared/source/command_stream/csr_deps.h b/shared/source/command_stream/csr_deps.h index 4288685b2c..fa028b59f4 100644 --- a/shared/source/command_stream/csr_deps.h +++ b/shared/source/command_stream/csr_deps.h @@ -31,5 +31,6 @@ class CsrDependencies { void copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer); std::set csrWithMultiEngineDependencies; + bool containsCrossEngineDependency = false; }; } // namespace NEO