performance: don't flush gpgpu if not required

Related-To: NEO-12124

If queue is OOQ and there are no cross-engine dependencies,
don't flush CCS before submitting copy on BCS.

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2024-07-25 15:45:32 +00:00
committed by Compute-Runtime-Automation
parent 05b8c2ed97
commit ace883ca55
8 changed files with 31 additions and 6 deletions

View File

@@ -521,7 +521,7 @@ class CommandQueueHw : public CommandQueue {
TimestampPacketDependencies &timestampPacketDependencies, TimestampPacketDependencies &timestampPacketDependencies,
bool relaxedOrderingEnabled); bool relaxedOrderingEnabled);
MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies) const; MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies, bool containsCrossEngineDependency) const;
void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType); void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType);
bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo); bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo);

View File

@@ -205,10 +205,14 @@ void CommandQueueHw<Family>::setupBlitAuxTranslation(MultiDispatchInfo &multiDis
} }
template <typename Family> template <typename Family>
bool CommandQueueHw<Family>::isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies) const { bool CommandQueueHw<Family>::isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies, bool containsCrossEngineDependency) const {
if (queueBlocked || timestampPacketDependencies.barrierNodes.peekNodes().size() > 0u) { if (queueBlocked || timestampPacketDependencies.barrierNodes.peekNodes().size() > 0u) {
return true; return true;
} }
if (isOOQEnabled()) {
return containsCrossEngineDependency;
}
bool required = false; bool required = false;
switch (latestSentEnqueueType) { switch (latestSentEnqueueType) {
case NEO::EnqueueProperties::Operation::explicitCacheFlush: case NEO::EnqueueProperties::Operation::explicitCacheFlush:

View File

@@ -1460,7 +1460,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
migratedMemory = migrateMultiGraphicsAllocationsIfRequired(multiDispatchInfo.peekBuiltinOpParams(), bcsCsr); migratedMemory = migrateMultiGraphicsAllocationsIfRequired(multiDispatchInfo.peekBuiltinOpParams(), bcsCsr);
} }
auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies); auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies, csrDeps.containsCrossEngineDependency);
if ((isCacheFlushForBcsRequired() || NEO::EnqueueProperties::Operation::dependencyResolveOnGpu == latestSentEnqueueType) && gpgpuSubmission) { if ((isCacheFlushForBcsRequired() || NEO::EnqueueProperties::Operation::dependencyResolveOnGpu == latestSentEnqueueType) && gpgpuSubmission) {
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag()); timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
} }

View File

@@ -67,6 +67,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
} }
csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr); csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr);
} }
csrDeps.containsCrossEngineDependency = true;
} }
} }
} }

View File

@@ -3321,4 +3321,22 @@ HWTEST_F(CsrSelectionCommandQueueWithBlitterTests, givenImageFromBufferThenBcsAl
} else { } else {
EXPECT_EQ(ccsCsr, &queue->selectCsrForBuiltinOperation(args)); EXPECT_EQ(ccsCsr, &queue->selectCsrForBuiltinOperation(args));
} }
}
HWTEST_F(CommandQueueTests, GivenOOQCommandQueueWhenIsGpgpuSubmissionForBcsRequiredCalledThenReturnCorrectValue) {
auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
MockContext context(device.get());
auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
mockCmdQ->latestSentEnqueueType = EnqueueProperties::Operation::gpuKernel;
mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true;
mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = true;
TimestampPacketDependencies dependencies{};
auto containsCrossEngineDependency = false;
EXPECT_TRUE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency));
mockCmdQ->setOoqEnabled();
EXPECT_FALSE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency));
containsCrossEngineDependency = true;
EXPECT_TRUE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency));
} }

View File

@@ -170,7 +170,7 @@ HWTEST_F(TimestampPacketTests, givenCrossCsrDependenciesWhenFillCsrDepsThendepen
} else { } else {
EXPECT_EQ(csrDeps.csrWithMultiEngineDependencies.size(), 0u); EXPECT_EQ(csrDeps.csrWithMultiEngineDependencies.size(), 0u);
} }
EXPECT_TRUE(csrDeps.containsCrossEngineDependency);
mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 1; mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
*mockCmdQHw->getUltCommandStreamReceiver().tagAddress = 1; *mockCmdQHw->getUltCommandStreamReceiver().tagAddress = 1;
mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 1; mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;

View File

@@ -282,6 +282,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
using BaseClass::heaplessStateInitEnabled; using BaseClass::heaplessStateInitEnabled;
using BaseClass::isBlitAuxTranslationRequired; using BaseClass::isBlitAuxTranslationRequired;
using BaseClass::isCompleted; using BaseClass::isCompleted;
using BaseClass::isGpgpuSubmissionForBcsRequired;
using BaseClass::latestSentEnqueueType; using BaseClass::latestSentEnqueueType;
using BaseClass::minimalSizeForBcsSplit; using BaseClass::minimalSizeForBcsSplit;
using BaseClass::obtainCommandStream; using BaseClass::obtainCommandStream;
@@ -440,11 +441,11 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
} }
return BaseClass::isQueueBlocked(); return BaseClass::isQueueBlocked();
} }
bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies) const override { bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies, bool containsCrossEngineDependency) const override {
if (forceGpgpuSubmissionForBcsRequired != -1) { if (forceGpgpuSubmissionForBcsRequired != -1) {
return forceGpgpuSubmissionForBcsRequired; return forceGpgpuSubmissionForBcsRequired;
} }
return BaseClass::isGpgpuSubmissionForBcsRequired(queueBlocked, timestampPacketDependencies); return BaseClass::isGpgpuSubmissionForBcsRequired(queueBlocked, timestampPacketDependencies, containsCrossEngineDependency);
} }
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) override { bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) override {

View File

@@ -31,5 +31,6 @@ class CsrDependencies {
void copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer); void copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer);
std::set<CommandStreamReceiver *> csrWithMultiEngineDependencies; std::set<CommandStreamReceiver *> csrWithMultiEngineDependencies;
bool containsCrossEngineDependency = false;
}; };
} // namespace NEO } // namespace NEO