performance: don't flush gpgpu if not required

Related-To: NEO-12124

If queue is OOQ and there are no cross-engine dependencies,
don't flush CCS before submitting copy on BCS.

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2024-07-25 15:45:32 +00:00
committed by Compute-Runtime-Automation
parent 05b8c2ed97
commit ace883ca55
8 changed files with 31 additions and 6 deletions

View File

@@ -521,7 +521,7 @@ class CommandQueueHw : public CommandQueue {
TimestampPacketDependencies &timestampPacketDependencies,
bool relaxedOrderingEnabled);
MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies) const;
MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies, bool containsCrossEngineDependency) const;
void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType);
bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo);

View File

@@ -205,10 +205,14 @@ void CommandQueueHw<Family>::setupBlitAuxTranslation(MultiDispatchInfo &multiDis
}
template <typename Family>
bool CommandQueueHw<Family>::isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies) const {
bool CommandQueueHw<Family>::isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies, bool containsCrossEngineDependency) const {
if (queueBlocked || timestampPacketDependencies.barrierNodes.peekNodes().size() > 0u) {
return true;
}
if (isOOQEnabled()) {
return containsCrossEngineDependency;
}
bool required = false;
switch (latestSentEnqueueType) {
case NEO::EnqueueProperties::Operation::explicitCacheFlush:

View File

@@ -1460,7 +1460,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
migratedMemory = migrateMultiGraphicsAllocationsIfRequired(multiDispatchInfo.peekBuiltinOpParams(), bcsCsr);
}
auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies);
auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies, csrDeps.containsCrossEngineDependency);
if ((isCacheFlushForBcsRequired() || NEO::EnqueueProperties::Operation::dependencyResolveOnGpu == latestSentEnqueueType) && gpgpuSubmission) {
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
}

View File

@@ -67,6 +67,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
}
csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr);
}
csrDeps.containsCrossEngineDependency = true;
}
}
}

View File

@@ -3321,4 +3321,22 @@ HWTEST_F(CsrSelectionCommandQueueWithBlitterTests, givenImageFromBufferThenBcsAl
} else {
EXPECT_EQ(ccsCsr, &queue->selectCsrForBuiltinOperation(args));
}
}
HWTEST_F(CommandQueueTests, GivenOOQCommandQueueWhenIsGpgpuSubmissionForBcsRequiredCalledThenReturnCorrectValue) {
auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
MockContext context(device.get());
auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
mockCmdQ->latestSentEnqueueType = EnqueueProperties::Operation::gpuKernel;
mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true;
mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = true;
TimestampPacketDependencies dependencies{};
auto containsCrossEngineDependency = false;
EXPECT_TRUE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency));
mockCmdQ->setOoqEnabled();
EXPECT_FALSE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency));
containsCrossEngineDependency = true;
EXPECT_TRUE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency));
}

View File

@@ -170,7 +170,7 @@ HWTEST_F(TimestampPacketTests, givenCrossCsrDependenciesWhenFillCsrDepsThendepen
} else {
EXPECT_EQ(csrDeps.csrWithMultiEngineDependencies.size(), 0u);
}
EXPECT_TRUE(csrDeps.containsCrossEngineDependency);
mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
*mockCmdQHw->getUltCommandStreamReceiver().tagAddress = 1;
mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;

View File

@@ -282,6 +282,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
using BaseClass::heaplessStateInitEnabled;
using BaseClass::isBlitAuxTranslationRequired;
using BaseClass::isCompleted;
using BaseClass::isGpgpuSubmissionForBcsRequired;
using BaseClass::latestSentEnqueueType;
using BaseClass::minimalSizeForBcsSplit;
using BaseClass::obtainCommandStream;
@@ -440,11 +441,11 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
}
return BaseClass::isQueueBlocked();
}
bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies) const override {
bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies, bool containsCrossEngineDependency) const override {
if (forceGpgpuSubmissionForBcsRequired != -1) {
return forceGpgpuSubmissionForBcsRequired;
}
return BaseClass::isGpgpuSubmissionForBcsRequired(queueBlocked, timestampPacketDependencies);
return BaseClass::isGpgpuSubmissionForBcsRequired(queueBlocked, timestampPacketDependencies, containsCrossEngineDependency);
}
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) override {

View File

@@ -31,5 +31,6 @@ class CsrDependencies {
void copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer);
std::set<CommandStreamReceiver *> csrWithMultiEngineDependencies;
bool containsCrossEngineDependency = false;
};
} // namespace NEO