mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-21 09:14:47 +08:00
performance: don't flush gpgpu if not required
Related-To: NEO-12124 If queue is OOQ and there are no cross-engine dependencies, don't flush CCS before submitting copy on BCS. Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
05b8c2ed97
commit
ace883ca55
@@ -521,7 +521,7 @@ class CommandQueueHw : public CommandQueue {
|
||||
TimestampPacketDependencies ×tampPacketDependencies,
|
||||
bool relaxedOrderingEnabled);
|
||||
|
||||
MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const;
|
||||
MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies, bool containsCrossEngineDependency) const;
|
||||
void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType);
|
||||
|
||||
bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo);
|
||||
|
||||
@@ -205,10 +205,14 @@ void CommandQueueHw<Family>::setupBlitAuxTranslation(MultiDispatchInfo &multiDis
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
bool CommandQueueHw<Family>::isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const {
|
||||
bool CommandQueueHw<Family>::isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies, bool containsCrossEngineDependency) const {
|
||||
if (queueBlocked || timestampPacketDependencies.barrierNodes.peekNodes().size() > 0u) {
|
||||
return true;
|
||||
}
|
||||
if (isOOQEnabled()) {
|
||||
return containsCrossEngineDependency;
|
||||
}
|
||||
|
||||
bool required = false;
|
||||
switch (latestSentEnqueueType) {
|
||||
case NEO::EnqueueProperties::Operation::explicitCacheFlush:
|
||||
|
||||
@@ -1460,7 +1460,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
|
||||
migratedMemory = migrateMultiGraphicsAllocationsIfRequired(multiDispatchInfo.peekBuiltinOpParams(), bcsCsr);
|
||||
}
|
||||
|
||||
auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies);
|
||||
auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies, csrDeps.containsCrossEngineDependency);
|
||||
if ((isCacheFlushForBcsRequired() || NEO::EnqueueProperties::Operation::dependencyResolveOnGpu == latestSentEnqueueType) && gpgpuSubmission) {
|
||||
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
|
||||
}
|
||||
|
||||
@@ -67,6 +67,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
|
||||
}
|
||||
csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr);
|
||||
}
|
||||
csrDeps.containsCrossEngineDependency = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3322,3 +3322,21 @@ HWTEST_F(CsrSelectionCommandQueueWithBlitterTests, givenImageFromBufferThenBcsAl
|
||||
EXPECT_EQ(ccsCsr, &queue->selectCsrForBuiltinOperation(args));
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(CommandQueueTests, GivenOOQCommandQueueWhenIsGpgpuSubmissionForBcsRequiredCalledThenReturnCorrectValue) {
|
||||
auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
|
||||
MockContext context(device.get());
|
||||
auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
|
||||
mockCmdQ->latestSentEnqueueType = EnqueueProperties::Operation::gpuKernel;
|
||||
mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true;
|
||||
mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = true;
|
||||
TimestampPacketDependencies dependencies{};
|
||||
auto containsCrossEngineDependency = false;
|
||||
EXPECT_TRUE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency));
|
||||
|
||||
mockCmdQ->setOoqEnabled();
|
||||
EXPECT_FALSE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency));
|
||||
|
||||
containsCrossEngineDependency = true;
|
||||
EXPECT_TRUE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency));
|
||||
}
|
||||
@@ -170,7 +170,7 @@ HWTEST_F(TimestampPacketTests, givenCrossCsrDependenciesWhenFillCsrDepsThendepen
|
||||
} else {
|
||||
EXPECT_EQ(csrDeps.csrWithMultiEngineDependencies.size(), 0u);
|
||||
}
|
||||
|
||||
EXPECT_TRUE(csrDeps.containsCrossEngineDependency);
|
||||
mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
|
||||
*mockCmdQHw->getUltCommandStreamReceiver().tagAddress = 1;
|
||||
mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
|
||||
|
||||
@@ -282,6 +282,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
||||
using BaseClass::heaplessStateInitEnabled;
|
||||
using BaseClass::isBlitAuxTranslationRequired;
|
||||
using BaseClass::isCompleted;
|
||||
using BaseClass::isGpgpuSubmissionForBcsRequired;
|
||||
using BaseClass::latestSentEnqueueType;
|
||||
using BaseClass::minimalSizeForBcsSplit;
|
||||
using BaseClass::obtainCommandStream;
|
||||
@@ -440,11 +441,11 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
||||
}
|
||||
return BaseClass::isQueueBlocked();
|
||||
}
|
||||
bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const override {
|
||||
bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies, bool containsCrossEngineDependency) const override {
|
||||
if (forceGpgpuSubmissionForBcsRequired != -1) {
|
||||
return forceGpgpuSubmissionForBcsRequired;
|
||||
}
|
||||
return BaseClass::isGpgpuSubmissionForBcsRequired(queueBlocked, timestampPacketDependencies);
|
||||
return BaseClass::isGpgpuSubmissionForBcsRequired(queueBlocked, timestampPacketDependencies, containsCrossEngineDependency);
|
||||
}
|
||||
|
||||
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) override {
|
||||
|
||||
@@ -31,5 +31,6 @@ class CsrDependencies {
|
||||
void copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer);
|
||||
|
||||
std::set<CommandStreamReceiver *> csrWithMultiEngineDependencies;
|
||||
bool containsCrossEngineDependency = false;
|
||||
};
|
||||
} // namespace NEO
|
||||
|
||||
Reference in New Issue
Block a user