mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-22 10:17:01 +08:00
performance: don't flush gpgpu if not required
Related-To: NEO-12124 If queue is OOQ and there are no cross-engine dependencies, don't flush CCS before submitting copy on BCS. Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
05b8c2ed97
commit
ace883ca55
@@ -521,7 +521,7 @@ class CommandQueueHw : public CommandQueue {
|
|||||||
TimestampPacketDependencies ×tampPacketDependencies,
|
TimestampPacketDependencies ×tampPacketDependencies,
|
||||||
bool relaxedOrderingEnabled);
|
bool relaxedOrderingEnabled);
|
||||||
|
|
||||||
MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const;
|
MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies, bool containsCrossEngineDependency) const;
|
||||||
void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType);
|
void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType);
|
||||||
|
|
||||||
bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo);
|
bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo);
|
||||||
|
|||||||
@@ -205,10 +205,14 @@ void CommandQueueHw<Family>::setupBlitAuxTranslation(MultiDispatchInfo &multiDis
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename Family>
|
template <typename Family>
|
||||||
bool CommandQueueHw<Family>::isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const {
|
bool CommandQueueHw<Family>::isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies, bool containsCrossEngineDependency) const {
|
||||||
if (queueBlocked || timestampPacketDependencies.barrierNodes.peekNodes().size() > 0u) {
|
if (queueBlocked || timestampPacketDependencies.barrierNodes.peekNodes().size() > 0u) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (isOOQEnabled()) {
|
||||||
|
return containsCrossEngineDependency;
|
||||||
|
}
|
||||||
|
|
||||||
bool required = false;
|
bool required = false;
|
||||||
switch (latestSentEnqueueType) {
|
switch (latestSentEnqueueType) {
|
||||||
case NEO::EnqueueProperties::Operation::explicitCacheFlush:
|
case NEO::EnqueueProperties::Operation::explicitCacheFlush:
|
||||||
|
|||||||
@@ -1460,7 +1460,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
|
|||||||
migratedMemory = migrateMultiGraphicsAllocationsIfRequired(multiDispatchInfo.peekBuiltinOpParams(), bcsCsr);
|
migratedMemory = migrateMultiGraphicsAllocationsIfRequired(multiDispatchInfo.peekBuiltinOpParams(), bcsCsr);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies);
|
auto gpgpuSubmission = isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies, csrDeps.containsCrossEngineDependency);
|
||||||
if ((isCacheFlushForBcsRequired() || NEO::EnqueueProperties::Operation::dependencyResolveOnGpu == latestSentEnqueueType) && gpgpuSubmission) {
|
if ((isCacheFlushForBcsRequired() || NEO::EnqueueProperties::Operation::dependencyResolveOnGpu == latestSentEnqueueType) && gpgpuSubmission) {
|
||||||
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
|
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
|
|||||||
}
|
}
|
||||||
csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr);
|
csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr);
|
||||||
}
|
}
|
||||||
|
csrDeps.containsCrossEngineDependency = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3321,4 +3321,22 @@ HWTEST_F(CsrSelectionCommandQueueWithBlitterTests, givenImageFromBufferThenBcsAl
|
|||||||
} else {
|
} else {
|
||||||
EXPECT_EQ(ccsCsr, &queue->selectCsrForBuiltinOperation(args));
|
EXPECT_EQ(ccsCsr, &queue->selectCsrForBuiltinOperation(args));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(CommandQueueTests, GivenOOQCommandQueueWhenIsGpgpuSubmissionForBcsRequiredCalledThenReturnCorrectValue) {
|
||||||
|
auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
|
||||||
|
MockContext context(device.get());
|
||||||
|
auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, context.getDevice(0), nullptr);
|
||||||
|
mockCmdQ->latestSentEnqueueType = EnqueueProperties::Operation::gpuKernel;
|
||||||
|
mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true;
|
||||||
|
mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = true;
|
||||||
|
TimestampPacketDependencies dependencies{};
|
||||||
|
auto containsCrossEngineDependency = false;
|
||||||
|
EXPECT_TRUE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency));
|
||||||
|
|
||||||
|
mockCmdQ->setOoqEnabled();
|
||||||
|
EXPECT_FALSE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency));
|
||||||
|
|
||||||
|
containsCrossEngineDependency = true;
|
||||||
|
EXPECT_TRUE(mockCmdQ->isGpgpuSubmissionForBcsRequired(false, dependencies, containsCrossEngineDependency));
|
||||||
}
|
}
|
||||||
@@ -170,7 +170,7 @@ HWTEST_F(TimestampPacketTests, givenCrossCsrDependenciesWhenFillCsrDepsThendepen
|
|||||||
} else {
|
} else {
|
||||||
EXPECT_EQ(csrDeps.csrWithMultiEngineDependencies.size(), 0u);
|
EXPECT_EQ(csrDeps.csrWithMultiEngineDependencies.size(), 0u);
|
||||||
}
|
}
|
||||||
|
EXPECT_TRUE(csrDeps.containsCrossEngineDependency);
|
||||||
mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
|
mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
|
||||||
*mockCmdQHw->getUltCommandStreamReceiver().tagAddress = 1;
|
*mockCmdQHw->getUltCommandStreamReceiver().tagAddress = 1;
|
||||||
mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
|
mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
|
||||||
|
|||||||
@@ -282,6 +282,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
|||||||
using BaseClass::heaplessStateInitEnabled;
|
using BaseClass::heaplessStateInitEnabled;
|
||||||
using BaseClass::isBlitAuxTranslationRequired;
|
using BaseClass::isBlitAuxTranslationRequired;
|
||||||
using BaseClass::isCompleted;
|
using BaseClass::isCompleted;
|
||||||
|
using BaseClass::isGpgpuSubmissionForBcsRequired;
|
||||||
using BaseClass::latestSentEnqueueType;
|
using BaseClass::latestSentEnqueueType;
|
||||||
using BaseClass::minimalSizeForBcsSplit;
|
using BaseClass::minimalSizeForBcsSplit;
|
||||||
using BaseClass::obtainCommandStream;
|
using BaseClass::obtainCommandStream;
|
||||||
@@ -440,11 +441,11 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
|||||||
}
|
}
|
||||||
return BaseClass::isQueueBlocked();
|
return BaseClass::isQueueBlocked();
|
||||||
}
|
}
|
||||||
bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const override {
|
bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies, bool containsCrossEngineDependency) const override {
|
||||||
if (forceGpgpuSubmissionForBcsRequired != -1) {
|
if (forceGpgpuSubmissionForBcsRequired != -1) {
|
||||||
return forceGpgpuSubmissionForBcsRequired;
|
return forceGpgpuSubmissionForBcsRequired;
|
||||||
}
|
}
|
||||||
return BaseClass::isGpgpuSubmissionForBcsRequired(queueBlocked, timestampPacketDependencies);
|
return BaseClass::isGpgpuSubmissionForBcsRequired(queueBlocked, timestampPacketDependencies, containsCrossEngineDependency);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) override {
|
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) override {
|
||||||
|
|||||||
@@ -31,5 +31,6 @@ class CsrDependencies {
|
|||||||
void copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer);
|
void copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer);
|
||||||
|
|
||||||
std::set<CommandStreamReceiver *> csrWithMultiEngineDependencies;
|
std::set<CommandStreamReceiver *> csrWithMultiEngineDependencies;
|
||||||
|
bool containsCrossEngineDependency = false;
|
||||||
};
|
};
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|||||||
Reference in New Issue
Block a user