[perf] group once per context calls under single condition
Plenty of calls require hw command programming only once per context. There is no need to visit every method of them every execute call. Set global init flag only if any of them is true and then visit all of them. But for regular command list execution it can save time when there is single global check. Related-To: NEO-7828 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
parent
9ce5351d3f
commit
09b58f4a22
|
@ -96,6 +96,7 @@ struct CommandQueueHw : public CommandQueueImp {
|
|||
bool isDispatchTaskCountPostSyncRequired{};
|
||||
bool hasIndirectAccess{};
|
||||
bool rtDispatchRequired = false;
|
||||
bool globalInit = false;
|
||||
};
|
||||
|
||||
ze_result_t validateCommandListsParams(CommandListExecutionContext &ctx,
|
||||
|
@ -118,7 +119,7 @@ struct CommandQueueHw : public CommandQueueImp {
|
|||
uint32_t numCommandLists,
|
||||
ze_fence_handle_t hFence);
|
||||
MOCKABLE_VIRTUAL bool isDispatchTaskCountPostSyncRequired(ze_fence_handle_t hFence, bool containsAnyRegularCmdList) const;
|
||||
inline size_t estimateLinearStreamSizeInitial(const CommandListExecutionContext &ctx,
|
||||
inline size_t estimateLinearStreamSizeInitial(CommandListExecutionContext &ctx,
|
||||
ze_command_list_handle_t *phCommandLists,
|
||||
uint32_t numCommandLists);
|
||||
inline void setFrontEndStateProperties(CommandListExecutionContext &ctx);
|
||||
|
|
|
@ -140,11 +140,19 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
|
|||
this->getGlobalFenceAndMakeItResident();
|
||||
this->getWorkPartitionAndMakeItResident();
|
||||
this->getGlobalStatelessHeapAndMakeItResident();
|
||||
this->getTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(child);
|
||||
this->csr->programHardwareContext(child);
|
||||
this->makePreemptionAllocationResidentForModeMidThread(ctx.isDevicePreemptionModeMidThread);
|
||||
this->makeSipIsaResidentIfSipKernelUsed(ctx);
|
||||
this->makeDebugSurfaceResidentIfNEODebuggerActive(ctx.isNEODebuggerActive(this->device));
|
||||
this->makeRayTracingBufferResident(neoDevice->getRTMemoryBackedBuffer());
|
||||
this->makeSbaTrackingBufferResidentIfL0DebuggerEnabled(ctx.isDebugEnabled);
|
||||
this->makeCsrTagAllocationResident();
|
||||
this->encodeKernelArgsBufferAndMakeItResident();
|
||||
|
||||
auto &csrStateProperties = csr->getStreamProperties();
|
||||
if (ctx.globalInit) {
|
||||
this->getTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(child);
|
||||
this->csr->programHardwareContext(child);
|
||||
|
||||
if (!this->pipelineSelectStateTracking) {
|
||||
this->programPipelineSelectIfGpgpuDisabled(child);
|
||||
} else {
|
||||
|
@ -162,14 +170,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
|
|||
}
|
||||
this->programCsrBaseAddressIfPreemptionModeInitial(ctx.isPreemptionModeInitial, child);
|
||||
this->programStateSip(ctx.stateSipRequired, child);
|
||||
this->makePreemptionAllocationResidentForModeMidThread(ctx.isDevicePreemptionModeMidThread);
|
||||
this->makeSipIsaResidentIfSipKernelUsed(ctx);
|
||||
this->makeDebugSurfaceResidentIfNEODebuggerActive(ctx.isNEODebuggerActive(this->device));
|
||||
this->makeRayTracingBufferResident(neoDevice->getRTMemoryBackedBuffer());
|
||||
|
||||
this->programActivePartitionConfig(ctx.isProgramActivePartitionConfigRequired, child);
|
||||
this->encodeKernelArgsBufferAndMakeItResident();
|
||||
|
||||
bool shouldProgramVfe = this->csr->getLogicalStateHelper() && ctx.frontEndStateDirty;
|
||||
this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, child, csrStateProperties);
|
||||
|
||||
|
@ -177,6 +178,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
|
|||
auto csrHw = static_cast<NEO::CommandStreamReceiverHw<GfxFamily> *>(this->csr);
|
||||
csrHw->dispatchRayTracingStateCommand(child, *neoDevice);
|
||||
}
|
||||
}
|
||||
|
||||
this->writeCsrStreamInlineIfLogicalStateHelperAvailable(child);
|
||||
|
||||
|
@ -209,15 +211,13 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
|
|||
this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, phCommandLists[0]);
|
||||
|
||||
this->programStateSipEndWA(ctx.stateSipRequired, child);
|
||||
this->assignCsrTaskCountToFenceIfAvailable(hFence);
|
||||
this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child);
|
||||
auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child);
|
||||
|
||||
this->csr->setPreemptionMode(ctx.statePreemption);
|
||||
this->assignCsrTaskCountToFenceIfAvailable(hFence);
|
||||
|
||||
this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child);
|
||||
|
||||
this->makeCsrTagAllocationResident();
|
||||
auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child);
|
||||
this->updateTaskCountAndPostSync(ctx.isDispatchTaskCountPostSyncRequired);
|
||||
|
||||
this->csr->makeSurfacePackNonResident(this->csr->getResidencyAllocations(), false);
|
||||
|
||||
auto completionResult = this->waitForCommandQueueCompletionAndCleanHeapContainer();
|
||||
|
@ -490,11 +490,11 @@ CommandQueueHw<gfxCoreFamily>::CommandListExecutionContext::CommandListExecution
|
|||
this->cachedMOCSAllowed = false;
|
||||
}
|
||||
|
||||
hasIndirectAccess |= commandList->hasIndirectAllocationsAllowed();
|
||||
this->hasIndirectAccess |= commandList->hasIndirectAllocationsAllowed();
|
||||
if (commandList->hasIndirectAllocationsAllowed()) {
|
||||
unifiedMemoryControls.indirectDeviceAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectDeviceAllocationsAllowed;
|
||||
unifiedMemoryControls.indirectHostAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectHostAllocationsAllowed;
|
||||
unifiedMemoryControls.indirectSharedAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectSharedAllocationsAllowed;
|
||||
this->unifiedMemoryControls.indirectDeviceAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectDeviceAllocationsAllowed;
|
||||
this->unifiedMemoryControls.indirectHostAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectHostAllocationsAllowed;
|
||||
this->unifiedMemoryControls.indirectSharedAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectSharedAllocationsAllowed;
|
||||
}
|
||||
}
|
||||
this->isDevicePreemptionModeMidThread = device->getDevicePreemptionMode() == NEO::PreemptionMode::MidThread;
|
||||
|
@ -512,6 +512,8 @@ CommandQueueHw<gfxCoreFamily>::CommandListExecutionContext::CommandListExecution
|
|||
if (this->isMigrationRequested && device->getDriverHandle()->getMemoryManager()->getPageFaultManager() == nullptr) {
|
||||
this->isMigrationRequested = false;
|
||||
}
|
||||
|
||||
this->globalInit |= (this->isProgramActivePartitionConfigRequired || this->isPreemptionModeInitial || this->stateSipRequired || this->isDebugEnabled);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
@ -606,7 +608,7 @@ void CommandQueueHw<gfxCoreFamily>::setupCmdListsAndContextParams(
|
|||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeInitial(
|
||||
const CommandListExecutionContext &ctx,
|
||||
CommandListExecutionContext &ctx,
|
||||
ze_command_list_handle_t *phCommandLists,
|
||||
uint32_t numCommandLists) {
|
||||
|
||||
|
@ -620,7 +622,12 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeInitial(
|
|||
linearStreamSizeEstimate += commandList->getCmdContainer().getCmdBufferAllocations().size();
|
||||
}
|
||||
linearStreamSizeEstimate *= NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize();
|
||||
linearStreamSizeEstimate += this->csr->getCmdsSizeForHardwareContext();
|
||||
|
||||
auto hwContextSizeEstimate = this->csr->getCmdsSizeForHardwareContext();
|
||||
if (hwContextSizeEstimate > 0) {
|
||||
linearStreamSizeEstimate += hwContextSizeEstimate;
|
||||
ctx.globalInit |= true;
|
||||
}
|
||||
|
||||
if (ctx.isDirectSubmissionEnabled) {
|
||||
linearStreamSizeEstimate += NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize();
|
||||
|
@ -638,6 +645,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeInitial(
|
|||
|
||||
if (NEO::DebugManager.flags.EnableSWTags.get()) {
|
||||
linearStreamSizeEstimate += NEO::SWTagsManager::estimateSpaceForSWTags<GfxFamily>();
|
||||
ctx.globalInit |= true;
|
||||
}
|
||||
|
||||
linearStreamSizeEstimate += NEO::EncodeKernelArgsBuffer<GfxFamily>::getKernelArgsBufferCmdsSize(this->csr->getKernelArgsBufferAllocation(),
|
||||
|
@ -673,6 +681,7 @@ void CommandQueueHw<gfxCoreFamily>::setFrontEndStateProperties(CommandListExecut
|
|||
ctx.engineInstanced = isEngineInstanced;
|
||||
}
|
||||
ctx.frontEndStateDirty |= csr->getMediaVFEStateDirty();
|
||||
ctx.globalInit |= ctx.frontEndStateDirty;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
@ -684,6 +693,8 @@ void CommandQueueHw<gfxCoreFamily>::handleScratchSpaceAndUpdateGSBAStateDirtyFla
|
|||
ctx.perThreadScratchSpaceSize, ctx.perThreadPrivateScratchSize);
|
||||
ctx.gsbaStateDirty |= this->csr->getGSBAStateDirty();
|
||||
ctx.scratchGsba = scratchController->calculateNewGSH();
|
||||
|
||||
ctx.globalInit |= ctx.gsbaStateDirty;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
@ -694,6 +705,8 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
|
|||
|
||||
size_t linearStreamSizeEstimate = 0u;
|
||||
|
||||
ctx.globalInit |= !(csr->getPreambleSetFlag());
|
||||
|
||||
linearStreamSizeEstimate += estimateFrontEndCmdSize(ctx.frontEndStateDirty);
|
||||
linearStreamSizeEstimate += estimatePipelineSelectCmdSize();
|
||||
|
||||
|
@ -724,6 +737,8 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
|
|||
ctx.rtDispatchRequired = true;
|
||||
auto csrHw = static_cast<NEO::CommandStreamReceiverHw<GfxFamily> *>(this->csr);
|
||||
linearStreamSizeEstimate += csrHw->getCmdSizeForPerDssBackedBuffer(this->device->getHwInfo());
|
||||
|
||||
ctx.globalInit |= true;
|
||||
}
|
||||
|
||||
return linearStreamSizeEstimate;
|
||||
|
|
Loading…
Reference in New Issue