[perf] group once per context calls under single condition

Plenty of calls require hw command programming only once per context.
There is no need to visit every method of them every execute call.
Set global init flag only if any of them is true and then visit all of them.
But for regular command list execution it can save time when there is single
global check.

Related-To: NEO-7828

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2023-04-05 18:49:42 +00:00 committed by Compute-Runtime-Automation
parent 9ce5351d3f
commit 09b58f4a22
2 changed files with 58 additions and 42 deletions

View File

@ -96,6 +96,7 @@ struct CommandQueueHw : public CommandQueueImp {
bool isDispatchTaskCountPostSyncRequired{};
bool hasIndirectAccess{};
bool rtDispatchRequired = false;
bool globalInit = false;
};
ze_result_t validateCommandListsParams(CommandListExecutionContext &ctx,
@ -118,7 +119,7 @@ struct CommandQueueHw : public CommandQueueImp {
uint32_t numCommandLists,
ze_fence_handle_t hFence);
MOCKABLE_VIRTUAL bool isDispatchTaskCountPostSyncRequired(ze_fence_handle_t hFence, bool containsAnyRegularCmdList) const;
inline size_t estimateLinearStreamSizeInitial(const CommandListExecutionContext &ctx,
inline size_t estimateLinearStreamSizeInitial(CommandListExecutionContext &ctx,
ze_command_list_handle_t *phCommandLists,
uint32_t numCommandLists);
inline void setFrontEndStateProperties(CommandListExecutionContext &ctx);

View File

@ -140,42 +140,44 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
this->getGlobalFenceAndMakeItResident();
this->getWorkPartitionAndMakeItResident();
this->getGlobalStatelessHeapAndMakeItResident();
this->getTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(child);
this->csr->programHardwareContext(child);
this->makeSbaTrackingBufferResidentIfL0DebuggerEnabled(ctx.isDebugEnabled);
auto &csrStateProperties = csr->getStreamProperties();
if (!this->pipelineSelectStateTracking) {
this->programPipelineSelectIfGpgpuDisabled(child);
} else {
// Setting systolic/pipeline select here for 1st command list is to preserve dispatch order of hw commands
auto commandList = CommandList::fromHandle(phCommandLists[0]);
auto &requiredStreamState = commandList->getRequiredStreamState();
// Provide cmdlist required state as cmdlist final state, so csr state does not transition to final
// By preserving required state in csr - keeping csr state not dirty - it will not dispatch 1st command list pipeline select/systolic in main loop
// Csr state will transition to final of 1st command list in main loop
this->programOneCmdListPipelineSelect(commandList, child, csrStateProperties, requiredStreamState, requiredStreamState);
}
this->programCommandQueueDebugCmdsForSourceLevelOrL0DebuggerIfEnabled(ctx.isDebugEnabled, child);
if (!this->stateBaseAddressTracking) {
this->programStateBaseAddressWithGsbaIfDirty(ctx, phCommandLists[0], child);
}
this->programCsrBaseAddressIfPreemptionModeInitial(ctx.isPreemptionModeInitial, child);
this->programStateSip(ctx.stateSipRequired, child);
this->makePreemptionAllocationResidentForModeMidThread(ctx.isDevicePreemptionModeMidThread);
this->makeSipIsaResidentIfSipKernelUsed(ctx);
this->makeDebugSurfaceResidentIfNEODebuggerActive(ctx.isNEODebuggerActive(this->device));
this->makeRayTracingBufferResident(neoDevice->getRTMemoryBackedBuffer());
this->programActivePartitionConfig(ctx.isProgramActivePartitionConfigRequired, child);
this->makeSbaTrackingBufferResidentIfL0DebuggerEnabled(ctx.isDebugEnabled);
this->makeCsrTagAllocationResident();
this->encodeKernelArgsBufferAndMakeItResident();
bool shouldProgramVfe = this->csr->getLogicalStateHelper() && ctx.frontEndStateDirty;
this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, child, csrStateProperties);
auto &csrStateProperties = csr->getStreamProperties();
if (ctx.globalInit) {
this->getTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(child);
this->csr->programHardwareContext(child);
if (ctx.rtDispatchRequired) {
auto csrHw = static_cast<NEO::CommandStreamReceiverHw<GfxFamily> *>(this->csr);
csrHw->dispatchRayTracingStateCommand(child, *neoDevice);
if (!this->pipelineSelectStateTracking) {
this->programPipelineSelectIfGpgpuDisabled(child);
} else {
// Setting systolic/pipeline select here for 1st command list is to preserve dispatch order of hw commands
auto commandList = CommandList::fromHandle(phCommandLists[0]);
auto &requiredStreamState = commandList->getRequiredStreamState();
// Provide cmdlist required state as cmdlist final state, so csr state does not transition to final
// By preserving required state in csr - keeping csr state not dirty - it will not dispatch 1st command list pipeline select/systolic in main loop
// Csr state will transition to final of 1st command list in main loop
this->programOneCmdListPipelineSelect(commandList, child, csrStateProperties, requiredStreamState, requiredStreamState);
}
this->programCommandQueueDebugCmdsForSourceLevelOrL0DebuggerIfEnabled(ctx.isDebugEnabled, child);
if (!this->stateBaseAddressTracking) {
this->programStateBaseAddressWithGsbaIfDirty(ctx, phCommandLists[0], child);
}
this->programCsrBaseAddressIfPreemptionModeInitial(ctx.isPreemptionModeInitial, child);
this->programStateSip(ctx.stateSipRequired, child);
this->programActivePartitionConfig(ctx.isProgramActivePartitionConfigRequired, child);
bool shouldProgramVfe = this->csr->getLogicalStateHelper() && ctx.frontEndStateDirty;
this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, child, csrStateProperties);
if (ctx.rtDispatchRequired) {
auto csrHw = static_cast<NEO::CommandStreamReceiverHw<GfxFamily> *>(this->csr);
csrHw->dispatchRayTracingStateCommand(child, *neoDevice);
}
}
this->writeCsrStreamInlineIfLogicalStateHelperAvailable(child);
@ -209,15 +211,13 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, phCommandLists[0]);
this->programStateSipEndWA(ctx.stateSipRequired, child);
this->assignCsrTaskCountToFenceIfAvailable(hFence);
this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child);
auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child);
this->csr->setPreemptionMode(ctx.statePreemption);
this->assignCsrTaskCountToFenceIfAvailable(hFence);
this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child);
this->makeCsrTagAllocationResident();
auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child);
this->updateTaskCountAndPostSync(ctx.isDispatchTaskCountPostSyncRequired);
this->csr->makeSurfacePackNonResident(this->csr->getResidencyAllocations(), false);
auto completionResult = this->waitForCommandQueueCompletionAndCleanHeapContainer();
@ -490,11 +490,11 @@ CommandQueueHw<gfxCoreFamily>::CommandListExecutionContext::CommandListExecution
this->cachedMOCSAllowed = false;
}
hasIndirectAccess |= commandList->hasIndirectAllocationsAllowed();
this->hasIndirectAccess |= commandList->hasIndirectAllocationsAllowed();
if (commandList->hasIndirectAllocationsAllowed()) {
unifiedMemoryControls.indirectDeviceAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectDeviceAllocationsAllowed;
unifiedMemoryControls.indirectHostAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectHostAllocationsAllowed;
unifiedMemoryControls.indirectSharedAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectSharedAllocationsAllowed;
this->unifiedMemoryControls.indirectDeviceAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectDeviceAllocationsAllowed;
this->unifiedMemoryControls.indirectHostAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectHostAllocationsAllowed;
this->unifiedMemoryControls.indirectSharedAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectSharedAllocationsAllowed;
}
}
this->isDevicePreemptionModeMidThread = device->getDevicePreemptionMode() == NEO::PreemptionMode::MidThread;
@ -512,6 +512,8 @@ CommandQueueHw<gfxCoreFamily>::CommandListExecutionContext::CommandListExecution
if (this->isMigrationRequested && device->getDriverHandle()->getMemoryManager()->getPageFaultManager() == nullptr) {
this->isMigrationRequested = false;
}
this->globalInit |= (this->isProgramActivePartitionConfigRequired || this->isPreemptionModeInitial || this->stateSipRequired || this->isDebugEnabled);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -606,7 +608,7 @@ void CommandQueueHw<gfxCoreFamily>::setupCmdListsAndContextParams(
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeInitial(
const CommandListExecutionContext &ctx,
CommandListExecutionContext &ctx,
ze_command_list_handle_t *phCommandLists,
uint32_t numCommandLists) {
@ -620,7 +622,12 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeInitial(
linearStreamSizeEstimate += commandList->getCmdContainer().getCmdBufferAllocations().size();
}
linearStreamSizeEstimate *= NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize();
linearStreamSizeEstimate += this->csr->getCmdsSizeForHardwareContext();
auto hwContextSizeEstimate = this->csr->getCmdsSizeForHardwareContext();
if (hwContextSizeEstimate > 0) {
linearStreamSizeEstimate += hwContextSizeEstimate;
ctx.globalInit |= true;
}
if (ctx.isDirectSubmissionEnabled) {
linearStreamSizeEstimate += NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize();
@ -638,6 +645,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeInitial(
if (NEO::DebugManager.flags.EnableSWTags.get()) {
linearStreamSizeEstimate += NEO::SWTagsManager::estimateSpaceForSWTags<GfxFamily>();
ctx.globalInit |= true;
}
linearStreamSizeEstimate += NEO::EncodeKernelArgsBuffer<GfxFamily>::getKernelArgsBufferCmdsSize(this->csr->getKernelArgsBufferAllocation(),
@ -673,6 +681,7 @@ void CommandQueueHw<gfxCoreFamily>::setFrontEndStateProperties(CommandListExecut
ctx.engineInstanced = isEngineInstanced;
}
ctx.frontEndStateDirty |= csr->getMediaVFEStateDirty();
ctx.globalInit |= ctx.frontEndStateDirty;
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -684,6 +693,8 @@ void CommandQueueHw<gfxCoreFamily>::handleScratchSpaceAndUpdateGSBAStateDirtyFla
ctx.perThreadScratchSpaceSize, ctx.perThreadPrivateScratchSize);
ctx.gsbaStateDirty |= this->csr->getGSBAStateDirty();
ctx.scratchGsba = scratchController->calculateNewGSH();
ctx.globalInit |= ctx.gsbaStateDirty;
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -694,6 +705,8 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
size_t linearStreamSizeEstimate = 0u;
ctx.globalInit |= !(csr->getPreambleSetFlag());
linearStreamSizeEstimate += estimateFrontEndCmdSize(ctx.frontEndStateDirty);
linearStreamSizeEstimate += estimatePipelineSelectCmdSize();
@ -724,6 +737,8 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
ctx.rtDispatchRequired = true;
auto csrHw = static_cast<NEO::CommandStreamReceiverHw<GfxFamily> *>(this->csr);
linearStreamSizeEstimate += csrHw->getCmdSizeForPerDssBackedBuffer(this->device->getHwInfo());
ctx.globalInit |= true;
}
return linearStreamSizeEstimate;