diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.h b/level_zero/core/source/cmdqueue/cmdqueue_hw.h index a33cfb9302..993d57c067 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.h @@ -96,6 +96,7 @@ struct CommandQueueHw : public CommandQueueImp { bool isDispatchTaskCountPostSyncRequired{}; bool hasIndirectAccess{}; bool rtDispatchRequired = false; + bool globalInit = false; }; ze_result_t validateCommandListsParams(CommandListExecutionContext &ctx, @@ -118,7 +119,7 @@ struct CommandQueueHw : public CommandQueueImp { uint32_t numCommandLists, ze_fence_handle_t hFence); MOCKABLE_VIRTUAL bool isDispatchTaskCountPostSyncRequired(ze_fence_handle_t hFence, bool containsAnyRegularCmdList) const; - inline size_t estimateLinearStreamSizeInitial(const CommandListExecutionContext &ctx, + inline size_t estimateLinearStreamSizeInitial(CommandListExecutionContext &ctx, ze_command_list_handle_t *phCommandLists, uint32_t numCommandLists); inline void setFrontEndStateProperties(CommandListExecutionContext &ctx); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 64a3579355..e87e243cc4 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -140,42 +140,44 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( this->getGlobalFenceAndMakeItResident(); this->getWorkPartitionAndMakeItResident(); this->getGlobalStatelessHeapAndMakeItResident(); - this->getTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(child); - this->csr->programHardwareContext(child); - this->makeSbaTrackingBufferResidentIfL0DebuggerEnabled(ctx.isDebugEnabled); - - auto &csrStateProperties = csr->getStreamProperties(); - if (!this->pipelineSelectStateTracking) { - this->programPipelineSelectIfGpgpuDisabled(child); - } else { - // Setting systolic/pipeline select here for 1st command list is to preserve dispatch order of hw commands - auto commandList = CommandList::fromHandle(phCommandLists[0]); - auto &requiredStreamState = commandList->getRequiredStreamState(); - // Provide cmdlist required state as cmdlist final state, so csr state does not transition to final - // By preserving required state in csr - keeping csr state not dirty - it will not dispatch 1st command list pipeline select/systolic in main loop - // Csr state will transition to final of 1st command list in main loop - this->programOneCmdListPipelineSelect(commandList, child, csrStateProperties, requiredStreamState, requiredStreamState); - } - this->programCommandQueueDebugCmdsForSourceLevelOrL0DebuggerIfEnabled(ctx.isDebugEnabled, child); - if (!this->stateBaseAddressTracking) { - this->programStateBaseAddressWithGsbaIfDirty(ctx, phCommandLists[0], child); - } - this->programCsrBaseAddressIfPreemptionModeInitial(ctx.isPreemptionModeInitial, child); - this->programStateSip(ctx.stateSipRequired, child); this->makePreemptionAllocationResidentForModeMidThread(ctx.isDevicePreemptionModeMidThread); this->makeSipIsaResidentIfSipKernelUsed(ctx); this->makeDebugSurfaceResidentIfNEODebuggerActive(ctx.isNEODebuggerActive(this->device)); this->makeRayTracingBufferResident(neoDevice->getRTMemoryBackedBuffer()); - - this->programActivePartitionConfig(ctx.isProgramActivePartitionConfigRequired, child); + this->makeSbaTrackingBufferResidentIfL0DebuggerEnabled(ctx.isDebugEnabled); + this->makeCsrTagAllocationResident(); this->encodeKernelArgsBufferAndMakeItResident(); - bool shouldProgramVfe = this->csr->getLogicalStateHelper() && ctx.frontEndStateDirty; - this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, child, csrStateProperties); + auto &csrStateProperties = csr->getStreamProperties(); + if (ctx.globalInit) { + this->getTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(child); + this->csr->programHardwareContext(child); - if (ctx.rtDispatchRequired) { - auto csrHw = static_cast *>(this->csr); - csrHw->dispatchRayTracingStateCommand(child, *neoDevice); + if (!this->pipelineSelectStateTracking) { + this->programPipelineSelectIfGpgpuDisabled(child); + } else { + // Setting systolic/pipeline select here for 1st command list is to preserve dispatch order of hw commands + auto commandList = CommandList::fromHandle(phCommandLists[0]); + auto &requiredStreamState = commandList->getRequiredStreamState(); + // Provide cmdlist required state as cmdlist final state, so csr state does not transition to final + // By preserving required state in csr - keeping csr state not dirty - it will not dispatch 1st command list pipeline select/systolic in main loop + // Csr state will transition to final of 1st command list in main loop + this->programOneCmdListPipelineSelect(commandList, child, csrStateProperties, requiredStreamState, requiredStreamState); + } + this->programCommandQueueDebugCmdsForSourceLevelOrL0DebuggerIfEnabled(ctx.isDebugEnabled, child); + if (!this->stateBaseAddressTracking) { + this->programStateBaseAddressWithGsbaIfDirty(ctx, phCommandLists[0], child); + } + this->programCsrBaseAddressIfPreemptionModeInitial(ctx.isPreemptionModeInitial, child); + this->programStateSip(ctx.stateSipRequired, child); + this->programActivePartitionConfig(ctx.isProgramActivePartitionConfigRequired, child); + bool shouldProgramVfe = this->csr->getLogicalStateHelper() && ctx.frontEndStateDirty; + this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, child, csrStateProperties); + + if (ctx.rtDispatchRequired) { + auto csrHw = static_cast *>(this->csr); + csrHw->dispatchRayTracingStateCommand(child, *neoDevice); + } } this->writeCsrStreamInlineIfLogicalStateHelperAvailable(child); @@ -209,15 +211,13 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, phCommandLists[0]); this->programStateSipEndWA(ctx.stateSipRequired, child); + this->assignCsrTaskCountToFenceIfAvailable(hFence); + this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child); + auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child); this->csr->setPreemptionMode(ctx.statePreemption); - this->assignCsrTaskCountToFenceIfAvailable(hFence); - - this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child); - - this->makeCsrTagAllocationResident(); - auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child); this->updateTaskCountAndPostSync(ctx.isDispatchTaskCountPostSyncRequired); + this->csr->makeSurfacePackNonResident(this->csr->getResidencyAllocations(), false); auto completionResult = this->waitForCommandQueueCompletionAndCleanHeapContainer(); @@ -490,11 +490,11 @@ CommandQueueHw::CommandListExecutionContext::CommandListExecution this->cachedMOCSAllowed = false; } - hasIndirectAccess |= commandList->hasIndirectAllocationsAllowed(); + this->hasIndirectAccess |= commandList->hasIndirectAllocationsAllowed(); if (commandList->hasIndirectAllocationsAllowed()) { - unifiedMemoryControls.indirectDeviceAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectDeviceAllocationsAllowed; - unifiedMemoryControls.indirectHostAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectHostAllocationsAllowed; - unifiedMemoryControls.indirectSharedAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectSharedAllocationsAllowed; + this->unifiedMemoryControls.indirectDeviceAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectDeviceAllocationsAllowed; + this->unifiedMemoryControls.indirectHostAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectHostAllocationsAllowed; + this->unifiedMemoryControls.indirectSharedAllocationsAllowed |= commandList->getUnifiedMemoryControls().indirectSharedAllocationsAllowed; } } this->isDevicePreemptionModeMidThread = device->getDevicePreemptionMode() == NEO::PreemptionMode::MidThread; @@ -512,6 +512,8 @@ CommandQueueHw::CommandListExecutionContext::CommandListExecution if (this->isMigrationRequested && device->getDriverHandle()->getMemoryManager()->getPageFaultManager() == nullptr) { this->isMigrationRequested = false; } + + this->globalInit |= (this->isProgramActivePartitionConfigRequired || this->isPreemptionModeInitial || this->stateSipRequired || this->isDebugEnabled); } template @@ -606,7 +608,7 @@ void CommandQueueHw::setupCmdListsAndContextParams( template size_t CommandQueueHw::estimateLinearStreamSizeInitial( - const CommandListExecutionContext &ctx, + CommandListExecutionContext &ctx, ze_command_list_handle_t *phCommandLists, uint32_t numCommandLists) { @@ -620,7 +622,12 @@ size_t CommandQueueHw::estimateLinearStreamSizeInitial( linearStreamSizeEstimate += commandList->getCmdContainer().getCmdBufferAllocations().size(); } linearStreamSizeEstimate *= NEO::EncodeBatchBufferStartOrEnd::getBatchBufferStartSize(); - linearStreamSizeEstimate += this->csr->getCmdsSizeForHardwareContext(); + + auto hwContextSizeEstimate = this->csr->getCmdsSizeForHardwareContext(); + if (hwContextSizeEstimate > 0) { + linearStreamSizeEstimate += hwContextSizeEstimate; + ctx.globalInit |= true; + } if (ctx.isDirectSubmissionEnabled) { linearStreamSizeEstimate += NEO::EncodeBatchBufferStartOrEnd::getBatchBufferStartSize(); @@ -638,6 +645,7 @@ size_t CommandQueueHw::estimateLinearStreamSizeInitial( if (NEO::DebugManager.flags.EnableSWTags.get()) { linearStreamSizeEstimate += NEO::SWTagsManager::estimateSpaceForSWTags(); + ctx.globalInit |= true; } linearStreamSizeEstimate += NEO::EncodeKernelArgsBuffer::getKernelArgsBufferCmdsSize(this->csr->getKernelArgsBufferAllocation(), @@ -673,6 +681,7 @@ void CommandQueueHw::setFrontEndStateProperties(CommandListExecut ctx.engineInstanced = isEngineInstanced; } ctx.frontEndStateDirty |= csr->getMediaVFEStateDirty(); + ctx.globalInit |= ctx.frontEndStateDirty; } template @@ -684,6 +693,8 @@ void CommandQueueHw::handleScratchSpaceAndUpdateGSBAStateDirtyFla ctx.perThreadScratchSpaceSize, ctx.perThreadPrivateScratchSize); ctx.gsbaStateDirty |= this->csr->getGSBAStateDirty(); ctx.scratchGsba = scratchController->calculateNewGSH(); + + ctx.globalInit |= ctx.gsbaStateDirty; } template @@ -694,6 +705,8 @@ size_t CommandQueueHw::estimateLinearStreamSizeComplementary( size_t linearStreamSizeEstimate = 0u; + ctx.globalInit |= !(csr->getPreambleSetFlag()); + linearStreamSizeEstimate += estimateFrontEndCmdSize(ctx.frontEndStateDirty); linearStreamSizeEstimate += estimatePipelineSelectCmdSize(); @@ -724,6 +737,8 @@ size_t CommandQueueHw::estimateLinearStreamSizeComplementary( ctx.rtDispatchRequired = true; auto csrHw = static_cast *>(this->csr); linearStreamSizeEstimate += csrHw->getCmdSizeForPerDssBackedBuffer(this->device->getHwInfo()); + + ctx.globalInit |= true; } return linearStreamSizeEstimate;