From f82b2e29845fc6698e74c6054bc70ed3ee15c3bb Mon Sep 17 00:00:00 2001 From: Maciej Bielski Date: Fri, 24 Jun 2022 12:58:24 +0000 Subject: [PATCH] executeCommandLists: cleanup and split copy-only vs non-copy-only Split the function into submethods to improve readability, reusability and maintability (initially it was ~500 lines long!). Also, split the execution into 'copy-only' and 'regular' cases to reduce amount of `if()`s in the code. Resolves: NEO-7118 Signed-off-by: Maciej Bielski --- level_zero/core/source/cmdqueue/cmdqueue_hw.h | 114 +- .../core/source/cmdqueue/cmdqueue_hw.inl | 1342 +++++++++++------ 2 files changed, 971 insertions(+), 485 deletions(-) diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.h b/level_zero/core/source/cmdqueue/cmdqueue_hw.h index 48b14e46a4..df25053032 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.h @@ -11,7 +11,7 @@ namespace NEO { class ScratchSpaceController; -} +} // namespace NEO namespace L0 { @@ -29,9 +29,6 @@ struct CommandQueueHw : public CommandQueueImp { void *phCommands, ze_fence_handle_t hFence) override; - void dispatchTaskCountPostSync(NEO::LinearStream &commandStream, const NEO::HardwareInfo &hwInfo); - bool isDispatchTaskCountPostSyncRequired(ze_fence_handle_t hFence, bool containsAnyRegularCmdList) const; - void programStateBaseAddress(uint64_t gsba, bool useLocalMemoryForIndirectHeap, NEO::LinearStream &commandStream, bool cachedMOCSAllowed); size_t estimateStateBaseAddressCmdSize(); MOCKABLE_VIRTUAL void programFrontEnd(uint64_t scratchAddress, uint32_t perThreadScratchSpaceSize, NEO::LinearStream &commandStream); @@ -40,7 +37,7 @@ struct CommandQueueHw : public CommandQueueImp { ze_command_list_handle_t *phCommandLists); size_t estimateFrontEndCmdSize(); size_t estimatePipelineSelect(); - void programPipelineSelect(NEO::LinearStream &commandStream); + void programPipelineSelectIfGpgpuDisabled(NEO::LinearStream &commandStream); MOCKABLE_VIRTUAL void handleScratchSpace(NEO::HeapContainer &heapContainer, NEO::ScratchSpaceController *scratchController, @@ -50,6 +47,113 @@ struct CommandQueueHw : public CommandQueueImp { bool getPreemptionCmdProgramming() override; void patchCommands(CommandList &commandList, uint64_t scratchAddress); + + protected: + struct CommandListExecutionContext { + + CommandListExecutionContext(ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists, + NEO::PreemptionMode contextPreemptionMode, + Device *device, + bool debugEnabled, + bool programActivePartitionConfig, + bool performMigration); + + inline bool isNEODebuggerActive(Device *device); + + bool anyCommandListWithCooperativeKernels = false; + bool anyCommandListWithoutCooperativeKernels = false; + bool anyCommandListRequiresDisabledEUFusion = false; + bool cachedMOCSAllowed = true; + bool performMemoryPrefetch = false; + bool containsAnyRegularCmdList = false; + bool gsbaStateDirty = false; + bool frontEndStateDirty = false; + size_t spaceForResidency = 0; + NEO::PreemptionMode preemptionMode{}; + NEO::PreemptionMode statePreemption{}; + uint32_t perThreadScratchSpaceSize = 0; + uint32_t perThreadPrivateScratchSize = 0; + const bool isPreemptionModeInitial{}; + bool isDevicePreemptionModeMidThread{}; + bool isDebugEnabled{}; + bool stateSipRequired{}; + bool isProgramActivePartitionConfigRequired{}; + bool isMigrationRequested{}; + bool isDirectSubmissionEnabled{}; + bool isDispatchTaskCountPostSyncRequired{}; + }; + + ze_result_t validateCommandListsParams(CommandListExecutionContext &ctx, + ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists); + inline ze_result_t executeCommandListsRegular(CommandListExecutionContext &ctx, + uint32_t numCommandLists, + ze_command_list_handle_t *phCommandLists, + ze_fence_handle_t hFence); + inline ze_result_t executeCommandListsCopyOnly(CommandListExecutionContext &ctx, + uint32_t numCommandLists, + ze_command_list_handle_t *phCommandLists, + ze_fence_handle_t hFence); + inline size_t computeDebuggerCmdsSize(const CommandListExecutionContext &ctx); + inline size_t computePreemptionSize(CommandListExecutionContext &ctx, + ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists); + inline void setupCmdListsAndContextParams(CommandListExecutionContext &ctx, + ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists, + ze_fence_handle_t hFence); + inline bool isDispatchTaskCountPostSyncRequired(ze_fence_handle_t hFence, bool containsAnyRegularCmdList) const; + inline size_t estimateLinearStreamSizeInitial(CommandListExecutionContext &ctx, + ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists); + inline void setFrontEndStateProperties(CommandListExecutionContext &ctx); + inline void handleScratchSpaceAndUpdateGSBAStateDirtyFlag(CommandListExecutionContext &ctx); + inline size_t estimateLinearStreamSizeComplementary(CommandListExecutionContext &ctx, + ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists); + inline ze_result_t makeAlignedChildStreamAndSetGpuBase(NEO::LinearStream &child, size_t requiredSize); + inline void allocateGlobalFenceAndMakeItResident(); + inline void allocateWorkPartitionAndMakeItResident(); + inline void allocateTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(NEO::LinearStream &commandStream); + inline void makeSbaTrackingBufferResidentIfL0DebuggerEnabled(bool isDebugEnabled); + inline void programCommandQueueDebugCmdsForSourceLevelOrL0DebuggerIfEnabled(bool isDebugEnabled, NEO::LinearStream &commandStream); + inline void programSbaWithUpdatedGsbaIfDirty(CommandListExecutionContext &ctx, + ze_command_list_handle_t hCommandList, + NEO::LinearStream &commandStream); + inline void programCsrBaseAddressIfPreemptionModeInitial(bool isPreemptionModeInitial, NEO::LinearStream &commandStream); + inline void programStateSip(bool isStateSipRequired, NEO::LinearStream &commandStream); + inline void updateOneCmdListPreemptionModeAndCtxStatePreemption(CommandListExecutionContext &ctx, + NEO::PreemptionMode commandListPreemption, + NEO::LinearStream &commandStream); + inline void makePreemptionAllocationResidentForModeMidThread(bool isDevicePreemptionModeMidThread); + inline void makeSipIsaResidentIfSipKernelUsed(CommandListExecutionContext &ctx); + inline void makeDebugSurfaceResidentIfNEODebuggerActive(bool isNEODebuggerActive); + inline void makeCsrTagAllocationResident(); + inline void programActivePartitionConfig(bool isProgramActivePartitionConfigRequired, NEO::LinearStream &commandStream); + inline void encodeKernelArgsBufferAndMakeItResident(); + inline void writeCsrStreamInlineIfLogicalStateHelperAvailable(NEO::LinearStream &commandStream); + inline void programOneCmdListFrontEndIfDirty(CommandListExecutionContext &ctx, + CommandList *commandList, + NEO::LinearStream &commandStream); + inline void programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream); + inline void mergeOneCmdListPipelinedState(CommandList *commandList); + inline void programFrontEndAndClearDirtyFlag(bool shouldFrontEndBeProgrammed, + CommandListExecutionContext &ctx, + NEO::LinearStream &commandStream); + inline void collectPrintfContentsFromAllCommandsLists(ze_command_list_handle_t *phCommandLists, uint32_t numCommandLists); + inline void migrateSharedAllocationsIfRequested(bool isMigrationRequested, ze_command_list_handle_t hCommandList); + inline void prefetchMemoryIfRequested(bool &isMemoryPrefetchRequested); + inline void programStateSipEndWA(bool isStateSipRequired, NEO::LinearStream &commandStream); + inline void assignCsrTaskCountToFenceIfAvailable(ze_fence_handle_t hFence); + inline void dispatchTaskCountPostSyncRegular(bool isDispatchTaskCountPostSyncRequired, NEO::LinearStream &commandStream); + inline void dispatchTaskCountPostSyncByMiFlushDw(bool isDispatchTaskCountPostSyncRequired, NEO::LinearStream &commandStream); + inline NEO::SubmissionStatus prepareAndSubmitBatchBuffer(CommandListExecutionContext &ctx, NEO::LinearStream &innerCommandStream); + inline void updateTaskCountAndPostSync(bool isDispatchTaskCountPostSyncRequired); + inline ze_result_t waitForCommandQueueCompletionAndCleanHeapContainer(); + inline ze_result_t handleSubmissionAndCompletionResults(NEO::SubmissionStatus submitRet, ze_result_t completionRet); + + size_t alignedChildStreamPadding{}; }; } // namespace L0 diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 667de826b0..0805c1b917 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -75,482 +75,256 @@ ze_result_t CommandQueueHw::executeCommandLists( ze_fence_handle_t hFence, bool performMigration) { - using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; - using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END; + auto lockCSR = this->csr->obtainUniqueOwnership(); - auto lockCSR = csr->obtainUniqueOwnership(); + auto ctx = CommandListExecutionContext{phCommandLists, + numCommandLists, + csr->getPreemptionMode(), + device, + NEO::Debugger::isDebugEnabled(internalUsage), + csr->isProgramActivePartitionConfigRequired(), + performMigration}; - auto anyCommandListWithCooperativeKernels = false; - auto anyCommandListWithoutCooperativeKernels = false; - bool anyCommandListRequiresDisabledEUFusion = false; - bool cachedMOCSAllowed = true; - bool performMemoryPrefetch = false; + auto ret = validateCommandListsParams(ctx, phCommandLists, numCommandLists); + if (ret != ZE_RESULT_SUCCESS) { + return ret; + } + + this->device->activateMetricGroups(); + + if (this->isCopyOnlyCommandQueue) { + ret = this->executeCommandListsCopyOnly(ctx, numCommandLists, phCommandLists, hFence); + } else { + ret = this->executeCommandListsRegular(ctx, numCommandLists, phCommandLists, hFence); + } + + return ret; +} + +template +ze_result_t CommandQueueHw::executeCommandListsRegular( + CommandListExecutionContext &ctx, + uint32_t numCommandLists, + ze_command_list_handle_t *phCommandLists, + ze_fence_handle_t hFence) { + + this->setupCmdListsAndContextParams(ctx, phCommandLists, numCommandLists, hFence); + ctx.isDirectSubmissionEnabled = this->csr->isDirectSubmissionEnabled(); + + size_t linearStreamSizeEstimate = this->estimateLinearStreamSizeInitial(ctx, phCommandLists, numCommandLists); + + this->csr->getResidencyAllocations().reserve(ctx.spaceForResidency); + + this->handleScratchSpaceAndUpdateGSBAStateDirtyFlag(ctx); + this->setFrontEndStateProperties(ctx); + + linearStreamSizeEstimate += this->estimateLinearStreamSizeComplementary(ctx, phCommandLists, numCommandLists); + linearStreamSizeEstimate += this->computePreemptionSize(ctx, phCommandLists, numCommandLists); + linearStreamSizeEstimate += this->computeDebuggerCmdsSize(ctx); + linearStreamSizeEstimate += NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(this->device->getHwInfo()); + + NEO::LinearStream child(nullptr); + if (const auto ret = this->makeAlignedChildStreamAndSetGpuBase(child, linearStreamSizeEstimate); ret != ZE_RESULT_SUCCESS) { + return ret; + } + + this->allocateGlobalFenceAndMakeItResident(); + this->allocateWorkPartitionAndMakeItResident(); + this->allocateTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(child); + this->csr->programHardwareContext(child); + this->makeSbaTrackingBufferResidentIfL0DebuggerEnabled(ctx.isDebugEnabled); + + this->programPipelineSelectIfGpgpuDisabled(child); + this->programCommandQueueDebugCmdsForSourceLevelOrL0DebuggerIfEnabled(ctx.isDebugEnabled, child); + this->programSbaWithUpdatedGsbaIfDirty(ctx, phCommandLists[0], child); + this->programCsrBaseAddressIfPreemptionModeInitial(ctx.isPreemptionModeInitial, child); + this->programStateSip(ctx.stateSipRequired, child); + this->makePreemptionAllocationResidentForModeMidThread(ctx.isDevicePreemptionModeMidThread); + this->makeSipIsaResidentIfSipKernelUsed(ctx); + this->makeDebugSurfaceResidentIfNEODebuggerActive(ctx.isNEODebuggerActive(this->device)); + + this->programActivePartitionConfig(ctx.isProgramActivePartitionConfigRequired, child); + this->encodeKernelArgsBufferAndMakeItResident(); + + bool shouldProgramVfe = this->csr->getLogicalStateHelper() && ctx.frontEndStateDirty; + this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, child); + + this->writeCsrStreamInlineIfLogicalStateHelperAvailable(child); + + ctx.statePreemption = ctx.preemptionMode; + + for (auto i = 0u; i < numCommandLists; ++i) { + auto commandList = CommandList::fromHandle(phCommandLists[i]); + this->updateOneCmdListPreemptionModeAndCtxStatePreemption(ctx, commandList->getCommandListPreemptionMode(), child); + this->programOneCmdListFrontEndIfDirty(ctx, commandList, child); + + this->patchCommands(*commandList, this->csr->getScratchSpaceController()->getScratchPatchAddress()); + this->programOneCmdListBatchBufferStart(commandList, child); + this->mergeOneCmdListPipelinedState(commandList); + } + + this->collectPrintfContentsFromAllCommandsLists(phCommandLists, numCommandLists); + this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, phCommandLists[0]); + this->prefetchMemoryIfRequested(ctx.performMemoryPrefetch); + + this->programStateSipEndWA(ctx.stateSipRequired, child); + + this->csr->setPreemptionMode(ctx.statePreemption); + this->assignCsrTaskCountToFenceIfAvailable(hFence); + + this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child); + + this->makeCsrTagAllocationResident(); + auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child); + this->updateTaskCountAndPostSync(ctx.isDispatchTaskCountPostSyncRequired); + this->csr->makeSurfacePackNonResident(this->csr->getResidencyAllocations(), false); + + auto completionResult = this->waitForCommandQueueCompletionAndCleanHeapContainer(); + ze_result_t retVal = this->handleSubmissionAndCompletionResults(submitResult, completionResult); + + this->csr->getResidencyAllocations().clear(); + + return retVal; +} + +template +ze_result_t CommandQueueHw::executeCommandListsCopyOnly( + CommandListExecutionContext &ctx, + uint32_t numCommandLists, + ze_command_list_handle_t *phCommandLists, + ze_fence_handle_t hFence) { + + this->setupCmdListsAndContextParams(ctx, phCommandLists, numCommandLists, hFence); + ctx.isDirectSubmissionEnabled = this->csr->isBlitterDirectSubmissionEnabled(); + + size_t linearStreamSizeEstimate = this->estimateLinearStreamSizeInitial(ctx, phCommandLists, numCommandLists); + + this->csr->getResidencyAllocations().reserve(ctx.spaceForResidency); + + this->handleScratchSpaceAndUpdateGSBAStateDirtyFlag(ctx); + this->setFrontEndStateProperties(ctx); + + linearStreamSizeEstimate += NEO::EncodeMiFlushDW::getMiFlushDwCmdSizeForDataWrite(); + + NEO::LinearStream child(nullptr); + if (const auto ret = this->makeAlignedChildStreamAndSetGpuBase(child, linearStreamSizeEstimate); ret != ZE_RESULT_SUCCESS) { + return ret; + } + + this->allocateGlobalFenceAndMakeItResident(); + this->allocateWorkPartitionAndMakeItResident(); + this->allocateTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(child); + this->csr->programHardwareContext(child); + this->makeSbaTrackingBufferResidentIfL0DebuggerEnabled(ctx.isDebugEnabled); + + this->programActivePartitionConfig(ctx.isProgramActivePartitionConfigRequired, child); + this->encodeKernelArgsBufferAndMakeItResident(); + + this->writeCsrStreamInlineIfLogicalStateHelperAvailable(child); + + for (auto i = 0u; i < numCommandLists; ++i) { + auto commandList = CommandList::fromHandle(phCommandLists[i]); + + this->patchCommands(*commandList, this->csr->getScratchSpaceController()->getScratchPatchAddress()); + this->programOneCmdListBatchBufferStart(commandList, child); + this->mergeOneCmdListPipelinedState(commandList); + } + this->collectPrintfContentsFromAllCommandsLists(phCommandLists, numCommandLists); + this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, phCommandLists[0]); + this->prefetchMemoryIfRequested(ctx.performMemoryPrefetch); + + this->csr->setPreemptionMode(ctx.statePreemption); + this->assignCsrTaskCountToFenceIfAvailable(hFence); + + this->dispatchTaskCountPostSyncByMiFlushDw(ctx.isDispatchTaskCountPostSyncRequired, child); + + this->makeCsrTagAllocationResident(); + auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child); + this->updateTaskCountAndPostSync(ctx.isDispatchTaskCountPostSyncRequired); + this->csr->makeSurfacePackNonResident(this->csr->getResidencyAllocations(), false); + + auto completionResult = this->waitForCommandQueueCompletionAndCleanHeapContainer(); + ze_result_t retVal = this->handleSubmissionAndCompletionResults(submitResult, completionResult); + + this->csr->getResidencyAllocations().clear(); + + return retVal; +} + +template +ze_result_t CommandQueueHw::validateCommandListsParams( + CommandListExecutionContext &ctx, + ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists) { for (auto i = 0u; i < numCommandLists; i++) { auto commandList = CommandList::fromHandle(phCommandLists[i]); - if (peekIsCopyOnlyCommandQueue() != commandList->isCopyOnly()) { + if (this->peekIsCopyOnlyCommandQueue() != commandList->isCopyOnly()) { return ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE; } if (this->activeSubDevices < commandList->partitionCount) { return ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE; } - - if (commandList->containsCooperativeKernels()) { - anyCommandListWithCooperativeKernels = true; - } else { - anyCommandListWithoutCooperativeKernels = true; - } - - if (commandList->getRequiredStreamState().frontEndState.disableEUFusion.value == 1) { - anyCommandListRequiresDisabledEUFusion = true; - } - - // If the Command List has commands that require uncached MOCS, then any changes to the commands in the queue requires the uncached MOCS - if (commandList->requiresQueueUncachedMocs && cachedMOCSAllowed == true) { - cachedMOCSAllowed = false; - } - - if (commandList->isMemoryPrefetchRequested()) { - performMemoryPrefetch = true; - } } - bool isMixingRegularAndCooperativeKernelsAllowed = NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.get(); - if (anyCommandListWithCooperativeKernels && anyCommandListWithoutCooperativeKernels && - (!isMixingRegularAndCooperativeKernelsAllowed)) { + if (ctx.anyCommandListWithCooperativeKernels && + ctx.anyCommandListWithoutCooperativeKernels && + (!NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.get())) { return ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE; } - size_t spaceForResidency = 0; - size_t preemptionSize = 0u; - size_t debuggerCmdsSize = 0; - constexpr size_t residencyContainerSpaceForPreemption = 2; - constexpr size_t residencyContainerSpaceForTagWrite = 1; - - NEO::Device *neoDevice = device->getNEODevice(); - auto devicePreemption = device->getDevicePreemptionMode(); - auto contextPreemptionMode = csr->getPreemptionMode(); - const bool initialPreemptionMode = contextPreemptionMode == NEO::PreemptionMode::Initial; - NEO::PreemptionMode statePreemption = contextPreemptionMode; - - const bool stateSipRequired = (initialPreemptionMode && devicePreemption == NEO::PreemptionMode::MidThread) || - (neoDevice->getDebugger() && NEO::Debugger::isDebugEnabled(internalUsage)); - - if (!isCopyOnlyCommandQueue) { - if (initialPreemptionMode) { - preemptionSize += NEO::PreemptionHelper::getRequiredPreambleSize(*neoDevice); - } - - if (stateSipRequired) { - preemptionSize += NEO::PreemptionHelper::getRequiredStateSipCmdSize(*neoDevice, csr->isRcs()); - } - } - - if (NEO::Debugger::isDebugEnabled(internalUsage) && !commandQueueDebugCmdsProgrammed) { - if (neoDevice->getSourceLevelDebugger() != nullptr) { - debuggerCmdsSize += NEO::PreambleHelper::getKernelDebuggingCommandsSize(true); - } else if (device->getL0Debugger()) { - debuggerCmdsSize += device->getL0Debugger()->getSbaAddressLoadCommandsSize(); - } - } - - if (devicePreemption == NEO::PreemptionMode::MidThread) { - spaceForResidency += residencyContainerSpaceForPreemption; - } - - bool directSubmissionEnabled = isCopyOnlyCommandQueue ? csr->isBlitterDirectSubmissionEnabled() : csr->isDirectSubmissionEnabled(); - bool programActivePartitionConfig = csr->isProgramActivePartitionConfigRequired(); - - L0::Fence *fence = nullptr; - - device->activateMetricGroups(); - - bool containsAnyRegularCmdList = false; - size_t totalCmdBuffers = 0; - uint32_t perThreadScratchSpaceSize = 0; - uint32_t perThreadPrivateScratchSize = 0; - NEO::PageFaultManager *pageFaultManager = nullptr; - if (performMigration) { - pageFaultManager = device->getDriverHandle()->getMemoryManager()->getPageFaultManager(); - if (pageFaultManager == nullptr) { - performMigration = false; - } - } - for (auto i = 0u; i < numCommandLists; i++) { - auto commandList = CommandList::fromHandle(phCommandLists[i]); - - commandList->csr = csr; - commandList->handleIndirectAllocationResidency(); - - containsAnyRegularCmdList |= commandList->cmdListType == CommandList::CommandListType::TYPE_REGULAR; - - totalCmdBuffers += commandList->commandContainer.getCmdBufferAllocations().size(); - spaceForResidency += commandList->commandContainer.getResidencyContainer().size(); - if (!isCopyOnlyCommandQueue) { - auto commandListPreemption = commandList->getCommandListPreemptionMode(); - if (statePreemption != commandListPreemption) { - if (preemptionCmdSyncProgramming) { - preemptionSize += NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(); - } - preemptionSize += NEO::PreemptionHelper::getRequiredCmdStreamSize(commandListPreemption, statePreemption); - statePreemption = commandListPreemption; - } - - perThreadScratchSpaceSize = std::max(perThreadScratchSpaceSize, commandList->getCommandListPerThreadScratchSize()); - - perThreadPrivateScratchSize = std::max(perThreadPrivateScratchSize, commandList->getCommandListPerThreadPrivateScratchSize()); - - if (commandList->getCommandListPerThreadScratchSize() != 0 || commandList->getCommandListPerThreadPrivateScratchSize() != 0) { - if (commandList->commandContainer.getIndirectHeap(NEO::HeapType::SURFACE_STATE) != nullptr) { - heapContainer.push_back(commandList->commandContainer.getIndirectHeap(NEO::HeapType::SURFACE_STATE)->getGraphicsAllocation()); - } - for (auto element : commandList->commandContainer.sshAllocations) { - heapContainer.push_back(element); - } - } - } - - partitionCount = std::max(partitionCount, commandList->partitionCount); - commandList->makeResidentAndMigrate(performMigration); - } - - size_t linearStreamSizeEstimate = totalCmdBuffers * sizeof(MI_BATCH_BUFFER_START); - linearStreamSizeEstimate += csr->getCmdsSizeForHardwareContext(); - if (directSubmissionEnabled) { - linearStreamSizeEstimate += sizeof(MI_BATCH_BUFFER_START); - } else { - linearStreamSizeEstimate += sizeof(MI_BATCH_BUFFER_END); - } - - auto csrHw = reinterpret_cast *>(csr); - if (programActivePartitionConfig) { - linearStreamSizeEstimate += csrHw->getCmdSizeForActivePartitionConfig(); - } - const auto &hwInfo = this->device->getHwInfo(); - - spaceForResidency += residencyContainerSpaceForTagWrite; - - csr->getResidencyAllocations().reserve(spaceForResidency); - - auto scratchSpaceController = csr->getScratchSpaceController(); - bool gsbaStateDirty = false; - bool frontEndStateDirty = false; - handleScratchSpace(heapContainer, - scratchSpaceController, - gsbaStateDirty, frontEndStateDirty, - perThreadScratchSpaceSize, perThreadPrivateScratchSize); - - auto &streamProperties = csr->getStreamProperties(); - const auto &hwInfoConfig = *NEO::HwInfoConfig::get(hwInfo.platform.eProductFamily); - auto disableOverdispatch = hwInfoConfig.isDisableOverdispatchAvailable(hwInfo); - auto isEngineInstanced = csr->getOsContext().isEngineInstanced(); - bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get(); - if (!isPatchingVfeStateAllowed) { - streamProperties.frontEndState.setProperties(anyCommandListWithCooperativeKernels, anyCommandListRequiresDisabledEUFusion, - disableOverdispatch, isEngineInstanced, hwInfo); - } else { - streamProperties.frontEndState.singleSliceDispatchCcsMode.set(isEngineInstanced); - } - frontEndStateDirty |= (streamProperties.frontEndState.isDirty() && !csr->getLogicalStateHelper()); - - gsbaStateDirty |= csr->getGSBAStateDirty(); - frontEndStateDirty |= csr->getMediaVFEStateDirty(); - bool gpgpuEnabled = csr->getPreambleSetFlag(); - if (!isCopyOnlyCommandQueue) { - - if (!gpgpuEnabled) { - linearStreamSizeEstimate += estimatePipelineSelect(); - } - - linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(frontEndStateDirty, numCommandLists, phCommandLists); - - if (gsbaStateDirty) { - linearStreamSizeEstimate += estimateStateBaseAddressCmdSize(); - } - - linearStreamSizeEstimate += preemptionSize + debuggerCmdsSize; - } - - if (NEO::DebugManager.flags.EnableSWTags.get()) { - linearStreamSizeEstimate += NEO::SWTagsManager::estimateSpaceForSWTags(); - } - - bool dispatchPostSync = isDispatchTaskCountPostSyncRequired(hFence, containsAnyRegularCmdList); - if (dispatchPostSync) { - linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW::getMiFlushDwCmdSizeForDataWrite() - : NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); - } - - linearStreamSizeEstimate += NEO::EncodeKernelArgsBuffer::getKernelArgsBufferCmdsSize(csr->getKernelArgsBufferAllocation(), csr->getLogicalStateHelper()); - - size_t alignedSize = alignUp(linearStreamSizeEstimate, minCmdBufferPtrAlign); - size_t padding = alignedSize - linearStreamSizeEstimate; - - const auto waitStatus = reserveLinearStreamSize(alignedSize); - if (waitStatus == NEO::WaitStatus::GpuHang) { - return ZE_RESULT_ERROR_DEVICE_LOST; - } - - NEO::LinearStream child(commandStream->getSpace(alignedSize), alignedSize); - child.setGpuBase(ptrOffset(commandStream->getGpuBase(), commandStream->getUsed() - alignedSize)); - - const auto globalFenceAllocation = csr->getGlobalFenceAllocation(); - if (globalFenceAllocation) { - csr->makeResident(*globalFenceAllocation); - } - - const auto workPartitionAllocation = csr->getWorkPartitionAllocation(); - if (workPartitionAllocation) { - csr->makeResident(*workPartitionAllocation); - } - - if (NEO::DebugManager.flags.EnableSWTags.get()) { - NEO::SWTagsManager *tagsManager = neoDevice->getRootDeviceEnvironment().tagsManager.get(); - UNRECOVERABLE_IF(tagsManager == nullptr); - csr->makeResident(*tagsManager->getBXMLHeapAllocation()); - csr->makeResident(*tagsManager->getSWTagHeapAllocation()); - tagsManager->insertBXMLHeapAddress(child); - tagsManager->insertSWTagHeapAddress(child); - } - - csr->programHardwareContext(child); - - if (NEO::Debugger::isDebugEnabled(internalUsage) && device->getL0Debugger()) { - csr->makeResident(*device->getL0Debugger()->getSbaTrackingBuffer(csr->getOsContext().getContextId())); - } - - if (!isCopyOnlyCommandQueue) { - if (!gpgpuEnabled) { - programPipelineSelect(child); - csr->setPreambleSetFlag(true); - } - - if (NEO::Debugger::isDebugEnabled(internalUsage) && !commandQueueDebugCmdsProgrammed) { - if (neoDevice->getSourceLevelDebugger()) { - NEO::PreambleHelper::programKernelDebugging(&child); - commandQueueDebugCmdsProgrammed = true; - } else if (device->getL0Debugger()) { - device->getL0Debugger()->programSbaAddressLoad(child, - device->getL0Debugger()->getSbaTrackingBuffer(csr->getOsContext().getContextId())->getGpuAddress()); - commandQueueDebugCmdsProgrammed = true; - } - } - - if (gsbaStateDirty) { - auto indirectHeap = CommandList::fromHandle(phCommandLists[0])->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT); - programStateBaseAddress(scratchSpaceController->calculateNewGSH(), indirectHeap->getGraphicsAllocation()->isAllocatedInLocalMemoryPool(), child, cachedMOCSAllowed); - } - - if (initialPreemptionMode) { - NEO::PreemptionHelper::programCsrBaseAddress(child, *neoDevice, csr->getPreemptionAllocation(), csr->getLogicalStateHelper()); - } - - if (stateSipRequired) { - NEO::PreemptionHelper::programStateSip(child, *neoDevice, csr->getLogicalStateHelper()); - } - - const bool sipKernelUsed = devicePreemption == NEO::PreemptionMode::MidThread || - (neoDevice->getDebugger() != nullptr && NEO::Debugger::isDebugEnabled(internalUsage)); - - if (devicePreemption == NEO::PreemptionMode::MidThread) { - csr->makeResident(*csr->getPreemptionAllocation()); - } - - if (sipKernelUsed) { - auto sipIsa = NEO::SipKernel::getSipKernel(*neoDevice).getSipAllocation(); - csr->makeResident(*sipIsa); - } - - if (NEO::Debugger::isDebugEnabled(internalUsage) && neoDevice->getDebugger()) { - UNRECOVERABLE_IF(device->getDebugSurface() == nullptr); - csr->makeResident(*device->getDebugSurface()); - } - } - - if (programActivePartitionConfig) { - csrHw->programActivePartitionConfig(child); - } - - NEO::EncodeKernelArgsBuffer::encodeKernelArgsBufferCmds(csr->getKernelArgsBufferAllocation(), csr->getLogicalStateHelper()); - - if (csr->getKernelArgsBufferAllocation()) { - csr->makeResident(*csr->getKernelArgsBufferAllocation()); - } - - if (csr->getLogicalStateHelper()) { - if (frontEndStateDirty && !isCopyOnlyCommandQueue) { - programFrontEnd(scratchSpaceController->getScratchPatchAddress(), scratchSpaceController->getPerThreadScratchSpaceSize(), child); - frontEndStateDirty = false; - } - - csr->getLogicalStateHelper()->writeStreamInline(child, false); - } - - statePreemption = contextPreemptionMode; - - for (auto i = 0u; i < numCommandLists; ++i) { - auto commandList = CommandList::fromHandle(phCommandLists[i]); - auto &cmdBufferAllocations = commandList->commandContainer.getCmdBufferAllocations(); - auto cmdBufferCount = cmdBufferAllocations.size(); - bool immediateMode = (commandList->cmdListType == CommandList::CommandListType::TYPE_IMMEDIATE) ? true : false; - - if (!isCopyOnlyCommandQueue) { - auto commandListPreemption = commandList->getCommandListPreemptionMode(); - if (statePreemption != commandListPreemption) { - if (NEO::DebugManager.flags.EnableSWTags.get()) { - neoDevice->getRootDeviceEnvironment().tagsManager->insertTag( - child, - *neoDevice, - "ComandList Preemption Mode update", 0u); - } - - if (preemptionCmdSyncProgramming) { - NEO::PipeControlArgs args; - NEO::MemorySynchronizationCommands::addSingleBarrier(child, args); - } - NEO::PreemptionHelper::programCmdStream(child, - commandListPreemption, - statePreemption, - csr->getPreemptionAllocation()); - statePreemption = commandListPreemption; - } - - bool programVfe = frontEndStateDirty; - if (isPatchingVfeStateAllowed) { - auto &requiredStreamState = commandList->getRequiredStreamState(); - streamProperties.frontEndState.setProperties(requiredStreamState.frontEndState); - programVfe |= streamProperties.frontEndState.isDirty(); - } - - if (programVfe) { - programFrontEnd(scratchSpaceController->getScratchPatchAddress(), scratchSpaceController->getPerThreadScratchSpaceSize(), child); - frontEndStateDirty = false; - } - - if (isPatchingVfeStateAllowed) { - auto &finalStreamState = commandList->getFinalStreamState(); - streamProperties.frontEndState.setProperties(finalStreamState.frontEndState); - } - } - - patchCommands(*commandList, scratchSpaceController->getScratchPatchAddress()); - - for (size_t iter = 0; iter < cmdBufferCount; iter++) { - auto allocation = cmdBufferAllocations[iter]; - uint64_t startOffset = allocation->getGpuAddress(); - if (immediateMode && (iter == (cmdBufferCount - 1))) { - startOffset = ptrOffset(allocation->getGpuAddress(), commandList->commandContainer.currentLinearStreamStartOffset); - } - NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&child, startOffset, true); - } - - printfFunctionContainer.insert(printfFunctionContainer.end(), - commandList->getPrintfFunctionContainer().begin(), - commandList->getPrintfFunctionContainer().end()); - - auto commandListImp = static_cast(commandList); - if (!immediateMode && commandListImp->getLogicalStateHelper()) { - csr->getLogicalStateHelper()->mergePipelinedState(*commandListImp->getLogicalStateHelper()); - } - } - - if (performMigration) { - auto commandList = CommandList::fromHandle(phCommandLists[0]); - commandList->migrateSharedAllocations(); - } - - if (performMemoryPrefetch) { - auto prefetchManager = device->getDriverHandle()->getMemoryManager()->getPrefetchManager(); - prefetchManager->migrateAllocationsToGpu(*this->device->getDriverHandle()->getSvmAllocsManager(), *this->device->getNEODevice()); - performMemoryPrefetch = false; - } - - if (!isCopyOnlyCommandQueue && stateSipRequired) { - NEO::PreemptionHelper::programStateSipEndWa(child, *neoDevice); - } - - csr->setPreemptionMode(statePreemption); - - if (hFence) { - fence = Fence::fromHandle(hFence); - fence->assignTaskCountFromCsr(); - } - if (dispatchPostSync) { - dispatchTaskCountPostSync(child, hwInfo); - } - - csr->makeResident(*csr->getTagAllocation()); - void *endingCmd = nullptr; - if (directSubmissionEnabled) { - auto offset = ptrDiff(child.getCpuBase(), commandStream->getCpuBase()) + child.getUsed(); - uint64_t startAddress = commandStream->getGraphicsAllocation()->getGpuAddress() + offset; - if (NEO::DebugManager.flags.BatchBufferStartPrepatchingWaEnabled.get() == 0) { - startAddress = 0; - } - - endingCmd = child.getSpace(0); - NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&child, startAddress, false); - } else { - MI_BATCH_BUFFER_END cmd = GfxFamily::cmdInitBatchBufferEnd; - auto buffer = child.getSpaceForCmd(); - *(MI_BATCH_BUFFER_END *)buffer = cmd; - } - - if (padding) { - void *paddingPtr = child.getSpace(padding); - memset(paddingPtr, 0, padding); - } - - auto ret = submitBatchBuffer(ptrDiff(child.getCpuBase(), commandStream->getCpuBase()), csr->getResidencyAllocations(), endingCmd, - anyCommandListWithCooperativeKernels); - - this->taskCount = csr->peekTaskCount(); - if (dispatchPostSync) { - csr->setLatestFlushedTaskCount(this->taskCount); - } - - csr->makeSurfacePackNonResident(csr->getResidencyAllocations(), false); - - ze_result_t retVal = ZE_RESULT_SUCCESS; - if (getSynchronousMode() == ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS) { - const auto synchronizeResult = this->synchronize(std::numeric_limits::max()); - if (synchronizeResult == ZE_RESULT_ERROR_DEVICE_LOST) { - retVal = ZE_RESULT_ERROR_DEVICE_LOST; - } - } else { - csr->pollForCompletion(); - } - this->heapContainer.clear(); - - if ((ret != NEO::SubmissionStatus::SUCCESS) || (retVal == ZE_RESULT_ERROR_DEVICE_LOST)) { - for (auto &gfx : csr->getResidencyAllocations()) { - if (csr->peekLatestFlushedTaskCount() == 0) { - gfx->releaseUsageInOsContext(csr->getOsContext().getContextId()); - } else { - gfx->updateTaskCount(csr->peekLatestFlushedTaskCount(), csr->getOsContext().getContextId()); - } - } - if (retVal != ZE_RESULT_ERROR_DEVICE_LOST) { - retVal = ZE_RESULT_ERROR_UNKNOWN; - } - if (ret == NEO::SubmissionStatus::OUT_OF_MEMORY) { - retVal = ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; - } - } - - csr->getResidencyAllocations().clear(); - return retVal; + return ZE_RESULT_SUCCESS; } template -void CommandQueueHw::programFrontEnd(uint64_t scratchAddress, uint32_t perThreadScratchSpaceSize, NEO::LinearStream &commandStream) { +void CommandQueueHw::programOneCmdListFrontEndIfDirty( + CommandListExecutionContext &ctx, + CommandList *commandList, + NEO::LinearStream &cmdStream) { + + bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get(); + auto &streamProperties = this->csr->getStreamProperties(); + bool shouldProgramVfe = ctx.frontEndStateDirty; + + if (isPatchingVfeStateAllowed) { + auto &requiredStreamState = commandList->getRequiredStreamState(); + streamProperties.frontEndState.setProperties(requiredStreamState.frontEndState); + shouldProgramVfe |= streamProperties.frontEndState.isDirty(); + } + + this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, cmdStream); + + if (isPatchingVfeStateAllowed) { + auto &finalStreamState = commandList->getFinalStreamState(); + streamProperties.frontEndState.setProperties(finalStreamState.frontEndState); + } +} + +template +void CommandQueueHw::programFrontEndAndClearDirtyFlag( + bool shouldFrontEndBeProgrammed, + CommandListExecutionContext &ctx, + NEO::LinearStream &cmdStream) { + + if (!shouldFrontEndBeProgrammed) { + return; + } + auto scratchSpaceController = this->csr->getScratchSpaceController(); + programFrontEnd(scratchSpaceController->getScratchPatchAddress(), + scratchSpaceController->getPerThreadScratchSpaceSize(), + cmdStream); + ctx.frontEndStateDirty = false; +} + +template +void CommandQueueHw::programFrontEnd(uint64_t scratchAddress, uint32_t perThreadScratchSpaceSize, NEO::LinearStream &cmdStream) { UNRECOVERABLE_IF(csr == nullptr); auto &hwInfo = device->getHwInfo(); auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily); auto engineGroupType = hwHelper.getEngineGroupType(csr->getOsContext().getEngineType(), csr->getOsContext().getEngineUsage(), hwInfo); - auto pVfeState = NEO::PreambleHelper::getSpaceForVfeState(&commandStream, hwInfo, engineGroupType); + auto pVfeState = NEO::PreambleHelper::getSpaceForVfeState(&cmdStream, hwInfo, engineGroupType); NEO::PreambleHelper::programVfeState(pVfeState, hwInfo, perThreadScratchSpaceSize, @@ -601,9 +375,13 @@ size_t CommandQueueHw::estimatePipelineSelect() { } template -void CommandQueueHw::programPipelineSelect(NEO::LinearStream &commandStream) { - NEO::PipelineSelectArgs args = {0, 0}; - NEO::PreambleHelper::programPipelineSelect(&commandStream, args, device->getHwInfo()); +void CommandQueueHw::programPipelineSelectIfGpgpuDisabled(NEO::LinearStream &cmdStream) { + bool gpgpuEnabled = this->csr->getPreambleSetFlag(); + if (!gpgpuEnabled) { + NEO::PipelineSelectArgs args = {0, 0}; + NEO::PreambleHelper::programPipelineSelect(&cmdStream, args, device->getHwInfo()); + this->csr->setPreambleSetFlag(true); + } } template @@ -611,34 +389,638 @@ bool CommandQueueHw::isDispatchTaskCountPostSyncRequired(ze_fence return containsAnyRegularCmdList || !csr->isUpdateTagFromWaitEnabled() || hFence != nullptr || getSynchronousMode() == ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; } -template -void CommandQueueHw::dispatchTaskCountPostSync(NEO::LinearStream &commandStream, const NEO::HardwareInfo &hwInfo) { - uint64_t postSyncAddress = csr->getTagAllocation()->getGpuAddress(); - uint32_t postSyncData = csr->peekTaskCount() + 1; - - if (isCopyOnlyCommandQueue) { - NEO::MiFlushArgs args; - args.commandWithPostSync = true; - args.notifyEnable = csr->isUsedNotifyEnableForPostSync(); - NEO::EncodeMiFlushDW::programMiFlushDw(commandStream, postSyncAddress, postSyncData, args, hwInfo); - } else { - NEO::PipeControlArgs args; - args.dcFlushEnable = NEO::MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo); - args.workloadPartitionOffset = partitionCount > 1; - args.notifyEnable = csr->isUsedNotifyEnableForPostSync(); - NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( - commandStream, - NEO::PostSyncMode::ImmediateData, - postSyncAddress, - postSyncData, - hwInfo, - args); - } -} - template bool CommandQueueHw::getPreemptionCmdProgramming() { return NEO::PreemptionHelper::getRequiredCmdStreamSize(NEO::PreemptionMode::MidThread, NEO::PreemptionMode::Initial) > 0u; } +template +CommandQueueHw::CommandListExecutionContext::CommandListExecutionContext( + ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists, + NEO::PreemptionMode contextPreemptionMode, + Device *device, + bool debugEnabled, + bool programActivePartitionConfig, + bool performMigration) : preemptionMode{contextPreemptionMode}, + statePreemption{contextPreemptionMode}, + isPreemptionModeInitial{contextPreemptionMode == NEO::PreemptionMode::Initial}, + isDebugEnabled{debugEnabled}, + isProgramActivePartitionConfigRequired{programActivePartitionConfig}, + isMigrationRequested{performMigration} { + + constexpr size_t residencyContainerSpaceForPreemption = 2; + constexpr size_t residencyContainerSpaceForTagWrite = 1; + + for (auto i = 0u; i < numCommandLists; i++) { + auto commandList = CommandList::fromHandle(phCommandLists[i]); + + if (commandList->containsCooperativeKernels()) { + this->anyCommandListWithCooperativeKernels = true; + } else { + this->anyCommandListWithoutCooperativeKernels = true; + } + + if (commandList->getRequiredStreamState().frontEndState.disableEUFusion.value == 1) { + this->anyCommandListRequiresDisabledEUFusion = true; + } + + // If the Command List has commands that require uncached MOCS, then any changes to the commands in the queue requires the uncached MOCS + if (commandList->requiresQueueUncachedMocs && this->cachedMOCSAllowed == true) { + this->cachedMOCSAllowed = false; + } + + if (commandList->isMemoryPrefetchRequested()) { + this->performMemoryPrefetch = true; + } + } + this->isDevicePreemptionModeMidThread = device->getDevicePreemptionMode() == NEO::PreemptionMode::MidThread; + this->stateSipRequired = (this->isPreemptionModeInitial && this->isDevicePreemptionModeMidThread) || + this->isNEODebuggerActive(device); + + if (this->isDevicePreemptionModeMidThread) { + this->spaceForResidency += residencyContainerSpaceForPreemption; + } + this->spaceForResidency += residencyContainerSpaceForTagWrite; + + if (this->isMigrationRequested && device->getDriverHandle()->getMemoryManager()->getPageFaultManager() == nullptr) { + this->isMigrationRequested = false; + } +} + +template +bool CommandQueueHw::CommandListExecutionContext::isNEODebuggerActive(Device *device) { + return device->getNEODevice()->getDebugger() && this->isDebugEnabled; +} + +template +size_t CommandQueueHw::computeDebuggerCmdsSize(const CommandListExecutionContext &ctx) { + size_t debuggerCmdsSize = 0; + + if (ctx.isDebugEnabled && !this->commandQueueDebugCmdsProgrammed) { + if (this->device->getNEODevice()->getSourceLevelDebugger()) { + debuggerCmdsSize += NEO::PreambleHelper::getKernelDebuggingCommandsSize(true); + } else if (this->device->getL0Debugger()) { + debuggerCmdsSize += device->getL0Debugger()->getSbaAddressLoadCommandsSize(); + } + } + + return debuggerCmdsSize; +} + +template +size_t CommandQueueHw::computePreemptionSize( + CommandListExecutionContext &ctx, + ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists) { + + size_t preemptionSize = 0u; + NEO::Device *neoDevice = this->device->getNEODevice(); + + if (ctx.isPreemptionModeInitial) { + preemptionSize += NEO::PreemptionHelper::getRequiredPreambleSize(*neoDevice); + } + + if (ctx.stateSipRequired) { + preemptionSize += NEO::PreemptionHelper::getRequiredStateSipCmdSize(*neoDevice, this->csr->isRcs()); + } + + for (auto i = 0u; i < numCommandLists; i++) { + auto commandList = CommandList::fromHandle(phCommandLists[i]); + auto commandListPreemption = commandList->getCommandListPreemptionMode(); + + if (ctx.statePreemption != commandListPreemption) { + if (this->preemptionCmdSyncProgramming) { + preemptionSize += NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(); + } + preemptionSize += NEO::PreemptionHelper::getRequiredCmdStreamSize(commandListPreemption, ctx.statePreemption); + ctx.statePreemption = commandListPreemption; + } + } + + return preemptionSize; +} + +template +void CommandQueueHw::setupCmdListsAndContextParams( + CommandListExecutionContext &ctx, + ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists, + ze_fence_handle_t hFence) { + + for (auto i = 0u; i < numCommandLists; i++) { + auto commandList = CommandList::fromHandle(phCommandLists[i]); + + commandList->csr = this->csr; + commandList->handleIndirectAllocationResidency(); + + ctx.containsAnyRegularCmdList |= commandList->cmdListType == CommandList::CommandListType::TYPE_REGULAR; + ctx.spaceForResidency += commandList->commandContainer.getResidencyContainer().size(); + if (!isCopyOnlyCommandQueue) { + ctx.perThreadScratchSpaceSize = std::max(ctx.perThreadScratchSpaceSize, commandList->getCommandListPerThreadScratchSize()); + ctx.perThreadPrivateScratchSize = std::max(ctx.perThreadPrivateScratchSize, commandList->getCommandListPerThreadPrivateScratchSize()); + + if (commandList->getCommandListPerThreadScratchSize() != 0 || commandList->getCommandListPerThreadPrivateScratchSize() != 0) { + if (commandList->commandContainer.getIndirectHeap(NEO::HeapType::SURFACE_STATE) != nullptr) { + heapContainer.push_back(commandList->commandContainer.getIndirectHeap(NEO::HeapType::SURFACE_STATE)->getGraphicsAllocation()); + } + for (auto element : commandList->commandContainer.sshAllocations) { + heapContainer.push_back(element); + } + } + } + + this->partitionCount = std::max(this->partitionCount, commandList->partitionCount); + commandList->makeResidentAndMigrate(ctx.isMigrationRequested); + } + + ctx.isDispatchTaskCountPostSyncRequired = isDispatchTaskCountPostSyncRequired(hFence, ctx.containsAnyRegularCmdList); +} + +template +size_t CommandQueueHw::estimateLinearStreamSizeInitial( + CommandListExecutionContext &ctx, + ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists) { + + using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; + using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END; + + size_t linearStreamSizeEstimate = 0u; + + for (auto i = 0u; i < numCommandLists; i++) { + auto commandList = CommandList::fromHandle(phCommandLists[i]); + linearStreamSizeEstimate += commandList->commandContainer.getCmdBufferAllocations().size(); + } + linearStreamSizeEstimate *= sizeof(MI_BATCH_BUFFER_START); + linearStreamSizeEstimate += this->csr->getCmdsSizeForHardwareContext(); + + if (ctx.isDirectSubmissionEnabled) { + linearStreamSizeEstimate += sizeof(MI_BATCH_BUFFER_START); + } else { + linearStreamSizeEstimate += sizeof(MI_BATCH_BUFFER_END); + } + + auto csrHw = reinterpret_cast *>(this->csr); + if (ctx.isProgramActivePartitionConfigRequired) { + linearStreamSizeEstimate += csrHw->getCmdSizeForActivePartitionConfig(); + } + + if (NEO::DebugManager.flags.EnableSWTags.get()) { + linearStreamSizeEstimate += NEO::SWTagsManager::estimateSpaceForSWTags(); + } + + linearStreamSizeEstimate += NEO::EncodeKernelArgsBuffer::getKernelArgsBufferCmdsSize(this->csr->getKernelArgsBufferAllocation(), + this->csr->getLogicalStateHelper()); + + return linearStreamSizeEstimate; +} + +template +void CommandQueueHw::setFrontEndStateProperties(CommandListExecutionContext &ctx) { + const auto &hwInfo = this->device->getHwInfo(); + const auto &hwInfoConfig = *NEO::HwInfoConfig::get(hwInfo.platform.eProductFamily); + auto disableOverdispatch = hwInfoConfig.isDisableOverdispatchAvailable(hwInfo); + + auto isEngineInstanced = csr->getOsContext().isEngineInstanced(); + auto &streamProperties = this->csr->getStreamProperties(); + bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get(); + if (!isPatchingVfeStateAllowed) { + streamProperties.frontEndState.setProperties(ctx.anyCommandListWithCooperativeKernels, ctx.anyCommandListRequiresDisabledEUFusion, + disableOverdispatch, isEngineInstanced, hwInfo); + } else { + streamProperties.frontEndState.singleSliceDispatchCcsMode.set(isEngineInstanced); + } + ctx.frontEndStateDirty |= (streamProperties.frontEndState.isDirty() && !this->csr->getLogicalStateHelper()); + ctx.frontEndStateDirty |= csr->getMediaVFEStateDirty(); +} + +template +void CommandQueueHw::handleScratchSpaceAndUpdateGSBAStateDirtyFlag(CommandListExecutionContext &ctx) { + handleScratchSpace(this->heapContainer, + this->csr->getScratchSpaceController(), + ctx.gsbaStateDirty, ctx.frontEndStateDirty, + ctx.perThreadScratchSpaceSize, ctx.perThreadPrivateScratchSize); + ctx.gsbaStateDirty |= this->csr->getGSBAStateDirty(); +} + +template +size_t CommandQueueHw::estimateLinearStreamSizeComplementary( + CommandListExecutionContext &ctx, + ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists) { + + size_t linearStreamSizeEstimate = 0u; + bool gpgpuEnabled = csr->getPreambleSetFlag(); + + if (!gpgpuEnabled) { + linearStreamSizeEstimate += estimatePipelineSelect(); + } + + linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(ctx.frontEndStateDirty, numCommandLists, phCommandLists); + + if (ctx.gsbaStateDirty) { + linearStreamSizeEstimate += estimateStateBaseAddressCmdSize(); + } + + return linearStreamSizeEstimate; +} + +template +ze_result_t CommandQueueHw::makeAlignedChildStreamAndSetGpuBase(NEO::LinearStream &child, size_t requiredSize) { + + size_t alignedSize = alignUp(requiredSize, this->minCmdBufferPtrAlign); + + if (const auto waitStatus = this->reserveLinearStreamSize(alignedSize); waitStatus == NEO::WaitStatus::GpuHang) { + return ZE_RESULT_ERROR_DEVICE_LOST; + } + + child.replaceBuffer(this->commandStream->getSpace(alignedSize), alignedSize); + child.setGpuBase(ptrOffset(this->commandStream->getGpuBase(), this->commandStream->getUsed() - alignedSize)); + this->alignedChildStreamPadding = alignedSize - requiredSize; + return ZE_RESULT_SUCCESS; +} + +template +void CommandQueueHw::allocateGlobalFenceAndMakeItResident() { + const auto globalFenceAllocation = this->csr->getGlobalFenceAllocation(); + if (globalFenceAllocation) { + this->csr->makeResident(*globalFenceAllocation); + } +} + +template +void CommandQueueHw::allocateWorkPartitionAndMakeItResident() { + const auto workPartitionAllocation = this->csr->getWorkPartitionAllocation(); + if (workPartitionAllocation) { + this->csr->makeResident(*workPartitionAllocation); + } +} + +template +void CommandQueueHw::allocateTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(NEO::LinearStream &cmdStream) { + if (NEO::DebugManager.flags.EnableSWTags.get()) { + NEO::Device *neoDevice = this->device->getNEODevice(); + NEO::SWTagsManager *tagsManager = neoDevice->getRootDeviceEnvironment().tagsManager.get(); + UNRECOVERABLE_IF(tagsManager == nullptr); + this->csr->makeResident(*tagsManager->getBXMLHeapAllocation()); + this->csr->makeResident(*tagsManager->getSWTagHeapAllocation()); + tagsManager->insertBXMLHeapAddress(cmdStream); + tagsManager->insertSWTagHeapAddress(cmdStream); + } +} + +template +void CommandQueueHw::makeSbaTrackingBufferResidentIfL0DebuggerEnabled(bool isDebugEnabled) { + if (isDebugEnabled && this->device->getL0Debugger()) { + this->csr->makeResident(*this->device->getL0Debugger()->getSbaTrackingBuffer(this->csr->getOsContext().getContextId())); + } +} + +template +void CommandQueueHw::programCommandQueueDebugCmdsForSourceLevelOrL0DebuggerIfEnabled(bool isDebugEnabled, NEO::LinearStream &cmdStream) { + if (isDebugEnabled && !this->commandQueueDebugCmdsProgrammed) { + NEO::Device *neoDevice = device->getNEODevice(); + if (neoDevice->getSourceLevelDebugger()) { + NEO::PreambleHelper::programKernelDebugging(&cmdStream); + this->commandQueueDebugCmdsProgrammed = true; + } else if (this->device->getL0Debugger()) { + this->device->getL0Debugger()->programSbaAddressLoad(cmdStream, + device->getL0Debugger()->getSbaTrackingBuffer(csr->getOsContext().getContextId())->getGpuAddress()); + this->commandQueueDebugCmdsProgrammed = true; + } + } +} + +template +void CommandQueueHw::programSbaWithUpdatedGsbaIfDirty( + CommandListExecutionContext &ctx, + ze_command_list_handle_t hCommandList, + NEO::LinearStream &cmdStream) { + + if (!ctx.gsbaStateDirty) { + return; + } + auto indirectHeap = CommandList::fromHandle(hCommandList)->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT); + auto scratchSpaceController = this->csr->getScratchSpaceController(); + programStateBaseAddress(scratchSpaceController->calculateNewGSH(), + indirectHeap->getGraphicsAllocation()->isAllocatedInLocalMemoryPool(), + cmdStream, + ctx.cachedMOCSAllowed); +} + +template +void CommandQueueHw::programCsrBaseAddressIfPreemptionModeInitial(bool isPreemptionModeInitial, NEO::LinearStream &cmdStream) { + if (!isPreemptionModeInitial) { + return; + } + NEO::Device *neoDevice = this->device->getNEODevice(); + NEO::PreemptionHelper::programCsrBaseAddress(cmdStream, + *neoDevice, + this->csr->getPreemptionAllocation(), + this->csr->getLogicalStateHelper()); +} + +template +void CommandQueueHw::programStateSip(bool isStateSipRequired, NEO::LinearStream &cmdStream) { + if (!isStateSipRequired) { + return; + } + NEO::Device *neoDevice = this->device->getNEODevice(); + NEO::PreemptionHelper::programStateSip(cmdStream, *neoDevice, this->csr->getLogicalStateHelper()); +} + +template +void CommandQueueHw::programStateSipEndWA(bool isStateSipRequired, NEO::LinearStream &cmdStream) { + if (!isStateSipRequired) { + return; + } + NEO::Device *neoDevice = this->device->getNEODevice(); + NEO::PreemptionHelper::programStateSipEndWa(cmdStream, *neoDevice); +} + +template +void CommandQueueHw::updateOneCmdListPreemptionModeAndCtxStatePreemption( + CommandListExecutionContext &ctx, + NEO::PreemptionMode commandListPreemption, + NEO::LinearStream &cmdStream) { + + NEO::Device *neoDevice = this->device->getNEODevice(); + if (ctx.statePreemption != commandListPreemption) { + if (NEO::DebugManager.flags.EnableSWTags.get()) { + neoDevice->getRootDeviceEnvironment().tagsManager->insertTag( + cmdStream, + *neoDevice, + "ComandList Preemption Mode update", 0u); + } + + if (this->preemptionCmdSyncProgramming) { + NEO::PipeControlArgs args; + NEO::MemorySynchronizationCommands::addSingleBarrier(cmdStream, args); + } + NEO::PreemptionHelper::programCmdStream(cmdStream, + commandListPreemption, + ctx.statePreemption, + this->csr->getPreemptionAllocation()); + ctx.statePreemption = commandListPreemption; + } +} + +template +void CommandQueueHw::makePreemptionAllocationResidentForModeMidThread(bool isDevicePreemptionModeMidThread) { + if (isDevicePreemptionModeMidThread) { + this->csr->makeResident(*this->csr->getPreemptionAllocation()); + } +} + +template +void CommandQueueHw::makeSipIsaResidentIfSipKernelUsed(CommandListExecutionContext &ctx) { + NEO::Device *neoDevice = this->device->getNEODevice(); + if (ctx.isDevicePreemptionModeMidThread || ctx.isNEODebuggerActive(this->device)) { + auto sipIsa = NEO::SipKernel::getSipKernel(*neoDevice).getSipAllocation(); + this->csr->makeResident(*sipIsa); + } +} + +template +void CommandQueueHw::makeDebugSurfaceResidentIfNEODebuggerActive(bool isNEODebuggerActive) { + if (!isNEODebuggerActive) { + return; + } + UNRECOVERABLE_IF(this->device->getDebugSurface() == nullptr); + this->csr->makeResident(*this->device->getDebugSurface()); +} + +template +void CommandQueueHw::programActivePartitionConfig( + bool isProgramActivePartitionConfigRequired, + NEO::LinearStream &cmdStream) { + + if (!isProgramActivePartitionConfigRequired) { + return; + } + auto csrHw = reinterpret_cast *>(this->csr); + csrHw->programActivePartitionConfig(cmdStream); +} + +template +void CommandQueueHw::encodeKernelArgsBufferAndMakeItResident() { + NEO::EncodeKernelArgsBuffer::encodeKernelArgsBufferCmds(this->csr->getKernelArgsBufferAllocation(), + this->csr->getLogicalStateHelper()); + if (this->csr->getKernelArgsBufferAllocation()) { + this->csr->makeResident(*this->csr->getKernelArgsBufferAllocation()); + } +} + +template +void CommandQueueHw::writeCsrStreamInlineIfLogicalStateHelperAvailable(NEO::LinearStream &cmdStream) { + if (this->csr->getLogicalStateHelper()) { + this->csr->getLogicalStateHelper()->writeStreamInline(cmdStream, false); + } +} + +template +void CommandQueueHw::programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &cmdStream) { + + auto &cmdBufferAllocations = commandList->commandContainer.getCmdBufferAllocations(); + auto cmdBufferCount = cmdBufferAllocations.size(); + bool isCommandListImmediate = (commandList->cmdListType == CommandList::CommandListType::TYPE_IMMEDIATE) ? true : false; + + for (size_t iter = 0; iter < cmdBufferCount; iter++) { + auto allocation = cmdBufferAllocations[iter]; + uint64_t startOffset = allocation->getGpuAddress(); + if (isCommandListImmediate && (iter == (cmdBufferCount - 1))) { + startOffset = ptrOffset(allocation->getGpuAddress(), commandList->commandContainer.currentLinearStreamStartOffset); + } + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&cmdStream, startOffset, true); + } +} + +template +void CommandQueueHw::mergeOneCmdListPipelinedState(CommandList *commandList) { + + bool isCommandListImmediate = (commandList->cmdListType == CommandList::CommandListType::TYPE_IMMEDIATE) ? true : false; + auto commandListImp = static_cast(commandList); + if (!isCommandListImmediate && commandListImp->getLogicalStateHelper()) { + this->csr->getLogicalStateHelper()->mergePipelinedState(*commandListImp->getLogicalStateHelper()); + } +} + +template +void CommandQueueHw::collectPrintfContentsFromAllCommandsLists( + ze_command_list_handle_t *phCommandLists, + uint32_t numCommandLists) { + + for (auto i = 0u; i < numCommandLists; ++i) { + auto commandList = CommandList::fromHandle(phCommandLists[i]); + this->printfFunctionContainer.insert(this->printfFunctionContainer.end(), + commandList->getPrintfFunctionContainer().begin(), + commandList->getPrintfFunctionContainer().end()); + } +} + +template +void CommandQueueHw::migrateSharedAllocationsIfRequested( + bool isMigrationRequested, + ze_command_list_handle_t hCommandList) { + + if (isMigrationRequested) { + CommandList::fromHandle(hCommandList)->migrateSharedAllocations(); + } +} + +template +void CommandQueueHw::prefetchMemoryIfRequested(bool &isMemoryPrefetchRequested) { + if (isMemoryPrefetchRequested) { + auto prefetchManager = this->device->getDriverHandle()->getMemoryManager()->getPrefetchManager(); + prefetchManager->migrateAllocationsToGpu(*this->device->getDriverHandle()->getSvmAllocsManager(), + *this->device->getNEODevice()); + isMemoryPrefetchRequested = false; + } +} + +template +void CommandQueueHw::assignCsrTaskCountToFenceIfAvailable(ze_fence_handle_t hFence) { + if (hFence) { + Fence::fromHandle(hFence)->assignTaskCountFromCsr(); + } +} + +template +void CommandQueueHw::dispatchTaskCountPostSyncByMiFlushDw( + bool isDispatchTaskCountPostSyncRequired, + NEO::LinearStream &cmdStream) { + + if (!isDispatchTaskCountPostSyncRequired) { + return; + } + + uint64_t postSyncAddress = this->csr->getTagAllocation()->getGpuAddress(); + uint32_t postSyncData = this->csr->peekTaskCount() + 1; + const auto &hwInfo = this->device->getHwInfo(); + + NEO::MiFlushArgs args; + args.commandWithPostSync = true; + args.notifyEnable = this->csr->isUsedNotifyEnableForPostSync(); + + NEO::EncodeMiFlushDW::programMiFlushDw(cmdStream, postSyncAddress, postSyncData, args, hwInfo); +} + +template +void CommandQueueHw::dispatchTaskCountPostSyncRegular( + bool isDispatchTaskCountPostSyncRequired, + NEO::LinearStream &cmdStream) { + + if (!isDispatchTaskCountPostSyncRequired) { + return; + } + + uint64_t postSyncAddress = this->csr->getTagAllocation()->getGpuAddress(); + uint32_t postSyncData = this->csr->peekTaskCount() + 1; + const auto &hwInfo = this->device->getHwInfo(); + + NEO::PipeControlArgs args; + args.dcFlushEnable = NEO::MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo); + args.workloadPartitionOffset = this->partitionCount > 1; + args.notifyEnable = this->csr->isUsedNotifyEnableForPostSync(); + NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( + cmdStream, + NEO::PostSyncMode::ImmediateData, + postSyncAddress, + postSyncData, + hwInfo, + args); +} + +template +void CommandQueueHw::makeCsrTagAllocationResident() { + this->csr->makeResident(*this->csr->getTagAllocation()); +} + +template +NEO::SubmissionStatus CommandQueueHw::prepareAndSubmitBatchBuffer( + CommandListExecutionContext &ctx, + NEO::LinearStream &innerCommandStream) { + + using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END; + + auto &outerCommandStream = *this->commandStream; + + void *endingCmd = nullptr; + if (ctx.isDirectSubmissionEnabled) { + auto offset = ptrDiff(innerCommandStream.getCpuBase(), outerCommandStream.getCpuBase()) + innerCommandStream.getUsed(); + uint64_t startAddress = outerCommandStream.getGraphicsAllocation()->getGpuAddress() + offset; + if (NEO::DebugManager.flags.BatchBufferStartPrepatchingWaEnabled.get() == 0) { + startAddress = 0; + } + + endingCmd = innerCommandStream.getSpace(0); + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&innerCommandStream, startAddress, false); + } else { + auto buffer = innerCommandStream.getSpaceForCmd(); + *(MI_BATCH_BUFFER_END *)buffer = GfxFamily::cmdInitBatchBufferEnd; + } + + if (this->alignedChildStreamPadding) { + void *paddingPtr = innerCommandStream.getSpace(this->alignedChildStreamPadding); + memset(paddingPtr, 0, this->alignedChildStreamPadding); + } + + return submitBatchBuffer(ptrDiff(innerCommandStream.getCpuBase(), outerCommandStream.getCpuBase()), + csr->getResidencyAllocations(), + endingCmd, + ctx.anyCommandListWithCooperativeKernels); +} + +template +void CommandQueueHw::updateTaskCountAndPostSync(bool isDispatchTaskCountPostSyncRequired) { + + if (!isDispatchTaskCountPostSyncRequired) { + return; + } + this->taskCount = this->csr->peekTaskCount(); + this->csr->setLatestFlushedTaskCount(this->taskCount); +} + +template +ze_result_t CommandQueueHw::waitForCommandQueueCompletionAndCleanHeapContainer() { + + ze_result_t ret = ZE_RESULT_SUCCESS; + + if (this->getSynchronousMode() == ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS) { + if (const auto syncRet = this->synchronize(std::numeric_limits::max()); syncRet == ZE_RESULT_ERROR_DEVICE_LOST) { + ret = syncRet; + } + } else { + this->csr->pollForCompletion(); + } + this->heapContainer.clear(); + + return ret; +} + +template +ze_result_t CommandQueueHw::handleSubmissionAndCompletionResults( + NEO::SubmissionStatus submitRet, + ze_result_t completionRet) { + + if ((submitRet != NEO::SubmissionStatus::SUCCESS) || (completionRet == ZE_RESULT_ERROR_DEVICE_LOST)) { + for (auto &gfx : this->csr->getResidencyAllocations()) { + if (this->csr->peekLatestFlushedTaskCount() == 0) { + gfx->releaseUsageInOsContext(this->csr->getOsContext().getContextId()); + } else { + gfx->updateTaskCount(this->csr->peekLatestFlushedTaskCount(), this->csr->getOsContext().getContextId()); + } + } + if (completionRet != ZE_RESULT_ERROR_DEVICE_LOST) { + completionRet = ZE_RESULT_ERROR_UNKNOWN; + } + if (submitRet == NEO::SubmissionStatus::OUT_OF_MEMORY) { + completionRet = ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; + } + } + + return completionRet; +} + } // namespace L0