/* * Copyright (C) 2019-2020 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/command_stream/command_stream_receiver_hw.h" #include "shared/source/command_stream/experimental_command_buffer.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/command_stream/scratch_space_controller_base.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/device/device.h" #include "shared/source/direct_submission/direct_submission_hw.h" #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/gmm_helper/page_table_mngr.h" #include "shared/source/helpers/blit_commands_helper.h" #include "shared/source/helpers/cache_policy.h" #include "shared/source/helpers/flat_batch_buffer_helper_hw.h" #include "shared/source/helpers/flush_stamp.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/pause_on_gpu_properties.h" #include "shared/source/helpers/preamble.h" #include "shared/source/helpers/ptr_math.h" #include "shared/source/helpers/state_base_address.h" #include "shared/source/helpers/timestamp_packet.h" #include "shared/source/indirect_heap/indirect_heap.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/os_interface/os_context.h" #include "shared/source/utilities/tag_allocator.h" #include "command_stream_receiver_hw_ext.inl" #include "pipe_control_args.h" namespace NEO { template CommandStreamReceiverHw::~CommandStreamReceiverHw() = default; template CommandStreamReceiverHw::CommandStreamReceiverHw(ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex) : CommandStreamReceiver(executionEnvironment, rootDeviceIndex) { auto &hwHelper = HwHelper::get(peekHwInfo().platform.eRenderCoreFamily); localMemoryEnabled = hwHelper.getEnableLocalMemory(peekHwInfo()); requiredThreadArbitrationPolicy = hwHelper.getDefaultThreadArbitrationPolicy(); resetKmdNotifyHelper(new KmdNotifyHelper(&peekHwInfo().capabilityTable.kmdNotifyProperties)); flatBatchBufferHelper.reset(new FlatBatchBufferHelperHw(executionEnvironment)); defaultSshSize = getSshHeapSize(); timestampPacketWriteEnabled = hwHelper.timestampPacketWriteSupported(); if (DebugManager.flags.EnableTimestampPacket.get() != -1) { timestampPacketWriteEnabled = !!DebugManager.flags.EnableTimestampPacket.get(); } createScratchSpaceController(); useNewResourceImplicitFlush = checkPlatformSupportsNewResourceImplicitFlush(); int32_t overrideNewResourceImplicitFlush = DebugManager.flags.PerformImplicitFlushForNewResource.get(); if (overrideNewResourceImplicitFlush != -1) { useNewResourceImplicitFlush = overrideNewResourceImplicitFlush == 0 ? false : true; } useGpuIdleImplicitFlush = checkPlatformSupportsGpuIdleImplicitFlush(); int32_t overrideGpuIdleImplicitFlush = DebugManager.flags.PerformImplicitFlushForIdleGpu.get(); if (overrideGpuIdleImplicitFlush != -1) { useGpuIdleImplicitFlush = overrideGpuIdleImplicitFlush == 0 ? false : true; } } template bool CommandStreamReceiverHw::flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) { return true; } template inline void CommandStreamReceiverHw::addBatchBufferEnd(LinearStream &commandStream, void **patchLocation) { using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END; auto pCmd = commandStream.getSpaceForCmd(); *pCmd = GfxFamily::cmdInitBatchBufferEnd; if (patchLocation) { *patchLocation = pCmd; } } template inline void CommandStreamReceiverHw::programEndingCmd(LinearStream &commandStream, void **patchLocation, bool directSubmissionEnabled) { if (directSubmissionEnabled) { *patchLocation = commandStream.getSpace(sizeof(MI_BATCH_BUFFER_START)); auto bbStart = reinterpret_cast(*patchLocation); MI_BATCH_BUFFER_START cmd = {}; addBatchBufferStart(&cmd, 0ull, false); *bbStart = cmd; } else { this->addBatchBufferEnd(commandStream, patchLocation); } } template inline void CommandStreamReceiverHw::addBatchBufferStart(MI_BATCH_BUFFER_START *commandBufferMemory, uint64_t startAddress, bool secondary) { MI_BATCH_BUFFER_START cmd = GfxFamily::cmdInitBatchBufferStart; cmd.setBatchBufferStartAddressGraphicsaddress472(startAddress); cmd.setAddressSpaceIndicator(MI_BATCH_BUFFER_START::ADDRESS_SPACE_INDICATOR_PPGTT); if (secondary) { cmd.setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH); } if (DebugManager.flags.FlattenBatchBufferForAUBDump.get()) { flatBatchBufferHelper->registerBatchBufferStartAddress(reinterpret_cast(commandBufferMemory), startAddress); } *commandBufferMemory = cmd; } template inline void CommandStreamReceiverHw::alignToCacheLine(LinearStream &commandStream) { auto used = commandStream.getUsed(); auto alignment = MemoryConstants::cacheLineSize; auto partialCacheline = used & (alignment - 1); if (partialCacheline) { auto amountToPad = alignment - partialCacheline; auto pCmd = commandStream.getSpace(amountToPad); memset(pCmd, 0, amountToPad); } } template inline size_t CommandStreamReceiverHw::getRequiredCmdSizeForPreamble(Device &device) const { size_t size = 0; if (mediaVfeStateDirty) { size += PreambleHelper::getVFECommandsSize(); } if (!this->isPreambleSent) { size += PreambleHelper::getAdditionalCommandsSize(device); } if (!this->isPreambleSent || this->lastSentThreadArbitrationPolicy != this->requiredThreadArbitrationPolicy) { size += PreambleHelper::getThreadArbitrationCommandsSize(); } if (DebugManager.flags.ForcePerDssBackedBufferProgramming.get()) { if (!this->isPreambleSent) { size += PreambleHelper::getPerDssBackedBufferCommandsSize(device.getHardwareInfo()); } } if (!this->isPreambleSent) { if (DebugManager.flags.ForceSemaphoreDelayBetweenWaits.get() > -1) { size += PreambleHelper::getSemaphoreDelayCommandSize(); } } return size; } template inline void CommandStreamReceiverHw::addPipeControlCmd( LinearStream &commandStream, PipeControlArgs &args) { MemorySynchronizationCommands::addPipeControl(commandStream, args); } template void CommandStreamReceiverHw::programHardwareContext(LinearStream &cmdStream) { programEnginePrologue(cmdStream); } template size_t CommandStreamReceiverHw::getCmdsSizeForHardwareContext() const { return getCmdSizeForPrologue(); } template CompletionStamp CommandStreamReceiverHw::flushTask( LinearStream &commandStreamTask, size_t commandStreamStartTask, const IndirectHeap &dsh, const IndirectHeap &ioh, const IndirectHeap &ssh, uint32_t taskLevel, DispatchFlags &dispatchFlags, Device &device) { typedef typename GfxFamily::MI_BATCH_BUFFER_START MI_BATCH_BUFFER_START; typedef typename GfxFamily::MI_BATCH_BUFFER_END MI_BATCH_BUFFER_END; typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL; typedef typename GfxFamily::STATE_BASE_ADDRESS STATE_BASE_ADDRESS; DEBUG_BREAK_IF(&commandStreamTask == &commandStream); DEBUG_BREAK_IF(!(dispatchFlags.preemptionMode == PreemptionMode::Disabled ? device.getPreemptionMode() == PreemptionMode::Disabled : true)); DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady); DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "taskLevel", taskLevel); auto levelClosed = false; bool implicitFlush = dispatchFlags.implicitFlush || dispatchFlags.blocking || DebugManager.flags.ForceImplicitFlush.get(); void *currentPipeControlForNooping = nullptr; void *epiloguePipeControlLocation = nullptr; if (DebugManager.flags.ForceCsrFlushing.get()) { flushBatchedSubmissions(); } if (detectInitProgrammingFlagsRequired(dispatchFlags)) { initProgrammingFlags(); } if (dispatchFlags.blocking || dispatchFlags.dcFlush || dispatchFlags.guardCommandBufferWithPipeControl) { if (this->dispatchMode == DispatchMode::ImmediateDispatch) { //for ImmediateDispatch we will send this right away, therefore this pipe control will close the level //for BatchedSubmissions it will be nooped and only last ppc in batch will be emitted. levelClosed = true; //if we guard with ppc, flush dc as well to speed up completion latency if (dispatchFlags.guardCommandBufferWithPipeControl) { dispatchFlags.dcFlush = true; } } epiloguePipeControlLocation = ptrOffset(commandStreamTask.getCpuBase(), commandStreamTask.getUsed()); if ((dispatchFlags.outOfOrderExecutionAllowed || timestampPacketWriteEnabled) && !dispatchFlags.dcFlush) { currentPipeControlForNooping = epiloguePipeControlLocation; } auto address = getTagAllocation()->getGpuAddress(); PipeControlArgs args(dispatchFlags.dcFlush); MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( commandStreamTask, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, address, taskCount + 1, peekHwInfo(), args); this->latestSentTaskCount = taskCount + 1; DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "taskCount", peekTaskCount()); if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { flatBatchBufferHelper->setPatchInfoData(PatchInfoData(address, 0u, PatchInfoAllocationType::TagAddress, commandStreamTask.getGraphicsAllocation()->getGpuAddress(), commandStreamTask.getUsed() - 2 * sizeof(uint64_t), PatchInfoAllocationType::Default)); flatBatchBufferHelper->setPatchInfoData(PatchInfoData(address, 0u, PatchInfoAllocationType::TagValue, commandStreamTask.getGraphicsAllocation()->getGpuAddress(), commandStreamTask.getUsed() - sizeof(uint64_t), PatchInfoAllocationType::Default)); } } if (DebugManager.flags.ForceSLML3Config.get()) { dispatchFlags.useSLM = true; } if (DebugManager.flags.OverrideThreadArbitrationPolicy.get() != -1) { dispatchFlags.threadArbitrationPolicy = static_cast(DebugManager.flags.OverrideThreadArbitrationPolicy.get()); } auto newL3Config = PreambleHelper::getL3Config(peekHwInfo(), dispatchFlags.useSLM); csrSizeRequestFlags.l3ConfigChanged = this->lastSentL3Config != newL3Config; csrSizeRequestFlags.coherencyRequestChanged = this->lastSentCoherencyRequest != static_cast(dispatchFlags.requiresCoherency); csrSizeRequestFlags.preemptionRequestChanged = this->lastPreemptionMode != dispatchFlags.preemptionMode; csrSizeRequestFlags.mediaSamplerConfigChanged = this->lastMediaSamplerConfig != static_cast(dispatchFlags.pipelineSelectArgs.mediaSamplerRequired); csrSizeRequestFlags.specialPipelineSelectModeChanged = this->lastSpecialPipelineSelectMode != dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode; if (dispatchFlags.numGrfRequired == GrfConfig::NotApplicable) { dispatchFlags.numGrfRequired = lastSentNumGrfRequired; } csrSizeRequestFlags.numGrfRequiredChanged = this->lastSentNumGrfRequired != dispatchFlags.numGrfRequired; lastSentNumGrfRequired = dispatchFlags.numGrfRequired; if (dispatchFlags.threadArbitrationPolicy != ThreadArbitrationPolicy::NotPresent) { this->requiredThreadArbitrationPolicy = dispatchFlags.threadArbitrationPolicy; } auto force32BitAllocations = getMemoryManager()->peekForce32BitAllocations(); bool stateBaseAddressDirty = false; bool checkVfeStateDirty = false; if (requiredScratchSize || requiredPrivateScratchSize) { scratchSpaceController->setRequiredScratchSpace(ssh.getCpuBase(), 0u, requiredScratchSize, requiredPrivateScratchSize, this->taskCount, *this->osContext, stateBaseAddressDirty, checkVfeStateDirty); if (checkVfeStateDirty) { setMediaVFEStateDirty(true); } if (scratchSpaceController->getScratchSpaceAllocation()) { makeResident(*scratchSpaceController->getScratchSpaceAllocation()); } if (scratchSpaceController->getPrivateScratchSpaceAllocation()) { makeResident(*scratchSpaceController->getPrivateScratchSpaceAllocation()); } } if (dispatchFlags.usePerDssBackedBuffer) { if (!perDssBackedBuffer) { createPerDssBackedBuffer(device); } makeResident(*perDssBackedBuffer); } if (dispatchFlags.additionalKernelExecInfo != AdditionalKernelExecInfo::NotApplicable && lastAdditionalKernelExecInfo != dispatchFlags.additionalKernelExecInfo) { setMediaVFEStateDirty(true); } auto &commandStreamCSR = this->getCS(getRequiredCmdStreamSizeAligned(dispatchFlags, device)); auto commandStreamStartCSR = commandStreamCSR.getUsed(); TimestampPacketHelper::programCsrDependencies(commandStreamCSR, dispatchFlags.csrDependencies, getOsContext().getNumSupportedDevices()); if (stallingPipeControlOnNextFlushRequired) { programStallingPipeControlForBarrier(commandStreamCSR, dispatchFlags); } programEngineModeCommands(commandStreamCSR, dispatchFlags); if (executionEnvironment.rootDeviceEnvironments[device.getRootDeviceIndex()]->pageTableManager.get() && !pageTableManagerInitialized) { pageTableManagerInitialized = executionEnvironment.rootDeviceEnvironments[device.getRootDeviceIndex()]->pageTableManager->initPageTableManagerRegisters(this); } programHardwareContext(commandStreamCSR); programComputeMode(commandStreamCSR, dispatchFlags); programPipelineSelect(commandStreamCSR, dispatchFlags.pipelineSelectArgs); programL3(commandStreamCSR, dispatchFlags, newL3Config); programPreamble(commandStreamCSR, device, dispatchFlags, newL3Config); programMediaSampler(commandStreamCSR, dispatchFlags); if (this->lastSentThreadArbitrationPolicy != this->requiredThreadArbitrationPolicy) { PreambleHelper::programThreadArbitration(&commandStreamCSR, this->requiredThreadArbitrationPolicy); this->lastSentThreadArbitrationPolicy = this->requiredThreadArbitrationPolicy; } stateBaseAddressDirty |= ((GSBAFor32BitProgrammed ^ dispatchFlags.gsba32BitRequired) && force32BitAllocations); programVFEState(commandStreamCSR, dispatchFlags, device.getDeviceInfo().maxFrontEndThreads); programPreemption(commandStreamCSR, dispatchFlags); bool dshDirty = dshState.updateAndCheck(&dsh); bool iohDirty = iohState.updateAndCheck(&ioh); bool sshDirty = sshState.updateAndCheck(&ssh); auto isStateBaseAddressDirty = dshDirty || iohDirty || sshDirty || stateBaseAddressDirty; auto mocsIndex = latestSentStatelessMocsConfig; auto &hwHelper = HwHelper::get(peekHwInfo().platform.eRenderCoreFamily); if (dispatchFlags.l3CacheSettings != L3CachingSettings::NotApplicable) { auto l3On = dispatchFlags.l3CacheSettings != L3CachingSettings::l3CacheOff; auto l1On = dispatchFlags.l3CacheSettings == L3CachingSettings::l3AndL1On; mocsIndex = hwHelper.getMocsIndex(*device.getGmmHelper(), l3On, l1On); } if (mocsIndex != latestSentStatelessMocsConfig) { isStateBaseAddressDirty = true; latestSentStatelessMocsConfig = mocsIndex; } bool sourceLevelDebuggerActive = device.getSourceLevelDebugger() != nullptr ? true : false; //Reprogram state base address if required if (isStateBaseAddressDirty || sourceLevelDebuggerActive) { addPipeControlBeforeStateBaseAddress(commandStreamCSR); programAdditionalPipelineSelect(commandStreamCSR, dispatchFlags.pipelineSelectArgs, true); uint64_t newGSHbase = 0; GSBAFor32BitProgrammed = false; if (is64bit && scratchSpaceController->getScratchSpaceAllocation() && !force32BitAllocations) { newGSHbase = scratchSpaceController->calculateNewGSH(); } else if (is64bit && force32BitAllocations && dispatchFlags.gsba32BitRequired) { bool useLocalMemory = scratchSpaceController->getScratchSpaceAllocation() ? scratchSpaceController->getScratchSpaceAllocation()->isAllocatedInLocalMemoryPool() : false; newGSHbase = getMemoryManager()->getExternalHeapBaseAddress(rootDeviceIndex, useLocalMemory); GSBAFor32BitProgrammed = true; } auto stateBaseAddressCmdOffset = commandStreamCSR.getUsed(); auto pCmd = static_cast(commandStreamCSR.getSpace(sizeof(STATE_BASE_ADDRESS))); STATE_BASE_ADDRESS cmd; auto instructionHeapBaseAddress = getMemoryManager()->getInternalHeapBaseAddress(rootDeviceIndex, !hwHelper.useSystemMemoryPlacementForISA(peekHwInfo())); StateBaseAddressHelper::programStateBaseAddress( &cmd, &dsh, &ioh, &ssh, newGSHbase, true, mocsIndex, getMemoryManager()->getInternalHeapBaseAddress(rootDeviceIndex, ioh.getGraphicsAllocation()->isAllocatedInLocalMemoryPool()), instructionHeapBaseAddress, true, device.getGmmHelper(), isMultiOsContextCapable()); *pCmd = cmd; if (sshDirty) { bindingTableBaseAddressRequired = true; } if (bindingTableBaseAddressRequired) { StateBaseAddressHelper::programBindingTableBaseAddress(commandStreamCSR, ssh, device.getGmmHelper()); bindingTableBaseAddressRequired = false; } programAdditionalPipelineSelect(commandStreamCSR, dispatchFlags.pipelineSelectArgs, false); programStateSip(commandStreamCSR, device); if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { collectStateBaseAddresPatchInfo(commandStream.getGraphicsAllocation()->getGpuAddress(), stateBaseAddressCmdOffset, dsh, ioh, ssh, newGSHbase); } } DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "this->taskLevel", (uint32_t)this->taskLevel); if (executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getHardwareInfo()->workaroundTable.waSamplerCacheFlushBetweenRedescribedSurfaceReads) { if (this->samplerCacheFlushRequired != SamplerCacheFlushState::samplerCacheFlushNotRequired) { PipeControlArgs args; args.textureCacheInvalidationEnable = true; addPipeControlCmd(commandStreamCSR, args); if (this->samplerCacheFlushRequired == SamplerCacheFlushState::samplerCacheFlushBefore) { this->samplerCacheFlushRequired = SamplerCacheFlushState::samplerCacheFlushAfter; } else { this->samplerCacheFlushRequired = SamplerCacheFlushState::samplerCacheFlushNotRequired; } } } if (experimentalCmdBuffer.get() != nullptr) { size_t startingOffset = experimentalCmdBuffer->programExperimentalCommandBuffer(); experimentalCmdBuffer->injectBufferStart(commandStreamCSR, startingOffset); } if (requiresInstructionCacheFlush) { PipeControlArgs args; args.instructionCacheInvalidateEnable = true; MemorySynchronizationCommands::addPipeControl(commandStreamCSR, args); requiresInstructionCacheFlush = false; } // Add a PC if we have a dependency on a previous walker to avoid concurrency issues. if (taskLevel > this->taskLevel) { if (!timestampPacketWriteEnabled) { PipeControlArgs args; MemorySynchronizationCommands::addPipeControl(commandStreamCSR, args); } this->taskLevel = taskLevel; DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "this->taskCount", peekTaskCount()); } if (DebugManager.flags.ForcePipeControlPriorToWalker.get()) { forcePipeControl(commandStreamCSR); } auto dshAllocation = dsh.getGraphicsAllocation(); auto iohAllocation = ioh.getGraphicsAllocation(); auto sshAllocation = ssh.getGraphicsAllocation(); this->makeResident(*dshAllocation); dshAllocation->setEvictable(false); this->makeResident(*iohAllocation); this->makeResident(*sshAllocation); iohAllocation->setEvictable(false); this->makeResident(*tagAllocation); if (globalFenceAllocation) { makeResident(*globalFenceAllocation); } if (preemptionAllocation) { makeResident(*preemptionAllocation); } if (dispatchFlags.preemptionMode == PreemptionMode::MidThread || sourceLevelDebuggerActive) { makeResident(*SipKernel::getSipKernelAllocation(device)); if (debugSurface) { makeResident(*debugSurface); } } if (experimentalCmdBuffer.get() != nullptr) { experimentalCmdBuffer->makeResidentAllocations(); } // If the CSR has work in its CS, flush it before the task bool submitTask = commandStreamStartTask != commandStreamTask.getUsed(); bool submitCSR = (commandStreamStartCSR != commandStreamCSR.getUsed()) || this->isMultiOsContextCapable(); bool submitCommandStreamFromCsr = false; void *bbEndLocation = nullptr; auto bbEndPaddingSize = this->dispatchMode == DispatchMode::ImmediateDispatch ? 0 : sizeof(MI_BATCH_BUFFER_START) - sizeof(MI_BATCH_BUFFER_END); size_t chainedBatchBufferStartOffset = 0; GraphicsAllocation *chainedBatchBuffer = nullptr; bool directSubmissionEnabled = isDirectSubmissionEnabled(); if (submitTask) { programEndingCmd(commandStreamTask, &bbEndLocation, directSubmissionEnabled); this->emitNoop(commandStreamTask, bbEndPaddingSize); this->alignToCacheLine(commandStreamTask); if (submitCSR) { chainedBatchBufferStartOffset = commandStreamCSR.getUsed(); chainedBatchBuffer = commandStreamTask.getGraphicsAllocation(); // Add MI_BATCH_BUFFER_START to chain from CSR -> Task auto pBBS = reinterpret_cast(commandStreamCSR.getSpace(sizeof(MI_BATCH_BUFFER_START))); addBatchBufferStart(pBBS, ptrOffset(commandStreamTask.getGraphicsAllocation()->getGpuAddress(), commandStreamStartTask), false); if (DebugManager.flags.FlattenBatchBufferForAUBDump.get()) { flatBatchBufferHelper->registerCommandChunk(commandStreamTask.getGraphicsAllocation()->getGpuAddress(), reinterpret_cast(commandStreamTask.getCpuBase()), commandStreamStartTask, static_cast(ptrDiff(bbEndLocation, commandStreamTask.getGraphicsAllocation()->getGpuAddress())) + sizeof(MI_BATCH_BUFFER_START)); } auto commandStreamAllocation = commandStreamTask.getGraphicsAllocation(); DEBUG_BREAK_IF(commandStreamAllocation == nullptr); this->makeResident(*commandStreamAllocation); this->alignToCacheLine(commandStreamCSR); submitCommandStreamFromCsr = true; } else if (dispatchFlags.epilogueRequired) { this->makeResident(*commandStreamCSR.getGraphicsAllocation()); } this->programEpilogue(commandStreamCSR, &bbEndLocation, dispatchFlags); } else if (submitCSR) { programEndingCmd(commandStreamCSR, &bbEndLocation, directSubmissionEnabled); this->emitNoop(commandStreamCSR, bbEndPaddingSize); this->alignToCacheLine(commandStreamCSR); DEBUG_BREAK_IF(commandStreamCSR.getUsed() > commandStreamCSR.getMaxAvailableSpace()); submitCommandStreamFromCsr = true; } size_t startOffset = submitCommandStreamFromCsr ? commandStreamStartCSR : commandStreamStartTask; auto &streamToSubmit = submitCommandStreamFromCsr ? commandStreamCSR : commandStreamTask; BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, chainedBatchBufferStartOffset, chainedBatchBuffer, dispatchFlags.requiresCoherency, dispatchFlags.lowPriority, dispatchFlags.throttle, dispatchFlags.sliceCount, streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation}; if (submitCSR | submitTask) { if (this->dispatchMode == DispatchMode::ImmediateDispatch) { this->flush(batchBuffer, this->getResidencyAllocations()); this->latestFlushedTaskCount = this->taskCount + 1; this->makeSurfacePackNonResident(this->getResidencyAllocations()); } else { auto commandBuffer = new CommandBuffer(device); commandBuffer->batchBuffer = batchBuffer; commandBuffer->surfaces.swap(this->getResidencyAllocations()); commandBuffer->batchBufferEndLocation = bbEndLocation; commandBuffer->taskCount = this->taskCount + 1; commandBuffer->flushStamp->replaceStampObject(dispatchFlags.flushStampReference); commandBuffer->pipeControlThatMayBeErasedLocation = currentPipeControlForNooping; commandBuffer->epiloguePipeControlLocation = epiloguePipeControlLocation; this->submissionAggregator->recordCommandBuffer(commandBuffer); } } else { this->makeSurfacePackNonResident(this->getResidencyAllocations()); } //check if we are not over the budget, if we are do implicit flush if (getMemoryManager()->isMemoryBudgetExhausted()) { if (this->totalMemoryUsed >= device.getDeviceInfo().globalMemSize / 4) { implicitFlush = true; } } if (DebugManager.flags.PerformImplicitFlushEveryEnqueueCount.get() != -1) { if ((taskCount + 1) % DebugManager.flags.PerformImplicitFlushEveryEnqueueCount.get() == 0) { implicitFlush = true; } } if (this->newResources) { implicitFlush = true; this->newResources = false; } implicitFlush |= checkImplicitFlushForGpuIdle(); if (this->dispatchMode == DispatchMode::BatchedDispatch && implicitFlush) { this->flushBatchedSubmissions(); } ++taskCount; DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "taskCount", peekTaskCount()); DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", tagAddress ? *tagAddress : 0); CompletionStamp completionStamp = { taskCount, this->taskLevel, flushStamp->peekStamp()}; this->taskLevel += levelClosed ? 1 : 0; return completionStamp; } template void CommandStreamReceiverHw::forcePipeControl(NEO::LinearStream &commandStreamCSR) { PipeControlArgs args; MemorySynchronizationCommands::addPipeControlWithCSStallOnly(commandStreamCSR, args); MemorySynchronizationCommands::addPipeControl(commandStreamCSR, args); } template inline void CommandStreamReceiverHw::programStallingPipeControlForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags) { stallingPipeControlOnNextFlushRequired = false; auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes; if (barrierTimestampPacketNodes && barrierTimestampPacketNodes->peekNodes().size() != 0) { auto barrierTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*dispatchFlags.barrierTimestampPacketNodes->peekNodes()[0]); PipeControlArgs args(true); MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( cmdStream, PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, barrierTimestampPacketGpuAddress, 0, peekHwInfo(), args); dispatchFlags.barrierTimestampPacketNodes->makeResident(*this); } else { PipeControlArgs args; MemorySynchronizationCommands::addPipeControl(cmdStream, args); } } template inline bool CommandStreamReceiverHw::flushBatchedSubmissions() { if (this->dispatchMode == DispatchMode::ImmediateDispatch) { return true; } typedef typename GfxFamily::MI_BATCH_BUFFER_START MI_BATCH_BUFFER_START; typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL; std::unique_lock lockGuard(ownershipMutex); bool submitResult = true; auto &commandBufferList = this->submissionAggregator->peekCmdBufferList(); if (!commandBufferList.peekIsEmpty()) { const auto totalMemoryBudget = static_cast(commandBufferList.peekHead()->device.getDeviceInfo().globalMemSize / 2); ResidencyContainer surfacesForSubmit; ResourcePackage resourcePackage; auto pipeControlLocationSize = MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(peekHwInfo()); void *currentPipeControlForNooping = nullptr; void *epiloguePipeControlLocation = nullptr; while (!commandBufferList.peekIsEmpty()) { size_t totalUsedSize = 0u; this->submissionAggregator->aggregateCommandBuffers(resourcePackage, totalUsedSize, totalMemoryBudget, osContext->getContextId()); auto primaryCmdBuffer = commandBufferList.removeFrontOne(); auto nextCommandBuffer = commandBufferList.peekHead(); auto currentBBendLocation = primaryCmdBuffer->batchBufferEndLocation; auto lastTaskCount = primaryCmdBuffer->taskCount; FlushStampUpdateHelper flushStampUpdateHelper; flushStampUpdateHelper.insert(primaryCmdBuffer->flushStamp->getStampReference()); currentPipeControlForNooping = primaryCmdBuffer->pipeControlThatMayBeErasedLocation; epiloguePipeControlLocation = primaryCmdBuffer->epiloguePipeControlLocation; if (DebugManager.flags.FlattenBatchBufferForAUBDump.get()) { flatBatchBufferHelper->registerCommandChunk(primaryCmdBuffer.get()->batchBuffer, sizeof(MI_BATCH_BUFFER_START)); } while (nextCommandBuffer && nextCommandBuffer->inspectionId == primaryCmdBuffer->inspectionId) { //noop pipe control if (currentPipeControlForNooping) { if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { flatBatchBufferHelper->removePipeControlData(pipeControlLocationSize, currentPipeControlForNooping, peekHwInfo()); } memset(currentPipeControlForNooping, 0, pipeControlLocationSize); } //obtain next candidate for nooping currentPipeControlForNooping = nextCommandBuffer->pipeControlThatMayBeErasedLocation; //track epilogue pipe control epiloguePipeControlLocation = nextCommandBuffer->epiloguePipeControlLocation; flushStampUpdateHelper.insert(nextCommandBuffer->flushStamp->getStampReference()); auto nextCommandBufferAddress = nextCommandBuffer->batchBuffer.commandBufferAllocation->getGpuAddress(); auto offsetedCommandBuffer = (uint64_t)ptrOffset(nextCommandBufferAddress, nextCommandBuffer->batchBuffer.startOffset); addBatchBufferStart((MI_BATCH_BUFFER_START *)currentBBendLocation, offsetedCommandBuffer, false); if (DebugManager.flags.FlattenBatchBufferForAUBDump.get()) { flatBatchBufferHelper->registerCommandChunk(nextCommandBuffer->batchBuffer, sizeof(MI_BATCH_BUFFER_START)); } currentBBendLocation = nextCommandBuffer->batchBufferEndLocation; lastTaskCount = nextCommandBuffer->taskCount; nextCommandBuffer = nextCommandBuffer->next; commandBufferList.removeFrontOne(); } surfacesForSubmit.reserve(resourcePackage.size() + 1); for (auto &surface : resourcePackage) { surfacesForSubmit.push_back(surface); } //make sure we flush DC if needed if (epiloguePipeControlLocation) { bool flushDcInEpilogue = true; if (DebugManager.flags.DisableDcFlushInEpilogue.get()) { flushDcInEpilogue = false; } ((PIPE_CONTROL *)epiloguePipeControlLocation)->setDcFlushEnable(flushDcInEpilogue); } primaryCmdBuffer->batchBuffer.endCmdPtr = currentBBendLocation; if (!this->flush(primaryCmdBuffer->batchBuffer, surfacesForSubmit)) { submitResult = false; break; } //after flush task level is closed this->taskLevel++; flushStampUpdateHelper.updateAll(flushStamp->peekStamp()); this->latestFlushedTaskCount = lastTaskCount; this->makeSurfacePackNonResident(surfacesForSubmit); resourcePackage.clear(); } this->totalMemoryUsed = 0; } return submitResult; } template size_t CommandStreamReceiverHw::getRequiredCmdStreamSizeAligned(const DispatchFlags &dispatchFlags, Device &device) { size_t size = getRequiredCmdStreamSize(dispatchFlags, device); return alignUp(size, MemoryConstants::cacheLineSize); } template size_t CommandStreamReceiverHw::getRequiredCmdStreamSize(const DispatchFlags &dispatchFlags, Device &device) { size_t size = getRequiredCmdSizeForPreamble(device); size += getRequiredStateBaseAddressSize(); if (!this->isStateSipSent || device.isDebuggerActive()) { size += PreemptionHelper::getRequiredStateSipCmdSize(device); } size += MemorySynchronizationCommands::getSizeForSinglePipeControl(); size += sizeof(typename GfxFamily::MI_BATCH_BUFFER_START); size += getCmdSizeForL3Config(); size += getCmdSizeForComputeMode(); size += getCmdSizeForMediaSampler(dispatchFlags.pipelineSelectArgs.mediaSamplerRequired); size += getCmdSizeForPipelineSelect(); size += getCmdSizeForPreemption(dispatchFlags); size += getCmdSizeForEpilogue(dispatchFlags); size += getCmdsSizeForHardwareContext(); if (executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getHardwareInfo()->workaroundTable.waSamplerCacheFlushBetweenRedescribedSurfaceReads) { if (this->samplerCacheFlushRequired != SamplerCacheFlushState::samplerCacheFlushNotRequired) { size += sizeof(typename GfxFamily::PIPE_CONTROL); } } if (experimentalCmdBuffer.get() != nullptr) { size += experimentalCmdBuffer->getRequiredInjectionSize(); } size += TimestampPacketHelper::getRequiredCmdStreamSize(dispatchFlags.csrDependencies); if (stallingPipeControlOnNextFlushRequired) { auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes; if (barrierTimestampPacketNodes && barrierTimestampPacketNodes->peekNodes().size() > 0) { size += MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(peekHwInfo()); } else { size += sizeof(typename GfxFamily::PIPE_CONTROL); } } if (requiresInstructionCacheFlush) { size += sizeof(typename GfxFamily::PIPE_CONTROL); } if (DebugManager.flags.ForcePipeControlPriorToWalker.get()) { size += 2 * sizeof(PIPE_CONTROL); } return size; } template inline size_t CommandStreamReceiverHw::getCmdSizeForPipelineSelect() const { size_t size = 0; if ((csrSizeRequestFlags.mediaSamplerConfigChanged || csrSizeRequestFlags.specialPipelineSelectModeChanged || !isPreambleSent) && !isPipelineSelectAlreadyProgrammed()) { size += PreambleHelper::getCmdSizeForPipelineSelect(peekHwInfo()); } return size; } template inline void CommandStreamReceiverHw::emitNoop(LinearStream &commandStream, size_t bytesToUpdate) { if (bytesToUpdate) { auto ptr = commandStream.getSpace(bytesToUpdate); memset(ptr, 0, bytesToUpdate); } } template inline void CommandStreamReceiverHw::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) { int64_t waitTimeout = 0; bool enableTimeout = kmdNotifyHelper->obtainTimeoutParams(waitTimeout, useQuickKmdSleep, *getTagAddress(), taskCountToWait, flushStampToWait, forcePowerSavingMode); PRINT_DEBUG_STRING(DebugManager.flags.LogWaitingForCompletion.get(), stdout, "\nWaiting for task count %u at location %p. Current value: %u\n", taskCountToWait, getTagAddress(), *getTagAddress()); auto status = waitForCompletionWithTimeout(enableTimeout, waitTimeout, taskCountToWait); if (!status) { waitForFlushStamp(flushStampToWait); //now call blocking wait, this is to ensure that task count is reached waitForCompletionWithTimeout(false, 0, taskCountToWait); } UNRECOVERABLE_IF(*getTagAddress() < taskCountToWait); if (kmdNotifyHelper->quickKmdSleepForSporadicWaitsEnabled()) { kmdNotifyHelper->updateLastWaitForCompletionTimestamp(); } PRINT_DEBUG_STRING(DebugManager.flags.LogWaitingForCompletion.get(), stdout, "\nWaiting completed. Current value: %u\n", *getTagAddress()); } template inline const HardwareInfo &CommandStreamReceiverHw::peekHwInfo() const { return *executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getHardwareInfo(); } template inline void CommandStreamReceiverHw::programPreemption(LinearStream &csr, DispatchFlags &dispatchFlags) { PreemptionHelper::programCmdStream(csr, dispatchFlags.preemptionMode, this->lastPreemptionMode, preemptionAllocation); this->lastPreemptionMode = dispatchFlags.preemptionMode; } template inline size_t CommandStreamReceiverHw::getCmdSizeForPreemption(const DispatchFlags &dispatchFlags) const { return PreemptionHelper::getRequiredCmdStreamSize(dispatchFlags.preemptionMode, this->lastPreemptionMode); } template inline void CommandStreamReceiverHw::programStateSip(LinearStream &cmdStream, Device &device) { if (!this->isStateSipSent || device.isDebuggerActive()) { PreemptionHelper::programStateSip(cmdStream, device); this->isStateSipSent = true; } } template inline void CommandStreamReceiverHw::programPreamble(LinearStream &csr, Device &device, DispatchFlags &dispatchFlags, uint32_t &newL3Config) { if (!this->isPreambleSent) { GraphicsAllocation *perDssBackedBufferToUse = dispatchFlags.usePerDssBackedBuffer ? this->perDssBackedBuffer : nullptr; PreambleHelper::programPreamble(&csr, device, newL3Config, this->requiredThreadArbitrationPolicy, this->preemptionAllocation, perDssBackedBufferToUse); this->isPreambleSent = true; this->lastSentL3Config = newL3Config; this->lastSentThreadArbitrationPolicy = this->requiredThreadArbitrationPolicy; } } template inline void CommandStreamReceiverHw::programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t maxFrontEndThreads) { if (mediaVfeStateDirty) { if (dispatchFlags.additionalKernelExecInfo != AdditionalKernelExecInfo::NotApplicable) { lastAdditionalKernelExecInfo = dispatchFlags.additionalKernelExecInfo; } auto commandOffset = PreambleHelper::programVFEState(&csr, peekHwInfo(), requiredScratchSize, getScratchPatchAddress(), maxFrontEndThreads, getOsContext().getEngineType(), lastAdditionalKernelExecInfo); if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { flatBatchBufferHelper->collectScratchSpacePatchInfo(getScratchPatchAddress(), commandOffset, csr); } setMediaVFEStateDirty(false); } } template void CommandStreamReceiverHw::programMediaSampler(LinearStream &commandStream, DispatchFlags &dispatchFlags) { } template size_t CommandStreamReceiverHw::getCmdSizeForMediaSampler(bool mediaSamplerRequired) const { return 0; } template void CommandStreamReceiverHw::collectStateBaseAddresPatchInfo( uint64_t baseAddress, uint64_t commandOffset, const LinearStream &dsh, const LinearStream &ioh, const LinearStream &ssh, uint64_t generalStateBase) { typedef typename GfxFamily::STATE_BASE_ADDRESS STATE_BASE_ADDRESS; PatchInfoData dynamicStatePatchInfo = {dsh.getGraphicsAllocation()->getGpuAddress(), 0u, PatchInfoAllocationType::DynamicStateHeap, baseAddress, commandOffset + STATE_BASE_ADDRESS::PATCH_CONSTANTS::DYNAMICSTATEBASEADDRESS_BYTEOFFSET, PatchInfoAllocationType::Default}; PatchInfoData generalStatePatchInfo = {generalStateBase, 0u, PatchInfoAllocationType::GeneralStateHeap, baseAddress, commandOffset + STATE_BASE_ADDRESS::PATCH_CONSTANTS::GENERALSTATEBASEADDRESS_BYTEOFFSET, PatchInfoAllocationType::Default}; PatchInfoData surfaceStatePatchInfo = {ssh.getGraphicsAllocation()->getGpuAddress(), 0u, PatchInfoAllocationType::SurfaceStateHeap, baseAddress, commandOffset + STATE_BASE_ADDRESS::PATCH_CONSTANTS::SURFACESTATEBASEADDRESS_BYTEOFFSET, PatchInfoAllocationType::Default}; PatchInfoData indirectObjectPatchInfo = {ioh.getGraphicsAllocation()->getGpuAddress(), 0u, PatchInfoAllocationType::IndirectObjectHeap, baseAddress, commandOffset + STATE_BASE_ADDRESS::PATCH_CONSTANTS::INDIRECTOBJECTBASEADDRESS_BYTEOFFSET, PatchInfoAllocationType::Default}; flatBatchBufferHelper->setPatchInfoData(dynamicStatePatchInfo); flatBatchBufferHelper->setPatchInfoData(generalStatePatchInfo); flatBatchBufferHelper->setPatchInfoData(surfaceStatePatchInfo); flatBatchBufferHelper->setPatchInfoData(indirectObjectPatchInfo); } template void CommandStreamReceiverHw::resetKmdNotifyHelper(KmdNotifyHelper *newHelper) { kmdNotifyHelper.reset(newHelper); kmdNotifyHelper->updateAcLineStatus(); if (kmdNotifyHelper->quickKmdSleepForSporadicWaitsEnabled()) { kmdNotifyHelper->updateLastWaitForCompletionTimestamp(); } } template void CommandStreamReceiverHw::addClearSLMWorkAround(typename GfxFamily::PIPE_CONTROL *pCmd) { } template uint64_t CommandStreamReceiverHw::getScratchPatchAddress() { return scratchSpaceController->getScratchPatchAddress(); } template bool CommandStreamReceiverHw::detectInitProgrammingFlagsRequired(const DispatchFlags &dispatchFlags) const { return DebugManager.flags.ForceCsrReprogramming.get(); } template uint32_t CommandStreamReceiverHw::blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled) { using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END; using MI_FLUSH_DW = typename GfxFamily::MI_FLUSH_DW; auto lock = obtainUniqueOwnership(); auto &commandStream = getCS(BlitCommandsHelper::estimateBlitCommandsSize(blitPropertiesContainer, profilingEnabled, PauseOnGpuProperties::featureEnabled(DebugManager.flags.PauseOnBlitCopy.get()), *this->executionEnvironment.rootDeviceEnvironments[this->rootDeviceIndex])); auto commandStreamStart = commandStream.getUsed(); auto newTaskCount = taskCount + 1; latestSentTaskCount = newTaskCount; if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnBlitCopy.get(), taskCount, PauseOnGpuProperties::PauseMode::BeforeWorkload)) { BlitCommandsHelper::dispatchDebugPauseCommands(commandStream, getDebugPauseStateGPUAddress(), DebugPauseState::waitingForUserStartConfirmation, DebugPauseState::hasUserStartConfirmation); } programEnginePrologue(commandStream); for (auto &blitProperties : blitPropertiesContainer) { TimestampPacketHelper::programCsrDependencies(commandStream, blitProperties.csrDependencies, getOsContext().getNumSupportedDevices()); if (blitProperties.outputTimestampPacket && profilingEnabled) { auto timestampContextStartGpuAddress = blitProperties.outputTimestampPacket->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextStart); auto timestampGlobalStartAddress = blitProperties.outputTimestampPacket->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].globalStart); EncodeStoreMMIO::encode(commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextStartGpuAddress); EncodeStoreMMIO::encode(commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalStartAddress); } BlitCommandsHelper::dispatchBlitCommands(blitProperties, commandStream, *this->executionEnvironment.rootDeviceEnvironments[this->rootDeviceIndex]); if (blitProperties.outputTimestampPacket) { if (profilingEnabled) { auto timestampContextEndGpuAddress = blitProperties.outputTimestampPacket->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextEnd); auto timestampGlobalEndAddress = blitProperties.outputTimestampPacket->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].globalEnd); EncodeStoreMMIO::encode(commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextEndGpuAddress); EncodeStoreMMIO::encode(commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalEndAddress); } else { auto timestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*blitProperties.outputTimestampPacket); EncodeMiFlushDW::programMiFlushDw(commandStream, timestampPacketGpuAddress, 0, true, true); } makeResident(*blitProperties.outputTimestampPacket->getBaseGraphicsAllocation()); } blitProperties.csrDependencies.makeResident(*this); makeResident(*blitProperties.srcAllocation); makeResident(*blitProperties.dstAllocation); } MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), peekHwInfo()); EncodeMiFlushDW::programMiFlushDw(commandStream, tagAllocation->getGpuAddress(), newTaskCount, false, true); MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), peekHwInfo()); if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnBlitCopy.get(), taskCount, PauseOnGpuProperties::PauseMode::AfterWorkload)) { BlitCommandsHelper::dispatchDebugPauseCommands(commandStream, getDebugPauseStateGPUAddress(), DebugPauseState::waitingForUserEndConfirmation, DebugPauseState::hasUserEndConfirmation); } auto batchBufferEnd = reinterpret_cast(commandStream.getSpace(sizeof(MI_BATCH_BUFFER_END))); *batchBufferEnd = GfxFamily::cmdInitBatchBufferEnd; alignToCacheLine(commandStream); makeResident(*tagAllocation); if (globalFenceAllocation) { makeResident(*globalFenceAllocation); } BatchBuffer batchBuffer{commandStream.getGraphicsAllocation(), commandStreamStart, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, commandStream.getUsed(), &commandStream, nullptr}; flush(batchBuffer, getResidencyAllocations()); makeSurfacePackNonResident(getResidencyAllocations()); latestFlushedTaskCount = newTaskCount; taskCount = newTaskCount; auto flushStampToWait = flushStamp->peekStamp(); lock.unlock(); if (blocking) { waitForTaskCountWithKmdNotifyFallback(newTaskCount, flushStampToWait, false, false); internalAllocationStorage->cleanAllocationList(newTaskCount, TEMPORARY_ALLOCATION); } return newTaskCount; } template inline void CommandStreamReceiverHw::programAdditionalPipelineSelect(LinearStream &csr, PipelineSelectArgs &pipelineSelectArgs, bool is3DPipeline) { auto &hwHelper = HwHelper::get(peekHwInfo().platform.eRenderCoreFamily); if (hwHelper.is3DPipelineSelectWARequired(peekHwInfo()) && isRcs()) { auto localPipelineSelectArgs = pipelineSelectArgs; localPipelineSelectArgs.is3DPipelineRequired = is3DPipeline; PreambleHelper::programPipelineSelect(&csr, localPipelineSelectArgs, peekHwInfo()); } } template inline bool CommandStreamReceiverHw::isComputeModeNeeded() const { return false; } template inline bool CommandStreamReceiverHw::isPipelineSelectAlreadyProgrammed() const { auto &hwHelper = HwHelper::get(peekHwInfo().platform.eRenderCoreFamily); return isComputeModeNeeded() && hwHelper.is3DPipelineSelectWARequired(peekHwInfo()) && isRcs(); } template inline void CommandStreamReceiverHw::programEpilogue(LinearStream &csr, void **batchBufferEndLocation, DispatchFlags &dispatchFlags) { if (dispatchFlags.epilogueRequired) { auto currentOffset = ptrDiff(csr.getSpace(0u), csr.getCpuBase()); auto gpuAddress = ptrOffset(csr.getGraphicsAllocation()->getGpuAddress(), currentOffset); addBatchBufferStart(reinterpret_cast(*batchBufferEndLocation), gpuAddress, false); this->programEpliogueCommands(csr, dispatchFlags); programEndingCmd(csr, batchBufferEndLocation, isDirectSubmissionEnabled()); this->alignToCacheLine(csr); } } template inline size_t CommandStreamReceiverHw::getCmdSizeForEpilogue(const DispatchFlags &dispatchFlags) const { if (dispatchFlags.epilogueRequired) { size_t terminateCmd = sizeof(typename GfxFamily::MI_BATCH_BUFFER_END); if (isDirectSubmissionEnabled()) { terminateCmd = sizeof(typename GfxFamily::MI_BATCH_BUFFER_START); } auto size = getCmdSizeForEpilogueCommands(dispatchFlags) + terminateCmd; return alignUp(size, MemoryConstants::cacheLineSize); } return 0u; } template inline void CommandStreamReceiverHw::programEnginePrologue(LinearStream &csr) { } template inline size_t CommandStreamReceiverHw::getCmdSizeForPrologue() const { return 0u; } template inline bool CommandStreamReceiverHw::initDirectSubmission(Device &device, OsContext &osContext) { bool ret = true; if (DebugManager.flags.EnableDirectSubmission.get() == 1) { auto contextEngineType = osContext.getEngineType(); const DirectSubmissionProperties &directSubmissionProperty = device.getHardwareInfo().capabilityTable.directSubmissionEngines.data[contextEngineType]; bool startDirect = true; if (!osContext.isDefaultContext()) { startDirect = directSubmissionProperty.useNonDefault; } if (osContext.isLowPriority()) { startDirect = directSubmissionProperty.useLowPriority; } if (osContext.isInternalEngine()) { startDirect = directSubmissionProperty.useInternal; } if (osContext.isRootDevice()) { startDirect = directSubmissionProperty.useRootDevice; } bool submitOnInit = directSubmissionProperty.submitOnInit; bool engineSupported = checkDirectSubmissionSupportsEngine(directSubmissionProperty, contextEngineType, submitOnInit); if (engineSupported && startDirect) { if (contextEngineType == aub_stream::ENGINE_BCS) { blitterDirectSubmission = DirectSubmissionHw>::create(device, osContext); ret = blitterDirectSubmission->initialize(submitOnInit); } else { directSubmission = DirectSubmissionHw>::create(device, osContext); ret = directSubmission->initialize(submitOnInit); this->dispatchMode = DispatchMode::ImmediateDispatch; } } } return ret; } template inline bool CommandStreamReceiverHw::checkDirectSubmissionSupportsEngine(const DirectSubmissionProperties &directSubmissionProperty, aub_stream::EngineType contextEngineType, bool &startOnInit) { bool supported = directSubmissionProperty.engineSupported; startOnInit = directSubmissionProperty.submitOnInit; if (contextEngineType == aub_stream::ENGINE_BCS) { int32_t blitterOverrideKey = DebugManager.flags.DirectSubmissionOverrideBlitterSupport.get(); if (blitterOverrideKey != -1) { supported = blitterOverrideKey == 0 ? false : true; startOnInit = blitterOverrideKey == 1 ? true : false; } } else if (contextEngineType == aub_stream::ENGINE_RCS) { int32_t renderOverrideKey = DebugManager.flags.DirectSubmissionOverrideRenderSupport.get(); if (renderOverrideKey != -1) { supported = renderOverrideKey == 0 ? false : true; startOnInit = renderOverrideKey == 1 ? true : false; } } else { //assume else is CCS int32_t computeOverrideKey = DebugManager.flags.DirectSubmissionOverrideComputeSupport.get(); if (computeOverrideKey != -1) { supported = computeOverrideKey == 0 ? false : true; startOnInit = computeOverrideKey == 1 ? true : false; } } return supported; } } // namespace NEO