/* * Copyright (c) 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include "runtime/command_stream/command_stream_receiver_hw.h" #include "runtime/command_stream/linear_stream.h" #include "runtime/device/device.h" #include "runtime/gtpin/gtpin_notify.h" #include "runtime/helpers/cache_policy.h" #include "runtime/helpers/flat_batch_buffer_helper_hw.h" #include "runtime/helpers/preamble.h" #include "runtime/helpers/ptr_math.h" #include "runtime/helpers/state_base_address.h" #include "runtime/helpers/options.h" #include "runtime/memory_manager/memory_manager.h" #include "runtime/os_interface/debug_settings_manager.h" #include "runtime/command_stream/preemption.h" #include "runtime/command_queue/gpgpu_walker.h" #include "command_stream_receiver_hw.h" namespace OCLRT { template CommandStreamReceiverHw::CommandStreamReceiverHw(const HardwareInfo &hwInfoIn) : hwInfo(hwInfoIn) { requiredThreadArbitrationPolicy = PreambleHelper::getDefaultThreadArbitrationPolicy(); resetKmdNotifyHelper(new KmdNotifyHelper(&(hwInfoIn.capabilityTable.kmdNotifyProperties))); flatBatchBufferHelper.reset(new FlatBatchBufferHelperHw(this->memoryManager)); } template FlushStamp CommandStreamReceiverHw::flush(BatchBuffer &batchBuffer, EngineType engineType, ResidencyContainer *allocationsForResidency) { return flushStamp->peekStamp(); } template inline void CommandStreamReceiverHw::addBatchBufferEnd(LinearStream &commandStream, void **patchLocation) { typedef typename GfxFamily::MI_BATCH_BUFFER_END MI_BATCH_BUFFER_END; auto pCmd = (MI_BATCH_BUFFER_END *)commandStream.getSpace(sizeof(MI_BATCH_BUFFER_END)); *pCmd = GfxFamily::cmdInitBatchBufferEnd; if (patchLocation) { *patchLocation = pCmd; } } template inline void CommandStreamReceiverHw::addBatchBufferStart(MI_BATCH_BUFFER_START *commandBufferMemory, uint64_t startAddress) { *commandBufferMemory = GfxFamily::cmdInitBatchBufferStart; commandBufferMemory->setBatchBufferStartAddressGraphicsaddress472(startAddress); commandBufferMemory->setAddressSpaceIndicator(MI_BATCH_BUFFER_START::ADDRESS_SPACE_INDICATOR_PPGTT); if (DebugManager.flags.FlattenBatchBufferForAUBDump.get()) { flatBatchBufferHelper->registerBatchBufferStartAddress(reinterpret_cast(commandBufferMemory), startAddress); } } template inline void CommandStreamReceiverHw::alignToCacheLine(LinearStream &commandStream) { auto used = commandStream.getUsed(); auto alignment = MemoryConstants::cacheLineSize; auto partialCacheline = used & (alignment - 1); if (partialCacheline) { auto amountToPad = alignment - partialCacheline; auto pCmd = commandStream.getSpace(amountToPad); memset(pCmd, 0, amountToPad); } } template inline size_t CommandStreamReceiverHw::getRequiredCmdSizeForPreamble() const { size_t size = sizeof(typename GfxFamily::PIPE_CONTROL) + sizeof(typename GfxFamily::MEDIA_VFE_STATE); size += PreambleHelper::getAdditionalCommandsSize(*memoryManager->device); if (!this->isPreambleSent || this->lastSentThreadArbitrationPolicy != this->requiredThreadArbitrationPolicy) { size += PreambleHelper::getThreadArbitrationCommandsSize(); } return size; } template inline typename GfxFamily::PIPE_CONTROL *CommandStreamReceiverHw::addPipeControlCmd(LinearStream &commandStream) { typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL; auto pCmd = reinterpret_cast(commandStream.getSpace(sizeof(PIPE_CONTROL))); *pCmd = GfxFamily::cmdInitPipeControl; pCmd->setCommandStreamerStallEnable(true); return pCmd; } template void CommandStreamReceiverHw::programPipelineSelect(LinearStream &commandStream, DispatchFlags &dispatchFlags) { if (csrSizeRequestFlags.mediaSamplerConfigChanged || !isPreambleSent) { PreambleHelper::programPipelineSelect(&commandStream, dispatchFlags.mediaSamplerRequired); this->lastMediaSamplerConfig = dispatchFlags.mediaSamplerRequired; } } template inline size_t CommandStreamReceiverHw::getCmdSizeForPipelineSelect() const { if (csrSizeRequestFlags.mediaSamplerConfigChanged || !isPreambleSent) { return sizeof(typename GfxFamily::PIPELINE_SELECT); } return 0; } template CompletionStamp CommandStreamReceiverHw::flushTask( LinearStream &commandStreamTask, size_t commandStreamStartTask, const IndirectHeap &dsh, const IndirectHeap &ioh, const IndirectHeap &ssh, uint32_t taskLevel, DispatchFlags &dispatchFlags) { typedef typename GfxFamily::MI_BATCH_BUFFER_START MI_BATCH_BUFFER_START; typedef typename GfxFamily::MI_BATCH_BUFFER_END MI_BATCH_BUFFER_END; typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL; typedef typename GfxFamily::STATE_BASE_ADDRESS STATE_BASE_ADDRESS; DEBUG_BREAK_IF(&commandStreamTask == &commandStream); DEBUG_BREAK_IF(!(dispatchFlags.preemptionMode == PreemptionMode::Disabled ? getMemoryManager()->device->getPreemptionMode() == PreemptionMode::Disabled : true)); DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady); DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "taskLevel", taskLevel); auto levelClosed = false; void *currentPipeControlForNooping = nullptr; void *epiloguePipeControlLocation = nullptr; Device *device = this->getMemoryManager()->device; if (dispatchFlags.blocking || dispatchFlags.dcFlush || dispatchFlags.guardCommandBufferWithPipeControl) { if (this->dispatchMode == DispatchMode::ImmediateDispatch) { //for ImmediateDispatch we will send this right away, therefore this pipe control will close the level //for BatchedSubmissions it will be nooped and only last ppc in batch will be emitted. levelClosed = true; //if we guard with ppc, flush dc as well to speed up completion latency if (dispatchFlags.guardCommandBufferWithPipeControl) { dispatchFlags.dcFlush = true; } } epiloguePipeControlLocation = ptrOffset(commandStreamTask.getCpuBase(), commandStreamTask.getUsed()); if (dispatchFlags.outOfOrderExecutionAllowed && !dispatchFlags.dcFlush) { currentPipeControlForNooping = epiloguePipeControlLocation; } //Some architectures (SKL) requires to have pipe control prior to pipe control with tag write, add it here addPipeControlWA(commandStreamTask, dispatchFlags.dcFlush); auto pCmd = addPipeControlCmd(commandStreamTask); pCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA); //Some architectures (BDW) requires to have at least one flush bit set addDcFlushToPipeControl(pCmd, dispatchFlags.dcFlush); if (DebugManager.flags.FlushAllCaches.get()) { pCmd->setDcFlushEnable(true); pCmd->setRenderTargetCacheFlushEnable(true); pCmd->setInstructionCacheInvalidateEnable(true); pCmd->setTextureCacheInvalidationEnable(true); pCmd->setPipeControlFlushEnable(true); pCmd->setVfCacheInvalidationEnable(true); pCmd->setConstantCacheInvalidationEnable(true); pCmd->setStateCacheInvalidationEnable(true); } auto address = reinterpret_cast(getTagAddress()); pCmd->setAddressHigh(address >> 32); pCmd->setAddress(address & (0xffffffff)); pCmd->setImmediateData(taskCount + 1); this->latestSentTaskCount = taskCount + 1; DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "taskCount", taskCount); if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { flatBatchBufferHelper->setPatchInfoData(PatchInfoData(address, 0u, PatchInfoAllocationType::TagAddress, commandStreamTask.getGraphicsAllocation()->getGpuAddress(), commandStreamTask.getUsed() - 2 * sizeof(uint64_t), PatchInfoAllocationType::Default)); flatBatchBufferHelper->setPatchInfoData(PatchInfoData(address, 0u, PatchInfoAllocationType::TagValue, commandStreamTask.getGraphicsAllocation()->getGpuAddress(), commandStreamTask.getUsed() - sizeof(uint64_t), PatchInfoAllocationType::Default)); } } if (DebugManager.flags.ForceSLML3Config.get()) { dispatchFlags.useSLM = true; } if (DebugManager.flags.OverrideThreadArbitrationPolicy.get() != -1) { requestThreadArbitrationPolicy(static_cast(DebugManager.flags.OverrideThreadArbitrationPolicy.get())); } auto newL3Config = PreambleHelper::getL3Config(peekHwInfo(), dispatchFlags.useSLM); csrSizeRequestFlags.l3ConfigChanged = this->lastSentL3Config != newL3Config; csrSizeRequestFlags.coherencyRequestChanged = this->lastSentCoherencyRequest != static_cast(dispatchFlags.requiresCoherency); csrSizeRequestFlags.preemptionRequestChanged = this->lastPreemptionMode != dispatchFlags.preemptionMode; csrSizeRequestFlags.mediaSamplerConfigChanged = this->lastMediaSamplerConfig != static_cast(dispatchFlags.mediaSamplerRequired); auto &commandStreamCSR = this->getCS(getRequiredCmdStreamSizeAligned(dispatchFlags)); auto commandStreamStartCSR = commandStreamCSR.getUsed(); initPageTableManagerRegisters(commandStreamCSR); programPreemption(commandStreamCSR, dispatchFlags); programCoherency(commandStreamCSR, dispatchFlags); programL3(commandStreamCSR, dispatchFlags, newL3Config); programPipelineSelect(commandStreamCSR, dispatchFlags); programPreamble(commandStreamCSR, dispatchFlags, newL3Config); programMediaSampler(commandStreamCSR, dispatchFlags); size_t requiredScratchSizeInBytes = requiredScratchSize * (hwInfo.pSysInfo->MaxSubSlicesSupported * hwInfo.pSysInfo->MaxEuPerSubSlice * hwInfo.pSysInfo->ThreadCount / hwInfo.pSysInfo->EUCount); auto force32BitAllocations = getMemoryManager()->peekForce32BitAllocations(); bool stateBaseAddressDirty = false; if (requiredScratchSize && (!scratchAllocation || scratchAllocation->getUnderlyingBufferSize() < requiredScratchSizeInBytes)) { if (scratchAllocation) { scratchAllocation->taskCount = this->taskCount; getMemoryManager()->storeAllocation(std::unique_ptr(scratchAllocation), TEMPORARY_ALLOCATION); } scratchAllocation = getMemoryManager()->createGraphicsAllocationWithRequiredBitness(requiredScratchSizeInBytes, nullptr); overrideMediaVFEStateDirty(true); if (is64bit && !force32BitAllocations) { stateBaseAddressDirty = true; } } if (this->lastSentThreadArbitrationPolicy != this->requiredThreadArbitrationPolicy) { PreambleHelper::programThreadArbitration(&commandStreamCSR, this->requiredThreadArbitrationPolicy); this->lastSentThreadArbitrationPolicy = this->requiredThreadArbitrationPolicy; } stateBaseAddressDirty |= ((GSBAFor32BitProgrammed ^ dispatchFlags.GSBA32BitRequired) && force32BitAllocations); programVFEState(commandStreamCSR, dispatchFlags); bool dshDirty = dshState.updateAndCheck(&dsh); bool iohDirty = iohState.updateAndCheck(&ioh); bool sshDirty = sshState.updateAndCheck(&ssh); auto isStateBaseAddressDirty = dshDirty || iohDirty || sshDirty || stateBaseAddressDirty; auto requiredL3Index = CacheSettings::l3CacheOn; if (this->disableL3Cache) { requiredL3Index = CacheSettings::l3CacheOff; this->disableL3Cache = false; } if (requiredL3Index != latestSentStatelessMocsConfig) { isStateBaseAddressDirty = true; } //Reprogram state base address if required if (isStateBaseAddressDirty) { auto pCmd = addPipeControlCmd(commandStreamCSR); pCmd->setTextureCacheInvalidationEnable(true); pCmd->setDcFlushEnable(true); uint64_t newGSHbase = 0; GSBAFor32BitProgrammed = false; if (is64bit && scratchAllocation && !force32BitAllocations) { newGSHbase = (uint64_t)scratchAllocation->getUnderlyingBuffer() - PreambleHelper::getScratchSpaceOffsetFor64bit(); } else if (is64bit && force32BitAllocations && dispatchFlags.GSBA32BitRequired) { newGSHbase = memoryManager->allocator32Bit->getBase(); GSBAFor32BitProgrammed = true; } auto stateBaseAddressCmdOffset = commandStreamCSR.getUsed(); StateBaseAddressHelper::programStateBaseAddress( commandStreamCSR, dsh, ioh, ssh, newGSHbase, requiredL3Index, memoryManager->getInternalHeapBaseAddress()); latestSentStatelessMocsConfig = requiredL3Index; if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { collectStateBaseAddresPatchInfo(commandStream.getGraphicsAllocation()->getGpuAddress(), stateBaseAddressCmdOffset, dsh, ioh, ssh, newGSHbase); } } DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "this->taskLevel", (uint32_t)this->taskLevel); if (getMemoryManager()->device->getWaTable()->waSamplerCacheFlushBetweenRedescribedSurfaceReads) { if (this->samplerCacheFlushRequired != SamplerCacheFlushState::samplerCacheFlushNotRequired) { auto pCmd = addPipeControlCmd(commandStreamCSR); pCmd->setTextureCacheInvalidationEnable(true); if (this->samplerCacheFlushRequired == SamplerCacheFlushState::samplerCacheFlushBefore) { this->samplerCacheFlushRequired = SamplerCacheFlushState::samplerCacheFlushAfter; } else { this->samplerCacheFlushRequired = SamplerCacheFlushState::samplerCacheFlushNotRequired; } } } // Add a PC if we have a dependency on a previous walker to avoid concurrency issues. if (taskLevel > this->taskLevel) { addPipeControl(commandStreamCSR, false); this->taskLevel = taskLevel; DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "this->taskCount", this->taskCount); } auto dshAllocation = dsh.getGraphicsAllocation(); auto iohAllocation = ioh.getGraphicsAllocation(); auto sshAllocation = ssh.getGraphicsAllocation(); this->makeResident(*dshAllocation); dshAllocation->setEvictable(false); this->makeResident(*iohAllocation); this->makeResident(*sshAllocation); iohAllocation->setEvictable(false); this->makeResident(*tagAllocation); if (requiredScratchSize) makeResident(*scratchAllocation); if (preemptionCsrAllocation) makeResident(*preemptionCsrAllocation); if (dispatchFlags.preemptionMode == PreemptionMode::MidThread) { auto sipType = SipKernel::getSipKernelType(device->getHardwareInfo().pPlatform->eRenderCoreFamily, device->isSourceLevelDebuggerActive()); makeResident(*BuiltIns::getInstance().getSipKernel(sipType, *device).getSipAllocation()); } // If the CSR has work in its CS, flush it before the task bool submitTask = commandStreamStartTask != commandStreamTask.getUsed(); bool submitCSR = commandStreamStartCSR != commandStreamCSR.getUsed(); bool submitCommandStreamFromCsr = false; void *bbEndLocation = nullptr; auto bbEndPaddingSize = this->dispatchMode == DispatchMode::ImmediateDispatch ? 0 : sizeof(MI_BATCH_BUFFER_START) - sizeof(MI_BATCH_BUFFER_END); size_t chainedBatchBufferStartOffset = 0; GraphicsAllocation *chainedBatchBuffer = nullptr; if (submitTask) { this->addBatchBufferEnd(commandStreamTask, &bbEndLocation); this->emitNoop(commandStreamTask, bbEndPaddingSize); this->alignToCacheLine(commandStreamTask); if (submitCSR) { chainedBatchBufferStartOffset = commandStreamCSR.getUsed(); chainedBatchBuffer = commandStreamTask.getGraphicsAllocation(); // Add MI_BATCH_BUFFER_START to chain from CSR -> Task auto pBBS = reinterpret_cast(commandStreamCSR.getSpace(sizeof(MI_BATCH_BUFFER_START))); addBatchBufferStart(pBBS, ptrOffset(commandStreamTask.getGraphicsAllocation()->getGpuAddress(), commandStreamStartTask)); if (DebugManager.flags.FlattenBatchBufferForAUBDump.get()) { flatBatchBufferHelper->registerCommandChunk(commandStreamTask.getGraphicsAllocation()->getGpuAddress(), reinterpret_cast(commandStreamTask.getCpuBase()), commandStreamStartTask, static_cast(ptrDiff(bbEndLocation, commandStreamTask.getGraphicsAllocation()->getGpuAddress())) + sizeof(MI_BATCH_BUFFER_START)); } auto commandStreamAllocation = commandStreamTask.getGraphicsAllocation(); DEBUG_BREAK_IF(commandStreamAllocation == nullptr); this->makeResident(*commandStreamAllocation); this->alignToCacheLine(commandStreamCSR); submitCommandStreamFromCsr = true; } } else if (submitCSR) { this->addBatchBufferEnd(commandStreamCSR, &bbEndLocation); this->emitNoop(commandStreamCSR, bbEndPaddingSize); this->alignToCacheLine(commandStreamCSR); DEBUG_BREAK_IF(commandStreamCSR.getUsed() > commandStreamCSR.getMaxAvailableSpace()); submitCommandStreamFromCsr = true; } size_t startOffset = submitCommandStreamFromCsr ? commandStreamStartCSR : commandStreamStartTask; auto &streamToSubmit = submitCommandStreamFromCsr ? commandStreamCSR : commandStreamTask; BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, chainedBatchBufferStartOffset, chainedBatchBuffer, dispatchFlags.requiresCoherency, dispatchFlags.lowPriority, dispatchFlags.throttle, streamToSubmit.getUsed(), &streamToSubmit}; EngineType engineType = device->getEngineType(); if (submitCSR | submitTask) { if (this->dispatchMode == DispatchMode::ImmediateDispatch) { flushStamp->setStamp(this->flush(batchBuffer, engineType, nullptr)); this->latestFlushedTaskCount = this->taskCount + 1; this->makeSurfacePackNonResident(nullptr); } else { auto commandBuffer = new CommandBuffer; commandBuffer->batchBuffer = batchBuffer; commandBuffer->surfaces.swap(getMemoryManager()->getResidencyAllocations()); commandBuffer->batchBufferEndLocation = bbEndLocation; commandBuffer->taskCount = this->taskCount + 1; commandBuffer->flushStamp->replaceStampObject(dispatchFlags.flushStampReference); commandBuffer->pipeControlThatMayBeErasedLocation = currentPipeControlForNooping; commandBuffer->epiloguePipeControlLocation = epiloguePipeControlLocation; this->submissionAggregator->recordCommandBuffer(commandBuffer); } } else { this->makeSurfacePackNonResident(nullptr); } //check if we are not over the budget, if we are do implicit flush if (getMemoryManager()->isMemoryBudgetExhausted()) { if (this->totalMemoryUsed >= device->getDeviceInfo().globalMemSize / 4) { dispatchFlags.implicitFlush = true; } } if (this->dispatchMode == DispatchMode::BatchedDispatch && (dispatchFlags.blocking || dispatchFlags.implicitFlush)) { this->flushBatchedSubmissions(); } ++taskCount; DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "taskCount", taskCount); DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", tagAddress ? *tagAddress : 0); CompletionStamp completionStamp = { taskCount, this->taskLevel, flushStamp->peekStamp(), 0, engineType}; this->taskLevel += levelClosed ? 1 : 0; if (gtpinIsGTPinInitialized()) { gtpinNotifyFlushTask(completionStamp.taskCount); } return completionStamp; } template inline void CommandStreamReceiverHw::flushBatchedSubmissions() { if (this->dispatchMode == DispatchMode::ImmediateDispatch) { return; } typedef typename GfxFamily::MI_BATCH_BUFFER_START MI_BATCH_BUFFER_START; typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL; Device *device = this->getMemoryManager()->device; TakeOwnershipWrapper deviceOwnership(*device); EngineType engineType = device->getEngineType(); auto &commandBufferList = this->submissionAggregator->peekCmdBufferList(); if (!commandBufferList.peekIsEmpty()) { ResidencyContainer surfacesForSubmit; ResourcePackage resourcePackage; auto pipeControlLocationSize = getRequiredPipeControlSize(); void *currentPipeControlForNooping = nullptr; void *epiloguePipeControlLocation = nullptr; while (!commandBufferList.peekIsEmpty()) { size_t totalUsedSize = 0u; this->submissionAggregator->aggregateCommandBuffers(resourcePackage, totalUsedSize, (size_t)device->getDeviceInfo().globalMemSize * 5 / 10); auto primaryCmdBuffer = commandBufferList.removeFrontOne(); auto nextCommandBuffer = commandBufferList.peekHead(); auto currentBBendLocation = primaryCmdBuffer->batchBufferEndLocation; auto lastTaskCount = primaryCmdBuffer->taskCount; FlushStampUpdateHelper flushStampUpdateHelper; flushStampUpdateHelper.insert(primaryCmdBuffer->flushStamp->getStampReference()); currentPipeControlForNooping = primaryCmdBuffer->pipeControlThatMayBeErasedLocation; epiloguePipeControlLocation = primaryCmdBuffer->epiloguePipeControlLocation; if (DebugManager.flags.FlattenBatchBufferForAUBDump.get()) { flatBatchBufferHelper->registerCommandChunk(primaryCmdBuffer.get()->batchBuffer, sizeof(MI_BATCH_BUFFER_START)); } while (nextCommandBuffer && nextCommandBuffer->inspectionId == primaryCmdBuffer->inspectionId) { //noop pipe control if (currentPipeControlForNooping) { if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { flatBatchBufferHelper->removePipeControlData(pipeControlLocationSize, currentPipeControlForNooping); } memset(currentPipeControlForNooping, 0, pipeControlLocationSize); } //obtain next candidate for nooping currentPipeControlForNooping = nextCommandBuffer->pipeControlThatMayBeErasedLocation; //track epilogue pipe control epiloguePipeControlLocation = nextCommandBuffer->epiloguePipeControlLocation; flushStampUpdateHelper.insert(nextCommandBuffer->flushStamp->getStampReference()); auto nextCommandBufferAddress = nextCommandBuffer->batchBuffer.commandBufferAllocation->getUnderlyingBuffer(); auto offsetedCommandBuffer = (uint64_t)ptrOffset(nextCommandBufferAddress, nextCommandBuffer->batchBuffer.startOffset); addBatchBufferStart((MI_BATCH_BUFFER_START *)currentBBendLocation, offsetedCommandBuffer); if (DebugManager.flags.FlattenBatchBufferForAUBDump.get()) { flatBatchBufferHelper->registerCommandChunk(nextCommandBuffer->batchBuffer, sizeof(MI_BATCH_BUFFER_START)); } currentBBendLocation = nextCommandBuffer->batchBufferEndLocation; lastTaskCount = nextCommandBuffer->taskCount; nextCommandBuffer = nextCommandBuffer->next; commandBufferList.removeFrontOne(); } surfacesForSubmit.reserve(resourcePackage.size() + 1); for (auto &surface : resourcePackage) { surfacesForSubmit.push_back(surface); } //make sure we flush DC if (epiloguePipeControlLocation) { ((PIPE_CONTROL *)epiloguePipeControlLocation)->setDcFlushEnable(true); } auto flushStamp = this->flush(primaryCmdBuffer->batchBuffer, engineType, &surfacesForSubmit); //after flush task level is closed this->taskLevel++; flushStampUpdateHelper.updateAll(flushStamp); this->latestFlushedTaskCount = lastTaskCount; this->flushStamp->setStamp(flushStamp); this->makeSurfacePackNonResident(&surfacesForSubmit); resourcePackage.clear(); } this->totalMemoryUsed = 0; } } template void CommandStreamReceiverHw::addPipeControl(LinearStream &commandStream, bool dcFlush) { typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL; addPipeControlWA(commandStream, dcFlush); // Add a PIPE_CONTROL w/ CS_stall auto pCmd = reinterpret_cast(commandStream.getSpace(sizeof(PIPE_CONTROL))); *pCmd = GfxFamily::cmdInitPipeControl; pCmd->setCommandStreamerStallEnable(true); //Some architectures (BDW) requires to have at least one flush bit set addDcFlushToPipeControl(pCmd, true); if (DebugManager.flags.FlushAllCaches.get()) { pCmd->setDcFlushEnable(true); pCmd->setRenderTargetCacheFlushEnable(true); pCmd->setInstructionCacheInvalidateEnable(true); pCmd->setTextureCacheInvalidationEnable(true); pCmd->setPipeControlFlushEnable(true); pCmd->setVfCacheInvalidationEnable(true); pCmd->setConstantCacheInvalidationEnable(true); pCmd->setStateCacheInvalidationEnable(true); } } template uint64_t CommandStreamReceiverHw::getScratchPatchAddress() { //for 32 bit scratch space pointer is being programmed in Media VFE State and is relative to 0 as General State Base Address //for 64 bit, scratch space pointer is being programmed as "General State Base Address - scratchSpaceOffsetFor64bit" // and "0 + scratchSpaceOffsetFor64bit" is being programmed in Media VFE state uint64_t scratchAddress = 0; if (requiredScratchSize) { scratchAddress = scratchAllocation->getGpuAddressToPatch(); if (is64bit && !getMemoryManager()->peekForce32BitAllocations()) { //this is to avoid scractch allocation offset "0" scratchAddress = PreambleHelper::getScratchSpaceOffsetFor64bit(); } } return scratchAddress; } template size_t CommandStreamReceiverHw::getRequiredCmdStreamSizeAligned(const DispatchFlags &dispatchFlags) { size_t size = getRequiredCmdStreamSize(dispatchFlags); return alignUp(size, MemoryConstants::cacheLineSize); } template size_t CommandStreamReceiverHw::getRequiredCmdStreamSize(const DispatchFlags &dispatchFlags) { size_t size = getRequiredCmdSizeForPreamble(); size += sizeof(typename GfxFamily::STATE_BASE_ADDRESS) + sizeof(PIPE_CONTROL); size += getRequiredPipeControlSize(); size += sizeof(typename GfxFamily::MI_BATCH_BUFFER_START); size += getCmdSizeForL3Config(); size += getCmdSizeForCoherency(); size += getCmdSizeForMediaSampler(dispatchFlags.mediaSamplerRequired); size += getCmdSizeForPipelineSelect(); size += getCmdSizeForPreemption(dispatchFlags); if (getMemoryManager()->device->getWaTable()->waSamplerCacheFlushBetweenRedescribedSurfaceReads) { if (this->samplerCacheFlushRequired != SamplerCacheFlushState::samplerCacheFlushNotRequired) { size += sizeof(typename GfxFamily::PIPE_CONTROL); } } return size; } template inline void CommandStreamReceiverHw::emitNoop(LinearStream &commandStream, size_t bytesToUpdate) { if (bytesToUpdate) { auto ptr = commandStream.getSpace(bytesToUpdate); memset(ptr, 0, bytesToUpdate); } } template inline void CommandStreamReceiverHw::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) { int64_t waitTimeout = 0; bool enableTimeout = kmdNotifyHelper->obtainTimeoutParams(waitTimeout, useQuickKmdSleep, *getTagAddress(), taskCountToWait, flushStampToWait); auto status = waitForCompletionWithTimeout(enableTimeout, waitTimeout, taskCountToWait); if (!status) { waitForFlushStamp(flushStampToWait); //now call blocking wait, this is to ensure that task count is reached waitForCompletionWithTimeout(false, 0, taskCountToWait); } UNRECOVERABLE_IF(*getTagAddress() < taskCountToWait); if (kmdNotifyHelper->quickKmdSleepForSporadicWaitsEnabled()) { kmdNotifyHelper->updateLastWaitForCompletionTimestamp(); } } template inline void CommandStreamReceiverHw::programPreemption(LinearStream &csr, DispatchFlags &dispatchFlags) { PreemptionHelper::programCmdStream(csr, dispatchFlags.preemptionMode, this->lastPreemptionMode, preemptionCsrAllocation, *memoryManager->device); this->lastPreemptionMode = dispatchFlags.preemptionMode; } template inline size_t CommandStreamReceiverHw::getCmdSizeForPreemption(const DispatchFlags &dispatchFlags) const { return PreemptionHelper::getRequiredCmdStreamSize(dispatchFlags.preemptionMode, this->lastPreemptionMode); } template inline void CommandStreamReceiverHw::programL3(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t &newL3Config) { typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL; if (csrSizeRequestFlags.l3ConfigChanged && this->isPreambleSent) { // Add a PIPE_CONTROL w/ CS_stall auto pCmd = (PIPE_CONTROL *)csr.getSpace(sizeof(PIPE_CONTROL)); *pCmd = GfxFamily::cmdInitPipeControl; pCmd->setCommandStreamerStallEnable(true); pCmd->setDcFlushEnable(true); PreambleHelper::programL3(&csr, newL3Config); this->lastSentL3Config = newL3Config; } } template inline size_t CommandStreamReceiverHw::getCmdSizeForL3Config() const { if (!this->isPreambleSent) { return sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM); } else if (csrSizeRequestFlags.l3ConfigChanged) { return sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM) + sizeof(typename GfxFamily::PIPE_CONTROL); } return 0; } template inline void CommandStreamReceiverHw::programPreamble(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t &newL3Config) { if (!this->isPreambleSent) { PreambleHelper::programPreamble(&csr, *memoryManager->device, newL3Config, this->requiredThreadArbitrationPolicy, this->preemptionCsrAllocation); this->isPreambleSent = true; this->lastSentL3Config = newL3Config; this->lastSentThreadArbitrationPolicy = this->requiredThreadArbitrationPolicy; } } template inline void CommandStreamReceiverHw::programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags) { if (mediaVfeStateDirty) { PreambleHelper::programVFEState(&csr, hwInfo, requiredScratchSize, getScratchPatchAddress()); overrideMediaVFEStateDirty(false); } } template void CommandStreamReceiverHw::programMediaSampler(LinearStream &commandStream, DispatchFlags &dispatchFlags) { } template size_t CommandStreamReceiverHw::getCmdSizeForMediaSampler(bool mediaSamplerRequired) const { return 0; } template void CommandStreamReceiverHw::collectStateBaseAddresPatchInfo( uint64_t baseAddress, uint64_t commandOffset, const LinearStream &dsh, const LinearStream &ioh, const LinearStream &ssh, uint64_t generalStateBase) { typedef typename GfxFamily::STATE_BASE_ADDRESS STATE_BASE_ADDRESS; PatchInfoData dynamicStatePatchInfo = {dsh.getGraphicsAllocation()->getGpuAddress(), 0u, PatchInfoAllocationType::DynamicStateHeap, baseAddress, commandOffset + STATE_BASE_ADDRESS::PATCH_CONSTANTS::DYNAMICSTATEBASEADDRESS_BYTEOFFSET, PatchInfoAllocationType::Default}; PatchInfoData generalStatePatchInfo = {generalStateBase, 0u, PatchInfoAllocationType::GeneralStateHeap, baseAddress, commandOffset + STATE_BASE_ADDRESS::PATCH_CONSTANTS::GENERALSTATEBASEADDRESS_BYTEOFFSET, PatchInfoAllocationType::Default}; PatchInfoData surfaceStatePatchInfo = {ssh.getGraphicsAllocation()->getGpuAddress(), 0u, PatchInfoAllocationType::SurfaceStateHeap, baseAddress, commandOffset + STATE_BASE_ADDRESS::PATCH_CONSTANTS::SURFACESTATEBASEADDRESS_BYTEOFFSET, PatchInfoAllocationType::Default}; PatchInfoData indirectObjectPatchInfo = {ioh.getGraphicsAllocation()->getGpuAddress(), 0u, PatchInfoAllocationType::IndirectObjectHeap, baseAddress, commandOffset + STATE_BASE_ADDRESS::PATCH_CONSTANTS::INDIRECTOBJECTBASEADDRESS_BYTEOFFSET, PatchInfoAllocationType::Default}; flatBatchBufferHelper->setPatchInfoData(dynamicStatePatchInfo); flatBatchBufferHelper->setPatchInfoData(generalStatePatchInfo); flatBatchBufferHelper->setPatchInfoData(surfaceStatePatchInfo); flatBatchBufferHelper->setPatchInfoData(indirectObjectPatchInfo); } template void CommandStreamReceiverHw::resetKmdNotifyHelper(KmdNotifyHelper *newHelper) { kmdNotifyHelper.reset(newHelper); kmdNotifyHelper->updateAcLineStatus(); if (kmdNotifyHelper->quickKmdSleepForSporadicWaitsEnabled()) { kmdNotifyHelper->updateLastWaitForCompletionTimestamp(); } } } // namespace OCLRT