From 10d610d16336175ed5c9120052f4c5c3182343a1 Mon Sep 17 00:00:00 2001 From: Kamil Kopryk Date: Fri, 23 Feb 2024 04:57:45 +0000 Subject: [PATCH] refactor: move process barrier with post sync to a function Related-To: NEO-7824 Signed-off-by: Kamil Kopryk --- .../command_stream_receiver_hw.h | 8 ++ .../command_stream_receiver_hw_base.inl | 108 ++++++++++-------- 2 files changed, 66 insertions(+), 50 deletions(-) diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index cb360695d1..11ae4926b2 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -241,6 +241,14 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { bool areMultipleSubDevicesInContext, bool setGeneralStateBaseAddress); + inline void processBarrierWithPostSync(LinearStream &commandStreamTask, + DispatchFlags &dispatchFlags, + bool &levelClosed, + void *¤tPipeControlForNooping, + void *&epiloguePipeControlLocation, + bool &hasStallingCmdsOnTaskStream, + PipeControlArgs &args); + inline CompletionStamp updateTaskCountAndGetCompletionStamp(bool levelClosed); inline void programSamplerCacheFlushBetweenRedescribedSurfaceReads(LinearStream &commandStreamCSR); bool bcsRelaxedOrderingAllowed(const BlitPropertiesContainer &blitPropertiesContainer, bool hasStallingCmds) const; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 6fe56d22ac..e0e53a2e2e 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -366,8 +366,6 @@ CompletionStamp CommandStreamReceiverHw::flushTask( using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END; using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; - auto &rootDeviceEnvironment = this->peekRootDeviceEnvironment(); - DEBUG_BREAK_IF(&commandStreamTask == &commandStream); DEBUG_BREAK_IF(!(dispatchFlags.preemptionMode == PreemptionMode::Disabled ? device.getPreemptionMode() == PreemptionMode::Disabled : true)); DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady); @@ -393,55 +391,9 @@ CompletionStamp CommandStreamReceiverHw::flushTask( bool hasStallingCmdsOnTaskStream = false; if (dispatchFlags.blocking || dispatchFlags.dcFlush || dispatchFlags.guardCommandBufferWithPipeControl || this->heapStorageRequiresRecyclingTag) { - if (this->dispatchMode == DispatchMode::immediateDispatch) { - // for ImmediateDispatch we will send this right away, therefore this pipe control will close the level - // for BatchedSubmissions it will be nooped and only last ppc in batch will be emitted. - levelClosed = true; - // if we guard with ppc, flush dc as well to speed up completion latency - if (dispatchFlags.guardCommandBufferWithPipeControl || this->heapStorageRequiresRecyclingTag || dispatchFlags.blocking) { - dispatchFlags.dcFlush = this->dcFlushSupport; - } - } - this->heapStorageRequiresRecyclingTag = false; - epiloguePipeControlLocation = ptrOffset(commandStreamTask.getCpuBase(), commandStreamTask.getUsed()); - - if ((dispatchFlags.outOfOrderExecutionAllowed || timestampPacketWriteEnabled) && - !dispatchFlags.dcFlush) { - currentPipeControlForNooping = epiloguePipeControlLocation; - } - - hasStallingCmdsOnTaskStream = true; - - auto address = getTagAllocation()->getGpuAddress(); - - args.dcFlushEnable = getDcFlushRequired(dispatchFlags.dcFlush); - args.notifyEnable = isUsedNotifyEnableForPostSync(); - args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired; - args.textureCacheInvalidationEnable |= dispatchFlags.textureCacheFlush; - args.workloadPartitionOffset = isMultiTileOperationEnabled(); - args.stateCacheInvalidationEnable = dispatchFlags.stateCacheInvalidation; - MemorySynchronizationCommands::addBarrierWithPostSyncOperation( - commandStreamTask, - PostSyncMode::immediateData, - address, - taskCount + 1, - rootDeviceEnvironment, - args); - - DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "taskCount", peekTaskCount()); - if (debugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { - flatBatchBufferHelper->setPatchInfoData(PatchInfoData(address, 0u, - PatchInfoAllocationType::tagAddress, - commandStreamTask.getGraphicsAllocation()->getGpuAddress(), - commandStreamTask.getUsed() - 2 * sizeof(uint64_t), - PatchInfoAllocationType::defaultType)); - flatBatchBufferHelper->setPatchInfoData(PatchInfoData(address, 0u, - PatchInfoAllocationType::tagValue, - commandStreamTask.getGraphicsAllocation()->getGpuAddress(), - commandStreamTask.getUsed() - sizeof(uint64_t), - PatchInfoAllocationType::defaultType)); - } + processBarrierWithPostSync(commandStreamTask, dispatchFlags, levelClosed, currentPipeControlForNooping, + epiloguePipeControlLocation, hasStallingCmdsOnTaskStream, args); } this->latestSentTaskCount = taskCount + 1; @@ -1829,6 +1781,62 @@ inline void CommandStreamReceiverHw::programStateBaseAddressCommon( } } +template +inline void CommandStreamReceiverHw::processBarrierWithPostSync(LinearStream &commandStreamTask, DispatchFlags &dispatchFlags, bool &levelClosed, void *¤tPipeControlForNooping, void *&epiloguePipeControlLocation, bool &hasStallingCmdsOnTaskStream, PipeControlArgs &args) { + + if (this->dispatchMode == DispatchMode::immediateDispatch) { + // for ImmediateDispatch we will send this right away, therefore this pipe control will close the level + // for BatchedSubmissions it will be nooped and only last ppc in batch will be emitted. + levelClosed = true; + // if we guard with ppc, flush dc as well to speed up completion latency + if (dispatchFlags.guardCommandBufferWithPipeControl || this->heapStorageRequiresRecyclingTag || dispatchFlags.blocking) { + dispatchFlags.dcFlush = this->dcFlushSupport; + } + } + + this->heapStorageRequiresRecyclingTag = false; + epiloguePipeControlLocation = ptrOffset(commandStreamTask.getCpuBase(), commandStreamTask.getUsed()); + + if ((dispatchFlags.outOfOrderExecutionAllowed || timestampPacketWriteEnabled) && + !dispatchFlags.dcFlush) { + currentPipeControlForNooping = epiloguePipeControlLocation; + } + + hasStallingCmdsOnTaskStream = true; + + auto address = getTagAllocation()->getGpuAddress(); + auto &rootDeviceEnvironment = this->peekRootDeviceEnvironment(); + + args.dcFlushEnable = getDcFlushRequired(dispatchFlags.dcFlush); + args.notifyEnable = isUsedNotifyEnableForPostSync(); + args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired; + args.textureCacheInvalidationEnable |= dispatchFlags.textureCacheFlush; + args.workloadPartitionOffset = isMultiTileOperationEnabled(); + args.stateCacheInvalidationEnable = dispatchFlags.stateCacheInvalidation; + + MemorySynchronizationCommands::addBarrierWithPostSyncOperation( + commandStreamTask, + PostSyncMode::immediateData, + address, + taskCount + 1, + rootDeviceEnvironment, + args); + + DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "taskCount", peekTaskCount()); + if (debugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { + flatBatchBufferHelper->setPatchInfoData(PatchInfoData(address, 0u, + PatchInfoAllocationType::tagAddress, + commandStreamTask.getGraphicsAllocation()->getGpuAddress(), + commandStreamTask.getUsed() - 2 * sizeof(uint64_t), + PatchInfoAllocationType::defaultType)); + flatBatchBufferHelper->setPatchInfoData(PatchInfoData(address, 0u, + PatchInfoAllocationType::tagValue, + commandStreamTask.getGraphicsAllocation()->getGpuAddress(), + commandStreamTask.getUsed() - sizeof(uint64_t), + PatchInfoAllocationType::defaultType)); + } +} + template inline CompletionStamp CommandStreamReceiverHw::updateTaskCountAndGetCompletionStamp(bool levelClosed) {