RelaxedOrdering: Improve dependencies tracking

Avoid not needed scheduler programming Related-To: NEO-7458 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
2025-12-21 09:14:47 +08:00 · 2022-11-26 20:10:32 +00:00
parent ad6237478f
commit 3f962bf3e8
23 changed files with 734 additions and 161 deletions
--- a/shared/source/command_stream/command_stream_receiver_hw_base.inl
+++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl
@@ -615,7 +615,8 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
    auto &streamToSubmit = submitCommandStreamFromCsr ? commandStreamCSR : commandStreamTask;
    BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, chainedBatchBufferStartOffset, taskStartAddress, chainedBatchBuffer,
                            dispatchFlags.requiresCoherency, dispatchFlags.lowPriority, dispatchFlags.throttle, dispatchFlags.sliceCount,
-                            streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation, dispatchFlags.useSingleSubdevice, (submitCSR || dispatchFlags.hasStallingCmds)};
+                            streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation, dispatchFlags.useSingleSubdevice, (submitCSR || dispatchFlags.hasStallingCmds),
+                            dispatchFlags.hasRelaxedOrderingDependencies};
    streamToSubmit.getGraphicsAllocation()->updateTaskCount(this->taskCount + 1, this->osContext->getContextId());
    streamToSubmit.getGraphicsAllocation()->updateResidencyTaskCount(this->taskCount + 1, this->osContext->getContextId());

@@ -1178,7 +1179,7 @@ uint32_t CommandStreamReceiverHw<GfxFamily>::flushBcsTask(const BlitPropertiesCo
    uint64_t taskStartAddress = commandStream.getGpuBase() + commandStreamStart;

    BatchBuffer batchBuffer{commandStream.getGraphicsAllocation(), commandStreamStart, 0, taskStartAddress, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount,
-                            commandStream.getUsed(), &commandStream, endingCmdPtr, false, false};
+                            commandStream.getUsed(), &commandStream, endingCmdPtr, false, false, false};

    commandStream.getGraphicsAllocation()->updateTaskCount(newTaskCount, this->osContext->getContextId());
    commandStream.getGraphicsAllocation()->updateResidencyTaskCount(newTaskCount, this->osContext->getContextId());
@@ -1290,7 +1291,7 @@ SubmissionStatus CommandStreamReceiverHw<GfxFamily>::flushSmallTask(LinearStream

    BatchBuffer batchBuffer{commandStreamTask.getGraphicsAllocation(), commandStreamStartTask, 0, taskStartAddress,
                            nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount,
-                            commandStreamTask.getUsed(), &commandStreamTask, endingCmdPtr, false, true};
+                            commandStreamTask.getUsed(), &commandStreamTask, endingCmdPtr, false, true, false};

    this->latestSentTaskCount = taskCount + 1;
    auto submissionStatus = flushHandler(batchBuffer, getResidencyAllocations());
--- a/shared/source/command_stream/csr_definitions.h
+++ b/shared/source/command_stream/csr_definitions.h
@@ -57,36 +57,37 @@ struct DispatchFlags {
                  uint64_t sliceCountP, bool blockingP, bool dcFlushP, bool useSLMP, bool guardCommandBufferWithPipeControlP, bool gsba32BitRequiredP,
                  bool requiresCoherencyP, bool lowPriorityP, bool implicitFlushP, bool outOfOrderExecutionAllowedP, bool epilogueRequiredP,
                  bool usePerDSSbackedBufferP, bool useSingleSubdeviceP, bool useGlobalAtomicsP, bool areMultipleSubDevicesInContextP, bool memoryMigrationRequiredP, bool textureCacheFlush,
-                  bool hasStallingCmds) : csrDependencies(csrDependenciesP),
-                                          barrierTimestampPacketNodes(barrierTimestampPacketNodesP),
-                                          pipelineSelectArgs(pipelineSelectArgsP),
-                                          flushStampReference(flushStampReferenceP),
-                                          throttle(throttleP),
-                                          preemptionMode(preemptionModeP),
-                                          numGrfRequired(numGrfRequiredP),
-                                          l3CacheSettings(l3CacheSettingsP),
-                                          threadArbitrationPolicy(threadArbitrationPolicyP),
-                                          additionalKernelExecInfo(additionalKernelExecInfoP),
-                                          kernelExecutionType(kernelExecutionTypeP),
-                                          memoryCompressionState(memoryCompressionStateP),
-                                          sliceCount(sliceCountP),
-                                          blocking(blockingP),
-                                          dcFlush(dcFlushP),
-                                          useSLM(useSLMP),
-                                          guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP),
-                                          gsba32BitRequired(gsba32BitRequiredP),
-                                          requiresCoherency(requiresCoherencyP),
-                                          lowPriority(lowPriorityP),
-                                          implicitFlush(implicitFlushP),
-                                          outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP),
-                                          epilogueRequired(epilogueRequiredP),
-                                          usePerDssBackedBuffer(usePerDSSbackedBufferP),
-                                          useSingleSubdevice(useSingleSubdeviceP),
-                                          useGlobalAtomics(useGlobalAtomicsP),
-                                          areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP),
-                                          memoryMigrationRequired(memoryMigrationRequiredP),
-                                          textureCacheFlush(textureCacheFlush),
-                                          hasStallingCmds(hasStallingCmds){};
+                  bool hasStallingCmds, bool hasRelaxedOrderingDependencies) : csrDependencies(csrDependenciesP),
+                                                                               barrierTimestampPacketNodes(barrierTimestampPacketNodesP),
+                                                                               pipelineSelectArgs(pipelineSelectArgsP),
+                                                                               flushStampReference(flushStampReferenceP),
+                                                                               throttle(throttleP),
+                                                                               preemptionMode(preemptionModeP),
+                                                                               numGrfRequired(numGrfRequiredP),
+                                                                               l3CacheSettings(l3CacheSettingsP),
+                                                                               threadArbitrationPolicy(threadArbitrationPolicyP),
+                                                                               additionalKernelExecInfo(additionalKernelExecInfoP),
+                                                                               kernelExecutionType(kernelExecutionTypeP),
+                                                                               memoryCompressionState(memoryCompressionStateP),
+                                                                               sliceCount(sliceCountP),
+                                                                               blocking(blockingP),
+                                                                               dcFlush(dcFlushP),
+                                                                               useSLM(useSLMP),
+                                                                               guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP),
+                                                                               gsba32BitRequired(gsba32BitRequiredP),
+                                                                               requiresCoherency(requiresCoherencyP),
+                                                                               lowPriority(lowPriorityP),
+                                                                               implicitFlush(implicitFlushP),
+                                                                               outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP),
+                                                                               epilogueRequired(epilogueRequiredP),
+                                                                               usePerDssBackedBuffer(usePerDSSbackedBufferP),
+                                                                               useSingleSubdevice(useSingleSubdeviceP),
+                                                                               useGlobalAtomics(useGlobalAtomicsP),
+                                                                               areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP),
+                                                                               memoryMigrationRequired(memoryMigrationRequiredP),
+                                                                               textureCacheFlush(textureCacheFlush),
+                                                                               hasStallingCmds(hasStallingCmds),
+                                                                               hasRelaxedOrderingDependencies(hasRelaxedOrderingDependencies){};

    CsrDependencies csrDependencies;
    TimestampPacketContainer *barrierTimestampPacketNodes = nullptr;
@@ -119,6 +120,7 @@ struct DispatchFlags {
    bool memoryMigrationRequired = false;
    bool textureCacheFlush = false;
    bool hasStallingCmds = false;
+    bool hasRelaxedOrderingDependencies = false;
    bool disableEUFusion = false;
 };

--- a/shared/source/command_stream/submissions_aggregator.cpp
+++ b/shared/source/command_stream/submissions_aggregator.cpp
@@ -100,14 +100,15 @@ void NEO::SubmissionAggregator::aggregateCommandBuffers(ResourcePackage &resourc

 NEO::BatchBuffer::BatchBuffer(GraphicsAllocation *commandBufferAllocation, size_t startOffset,
                              size_t chainedBatchBufferStartOffset, uint64_t taskStartAddress, GraphicsAllocation *chainedBatchBuffer,
-                              bool requiresCoherency, bool lowPriority,
-                              QueueThrottle throttle, uint64_t sliceCount,
-                              size_t usedSize, LinearStream *stream, void *endCmdPtr, bool useSingleSubdevice, bool hasStallingCmds)
+                              bool requiresCoherency, bool lowPriority, QueueThrottle throttle, uint64_t sliceCount,
+                              size_t usedSize, LinearStream *stream, void *endCmdPtr, bool useSingleSubdevice, bool hasStallingCmds,
+                              bool hasRelaxedOrderingDependencies)
    : commandBufferAllocation(commandBufferAllocation), startOffset(startOffset),
      chainedBatchBufferStartOffset(chainedBatchBufferStartOffset), taskStartAddress(taskStartAddress), chainedBatchBuffer(chainedBatchBuffer),
      requiresCoherency(requiresCoherency), low_priority(lowPriority),
      throttle(throttle), sliceCount(sliceCount),
-      usedSize(usedSize), stream(stream), endCmdPtr(endCmdPtr), useSingleSubdevice(useSingleSubdevice), hasStallingCmds(hasStallingCmds) {}
+      usedSize(usedSize), stream(stream), endCmdPtr(endCmdPtr), useSingleSubdevice(useSingleSubdevice), hasStallingCmds(hasStallingCmds),
+      hasRelaxedOrderingDependencies(hasRelaxedOrderingDependencies) {}

 NEO::CommandBuffer::CommandBuffer(Device &device) : device(device) {
    flushStamp.reset(new FlushStampTracker(false));
--- a/shared/source/command_stream/submissions_aggregator.h
+++ b/shared/source/command_stream/submissions_aggregator.h
@@ -32,7 +32,8 @@ struct BatchBuffer {
                LinearStream *stream,
                void *endCmdPtr,
                bool useSingleSubdevice,
-                bool hasStallingCmds);
+                bool hasStallingCmds,
+                bool hasRelaxedOrderingDependencies);
    BatchBuffer() {}
    GraphicsAllocation *commandBufferAllocation = nullptr;
    size_t startOffset = 0u;
@@ -52,6 +53,7 @@ struct BatchBuffer {

    bool useSingleSubdevice = false;
    bool hasStallingCmds = false;
+    bool hasRelaxedOrderingDependencies = false;
    bool ringBufferRestartRequest = false;
 };

--- a/shared/source/direct_submission/direct_submission_hw.h
+++ b/shared/source/direct_submission/direct_submission_hw.h
@@ -114,12 +114,12 @@ class DirectSubmissionHw {

    void cpuCachelineFlush(void *ptr, size_t size);

-    void dispatchSemaphoreSection(uint32_t value, bool firstSubmission);
-    size_t getSizeSemaphoreSection(bool firstSubmission);
+    void dispatchSemaphoreSection(uint32_t value);
+    size_t getSizeSemaphoreSection(bool relaxedOrderingSchedulerRequired);

-    void dispatchRelaxedOrderingSchedulerSection(uint32_t value);
+    MOCKABLE_VIRTUAL void dispatchRelaxedOrderingSchedulerSection(uint32_t value);

-    void dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr);
+    void dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr, bool hasRelaxedOrderingDependencies);

    void dispatchStartSection(uint64_t gpuStartAddress);
    size_t getSizeStartSection();
@@ -127,10 +127,10 @@ class DirectSubmissionHw {
    void dispatchSwitchRingBufferSection(uint64_t nextBufferGpuAddress);
    size_t getSizeSwitchRingBufferSection();

-    void dispatchRelaxedOrderingQueueStall();
+    MOCKABLE_VIRTUAL void dispatchRelaxedOrderingQueueStall();
    size_t getSizeDispatchRelaxedOrderingQueueStall();

-    void dispatchTaskStoreSection(uint64_t taskStartSectionVa);
+    MOCKABLE_VIRTUAL void dispatchTaskStoreSection(uint64_t taskStartSectionVa);
    MOCKABLE_VIRTUAL void preinitializeRelaxedOrderingSections();

    void initRelaxedOrderingRegisters();
@@ -138,7 +138,7 @@ class DirectSubmissionHw {
    void setReturnAddress(void *returnCmd, uint64_t returnAddress);

    void *dispatchWorkloadSection(BatchBuffer &batchBuffer);
-    size_t getSizeDispatch();
+    size_t getSizeDispatch(bool relaxedOrderingSchedulerRequired);

    void dispatchPrefetchMitigation();
    size_t getSizePrefetchMitigation();
@@ -148,7 +148,7 @@ class DirectSubmissionHw {

    MOCKABLE_VIRTUAL void dispatchStaticRelaxedOrderingScheduler();

-    size_t getSizeEnd();
+    size_t getSizeEnd(bool relaxedOrderingSchedulerRequired);

    void dispatchPartitionRegisterConfiguration();
    size_t getSizePartitionRegisterConfigurationSection();
@@ -226,6 +226,6 @@ class DirectSubmissionHw {
    bool dcFlushRequired = false;
    bool relaxedOrderingEnabled = false;
    bool relaxedOrderingInitialized = false;
-    bool firstSubmissionAfterRingStart = true;
+    bool relaxedOrderingSchedulerRequired = false;
 };
 } // namespace NEO
--- a/shared/source/direct_submission/direct_submission_hw.inl
+++ b/shared/source/direct_submission/direct_submission_hw.inl
@@ -403,7 +403,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit, bo
    initDiagnostic(submitOnInit);
    if (ret && submitOnInit) {
        size_t startBufferSize = Dispatcher::getSizePreemption() +
-                                 getSizeSemaphoreSection(true);
+                                 getSizeSemaphoreSection(false);

        Dispatcher::dispatchPreemption(ringCommandStream);
        if (this->partitionedMode) {
@@ -431,7 +431,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit, bo
            dispatchDiagnosticModeSection();
            startBufferSize += getDiagnosticModeSection();
        }
-        dispatchSemaphoreSection(currentQueueWorkCount, true);
+        dispatchSemaphoreSection(currentQueueWorkCount);

        ringStart = submit(ringCommandStream.getGraphicsAllocation()->getGpuAddress(), startBufferSize);
        performDiagnosticMode();
@@ -446,7 +446,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
        return true;
    }

-    size_t startSize = getSizeSemaphoreSection(true);
+    size_t startSize = getSizeSemaphoreSection(false);
    if (!this->partitionConfigSet) {
        startSize += getSizePartitionRegisterConfigurationSection();
    }
@@ -457,7 +457,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
        startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
    }

-    size_t requiredSize = startSize + getSizeDispatch() + getSizeEnd();
+    size_t requiredSize = startSize + getSizeDispatch(false) + getSizeEnd(false);
    if (ringCommandStream.getAvailableSpace() < requiredSize) {
        switchRingBuffers();
    }
@@ -482,12 +482,10 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
    }

    currentQueueWorkCount++;
-    dispatchSemaphoreSection(currentQueueWorkCount, true);
+    dispatchSemaphoreSection(currentQueueWorkCount);

    ringStart = submit(gpuStartVa, startSize);

-    firstSubmissionAfterRingStart = true;
-
    return ringStart;
 }

@@ -497,7 +495,8 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::stopRingBuffer() {
        return true;
    }

-    if (this->relaxedOrderingEnabled && !firstSubmissionAfterRingStart) {
+    bool relaxedOrderingSchedulerWasRequired = this->relaxedOrderingSchedulerRequired;
+    if (this->relaxedOrderingEnabled && this->relaxedOrderingSchedulerRequired) {
        dispatchRelaxedOrderingQueueStall();
    }

@@ -515,7 +514,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::stopRingBuffer() {
    EncodeNoop<GfxFamily>::emitNoop(ringCommandStream, bytesToPad);
    EncodeNoop<GfxFamily>::alignToCacheLine(ringCommandStream);

-    cpuCachelineFlush(flushPtr, getSizeEnd());
+    cpuCachelineFlush(flushPtr, getSizeEnd(relaxedOrderingSchedulerWasRequired));
    this->unblockGpu();
    cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize);

@@ -526,13 +525,13 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::stopRingBuffer() {
 }

 template <typename GfxFamily, typename Dispatcher>
-inline void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchSemaphoreSection(uint32_t value, bool firstSubmission) {
+inline void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchSemaphoreSection(uint32_t value) {
    using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
    using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;

    dispatchDisablePrefetcher(true);

-    if (this->relaxedOrderingEnabled && !firstSubmission) {
+    if (this->relaxedOrderingEnabled && this->relaxedOrderingSchedulerRequired) {
        dispatchRelaxedOrderingSchedulerSection(value);
    } else {
        EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(ringCommandStream,
@@ -550,9 +549,9 @@ inline void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchSemaphoreSection(
 }

 template <typename GfxFamily, typename Dispatcher>
-inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeSemaphoreSection(bool firstSubmission) {
-    size_t semaphoreSize = (this->relaxedOrderingEnabled && !firstSubmission) ? RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection<GfxFamily>::totalSize
-                                                                              : EncodeSempahore<GfxFamily>::getSizeMiSemaphoreWait();
+inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeSemaphoreSection(bool relaxedOrderingSchedulerRequired) {
+    size_t semaphoreSize = (this->relaxedOrderingEnabled && relaxedOrderingSchedulerRequired) ? RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection<GfxFamily>::totalSize
+                                                                                              : EncodeSempahore<GfxFamily>::getSizeMiSemaphoreWait();
    semaphoreSize += getSizePrefetchMitigation();

    if (isDisablePrefetcherRequired) {
@@ -597,7 +596,7 @@ inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeSwitchRingBuffer
 }

 template <typename GfxFamily, typename Dispatcher>
-inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeEnd() {
+inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeEnd(bool relaxedOrderingSchedulerRequired) {
    size_t size = Dispatcher::getSizeStopCommandBuffer() +
                  Dispatcher::getSizeCacheFlush(*hwInfo) +
                  (Dispatcher::getSizeStartCommandBuffer() - Dispatcher::getSizeStopCommandBuffer()) +
@@ -605,15 +604,15 @@ inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeEnd() {
    if (disableMonitorFence) {
        size += Dispatcher::getSizeMonitorFence(*hwInfo);
    }
-    if (this->relaxedOrderingEnabled) {
+    if (this->relaxedOrderingEnabled && relaxedOrderingSchedulerRequired) {
        size += getSizeDispatchRelaxedOrderingQueueStall();
    }
    return size;
 }

 template <typename GfxFamily, typename Dispatcher>
-inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatch() {
-    size_t size = getSizeSemaphoreSection(false);
+inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatch(bool relaxedOrderingSchedulerRequired) {
+    size_t size = getSizeSemaphoreSection(relaxedOrderingSchedulerRequired);
    if (workloadMode == 0) {
        size += getSizeStartSection();
        if (this->relaxedOrderingEnabled) {
@@ -673,7 +672,7 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
        uint64_t returnGpuPointer = ringCommandStream.getCurrentGpuAddressPosition();

        if (this->relaxedOrderingEnabled) {
-            dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer);
+            dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer, batchBuffer.hasRelaxedOrderingDependencies);
        } else {
            setReturnAddress(returnCmd, returnGpuPointer);
        }
@@ -683,7 +682,7 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
    }
    // mode 2 does not dispatch any commands

-    if (this->relaxedOrderingEnabled) {
+    if (this->relaxedOrderingEnabled && batchBuffer.hasRelaxedOrderingDependencies) {
        dispatchTaskStoreSection(batchBuffer.taskStartAddress);
    }

@@ -698,7 +697,7 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
                                         this->useNotifyForPostSync, this->partitionedMode, this->dcFlushRequired);
    }

-    dispatchSemaphoreSection(currentQueueWorkCount + 1, false);
+    dispatchSemaphoreSection(currentQueueWorkCount + 1);
    return currentPosition;
 }

@@ -708,25 +707,31 @@ void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingQueueStal
                               EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart());

    LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R5, 1, true);
-    dispatchSemaphoreSection(currentQueueWorkCount, false);
+    dispatchSemaphoreSection(currentQueueWorkCount);

    // patch conditional bb_start with current GPU address
    EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataRegBatchBufferStart(bbStartStream, ringCommandStream.getCurrentGpuAddressPosition(),
                                                                                      CS_GPR_R1, 0, CompareOperation::Equal, false);
+
+    relaxedOrderingSchedulerRequired = false;
 }

 template <typename GfxFamily, typename Dispatcher>
 size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatchRelaxedOrderingQueueStall() {
-    return getSizeSemaphoreSection(false) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM) +
+    return getSizeSemaphoreSection(true) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM) +
           EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart();
 }

 template <typename GfxFamily, typename Dispatcher>
-void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr) {
+void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr, bool hasRelaxedOrderingDependencies) {
    LriHelper<GfxFamily>::program(&cmdStream, CS_GPR_R4, static_cast<uint32_t>(returnPtr & 0xFFFF'FFFFULL), true);
    LriHelper<GfxFamily>::program(&cmdStream, CS_GPR_R4 + 4, static_cast<uint32_t>(returnPtr >> 32), true);

-    uint64_t returnPtrAfterTaskStoreSection = returnPtr + RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>();
+    uint64_t returnPtrAfterTaskStoreSection = returnPtr;
+
+    if (hasRelaxedOrderingDependencies) {
+        returnPtrAfterTaskStoreSection += RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>();
+    }

    LriHelper<GfxFamily>::program(&cmdStream, CS_GPR_R3, static_cast<uint32_t>(returnPtrAfterTaskStoreSection & 0xFFFF'FFFFULL), true);
    LriHelper<GfxFamily>::program(&cmdStream, CS_GPR_R3 + 4, static_cast<uint32_t>(returnPtrAfterTaskStoreSection >> 32), true);
@@ -846,24 +851,32 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe

    this->startRingBuffer();

-    size_t dispatchSize = getSizeDispatch();
+    bool relaxedOrderingSchedulerWillBeNeeded = (this->relaxedOrderingSchedulerRequired || batchBuffer.hasRelaxedOrderingDependencies);
+
+    size_t dispatchSize = getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded);
    size_t cycleSize = getSizeSwitchRingBufferSection();
-    size_t requiredMinimalSize = dispatchSize + cycleSize + getSizeEnd();
+    size_t requiredMinimalSize = dispatchSize + cycleSize + getSizeEnd(relaxedOrderingSchedulerWillBeNeeded);
    if (this->relaxedOrderingEnabled) {
-        if (batchBuffer.hasStallingCmds && !firstSubmissionAfterRingStart) {
+        requiredMinimalSize += +RelaxedOrderingHelper::getSizeReturnPtrRegs<GfxFamily>();
+
+        if (batchBuffer.hasStallingCmds && this->relaxedOrderingSchedulerRequired) {
            requiredMinimalSize += getSizeDispatchRelaxedOrderingQueueStall();
        }
-        requiredMinimalSize += RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>() + RelaxedOrderingHelper::getSizeReturnPtrRegs<GfxFamily>();
+        if (batchBuffer.hasRelaxedOrderingDependencies) {
+            requiredMinimalSize += RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>();
+        }
    }

    if (ringCommandStream.getAvailableSpace() < requiredMinimalSize) {
        switchRingBuffers();
    }

-    if (this->relaxedOrderingEnabled && batchBuffer.hasStallingCmds && !firstSubmissionAfterRingStart) {
+    if (this->relaxedOrderingEnabled && batchBuffer.hasStallingCmds && this->relaxedOrderingSchedulerRequired) {
        dispatchRelaxedOrderingQueueStall();
    }

+    this->relaxedOrderingSchedulerRequired |= batchBuffer.hasRelaxedOrderingDependencies;
+
    handleNewResourcesSubmission();

    void *currentPosition = dispatchWorkloadSection(batchBuffer);
@@ -890,8 +903,6 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
    uint64_t flushValue = updateTagValue();
    flushStamp.setStamp(flushValue);

-    firstSubmissionAfterRingStart = false;
-
    return ringStart;
 }