RelaxedOrdering: Optimize GPU Queue stall by adding early return

Related-To: NEO-7458 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
2026-01-03 14:55:24 +08:00 · 2022-11-24 15:00:51 +00:00
parent a104636b31
commit a969636b6a
6 changed files with 39 additions and 24 deletions
--- a/shared/source/command_stream/linear_stream.h
+++ b/shared/source/command_stream/linear_stream.h
@@ -37,6 +37,8 @@ class LinearStream {
    uint64_t getGpuBase() const;
    void setGpuBase(uint64_t gpuAddress);

+    uint64_t getCurrentGpuAddressPosition() const;
+
    void overrideMaxSize(size_t newMaxSize);
    void replaceBuffer(void *buffer, size_t bufferSize);
    GraphicsAllocation *getGraphicsAllocation() const;
@@ -112,4 +114,9 @@ inline GraphicsAllocation *LinearStream::getGraphicsAllocation() const {
 inline void LinearStream::replaceGraphicsAllocation(GraphicsAllocation *gfxAllocation) {
    graphicsAllocation = gfxAllocation;
 }
+
+inline uint64_t LinearStream::getCurrentGpuAddressPosition() const {
+    return (getGpuBase() + getUsed());
+}
+
 } // namespace NEO
--- a/shared/source/direct_submission/direct_submission_hw.h
+++ b/shared/source/direct_submission/direct_submission_hw.h
@@ -148,8 +148,6 @@ class DirectSubmissionHw {

    size_t getSizeEnd();

-    uint64_t getCommandBufferPositionGpuAddress(void *position);
-
    void dispatchPartitionRegisterConfiguration();
    size_t getSizePartitionRegisterConfigurationSection();

--- a/shared/source/direct_submission/direct_submission_hw.inl
+++ b/shared/source/direct_submission/direct_submission_hw.inl
@@ -82,7 +82,7 @@ DirectSubmissionHw<GfxFamily, Dispatcher>::DirectSubmissionHw(const DirectSubmis

 template <typename GfxFamily, typename Dispatcher>
 void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingSchedulerSection(uint32_t value) {
-    uint64_t schedulerStartAddress = getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));
+    uint64_t schedulerStartAddress = ringCommandStream.getCurrentGpuAddressPosition();
    uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress();

    // 1. Init section
@@ -409,7 +409,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
    if (ringCommandStream.getAvailableSpace() < requiredSize) {
        switchRingBuffers();
    }
-    uint64_t gpuStartVa = getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));
+    uint64_t gpuStartVa = ringCommandStream.getCurrentGpuAddressPosition();

    if (!this->partitionConfigSet) {
        dispatchPartitionRegisterConfiguration();
@@ -558,14 +558,6 @@ inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeEnd() {
    return size;
 }

-template <typename GfxFamily, typename Dispatcher>
-inline uint64_t DirectSubmissionHw<GfxFamily, Dispatcher>::getCommandBufferPositionGpuAddress(void *position) {
-    void *currentBase = ringCommandStream.getCpuBase();
-
-    size_t offset = ptrDiff(position, currentBase);
-    return ringCommandStream.getGraphicsAllocation()->getGpuAddress() + static_cast<uint64_t>(offset);
-}
-
 template <typename GfxFamily, typename Dispatcher>
 inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatch() {
    size_t size = getSizeSemaphoreSection(false);
@@ -624,8 +616,8 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
        }

        dispatchStartSection(commandStreamAddress);
-        void *returnPosition = ringCommandStream.getSpace(0);
-        uint64_t returnGpuPointer = getCommandBufferPositionGpuAddress(returnPosition);
+
+        uint64_t returnGpuPointer = ringCommandStream.getCurrentGpuAddressPosition();

        if (this->relaxedOrderingEnabled) {
            dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer);
@@ -659,13 +651,21 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu

 template <typename GfxFamily, typename Dispatcher>
 void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingQueueStall() {
+    LinearStream bbStartStream(ringCommandStream.getSpace(EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart()),
+                               EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart());
+
    LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R5, 1, true);
    dispatchSemaphoreSection(currentQueueWorkCount, false);
+
+    // patch conditional bb_start with current GPU address
+    EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataRegBatchBufferStart(bbStartStream, ringCommandStream.getCurrentGpuAddressPosition(),
+                                                                                      CS_GPR_R1, 0, CompareOperation::Equal, false);
 }

 template <typename GfxFamily, typename Dispatcher>
 size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatchRelaxedOrderingQueueStall() {
-    return getSizeSemaphoreSection(false) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM);
+    return getSizeSemaphoreSection(false) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM) +
+           EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart();
 }

 template <typename GfxFamily, typename Dispatcher>
@@ -762,8 +762,6 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
        requiredMinimalSize += RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>() + RelaxedOrderingHelper::getSizeReturnPtrRegs<GfxFamily>();
    }

-    getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));
-
    if (ringCommandStream.getAvailableSpace() < requiredMinimalSize) {
        switchRingBuffers();
    }
@@ -828,7 +826,7 @@ template <typename GfxFamily, typename Dispatcher>
 inline uint64_t DirectSubmissionHw<GfxFamily, Dispatcher>::switchRingBuffers() {
    GraphicsAllocation *nextRingBuffer = switchRingBuffersAllocations();
    void *flushPtr = ringCommandStream.getSpace(0);
-    uint64_t currentBufferGpuVa = getCommandBufferPositionGpuAddress(flushPtr);
+    uint64_t currentBufferGpuVa = ringCommandStream.getCurrentGpuAddressPosition();

    if (ringStart) {
        dispatchSwitchRingBufferSection(nextRingBuffer->getGpuAddress());
--- a/shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl
+++ b/shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021 Intel Corporation
+ * Copyright (C) 2021-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -11,8 +11,7 @@ namespace NEO {

 template <typename GfxFamily, typename Dispatcher>
 inline void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchPrefetchMitigation() {
-    auto addressToJump = ptrOffset(ringCommandStream.getSpace(0u), getSizeStartSection());
-    dispatchStartSection(getCommandBufferPositionGpuAddress(addressToJump));
+    dispatchStartSection(ringCommandStream.getCurrentGpuAddressPosition() + getSizeStartSection());
 }

 template <typename GfxFamily, typename Dispatcher>