From a969636b6a293877b797c1979ce11b4d664854fd Mon Sep 17 00:00:00 2001
From: "Dunajski, Bartosz" <bartosz.dunajski@intel.com>
Date: Thu, 24 Nov 2022 15:00:51 +0000
Subject: [PATCH] RelaxedOrdering: Optimize GPU Queue stall by adding early
 return

Related-To: NEO-7458

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
---
 shared/source/command_stream/linear_stream.h  |  7 +++++
 .../direct_submission/direct_submission_hw.h  |  2 --
 .../direct_submission_hw.inl                  | 30 +++++++++----------
 ...efetch_mitigation_xe_hp_core_and_later.inl |  5 ++--
 .../common/mocks/mock_direct_submission_hw.h  |  1 -
 .../direct_submission_tests_2.cpp             | 18 +++++++++--
 6 files changed, 39 insertions(+), 24 deletions(-)
diff --git a/shared/source/command_stream/linear_stream.h b/shared/source/command_stream/linear_stream.h
index 27a9466cbb..36efc0d973 100644
--- a/shared/source/command_stream/linear_stream.h
+++ b/shared/source/command_stream/linear_stream.h
@@ -37,6 +37,8 @@ class LinearStream {
     uint64_t getGpuBase() const;
     void setGpuBase(uint64_t gpuAddress);
 
+    uint64_t getCurrentGpuAddressPosition() const;
+
     void overrideMaxSize(size_t newMaxSize);
     void replaceBuffer(void *buffer, size_t bufferSize);
     GraphicsAllocation *getGraphicsAllocation() const;
@@ -112,4 +114,9 @@ inline GraphicsAllocation *LinearStream::getGraphicsAllocation() const {
 inline void LinearStream::replaceGraphicsAllocation(GraphicsAllocation *gfxAllocation) {
     graphicsAllocation = gfxAllocation;
 }
+
+inline uint64_t LinearStream::getCurrentGpuAddressPosition() const {
+    return (getGpuBase() + getUsed());
+}
+
 } // namespace NEO
diff --git a/shared/source/direct_submission/direct_submission_hw.h b/shared/source/direct_submission/direct_submission_hw.h
index 11f649830a..b5280224dc 100644
--- a/shared/source/direct_submission/direct_submission_hw.h
+++ b/shared/source/direct_submission/direct_submission_hw.h
@@ -148,8 +148,6 @@ class DirectSubmissionHw {
 
     size_t getSizeEnd();
 
-    uint64_t getCommandBufferPositionGpuAddress(void *position);
-
     void dispatchPartitionRegisterConfiguration();
     size_t getSizePartitionRegisterConfigurationSection();
 
diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl
index 603fbca6e7..4f209157f9 100644
--- a/shared/source/direct_submission/direct_submission_hw.inl
+++ b/shared/source/direct_submission/direct_submission_hw.inl
@@ -82,7 +82,7 @@ DirectSubmissionHw<GfxFamily, Dispatcher>::DirectSubmissionHw(const DirectSubmis
 
 template <typename GfxFamily, typename Dispatcher>
 void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingSchedulerSection(uint32_t value) {
-    uint64_t schedulerStartAddress = getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));
+    uint64_t schedulerStartAddress = ringCommandStream.getCurrentGpuAddressPosition();
     uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress();
 
     // 1. Init section
@@ -409,7 +409,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
     if (ringCommandStream.getAvailableSpace() < requiredSize) {
         switchRingBuffers();
     }
-    uint64_t gpuStartVa = getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));
+    uint64_t gpuStartVa = ringCommandStream.getCurrentGpuAddressPosition();
 
     if (!this->partitionConfigSet) {
         dispatchPartitionRegisterConfiguration();
@@ -558,14 +558,6 @@ inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeEnd() {
     return size;
 }
 
-template <typename GfxFamily, typename Dispatcher>
-inline uint64_t DirectSubmissionHw<GfxFamily, Dispatcher>::getCommandBufferPositionGpuAddress(void *position) {
-    void *currentBase = ringCommandStream.getCpuBase();
-
-    size_t offset = ptrDiff(position, currentBase);
-    return ringCommandStream.getGraphicsAllocation()->getGpuAddress() + static_cast<uint64_t>(offset);
-}
-
 template <typename GfxFamily, typename Dispatcher>
 inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatch() {
     size_t size = getSizeSemaphoreSection(false);
@@ -624,8 +616,8 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
         }
 
         dispatchStartSection(commandStreamAddress);
-        void *returnPosition = ringCommandStream.getSpace(0);
-        uint64_t returnGpuPointer = getCommandBufferPositionGpuAddress(returnPosition);
+
+        uint64_t returnGpuPointer = ringCommandStream.getCurrentGpuAddressPosition();
 
         if (this->relaxedOrderingEnabled) {
             dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer);
@@ -659,13 +651,21 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
 
 template <typename GfxFamily, typename Dispatcher>
 void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingQueueStall() {
+    LinearStream bbStartStream(ringCommandStream.getSpace(EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart()),
+                               EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart());
+
     LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R5, 1, true);
     dispatchSemaphoreSection(currentQueueWorkCount, false);
+
+    // patch conditional bb_start with current GPU address
+    EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataRegBatchBufferStart(bbStartStream, ringCommandStream.getCurrentGpuAddressPosition(),
+                                                                                      CS_GPR_R1, 0, CompareOperation::Equal, false);
 }
 
 template <typename GfxFamily, typename Dispatcher>
 size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatchRelaxedOrderingQueueStall() {
-    return getSizeSemaphoreSection(false) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM);
+    return getSizeSemaphoreSection(false) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM) +
+           EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart();
 }
 
 template <typename GfxFamily, typename Dispatcher>
@@ -762,8 +762,6 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
         requiredMinimalSize += RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>() + RelaxedOrderingHelper::getSizeReturnPtrRegs<GfxFamily>();
     }
 
-    getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));
-
     if (ringCommandStream.getAvailableSpace() < requiredMinimalSize) {
         switchRingBuffers();
     }
@@ -828,7 +826,7 @@ template <typename GfxFamily, typename Dispatcher>
 inline uint64_t DirectSubmissionHw<GfxFamily, Dispatcher>::switchRingBuffers() {
     GraphicsAllocation *nextRingBuffer = switchRingBuffersAllocations();
     void *flushPtr = ringCommandStream.getSpace(0);
-    uint64_t currentBufferGpuVa = getCommandBufferPositionGpuAddress(flushPtr);
+    uint64_t currentBufferGpuVa = ringCommandStream.getCurrentGpuAddressPosition();
 
     if (ringStart) {
         dispatchSwitchRingBufferSection(nextRingBuffer->getGpuAddress());
diff --git a/shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl b/shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl
index 60bf11a8ec..9b2125d9ae 100644
--- a/shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl
+++ b/shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021 Intel Corporation
+ * Copyright (C) 2021-2022 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -11,8 +11,7 @@ namespace NEO {
 
 template <typename GfxFamily, typename Dispatcher>
 inline void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchPrefetchMitigation() {
-    auto addressToJump = ptrOffset(ringCommandStream.getSpace(0u), getSizeStartSection());
-    dispatchStartSection(getCommandBufferPositionGpuAddress(addressToJump));
+    dispatchStartSection(ringCommandStream.getCurrentGpuAddressPosition() + getSizeStartSection());
 }
 
 template <typename GfxFamily, typename Dispatcher>
diff --git a/shared/test/common/mocks/mock_direct_submission_hw.h b/shared/test/common/mocks/mock_direct_submission_hw.h
index 31712758aa..08e15b9a85 100644
--- a/shared/test/common/mocks/mock_direct_submission_hw.h
+++ b/shared/test/common/mocks/mock_direct_submission_hw.h
@@ -36,7 +36,6 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
     using BaseClass::dispatchStartSection;
     using BaseClass::dispatchSwitchRingBufferSection;
     using BaseClass::dispatchWorkloadSection;
-    using BaseClass::getCommandBufferPositionGpuAddress;
     using BaseClass::getDiagnosticModeSection;
     using BaseClass::getSizeDisablePrefetcher;
     using BaseClass::getSizeDispatch;
diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp
index 6643dd95e7..e0dc8abfad 100644
--- a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp
+++ b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp
@@ -1761,8 +1761,15 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBbWithStallingCmdsWhenDispa
     batchBuffer.hasStallingCmds = true;
     directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp);
 
+    auto startAddress = ptrOffset(directSubmission.ringCommandStream.getCpuBase(), offset);
+    auto jumpOffset = directSubmission.getSizeSemaphoreSection(false) + sizeof(typename FamilyType::MI_LOAD_REGISTER_IMM) +
+                      EncodeBatchBufferStartOrEnd<FamilyType>::getCmdSizeConditionalDataRegBatchBufferStart();
+    uint64_t expectedJumpAddress = directSubmission.ringCommandStream.getGpuBase() + offset + jumpOffset;
+
+    EXPECT_TRUE(verifyConditionalDataRegBbStart<FamilyType>(startAddress, expectedJumpAddress, CS_GPR_R1, 0, CompareOperation::Equal, false));
+
     HardwareParse hwParse;
-    hwParse.parseCommands<FamilyType>(directSubmission.ringCommandStream, offset);
+    hwParse.parseCommands<FamilyType>(directSubmission.ringCommandStream, offset + EncodeBatchBufferStartOrEnd<FamilyType>::getCmdSizeConditionalDataRegBatchBufferStart());
     hwParse.findHardwareCommands<FamilyType>();
 
     bool success = false;
@@ -1856,8 +1863,15 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenStoppingRingThenProgramSched
 
     directSubmission.stopRingBuffer();
 
+    auto startAddress = ptrOffset(directSubmission.ringCommandStream.getCpuBase(), offset);
+    auto jumpOffset = directSubmission.getSizeSemaphoreSection(false) + sizeof(typename FamilyType::MI_LOAD_REGISTER_IMM) +
+                      EncodeBatchBufferStartOrEnd<FamilyType>::getCmdSizeConditionalDataRegBatchBufferStart();
+    uint64_t expectedJumpAddress = directSubmission.ringCommandStream.getGpuBase() + offset + jumpOffset;
+
+    EXPECT_TRUE(verifyConditionalDataRegBbStart<FamilyType>(startAddress, expectedJumpAddress, CS_GPR_R1, 0, CompareOperation::Equal, false));
+
     HardwareParse hwParse;
-    hwParse.parseCommands<FamilyType>(directSubmission.ringCommandStream, offset);
+    hwParse.parseCommands<FamilyType>(directSubmission.ringCommandStream, offset + EncodeBatchBufferStartOrEnd<FamilyType>::getCmdSizeConditionalDataRegBatchBufferStart());
     hwParse.findHardwareCommands<FamilyType>();
 
     bool success = false;