performance: Optimize ULLS start on submit path

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
2026-01-08 14:02:58 +08:00 · 2023-12-27 06:50:44 +00:00
parent fd68e4f0cf
commit ea78831e28
6 changed files with 84 additions and 160 deletions
--- a/shared/source/direct_submission/direct_submission_hw.h
+++ b/shared/source/direct_submission/direct_submission_hw.h
@@ -82,8 +82,6 @@ class DirectSubmissionHw {

    MOCKABLE_VIRTUAL bool stopRingBuffer(bool blocking);

-    bool startRingBuffer();
-
    MOCKABLE_VIRTUAL bool dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp);
    uint32_t getDispatchErrorCode();

@@ -121,6 +119,7 @@ class DirectSubmissionHw {
    virtual bool dispatchMonitorFenceRequired(bool requireMonitorFence);
    virtual void getTagAddressValue(TagData &tagData) = 0;
    void unblockGpu();
+    bool submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size);
    bool copyCommandBufferIntoRing(BatchBuffer &batchBuffer);

    void cpuCachelineFlush(void *ptr, size_t size);
@@ -135,6 +134,9 @@ class DirectSubmissionHw {
    void dispatchStartSection(uint64_t gpuStartAddress);
    size_t getSizeStartSection();

+    size_t getUllsStateSize();
+    void dispatchUllsState();
+
    void dispatchSwitchRingBufferSection(uint64_t nextBufferGpuAddress);
    size_t getSizeSwitchRingBufferSection();

--- a/shared/source/direct_submission/direct_submission_hw.inl
+++ b/shared/source/direct_submission/direct_submission_hw.inl
@@ -499,55 +499,6 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit, bo
    return ret;
 }

-template <typename GfxFamily, typename Dispatcher>
-bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
-    if (ringStart) {
-        return true;
-    }
-
-    size_t startSize = getSizeSemaphoreSection(false);
-    if (!this->partitionConfigSet) {
-        startSize += getSizePartitionRegisterConfigurationSection();
-    }
-    if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
-        startSize += getSizeSystemMemoryFenceAddress();
-    }
-    if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
-        startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
-    }
-
-    size_t requiredSize = startSize + getSizeDispatch(false, false, dispatchMonitorFenceRequired(true)) + getSizeEnd(false);
-    if (ringCommandStream.getAvailableSpace() < requiredSize) {
-        switchRingBuffers(nullptr);
-    }
-    uint64_t gpuStartVa = ringCommandStream.getCurrentGpuAddressPosition();
-
-    if (!this->partitionConfigSet) {
-        dispatchPartitionRegisterConfiguration();
-        this->partitionConfigSet = true;
-    }
-
-    if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
-        dispatchSystemMemoryFenceAddress();
-        this->systemMemoryFenceAddressSet = true;
-    }
-
-    if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
-        preinitializeRelaxedOrderingSections();
-        dispatchStaticRelaxedOrderingScheduler();
-        initRelaxedOrderingRegisters();
-
-        this->relaxedOrderingInitialized = true;
-    }
-
-    currentQueueWorkCount++;
-    dispatchSemaphoreSection(currentQueueWorkCount);
-
-    ringStart = submit(gpuStartVa, startSize);
-
-    return ringStart;
-}
-
 template <typename GfxFamily, typename Dispatcher>
 bool DirectSubmissionHw<GfxFamily, Dispatcher>::stopRingBuffer(bool blocking) {
    if (!ringStart) {
@@ -940,15 +891,46 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::copyCommandBufferIntoRing(BatchB
    return ret;
 }

+template <typename GfxFamily, typename Dispatcher>
+size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getUllsStateSize() {
+    size_t startSize = 0u;
+    if (!this->partitionConfigSet) {
+        startSize += getSizePartitionRegisterConfigurationSection();
+    }
+    if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
+        startSize += getSizeSystemMemoryFenceAddress();
+    }
+    if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
+        startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
+    }
+    return startSize;
+}
+
+template <typename GfxFamily, typename Dispatcher>
+void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchUllsState() {
+    if (!this->partitionConfigSet) {
+        dispatchPartitionRegisterConfiguration();
+        this->partitionConfigSet = true;
+    }
+    if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
+        dispatchSystemMemoryFenceAddress();
+        this->systemMemoryFenceAddressSet = true;
+    }
+    if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
+        preinitializeRelaxedOrderingSections();
+        dispatchStaticRelaxedOrderingScheduler();
+        initRelaxedOrderingRegisters();
+
+        this->relaxedOrderingInitialized = true;
+    }
+}
+
 template <typename GfxFamily, typename Dispatcher>
 bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) {
    if (batchBuffer.ringBufferRestartRequest) {
        this->stopRingBuffer(false);
    }

-    if (!this->startRingBuffer()) {
-        return false;
-    }
    lastSubmittedThrottle = batchBuffer.throttle;
    bool relaxedOrderingSchedulerWillBeNeeded = (this->relaxedOrderingSchedulerRequired || batchBuffer.hasRelaxedOrderingDependencies);
    bool inputRequiredMonitorFence = false;
@@ -959,7 +941,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
    }
    bool dispatchMonitorFence = this->dispatchMonitorFenceRequired(inputRequiredMonitorFence);

-    size_t dispatchSize = getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies, dispatchMonitorFence);
+    size_t dispatchSize = this->getUllsStateSize() + getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies, dispatchMonitorFence);

    if (this->copyCommandBufferIntoRing(batchBuffer)) {
        dispatchSize += (batchBuffer.stream->getUsed() - batchBuffer.startOffset) - 2 * getSizeStartSection();
@@ -978,8 +960,14 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
        }
    }

+    auto needStart = !this->ringStart;
+    this->ringStart = true;
+    auto startVA = ringCommandStream.getCurrentGpuAddressPosition();
+
    this->switchRingBuffersNeeded(requiredMinimalSize, batchBuffer.allocationsForResidency);

+    this->dispatchUllsState();
+
    if (this->relaxedOrderingEnabled && batchBuffer.hasStallingCmds && this->relaxedOrderingSchedulerRequired) {
        dispatchRelaxedOrderingQueueStall();
    }
@@ -991,9 +979,10 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
    void *currentPosition = dispatchWorkloadSection(batchBuffer, dispatchMonitorFence);

    cpuCachelineFlush(currentPosition, dispatchSize);
-    handleResidency();

-    this->unblockGpu();
+    if (!this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize)) {
+        return false;
+    }

    cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize);
    currentQueueWorkCount++;
@@ -1008,6 +997,17 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
    return ringStart;
 }

+template <typename GfxFamily, typename Dispatcher>
+bool DirectSubmissionHw<GfxFamily, Dispatcher>::submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size) {
+    if (needStart) {
+        return this->submit(gpuAddress, size);
+    } else {
+        handleResidency();
+        this->unblockGpu();
+        return true;
+    }
+}
+
 template <typename GfxFamily, typename Dispatcher>
 inline void DirectSubmissionHw<GfxFamily, Dispatcher>::setReturnAddress(void *returnCmd, uint64_t returnAddress) {
    using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
--- a/shared/source/direct_submission/windows/wddm_direct_submission.inl
+++ b/shared/source/direct_submission/windows/wddm_direct_submission.inl
@@ -55,7 +55,9 @@ WddmDirectSubmission<GfxFamily, Dispatcher>::~WddmDirectSubmission() {

 template <typename GfxFamily, typename Dispatcher>
 inline void WddmDirectSubmission<GfxFamily, Dispatcher>::flushMonitorFence() {
-    this->startRingBuffer();
+    auto needStart = !this->ringStart;
+    this->ringStart = true;
+    auto startVA = this->ringCommandStream.getCurrentGpuAddressPosition();

    size_t requiredMinimalSize = this->getSizeSemaphoreSection(false) +
                                 Dispatcher::getSizeMonitorFence(this->rootDeviceEnvironment) +
@@ -71,8 +73,7 @@ inline void WddmDirectSubmission<GfxFamily, Dispatcher>::flushMonitorFence() {
    Dispatcher::dispatchMonitorFence(this->ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, this->rootDeviceEnvironment, this->useNotifyForPostSync, this->partitionedMode, this->dcFlushRequired);

    this->dispatchSemaphoreSection(this->currentQueueWorkCount + 1);
-    this->handleResidency();
-    this->unblockGpu();
+    this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize);
    this->currentQueueWorkCount++;

    this->updateTagValueImpl(this->currentRingBuffer);