feature: Introduce ULLS light

Add core implementation of ULLS without VM_BIND interface aka ULLS light. Related-To: NEO-13922 Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
2026-01-05 09:09:04 +08:00 · 2025-02-11 14:31:01 +00:00
parent 488ac4bb6a
commit bc2b49b958
23 changed files with 293 additions and 62 deletions
--- a/shared/source/direct_submission/direct_submission_hw.h
+++ b/shared/source/direct_submission/direct_submission_hw.h
@@ -112,8 +112,9 @@ class DirectSubmissionHw {
    MOCKABLE_VIRTUAL void deallocateResources();
    MOCKABLE_VIRTUAL bool makeResourcesResident(DirectSubmissionAllocations &allocations);
    virtual bool allocateOsResources() = 0;
-    virtual bool submit(uint64_t gpuAddress, size_t size) = 0;
+    virtual bool submit(uint64_t gpuAddress, size_t size, ResidencyContainer *allocationsForResidency) = 0;
    virtual bool handleResidency() = 0;
+    virtual void handleRingRestartForUllsLightResidency(ResidencyContainer *allocationsForResidency){};
    void handleNewResourcesSubmission();
    bool isNewResourceHandleNeeded();
    size_t getSizeNewResourceHandler();
@@ -129,7 +130,7 @@ class DirectSubmissionHw {
    virtual bool dispatchMonitorFenceRequired(bool requireMonitorFence);
    virtual void getTagAddressValue(TagData &tagData) = 0;
    void unblockGpu();
-    bool submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size, bool needWait);
+    bool submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size, bool needWait, ResidencyContainer *allocationsForResidency);
    bool copyCommandBufferIntoRing(BatchBuffer &batchBuffer);

    void cpuCachelineFlush(void *ptr, size_t size);
--- a/shared/source/direct_submission/direct_submission_hw.inl
+++ b/shared/source/direct_submission/direct_submission_hw.inl
@@ -502,7 +502,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit) {
        }
        dispatchSemaphoreSection(currentQueueWorkCount);

-        ringStart = submit(ringCommandStream.getGraphicsAllocation()->getGpuAddress(), startBufferSize);
+        ringStart = submit(ringCommandStream.getGraphicsAllocation()->getGpuAddress(), startBufferSize, nullptr);
        performDiagnosticMode();
        return ringStart;
    }
@@ -964,6 +964,8 @@ void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchUllsState() {

 template <typename GfxFamily, typename Dispatcher>
 bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) {
+    this->handleRingRestartForUllsLightResidency(batchBuffer.allocationsForResidency);
+
    lastSubmittedThrottle = batchBuffer.throttle;
    bool relaxedOrderingSchedulerWillBeNeeded = (this->relaxedOrderingSchedulerRequired || batchBuffer.hasRelaxedOrderingDependencies);
    bool inputRequiredMonitorFence = false;
@@ -1017,7 +1019,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
    cpuCachelineFlush(currentPosition, dispatchSize);

    auto requiresBlockingResidencyHandling = batchBuffer.pagingFenceSemInfo.requiresBlockingResidencyHandling;
-    if (!this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize, requiresBlockingResidencyHandling)) {
+    if (!this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize, requiresBlockingResidencyHandling, batchBuffer.allocationsForResidency)) {
        return false;
    }

@@ -1035,9 +1037,9 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
 }

 template <typename GfxFamily, typename Dispatcher>
-bool DirectSubmissionHw<GfxFamily, Dispatcher>::submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size, bool needWait) {
+bool DirectSubmissionHw<GfxFamily, Dispatcher>::submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size, bool needWait, ResidencyContainer *allocationsForResidency) {
    if (needStart) {
-        this->ringStart = this->submit(gpuAddress, size);
+        this->ringStart = this->submit(gpuAddress, size, allocationsForResidency);
        return this->ringStart;
    } else {
        if (needWait) {
--- a/shared/source/direct_submission/linux/drm_direct_submission.h
+++ b/shared/source/direct_submission/linux/drm_direct_submission.h
@@ -7,6 +7,8 @@

 #pragma once
 #include "shared/source/direct_submission/direct_submission_hw.h"
+#include "shared/source/os_interface/linux/drm_buffer_object.h"
+#include "shared/source/os_interface/linux/drm_wrappers.h"

 namespace NEO {

@@ -24,9 +26,10 @@ class DrmDirectSubmission : public DirectSubmissionHw<GfxFamily, Dispatcher> {

  protected:
    bool allocateOsResources() override;
-    bool submit(uint64_t gpuAddress, size_t size) override;
+    bool submit(uint64_t gpuAddress, size_t size, ResidencyContainer *allocationsForResidency) override;

    bool handleResidency() override;
+    void handleRingRestartForUllsLightResidency(ResidencyContainer *allocationsForResidency) override;
    void handleStopRingBuffer() override;

    void ensureRingCompletion() override;
@@ -43,5 +46,8 @@ class DrmDirectSubmission : public DirectSubmissionHw<GfxFamily, Dispatcher> {
    volatile TagAddressType *tagAddress;
    TaskCountType completionFenceValue{};
    std::chrono::microseconds gpuHangCheckPeriod{CommonConstants::gpuHangCheckTimeInUS};
+
+    std::vector<BufferObject *> residency{};
+    std::vector<ExecObject> execObjectsStorage{};
 };
 } // namespace NEO
--- a/shared/source/direct_submission/linux/drm_direct_submission.inl
+++ b/shared/source/direct_submission/linux/drm_direct_submission.inl
@@ -12,6 +12,7 @@
 #include "shared/source/direct_submission/linux/drm_direct_submission.h"
 #include "shared/source/os_interface/linux/drm_allocation.h"
 #include "shared/source/os_interface/linux/drm_buffer_object.h"
+#include "shared/source/os_interface/linux/drm_memory_operations_handler.h"
 #include "shared/source/os_interface/linux/drm_neo.h"
 #include "shared/source/os_interface/linux/drm_wrappers.h"
 #include "shared/source/os_interface/linux/ioctl_helper.h"
@@ -27,6 +28,7 @@ namespace NEO {
 template <typename GfxFamily, typename Dispatcher>
 DrmDirectSubmission<GfxFamily, Dispatcher>::DrmDirectSubmission(const DirectSubmissionInputParams &inputParams)
    : DirectSubmissionHw<GfxFamily, Dispatcher>(inputParams) {
+    this->execObjectsStorage.resize(1u);

    this->completionFenceValue = inputParams.initialCompletionFenceValue;
    if (debugManager.flags.OverrideUserFenceStartValue.get() != -1) {
@@ -124,7 +126,7 @@ bool DrmDirectSubmission<GfxFamily, Dispatcher>::allocateOsResources() {
 }

 template <typename GfxFamily, typename Dispatcher>
-bool DrmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, size_t size) {
+bool DrmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, size_t size, ResidencyContainer *allocationsForResidency) {
    auto bb = static_cast<DrmAllocation *>(this->ringCommandStream.getGraphicsAllocation())->getBO();

    auto osContextLinux = static_cast<OsContextLinux *>(&this->osContext);
@@ -132,9 +134,9 @@ bool DrmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, siz
    auto execFlags = osContextLinux->getEngineFlag() | drm.getIoctlHelper()->getDrmParamValue(DrmParam::execNoReloc);
    auto &drmContextIds = osContextLinux->getDrmContextIds();

-    ExecObject execObject{};
-
-    this->handleResidency();
+    if (!allocationsForResidency) {
+        this->handleResidency();
+    }

    auto currentBase = this->ringCommandStream.getGraphicsAllocation()->getGpuAddress();
    auto offset = ptrDiff(gpuAddress, currentBase);
@@ -151,21 +153,33 @@ bool DrmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, siz

    for (auto drmIterator = 0u; drmIterator < osContextLinux->getDeviceBitfield().size(); drmIterator++) {
        if (osContextLinux->getDeviceBitfield().test(drmIterator)) {
-            uint32_t errorCode = bb->exec(static_cast<uint32_t>(size),
-                                          offset,
-                                          execFlags,
-                                          false,
-                                          &this->osContext,
-                                          drmIterator,
-                                          drmContextIds[drmContextId],
-                                          nullptr,
-                                          0,
-                                          &execObject,
-                                          completionFenceGpuAddress,
-                                          completionValue);
+            auto size = allocationsForResidency ? allocationsForResidency->size() : 0u;
+            for (uint32_t i = 0; i < size; i++) {
+                auto drmAlloc = static_cast<DrmAllocation *>((*allocationsForResidency)[i]);
+                drmAlloc->makeBOsResident(&this->osContext, drmIterator, &this->residency, false, false);
+            }
+
+            auto requiredSize = this->residency.size() + 1;
+            if (requiredSize > this->execObjectsStorage.size()) {
+                this->execObjectsStorage.resize(requiredSize);
+            }
+
+            auto errorCode = bb->exec(static_cast<uint32_t>(size),
+                                      offset,
+                                      execFlags,
+                                      false,
+                                      &this->osContext,
+                                      drmIterator,
+                                      drmContextIds[drmContextId],
+                                      this->residency.data(),
+                                      this->residency.size(),
+                                      this->execObjectsStorage.data(),
+                                      completionFenceGpuAddress,
+                                      completionValue);
            if (errorCode != 0) {
                this->dispatchErrorCode = errorCode;
                ret = false;
+                break;
            }
            drmContextId++;
            if (completionFenceGpuAddress) {
@@ -174,6 +188,8 @@ bool DrmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, siz
        }
    }

+    this->residency.clear();
+
    if (this->isCompletionFenceSupported() && ret) {
        completionFenceValue++;
    }
@@ -188,6 +204,16 @@ bool DrmDirectSubmission<GfxFamily, Dispatcher>::handleResidency() {
    return true;
 }

+template <typename GfxFamily, typename Dispatcher>
+void DrmDirectSubmission<GfxFamily, Dispatcher>::handleRingRestartForUllsLightResidency(ResidencyContainer *allocationsForResidency) {
+    if (allocationsForResidency) {
+        auto restartNeeded = static_cast<DrmMemoryOperationsHandler *>(this->memoryOperationHandler)->obtainAndResetNewResourcesSinceLastRingSubmit();
+        if (restartNeeded) {
+            this->stopRingBuffer(false);
+        }
+    }
+}
+
 template <typename GfxFamily, typename Dispatcher>
 void DrmDirectSubmission<GfxFamily, Dispatcher>::handleStopRingBuffer() {
    if (this->disableMonitorFence) {
@@ -209,6 +235,11 @@ void DrmDirectSubmission<GfxFamily, Dispatcher>::handleSwitchRingBuffers(Residen
            this->ringBuffers[this->previousRingBuffer].completionFence = this->currentTagData.tagValue;
        }
    }
+
+    if (allocationsForResidency) {
+        allocationsForResidency->clear();
+        static_cast<DrmMemoryOperationsHandler *>(this->memoryOperationHandler)->mergeWithResidencyContainer(&this->osContext, *allocationsForResidency);
+    }
 }

 template <typename GfxFamily, typename Dispatcher>
--- a/shared/source/direct_submission/windows/wddm_direct_submission.h
+++ b/shared/source/direct_submission/windows/wddm_direct_submission.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2024 Intel Corporation
+ * Copyright (C) 2020-2025 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -29,7 +29,7 @@ class WddmDirectSubmission : public DirectSubmissionHw<GfxFamily, Dispatcher> {

  protected:
    bool allocateOsResources() override;
-    bool submit(uint64_t gpuAddress, size_t size) override;
+    bool submit(uint64_t gpuAddress, size_t size, ResidencyContainer *allocationForResidency) override;

    bool handleResidency() override;
    void handleCompletionFence(uint64_t completionValue, MonitoredFence &fence);
--- a/shared/source/direct_submission/windows/wddm_direct_submission.inl
+++ b/shared/source/direct_submission/windows/wddm_direct_submission.inl
@@ -73,7 +73,7 @@ inline void WddmDirectSubmission<GfxFamily, Dispatcher>::flushMonitorFence() {
    Dispatcher::dispatchMonitorFence(this->ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, this->rootDeviceEnvironment, this->partitionedMode, this->dcFlushRequired);

    this->dispatchSemaphoreSection(this->currentQueueWorkCount + 1);
-    this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize, true);
+    this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize, true, nullptr);
    this->currentQueueWorkCount++;

    this->updateTagValueImpl(this->currentRingBuffer);
@@ -92,7 +92,7 @@ bool WddmDirectSubmission<GfxFamily, Dispatcher>::allocateOsResources() {
 }

 template <typename GfxFamily, typename Dispatcher>
-bool WddmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, size_t size) {
+bool WddmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, size_t size, ResidencyContainer *allocationForResidency) {
    perfLogResidencyVariadicLog(wddm->getResidencyLogger(), "ULLS Submit to GPU\n");
    COMMAND_BUFFER_HEADER *pHeader = reinterpret_cast<COMMAND_BUFFER_HEADER *>(commandBufferHeader.get());
    pHeader->RequiresCoherency = false;