refactor: split kernel residency into internal and argument containers

Related-To: NEO-11719 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
2026-01-03 06:49:52 +08:00 · 2024-07-23 12:14:04 +00:00
parent ae68df3832
commit 1c1e437d4b
16 changed files with 93 additions and 73 deletions
--- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl
@@ -254,8 +254,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
    appendSignalEventPostWalker(event, nullptr, nullptr, false, false, false);

    commandContainer.addToResidencyContainer(kernelImmutableData->getIsaGraphicsAllocation());
-    auto &residencyContainer = kernel->getResidencyContainer();
-    for (auto resource : residencyContainer) {
+    auto &argumentsResidencyContainer = kernel->getArgumentsResidencyContainer();
+    for (auto resource : argumentsResidencyContainer) {
+        commandContainer.addToResidencyContainer(resource);
+    }
+    auto &internalResidencyContainer = kernel->getInternalResidencyContainer();
+    for (auto resource : internalResidencyContainer) {
        commandContainer.addToResidencyContainer(resource);
    }

--- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl
@@ -225,15 +225,20 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K

    bool isKernelUsingSystemAllocation = false;
    if (!launchParams.isBuiltInKernel) {
-        auto &kernelAllocations = kernel->getResidencyContainer();
-        for (auto &allocation : kernelAllocations) {
-            if (allocation == nullptr) {
-                continue;
+        auto verifyKernelUsingSystemAllocations = [&isKernelUsingSystemAllocation](const NEO::ResidencyContainer &kernelResidencyContainer) {
+            for (const auto &allocation : kernelResidencyContainer) {
+                if (allocation == nullptr) {
+                    continue;
+                }
+                if (allocation->getAllocationType() == NEO::AllocationType::bufferHostMemory) {
+                    isKernelUsingSystemAllocation = true;
+                }
            }
-            if (allocation->getAllocationType() == NEO::AllocationType::bufferHostMemory) {
-                isKernelUsingSystemAllocation = true;
-            }
-        }
+        };
+
+        verifyKernelUsingSystemAllocations(kernel->getArgumentsResidencyContainer());
+        verifyKernelUsingSystemAllocations(kernel->getInternalResidencyContainer());
+
    } else {
        isKernelUsingSystemAllocation = launchParams.isDestinationAllocationInSystemMemory;
    }
@@ -437,9 +442,13 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
    // Attach kernel residency to our CommandList residency
    {
        commandContainer.addToResidencyContainer(kernelImmutableData->getIsaGraphicsAllocation());
+        auto &internalResidencyContainer = kernel->getInternalResidencyContainer();
+        for (auto resource : internalResidencyContainer) {
+            commandContainer.addToResidencyContainer(resource);
+        }
        if (!launchParams.omitAddingKernelResidency) {
-            auto &residencyContainer = kernel->getResidencyContainer();
-            for (auto resource : residencyContainer) {
+            auto &argumentsResidencyContainer = kernel->getArgumentsResidencyContainer();
+            for (auto resource : argumentsResidencyContainer) {
                commandContainer.addToResidencyContainer(resource);
            }
        }
--- a/level_zero/core/source/kernel/kernel.h
+++ b/level_zero/core/source/kernel/kernel.h
@@ -148,7 +148,8 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {

    virtual const KernelImmutableData *getImmutableData() const = 0;

-    virtual const std::vector<NEO::GraphicsAllocation *> &getResidencyContainer() const = 0;
+    virtual const std::vector<NEO::GraphicsAllocation *> &getArgumentsResidencyContainer() const = 0;
+    virtual const std::vector<NEO::GraphicsAllocation *> &getInternalResidencyContainer() const = 0;

    virtual UnifiedMemoryControls getUnifiedMemoryControls() const = 0;
    virtual bool hasIndirectAllocationsAllowed() const = 0;
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -586,7 +586,7 @@ ze_result_t KernelImp::setArgImmediate(uint32_t argIndex, size_t argSize, const
 ze_result_t KernelImp::setArgRedescribedImage(uint32_t argIndex, ze_image_handle_t argVal) {
    const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescImage>();
    if (argVal == nullptr) {
-        residencyContainer[argIndex] = nullptr;
+        argumentsResidencyContainer[argIndex] = nullptr;
        return ZE_RESULT_SUCCESS;
    }

@@ -620,7 +620,7 @@ ze_result_t KernelImp::setArgRedescribedImage(uint32_t argIndex, ze_image_handle
    } else {
        image->copyRedescribedSurfaceStateToSSH(surfaceStateHeapData.get(), arg.bindful);
    }
-    residencyContainer[argIndex] = image->getAllocation();
+    argumentsResidencyContainer[argIndex] = image->getAllocation();

    return ZE_RESULT_SUCCESS;
 }
@@ -656,7 +656,7 @@ ze_result_t KernelImp::setArgBufferWithAlloc(uint32_t argIndex, uintptr_t argVal
        }
        this->setKernelArgUncached(argIndex, argIsUncacheable);
    }
-    residencyContainer[argIndex] = allocation;
+    argumentsResidencyContainer[argIndex] = allocation;

    return ZE_RESULT_SUCCESS;
 }
@@ -727,7 +727,7 @@ ze_result_t KernelImp::setArgBuffer(uint32_t argIndex, size_t argSize, const voi
    }

    if (nullptr == argVal) {
-        residencyContainer[argIndex] = nullptr;
+        argumentsResidencyContainer[argIndex] = nullptr;
        const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescPointer>();
        uintptr_t nullBufferValue = 0;
        NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg, nullBufferValue);
@@ -774,7 +774,7 @@ ze_result_t KernelImp::setArgBuffer(uint32_t argIndex, size_t argSize, const voi
        for (const auto &mappedAllocationData : allocData->virtualReservationData->mappedAllocations) {
            // Add additional allocations to the residency container if the virtual reservation spans multiple allocations.
            if (requestedAddress != mappedAllocationData.second->ptr) {
-                this->residencyContainer.push_back(mappedAllocationData.second->mappedAllocation->allocation);
+                this->argumentsResidencyContainer.push_back(mappedAllocationData.second->mappedAllocation->allocation);
            }
        }
    }
@@ -784,7 +784,7 @@ ze_result_t KernelImp::setArgBuffer(uint32_t argIndex, size_t argSize, const voi

 ze_result_t KernelImp::setArgImage(uint32_t argIndex, size_t argSize, const void *argVal) {
    if (argVal == nullptr) {
-        residencyContainer[argIndex] = nullptr;
+        argumentsResidencyContainer[argIndex] = nullptr;
        return ZE_RESULT_SUCCESS;
    }

@@ -824,10 +824,10 @@ ze_result_t KernelImp::setArgImage(uint32_t argIndex, size_t argSize, const void
        image->copySurfaceStateToSSH(surfaceStateHeapData.get(), arg.bindful, isMediaBlockImage);
    }

-    residencyContainer[argIndex] = image->getAllocation();
+    argumentsResidencyContainer[argIndex] = image->getAllocation();

    if (image->getImplicitArgsAllocation()) {
-        this->residencyContainer.push_back(image->getImplicitArgsAllocation());
+        this->argumentsResidencyContainer.push_back(image->getImplicitArgsAllocation());
    }

    auto imageInfo = image->getImageInfo();
@@ -1091,13 +1091,13 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
        }
    }

-    residencyContainer.resize(this->kernelArgHandlers.size(), nullptr);
+    argumentsResidencyContainer.resize(this->kernelArgHandlers.size(), nullptr);

    auto &kernelAttributes = kernelDescriptor.kernelAttributes;
    if ((kernelAttributes.perHwThreadPrivateMemorySize != 0U) && (false == module->shouldAllocatePrivateMemoryPerDispatch())) {
        this->privateMemoryGraphicsAllocation = allocatePrivateMemoryGraphicsAllocation();
        this->patchCrossthreadDataWithPrivateAllocation(this->privateMemoryGraphicsAllocation);
-        this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation);
+        this->internalResidencyContainer.push_back(this->privateMemoryGraphicsAllocation);
    }

    this->createPrintfBuffer();
@@ -1106,8 +1106,8 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {

    this->setAssertBuffer();

-    residencyContainer.insert(residencyContainer.end(), kernelImmData->getResidencyContainer().begin(),
-                              kernelImmData->getResidencyContainer().end());
+    internalResidencyContainer.insert(internalResidencyContainer.end(), kernelImmData->getResidencyContainer().begin(),
+                                      kernelImmData->getResidencyContainer().end());
    ModuleImp *moduleImp = reinterpret_cast<ModuleImp *>(this->module);
    const auto indirectDetectionVersion = moduleImp->getTranslationUnit()->programInfo.indirectDetectionVersion;

@@ -1138,7 +1138,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
        }

        for (auto rtStack : rtDispatchGlobalsInfo->rtStacks) {
-            this->residencyContainer.push_back(rtStack);
+            this->internalResidencyContainer.push_back(rtStack);
        }

        auto address = rtDispatchGlobalsInfo->rtDispatchGlobalsArray->getGpuAddressToPatch();
@@ -1151,7 +1151,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
            pImplicitArgs->rtGlobalBufferPtr = address;
        }

-        this->residencyContainer.push_back(rtDispatchGlobalsInfo->rtDispatchGlobalsArray);
+        this->internalResidencyContainer.push_back(rtDispatchGlobalsInfo->rtDispatchGlobalsArray);
    }
    this->midThreadPreemptionDisallowedForRayTracingKernels = productHelper.isMidThreadPreemptionDisallowedForRayTracingKernels();
    return ZE_RESULT_SUCCESS;
@@ -1160,7 +1160,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
 void KernelImp::createPrintfBuffer() {
    if (this->kernelImmData->getDescriptor().kernelAttributes.flags.usesPrintf || pImplicitArgs) {
        this->printfBuffer = PrintfHandler::createPrintfBuffer(this->module->getDevice());
-        this->residencyContainer.push_back(printfBuffer);
+        this->internalResidencyContainer.push_back(printfBuffer);
        if (this->kernelImmData->getDescriptor().kernelAttributes.flags.usesPrintf) {
            NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
                              this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.printfSurfaceAddress,
@@ -1186,14 +1186,14 @@ bool KernelImp::usesRegionGroupBarrier() const {
 }

 void KernelImp::patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
-    this->residencyContainer.push_back(gfxAllocation);
+    this->internalResidencyContainer.push_back(gfxAllocation);
    NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
                      this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.syncBufferAddress,
                      static_cast<uintptr_t>(ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset)));
 }

 void KernelImp::patchRegionGroupBarrier(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
-    this->residencyContainer.push_back(gfxAllocation);
+    this->internalResidencyContainer.push_back(gfxAllocation);

    NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
                      this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.regionGroupBarrierBuffer,
@@ -1335,7 +1335,7 @@ void KernelImp::setAssertBuffer() {
    NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
                      this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.assertBufferAddress,
                      static_cast<uintptr_t>(assertHandler->getAssertBuffer()->getGpuAddressToPatch()));
-    this->residencyContainer.push_back(assertHandler->getAssertBuffer());
+    this->internalResidencyContainer.push_back(assertHandler->getAssertBuffer());

    if (pImplicitArgs) {
        pImplicitArgs->assertBufferPtr = static_cast<uintptr_t>(assertHandler->getAssertBuffer()->getGpuAddressToPatch());
--- a/level_zero/core/source/kernel/kernel_imp.h
+++ b/level_zero/core/source/kernel/kernel_imp.h
@@ -75,8 +75,12 @@ struct KernelImp : Kernel {
    const uint8_t *getCrossThreadData() const override { return crossThreadData.get(); }
    uint32_t getCrossThreadDataSize() const override { return crossThreadDataSize; }

-    const std::vector<NEO::GraphicsAllocation *> &getResidencyContainer() const override {
-        return residencyContainer;
+    const std::vector<NEO::GraphicsAllocation *> &getArgumentsResidencyContainer() const override {
+        return argumentsResidencyContainer;
+    }
+
+    const std::vector<NEO::GraphicsAllocation *> &getInternalResidencyContainer() const override {
+        return internalResidencyContainer;
    }

    ze_result_t setArgImmediate(uint32_t argIndex, size_t argSize, const void *argVal);
@@ -210,7 +214,8 @@ struct KernelImp : Kernel {
    typedef ze_result_t (KernelImp::*KernelArgHandler)(uint32_t argIndex, size_t argSize, const void *argVal);
    std::vector<KernelArgInfo> kernelArgInfos;
    std::vector<KernelImp::KernelArgHandler> kernelArgHandlers;
-    std::vector<NEO::GraphicsAllocation *> residencyContainer;
+    std::vector<NEO::GraphicsAllocation *> argumentsResidencyContainer;
+    std::vector<NEO::GraphicsAllocation *> internalResidencyContainer;

    std::mutex *devicePrintfKernelMutex = nullptr;
    NEO::GraphicsAllocation *printfBuffer = nullptr;