performance: use staging buffer as a pool

Related-To: NEO-11501 Currently whole staging buffer is consumed even if size of the transfer is smaller than that buffer. This commit changes that, so single staging buffer might be utilized by several smaller transfers as long as they don't exceed total size Signed-off-by: Morek, Szymon <szymon.morek@intel.com>
2025-12-30 01:35:20 +08:00 · 2024-07-05 11:33:04 +00:00
parent 85e708819a
commit ed7fc9acc9
3 changed files with 144 additions and 40 deletions
--- a/shared/source/utilities/staging_buffer_manager.cpp
+++ b/shared/source/utilities/staging_buffer_manager.cpp
@@ -11,9 +11,18 @@
 #include "shared/source/debug_settings/debug_settings_manager.h"
 #include "shared/source/device/device.h"
 #include "shared/source/memory_manager/unified_memory_manager.h"
+#include "shared/source/utilities/heap_allocator.h"

 namespace NEO {

+StagingBuffer::StagingBuffer(void *baseAddress, size_t size) : baseAddress(baseAddress) {
+    this->allocator = std::make_unique<HeapAllocator>(castToUint64(baseAddress), size, MemoryConstants::pageSize, 0u);
+}
+
+StagingBuffer::StagingBuffer(StagingBuffer &&other) : baseAddress(other.baseAddress) {
+    this->allocator.reset(other.allocator.release());
+}
+
 StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields) : svmAllocsManager(svmAllocsManager), rootDeviceIndices(rootDeviceIndices), deviceBitfields(deviceBitfields) {
    if (debugManager.flags.StagingBufferSize.get() != -1) {
        chunkSize = debugManager.flags.StagingBufferSize.get() * MemoryConstants::kiloByte;
@@ -22,27 +31,28 @@ StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, c

 StagingBufferManager::~StagingBufferManager() {
    for (auto &stagingBuffer : stagingBuffers) {
-        svmAllocsManager->freeSVMAlloc(stagingBuffer.first->gpuAllocations.getDefaultGraphicsAllocation()->getUnderlyingBuffer());
+        svmAllocsManager->freeSVMAlloc(stagingBuffer.getBaseAddress());
    }
 }

 /*
 * This method performs 4 steps for single chunk copy
- * 1. Get existing staging buffer, if can't - allocate new one,
+ * 1. Get existing chunk of staging buffer, if can't - allocate new one,
 * 2. Perform actual copy,
- * 3. Store used buffer back to the container (with current task count)
- * 4. Update tag to reuse previous buffers within same API call
+ * 3. Store used buffer to tracking container (with current task count)
+ * 4. Update tag if required to reuse this buffer in next chunk copies
 */
 int32_t StagingBufferManager::performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr) {
-    auto rootDeviceIndex = csr->getRootDeviceIndex();
-    auto taskCount = *csr->getTagAddress();
-    auto stagingBuffer = getExistingBuffer(taskCount, rootDeviceIndex);
-    if (stagingBuffer == nullptr) {
-        stagingBuffer = allocateStagingBuffer();
+    auto allocatedSize = size;
+    auto [allocator, chunkBuffer] = requestStagingBuffer(allocatedSize, csr);
+    auto ret = chunkCopyFunc(chunkDst, addrToPtr(chunkBuffer), chunkSrc, size);
+    {
+        auto lock = std::lock_guard<std::mutex>(mtx);
+        trackers.push_back({allocator, chunkBuffer, allocatedSize, csr->peekTaskCount()});
+    }
+    if (csr->isAnyDirectSubmissionEnabled()) {
+        csr->flushTagUpdate();
    }
-    auto ret = chunkCopyFunc(chunkDst, stagingBuffer, chunkSrc, size);
-    storeBuffer(stagingBuffer, csr->peekTaskCount());
-    csr->flushTagUpdate();
    return ret;
 }

@@ -76,25 +86,47 @@ int32_t StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size
 }

 /*
- * This method will try to return existing staging buffer from the container.
- * It's checking only "oldest" allocation.
- * Returns nullptr if no staging buffer available.
+ * This method returns allocator and chunk from staging buffer.
+ * Creates new staging buffer if it failed to allocate chunk from existing buffers.
 */
-void *StagingBufferManager::getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex) {
+std::pair<HeapAllocator *, uint64_t> StagingBufferManager::requestStagingBuffer(size_t &size, CommandStreamReceiver *csr) {
    auto lock = std::lock_guard<std::mutex>(mtx);
-    if (stagingBuffers.empty()) {
-        return nullptr;
-    }
-    void *buffer = nullptr;
-    auto iterator = stagingBuffers.begin();
-    UNRECOVERABLE_IF(iterator == stagingBuffers.end());

-    if (taskCount > iterator->second) {
-        auto allocation = iterator->first->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
-        buffer = allocation->getUnderlyingBuffer();
-        stagingBuffers.erase(iterator);
+    auto [allocator, chunkBuffer] = getExistingBuffer(size);
+    if (chunkBuffer != 0) {
+        return {allocator, chunkBuffer};
    }
-    return buffer;
+
+    clearTrackedChunks(csr);
+
+    auto [retriedAllocator, retriedChunkBuffer] = getExistingBuffer(size);
+    if (retriedChunkBuffer != 0) {
+        return {retriedAllocator, retriedChunkBuffer};
+    }
+
+    StagingBuffer stagingBuffer{allocateStagingBuffer(), chunkSize};
+    allocator = stagingBuffer.getAllocator();
+    chunkBuffer = allocator->allocate(size);
+    stagingBuffers.push_back(std::move(stagingBuffer));
+    return {allocator, chunkBuffer};
+}
+
+/*
+ * This method will try to allocate chunk from existing staging buffer.
+ * Returns allocator and chunk from consumed staging buffer.
+ */
+std::pair<HeapAllocator *, uint64_t> StagingBufferManager::getExistingBuffer(size_t &size) {
+    uint64_t buffer = 0;
+    HeapAllocator *allocator = nullptr;
+
+    for (auto &stagingBuffer : stagingBuffers) {
+        allocator = stagingBuffer.getAllocator();
+        buffer = allocator->allocate(size);
+        if (buffer != 0) {
+            break;
+        }
+    }
+    return {allocator, buffer};
 }

 void *StagingBufferManager::allocateStagingBuffer() {
@@ -103,12 +135,6 @@ void *StagingBufferManager::allocateStagingBuffer() {
    return hostPtr;
 }

-void StagingBufferManager::storeBuffer(void *stagingBuffer, uint64_t taskCount) {
-    auto lock = std::lock_guard<std::mutex>(mtx);
-    auto svmData = svmAllocsManager->getSVMAlloc(stagingBuffer);
-    stagingBuffers.push_back({svmData, taskCount});
-}
-
 bool StagingBufferManager::isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const {
    auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
    if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
@@ -124,4 +150,15 @@ bool StagingBufferManager::isValidForCopy(Device &device, void *dstPtr, const vo
    return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize);
 }

+void StagingBufferManager::clearTrackedChunks(CommandStreamReceiver *csr) {
+    for (auto iterator = trackers.begin(); iterator != trackers.end();) {
+        if (csr->testTaskCountReady(csr->getTagAddress(), iterator->taskCountToWait)) {
+            iterator->allocator->free(iterator->chunkAddress, iterator->size);
+            iterator = trackers.erase(iterator);
+        } else {
+            break;
+        }
+    }
+}
+
 } // namespace NEO
--- a/shared/source/utilities/staging_buffer_manager.h
+++ b/shared/source/utilities/staging_buffer_manager.h
@@ -12,19 +12,39 @@

 #include <functional>
 #include <map>
+#include <memory>
 #include <mutex>

 namespace NEO {
 class SVMAllocsManager;
 class CommandStreamReceiver;
 class Device;
-struct SvmAllocationData;
+class HeapAllocator;

 using ChunkCopyFunction = std::function<int32_t(void *, void *, const void *, size_t)>;

+class StagingBuffer {
+  public:
+    StagingBuffer(void *baseAddress, size_t size);
+    StagingBuffer(StagingBuffer &&other);
+
+    void *getBaseAddress() const {
+        return baseAddress;
+    }
+    HeapAllocator *getAllocator() const {
+        return allocator.get();
+    }
+
+  private:
+    void *baseAddress;
+    std::unique_ptr<HeapAllocator> allocator;
+};
+
 struct StagingBufferTracker {
-    void *stagingBuffer;
-    uint64_t taskCount;
+    HeapAllocator *allocator;
+    uint64_t chunkAddress;
+    size_t size;
+    uint64_t taskCountToWait;
 };

 class StagingBufferManager {
@@ -36,15 +56,17 @@ class StagingBufferManager {
    int32_t performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr);

  private:
-    void *getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex);
+    std::pair<HeapAllocator *, uint64_t> requestStagingBuffer(size_t &size, CommandStreamReceiver *csr);
+    std::pair<HeapAllocator *, uint64_t> getExistingBuffer(size_t &size);
    void *allocateStagingBuffer();
-    void storeBuffer(void *stagingBuffer, uint64_t taskCount);
+    void clearTrackedChunks(CommandStreamReceiver *csr);
+
    int32_t performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr);

    size_t chunkSize = MemoryConstants::pageSize2M;
-
-    std::vector<std::pair<SvmAllocationData *, uint64_t>> stagingBuffers;
    std::mutex mtx;
+    std::vector<StagingBuffer> stagingBuffers;
+    std::vector<StagingBufferTracker> trackers;

    SVMAllocsManager *svmAllocsManager;
    const RootDeviceIndicesContainer rootDeviceIndices;