From ed7fc9acc91ae79d72da741ea07db5728cdda38e Mon Sep 17 00:00:00 2001 From: "Morek, Szymon" Date: Fri, 5 Jul 2024 11:33:04 +0000 Subject: [PATCH] performance: use staging buffer as a pool Related-To: NEO-11501 Currently whole staging buffer is consumed even if size of the transfer is smaller than that buffer. This commit changes that, so single staging buffer might be utilized by several smaller transfers as long as they don't exceed total size Signed-off-by: Morek, Szymon --- .../utilities/staging_buffer_manager.cpp | 103 ++++++++++++------ .../source/utilities/staging_buffer_manager.h | 36 ++++-- .../staging_buffer_manager_tests.cpp | 45 ++++++++ 3 files changed, 144 insertions(+), 40 deletions(-) diff --git a/shared/source/utilities/staging_buffer_manager.cpp b/shared/source/utilities/staging_buffer_manager.cpp index 9cc3777b76..b921c3dd5a 100644 --- a/shared/source/utilities/staging_buffer_manager.cpp +++ b/shared/source/utilities/staging_buffer_manager.cpp @@ -11,9 +11,18 @@ #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/device/device.h" #include "shared/source/memory_manager/unified_memory_manager.h" +#include "shared/source/utilities/heap_allocator.h" namespace NEO { +StagingBuffer::StagingBuffer(void *baseAddress, size_t size) : baseAddress(baseAddress) { + this->allocator = std::make_unique(castToUint64(baseAddress), size, MemoryConstants::pageSize, 0u); +} + +StagingBuffer::StagingBuffer(StagingBuffer &&other) : baseAddress(other.baseAddress) { + this->allocator.reset(other.allocator.release()); +} + StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map &deviceBitfields) : svmAllocsManager(svmAllocsManager), rootDeviceIndices(rootDeviceIndices), deviceBitfields(deviceBitfields) { if (debugManager.flags.StagingBufferSize.get() != -1) { chunkSize = debugManager.flags.StagingBufferSize.get() * MemoryConstants::kiloByte; @@ -22,27 +31,28 @@ StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, c StagingBufferManager::~StagingBufferManager() { for (auto &stagingBuffer : stagingBuffers) { - svmAllocsManager->freeSVMAlloc(stagingBuffer.first->gpuAllocations.getDefaultGraphicsAllocation()->getUnderlyingBuffer()); + svmAllocsManager->freeSVMAlloc(stagingBuffer.getBaseAddress()); } } /* * This method performs 4 steps for single chunk copy - * 1. Get existing staging buffer, if can't - allocate new one, + * 1. Get existing chunk of staging buffer, if can't - allocate new one, * 2. Perform actual copy, - * 3. Store used buffer back to the container (with current task count) - * 4. Update tag to reuse previous buffers within same API call + * 3. Store used buffer to tracking container (with current task count) + * 4. Update tag if required to reuse this buffer in next chunk copies */ int32_t StagingBufferManager::performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr) { - auto rootDeviceIndex = csr->getRootDeviceIndex(); - auto taskCount = *csr->getTagAddress(); - auto stagingBuffer = getExistingBuffer(taskCount, rootDeviceIndex); - if (stagingBuffer == nullptr) { - stagingBuffer = allocateStagingBuffer(); + auto allocatedSize = size; + auto [allocator, chunkBuffer] = requestStagingBuffer(allocatedSize, csr); + auto ret = chunkCopyFunc(chunkDst, addrToPtr(chunkBuffer), chunkSrc, size); + { + auto lock = std::lock_guard(mtx); + trackers.push_back({allocator, chunkBuffer, allocatedSize, csr->peekTaskCount()}); + } + if (csr->isAnyDirectSubmissionEnabled()) { + csr->flushTagUpdate(); } - auto ret = chunkCopyFunc(chunkDst, stagingBuffer, chunkSrc, size); - storeBuffer(stagingBuffer, csr->peekTaskCount()); - csr->flushTagUpdate(); return ret; } @@ -76,25 +86,47 @@ int32_t StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size } /* - * This method will try to return existing staging buffer from the container. - * It's checking only "oldest" allocation. - * Returns nullptr if no staging buffer available. + * This method returns allocator and chunk from staging buffer. + * Creates new staging buffer if it failed to allocate chunk from existing buffers. */ -void *StagingBufferManager::getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex) { +std::pair StagingBufferManager::requestStagingBuffer(size_t &size, CommandStreamReceiver *csr) { auto lock = std::lock_guard(mtx); - if (stagingBuffers.empty()) { - return nullptr; - } - void *buffer = nullptr; - auto iterator = stagingBuffers.begin(); - UNRECOVERABLE_IF(iterator == stagingBuffers.end()); - if (taskCount > iterator->second) { - auto allocation = iterator->first->gpuAllocations.getGraphicsAllocation(rootDeviceIndex); - buffer = allocation->getUnderlyingBuffer(); - stagingBuffers.erase(iterator); + auto [allocator, chunkBuffer] = getExistingBuffer(size); + if (chunkBuffer != 0) { + return {allocator, chunkBuffer}; } - return buffer; + + clearTrackedChunks(csr); + + auto [retriedAllocator, retriedChunkBuffer] = getExistingBuffer(size); + if (retriedChunkBuffer != 0) { + return {retriedAllocator, retriedChunkBuffer}; + } + + StagingBuffer stagingBuffer{allocateStagingBuffer(), chunkSize}; + allocator = stagingBuffer.getAllocator(); + chunkBuffer = allocator->allocate(size); + stagingBuffers.push_back(std::move(stagingBuffer)); + return {allocator, chunkBuffer}; +} + +/* + * This method will try to allocate chunk from existing staging buffer. + * Returns allocator and chunk from consumed staging buffer. + */ +std::pair StagingBufferManager::getExistingBuffer(size_t &size) { + uint64_t buffer = 0; + HeapAllocator *allocator = nullptr; + + for (auto &stagingBuffer : stagingBuffers) { + allocator = stagingBuffer.getAllocator(); + buffer = allocator->allocate(size); + if (buffer != 0) { + break; + } + } + return {allocator, buffer}; } void *StagingBufferManager::allocateStagingBuffer() { @@ -103,12 +135,6 @@ void *StagingBufferManager::allocateStagingBuffer() { return hostPtr; } -void StagingBufferManager::storeBuffer(void *stagingBuffer, uint64_t taskCount) { - auto lock = std::lock_guard(mtx); - auto svmData = svmAllocsManager->getSVMAlloc(stagingBuffer); - stagingBuffers.push_back({svmData, taskCount}); -} - bool StagingBufferManager::isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const { auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled(); if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) { @@ -124,4 +150,15 @@ bool StagingBufferManager::isValidForCopy(Device &device, void *dstPtr, const vo return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize); } +void StagingBufferManager::clearTrackedChunks(CommandStreamReceiver *csr) { + for (auto iterator = trackers.begin(); iterator != trackers.end();) { + if (csr->testTaskCountReady(csr->getTagAddress(), iterator->taskCountToWait)) { + iterator->allocator->free(iterator->chunkAddress, iterator->size); + iterator = trackers.erase(iterator); + } else { + break; + } + } +} + } // namespace NEO diff --git a/shared/source/utilities/staging_buffer_manager.h b/shared/source/utilities/staging_buffer_manager.h index 93d66d075e..5f9fc9729f 100644 --- a/shared/source/utilities/staging_buffer_manager.h +++ b/shared/source/utilities/staging_buffer_manager.h @@ -12,19 +12,39 @@ #include #include +#include #include namespace NEO { class SVMAllocsManager; class CommandStreamReceiver; class Device; -struct SvmAllocationData; +class HeapAllocator; using ChunkCopyFunction = std::function; +class StagingBuffer { + public: + StagingBuffer(void *baseAddress, size_t size); + StagingBuffer(StagingBuffer &&other); + + void *getBaseAddress() const { + return baseAddress; + } + HeapAllocator *getAllocator() const { + return allocator.get(); + } + + private: + void *baseAddress; + std::unique_ptr allocator; +}; + struct StagingBufferTracker { - void *stagingBuffer; - uint64_t taskCount; + HeapAllocator *allocator; + uint64_t chunkAddress; + size_t size; + uint64_t taskCountToWait; }; class StagingBufferManager { @@ -36,15 +56,17 @@ class StagingBufferManager { int32_t performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr); private: - void *getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex); + std::pair requestStagingBuffer(size_t &size, CommandStreamReceiver *csr); + std::pair getExistingBuffer(size_t &size); void *allocateStagingBuffer(); - void storeBuffer(void *stagingBuffer, uint64_t taskCount); + void clearTrackedChunks(CommandStreamReceiver *csr); + int32_t performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr); size_t chunkSize = MemoryConstants::pageSize2M; - - std::vector> stagingBuffers; std::mutex mtx; + std::vector stagingBuffers; + std::vector trackers; SVMAllocsManager *svmAllocsManager; const RootDeviceIndicesContainer rootDeviceIndices; diff --git a/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp b/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp index ba8c5e31e7..4421c76be3 100644 --- a/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp +++ b/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp @@ -8,8 +8,11 @@ #include "shared/source/utilities/staging_buffer_manager.h" #include "shared/test/common/fixtures/device_fixture.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/libult/ult_command_stream_receiver.h" +#include "shared/test/common/mocks/mock_command_stream_receiver.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/common/mocks/mock_svm_manager.h" +#include "shared/test/common/test_macros/hw_test.h" #include "shared/test/common/test_macros/test.h" #include "shared/test/common/test_macros/test_checks_shared.h" @@ -56,6 +59,7 @@ class StagingBufferManagerFixture : public DeviceFixture { chunkCounter++; memcpy(stagingBuffer, chunkSrc, chunkSize); memcpy(chunkDst, stagingBuffer, chunkSize); + reinterpret_cast(csr)->taskCount++; return 0; }; auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs(); @@ -141,6 +145,18 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenTaskCountNotReadyThenDont copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 8); } +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenTaskCountNotReadyButSmallTransfersThenReuseBuffer) { + constexpr size_t numOfChunkCopies = 1; + constexpr size_t totalCopySize = MemoryConstants::pageSize; + constexpr size_t availableTransfersWithinBuffer = stagingBufferSize / totalCopySize; + *csr->getTagAddress() = csr->peekTaskCount(); + copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 1); + for (auto i = 1u; i < availableTransfersWithinBuffer; i++) { + copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 0); + } + copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 1); +} + TEST_F(StagingBufferManagerTest, givenStagingBufferWhenUpdatedTaskCountThenReuseBuffers) { constexpr size_t numOfChunkCopies = 8; constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies; @@ -227,4 +243,33 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenChangedBufferSizeThenPerf std::map deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}}; stagingBufferManager = std::make_unique(svmAllocsManager.get(), rootDeviceIndices, deviceBitfields); copyThroughStagingBuffers(totalCopySize, numOfChunkCopies + 1, 1); +} + +HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenDirectSubmissionEnabledThenFlushTagCalled) { + constexpr size_t numOfChunkCopies = 8; + constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies; + auto ultCsr = reinterpret_cast *>(csr); + ultCsr->directSubmissionAvailable = true; + ultCsr->callFlushTagUpdate = false; + + auto usmBuffer = allocateDeviceBuffer(totalCopySize); + auto nonUsmBuffer = new unsigned char[totalCopySize]; + + size_t flushTagsCalled = 0; + auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) { + if (ultCsr->flushTagUpdateCalled) { + flushTagsCalled++; + ultCsr->flushTagUpdateCalled = false; + } + reinterpret_cast(csr)->taskCount++; + return 0; + }; + stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr); + if (ultCsr->flushTagUpdateCalled) { + flushTagsCalled++; + } + + EXPECT_EQ(flushTagsCalled, numOfChunkCopies); + svmAllocsManager->freeSVMAlloc(usmBuffer); + delete[] nonUsmBuffer; } \ No newline at end of file