diff --git a/shared/source/utilities/staging_buffer_manager.cpp b/shared/source/utilities/staging_buffer_manager.cpp index 9cc3777b76..b921c3dd5a 100644 --- a/shared/source/utilities/staging_buffer_manager.cpp +++ b/shared/source/utilities/staging_buffer_manager.cpp @@ -11,9 +11,18 @@ #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/device/device.h" #include "shared/source/memory_manager/unified_memory_manager.h" +#include "shared/source/utilities/heap_allocator.h" namespace NEO { +StagingBuffer::StagingBuffer(void *baseAddress, size_t size) : baseAddress(baseAddress) { + this->allocator = std::make_unique(castToUint64(baseAddress), size, MemoryConstants::pageSize, 0u); +} + +StagingBuffer::StagingBuffer(StagingBuffer &&other) : baseAddress(other.baseAddress) { + this->allocator.reset(other.allocator.release()); +} + StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map &deviceBitfields) : svmAllocsManager(svmAllocsManager), rootDeviceIndices(rootDeviceIndices), deviceBitfields(deviceBitfields) { if (debugManager.flags.StagingBufferSize.get() != -1) { chunkSize = debugManager.flags.StagingBufferSize.get() * MemoryConstants::kiloByte; @@ -22,27 +31,28 @@ StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, c StagingBufferManager::~StagingBufferManager() { for (auto &stagingBuffer : stagingBuffers) { - svmAllocsManager->freeSVMAlloc(stagingBuffer.first->gpuAllocations.getDefaultGraphicsAllocation()->getUnderlyingBuffer()); + svmAllocsManager->freeSVMAlloc(stagingBuffer.getBaseAddress()); } } /* * This method performs 4 steps for single chunk copy - * 1. Get existing staging buffer, if can't - allocate new one, + * 1. Get existing chunk of staging buffer, if can't - allocate new one, * 2. Perform actual copy, - * 3. Store used buffer back to the container (with current task count) - * 4. Update tag to reuse previous buffers within same API call + * 3. Store used buffer to tracking container (with current task count) + * 4. Update tag if required to reuse this buffer in next chunk copies */ int32_t StagingBufferManager::performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr) { - auto rootDeviceIndex = csr->getRootDeviceIndex(); - auto taskCount = *csr->getTagAddress(); - auto stagingBuffer = getExistingBuffer(taskCount, rootDeviceIndex); - if (stagingBuffer == nullptr) { - stagingBuffer = allocateStagingBuffer(); + auto allocatedSize = size; + auto [allocator, chunkBuffer] = requestStagingBuffer(allocatedSize, csr); + auto ret = chunkCopyFunc(chunkDst, addrToPtr(chunkBuffer), chunkSrc, size); + { + auto lock = std::lock_guard(mtx); + trackers.push_back({allocator, chunkBuffer, allocatedSize, csr->peekTaskCount()}); + } + if (csr->isAnyDirectSubmissionEnabled()) { + csr->flushTagUpdate(); } - auto ret = chunkCopyFunc(chunkDst, stagingBuffer, chunkSrc, size); - storeBuffer(stagingBuffer, csr->peekTaskCount()); - csr->flushTagUpdate(); return ret; } @@ -76,25 +86,47 @@ int32_t StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size } /* - * This method will try to return existing staging buffer from the container. - * It's checking only "oldest" allocation. - * Returns nullptr if no staging buffer available. + * This method returns allocator and chunk from staging buffer. + * Creates new staging buffer if it failed to allocate chunk from existing buffers. */ -void *StagingBufferManager::getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex) { +std::pair StagingBufferManager::requestStagingBuffer(size_t &size, CommandStreamReceiver *csr) { auto lock = std::lock_guard(mtx); - if (stagingBuffers.empty()) { - return nullptr; - } - void *buffer = nullptr; - auto iterator = stagingBuffers.begin(); - UNRECOVERABLE_IF(iterator == stagingBuffers.end()); - if (taskCount > iterator->second) { - auto allocation = iterator->first->gpuAllocations.getGraphicsAllocation(rootDeviceIndex); - buffer = allocation->getUnderlyingBuffer(); - stagingBuffers.erase(iterator); + auto [allocator, chunkBuffer] = getExistingBuffer(size); + if (chunkBuffer != 0) { + return {allocator, chunkBuffer}; } - return buffer; + + clearTrackedChunks(csr); + + auto [retriedAllocator, retriedChunkBuffer] = getExistingBuffer(size); + if (retriedChunkBuffer != 0) { + return {retriedAllocator, retriedChunkBuffer}; + } + + StagingBuffer stagingBuffer{allocateStagingBuffer(), chunkSize}; + allocator = stagingBuffer.getAllocator(); + chunkBuffer = allocator->allocate(size); + stagingBuffers.push_back(std::move(stagingBuffer)); + return {allocator, chunkBuffer}; +} + +/* + * This method will try to allocate chunk from existing staging buffer. + * Returns allocator and chunk from consumed staging buffer. + */ +std::pair StagingBufferManager::getExistingBuffer(size_t &size) { + uint64_t buffer = 0; + HeapAllocator *allocator = nullptr; + + for (auto &stagingBuffer : stagingBuffers) { + allocator = stagingBuffer.getAllocator(); + buffer = allocator->allocate(size); + if (buffer != 0) { + break; + } + } + return {allocator, buffer}; } void *StagingBufferManager::allocateStagingBuffer() { @@ -103,12 +135,6 @@ void *StagingBufferManager::allocateStagingBuffer() { return hostPtr; } -void StagingBufferManager::storeBuffer(void *stagingBuffer, uint64_t taskCount) { - auto lock = std::lock_guard(mtx); - auto svmData = svmAllocsManager->getSVMAlloc(stagingBuffer); - stagingBuffers.push_back({svmData, taskCount}); -} - bool StagingBufferManager::isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const { auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled(); if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) { @@ -124,4 +150,15 @@ bool StagingBufferManager::isValidForCopy(Device &device, void *dstPtr, const vo return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize); } +void StagingBufferManager::clearTrackedChunks(CommandStreamReceiver *csr) { + for (auto iterator = trackers.begin(); iterator != trackers.end();) { + if (csr->testTaskCountReady(csr->getTagAddress(), iterator->taskCountToWait)) { + iterator->allocator->free(iterator->chunkAddress, iterator->size); + iterator = trackers.erase(iterator); + } else { + break; + } + } +} + } // namespace NEO diff --git a/shared/source/utilities/staging_buffer_manager.h b/shared/source/utilities/staging_buffer_manager.h index 93d66d075e..5f9fc9729f 100644 --- a/shared/source/utilities/staging_buffer_manager.h +++ b/shared/source/utilities/staging_buffer_manager.h @@ -12,19 +12,39 @@ #include #include +#include #include namespace NEO { class SVMAllocsManager; class CommandStreamReceiver; class Device; -struct SvmAllocationData; +class HeapAllocator; using ChunkCopyFunction = std::function; +class StagingBuffer { + public: + StagingBuffer(void *baseAddress, size_t size); + StagingBuffer(StagingBuffer &&other); + + void *getBaseAddress() const { + return baseAddress; + } + HeapAllocator *getAllocator() const { + return allocator.get(); + } + + private: + void *baseAddress; + std::unique_ptr allocator; +}; + struct StagingBufferTracker { - void *stagingBuffer; - uint64_t taskCount; + HeapAllocator *allocator; + uint64_t chunkAddress; + size_t size; + uint64_t taskCountToWait; }; class StagingBufferManager { @@ -36,15 +56,17 @@ class StagingBufferManager { int32_t performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr); private: - void *getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex); + std::pair requestStagingBuffer(size_t &size, CommandStreamReceiver *csr); + std::pair getExistingBuffer(size_t &size); void *allocateStagingBuffer(); - void storeBuffer(void *stagingBuffer, uint64_t taskCount); + void clearTrackedChunks(CommandStreamReceiver *csr); + int32_t performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr); size_t chunkSize = MemoryConstants::pageSize2M; - - std::vector> stagingBuffers; std::mutex mtx; + std::vector stagingBuffers; + std::vector trackers; SVMAllocsManager *svmAllocsManager; const RootDeviceIndicesContainer rootDeviceIndices; diff --git a/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp b/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp index ba8c5e31e7..4421c76be3 100644 --- a/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp +++ b/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp @@ -8,8 +8,11 @@ #include "shared/source/utilities/staging_buffer_manager.h" #include "shared/test/common/fixtures/device_fixture.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/libult/ult_command_stream_receiver.h" +#include "shared/test/common/mocks/mock_command_stream_receiver.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/common/mocks/mock_svm_manager.h" +#include "shared/test/common/test_macros/hw_test.h" #include "shared/test/common/test_macros/test.h" #include "shared/test/common/test_macros/test_checks_shared.h" @@ -56,6 +59,7 @@ class StagingBufferManagerFixture : public DeviceFixture { chunkCounter++; memcpy(stagingBuffer, chunkSrc, chunkSize); memcpy(chunkDst, stagingBuffer, chunkSize); + reinterpret_cast(csr)->taskCount++; return 0; }; auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs(); @@ -141,6 +145,18 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenTaskCountNotReadyThenDont copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 8); } +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenTaskCountNotReadyButSmallTransfersThenReuseBuffer) { + constexpr size_t numOfChunkCopies = 1; + constexpr size_t totalCopySize = MemoryConstants::pageSize; + constexpr size_t availableTransfersWithinBuffer = stagingBufferSize / totalCopySize; + *csr->getTagAddress() = csr->peekTaskCount(); + copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 1); + for (auto i = 1u; i < availableTransfersWithinBuffer; i++) { + copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 0); + } + copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 1); +} + TEST_F(StagingBufferManagerTest, givenStagingBufferWhenUpdatedTaskCountThenReuseBuffers) { constexpr size_t numOfChunkCopies = 8; constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies; @@ -227,4 +243,33 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenChangedBufferSizeThenPerf std::map deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}}; stagingBufferManager = std::make_unique(svmAllocsManager.get(), rootDeviceIndices, deviceBitfields); copyThroughStagingBuffers(totalCopySize, numOfChunkCopies + 1, 1); +} + +HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenDirectSubmissionEnabledThenFlushTagCalled) { + constexpr size_t numOfChunkCopies = 8; + constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies; + auto ultCsr = reinterpret_cast *>(csr); + ultCsr->directSubmissionAvailable = true; + ultCsr->callFlushTagUpdate = false; + + auto usmBuffer = allocateDeviceBuffer(totalCopySize); + auto nonUsmBuffer = new unsigned char[totalCopySize]; + + size_t flushTagsCalled = 0; + auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) { + if (ultCsr->flushTagUpdateCalled) { + flushTagsCalled++; + ultCsr->flushTagUpdateCalled = false; + } + reinterpret_cast(csr)->taskCount++; + return 0; + }; + stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr); + if (ultCsr->flushTagUpdateCalled) { + flushTagsCalled++; + } + + EXPECT_EQ(flushTagsCalled, numOfChunkCopies); + svmAllocsManager->freeSVMAlloc(usmBuffer); + delete[] nonUsmBuffer; } \ No newline at end of file