performance: use staging buffer when writing to an image

Related-To: NEO-12968

Also, don't import usm/mapped allocations for image
operations

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2024-10-30 11:04:33 +00:00
committed by Compute-Runtime-Automation
parent 0f2f3c3764
commit cf58be4142
11 changed files with 274 additions and 32 deletions

View File

@@ -10,6 +10,7 @@
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/device/device.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/utilities/heap_allocator.h"
@@ -46,10 +47,7 @@ int32_t StagingBufferManager::performChunkCopy(void *chunkDst, const void *chunk
auto allocatedSize = size;
auto [allocator, chunkBuffer] = requestStagingBuffer(allocatedSize, csr);
auto ret = chunkCopyFunc(chunkDst, addrToPtr(chunkBuffer), chunkSrc, size);
{
auto lock = std::lock_guard<std::mutex>(mtx);
trackers.push_back({allocator, chunkBuffer, allocatedSize, csr->peekTaskCount()});
}
trackChunk({allocator, chunkBuffer, allocatedSize, csr->peekTaskCount()});
if (csr->isAnyDirectSubmissionEnabled()) {
csr->flushTagUpdate();
}
@@ -104,10 +102,14 @@ std::pair<HeapAllocator *, uint64_t> StagingBufferManager::requestStagingBuffer(
return {retriedAllocator, retriedChunkBuffer};
}
StagingBuffer stagingBuffer{allocateStagingBuffer(), chunkSize};
allocator = stagingBuffer.getAllocator();
chunkBuffer = allocator->allocate(size);
stagingBuffers.push_back(std::move(stagingBuffer));
auto stagingBufferSize = alignUp(std::max(chunkSize, size), MemoryConstants::pageSize2M);
auto usmHost = allocateStagingBuffer(stagingBufferSize);
if (usmHost != nullptr) {
StagingBuffer stagingBuffer{usmHost, stagingBufferSize};
allocator = stagingBuffer.getAllocator();
chunkBuffer = allocator->allocate(size);
stagingBuffers.push_back(std::move(stagingBuffer));
}
return {allocator, chunkBuffer};
}
@@ -129,13 +131,13 @@ std::pair<HeapAllocator *, uint64_t> StagingBufferManager::getExistingBuffer(siz
return {allocator, buffer};
}
void *StagingBufferManager::allocateStagingBuffer() {
void *StagingBufferManager::allocateStagingBuffer(size_t size) {
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::hostUnifiedMemory, 0u, rootDeviceIndices, deviceBitfields);
auto hostPtr = svmAllocsManager->createHostUnifiedMemoryAllocation(chunkSize, unifiedMemoryProperties);
auto hostPtr = svmAllocsManager->createHostUnifiedMemoryAllocation(size, unifiedMemoryProperties);
return hostPtr;
}
bool StagingBufferManager::isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const {
bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const {
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
@@ -150,6 +152,15 @@ bool StagingBufferManager::isValidForCopy(Device &device, void *dstPtr, const vo
return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize);
}
bool StagingBufferManager::isValidForStagingWriteImage(const Device &device, size_t size) const {
auto thresholdSizeForImages = 32 * MemoryConstants::megaByte;
auto stagingCopyEnabled = false;
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
}
return stagingCopyEnabled && (0 < size && size <= thresholdSizeForImages);
}
void StagingBufferManager::clearTrackedChunks(CommandStreamReceiver *csr) {
for (auto iterator = trackers.begin(); iterator != trackers.end();) {
if (csr->testTaskCountReady(csr->getTagAddress(), iterator->taskCountToWait)) {
@@ -161,4 +172,9 @@ void StagingBufferManager::clearTrackedChunks(CommandStreamReceiver *csr) {
}
}
void StagingBufferManager::trackChunk(const StagingBufferTracker &tracker) {
auto lock = std::lock_guard<std::mutex>(mtx);
trackers.push_back(tracker);
}
} // namespace NEO

View File

@@ -44,10 +44,10 @@ class StagingBuffer {
};
struct StagingBufferTracker {
HeapAllocator *allocator;
uint64_t chunkAddress;
size_t size;
uint64_t taskCountToWait;
HeapAllocator *allocator = nullptr;
uint64_t chunkAddress = 0;
size_t size = 0;
uint64_t taskCountToWait = 0;
};
class StagingBufferManager {
@@ -59,13 +59,16 @@ class StagingBufferManager {
StagingBufferManager &operator=(StagingBufferManager &&other) noexcept = delete;
StagingBufferManager &operator=(const StagingBufferManager &other) = delete;
bool isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const;
bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const;
bool isValidForStagingWriteImage(const Device &device, size_t size) const;
int32_t performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr);
std::pair<HeapAllocator *, uint64_t> requestStagingBuffer(size_t &size, CommandStreamReceiver *csr);
void trackChunk(const StagingBufferTracker &tracker);
private:
std::pair<HeapAllocator *, uint64_t> requestStagingBuffer(size_t &size, CommandStreamReceiver *csr);
std::pair<HeapAllocator *, uint64_t> getExistingBuffer(size_t &size);
void *allocateStagingBuffer();
void *allocateStagingBuffer(size_t size);
void clearTrackedChunks(CommandStreamReceiver *csr);
int32_t performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr);

View File

@@ -273,3 +273,25 @@ HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenDirectSubmissionEnabled
svmAllocsManager->freeSVMAlloc(usmBuffer);
delete[] nonUsmBuffer;
}
HWTEST_F(StagingBufferManagerTest, givenStagingBufferManagerWhenIsValidForStagingWriteImageCalledThenReturnCorrectValue) {
EXPECT_TRUE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, MemoryConstants::pageSize2M));
EXPECT_FALSE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, 0));
EXPECT_FALSE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, MemoryConstants::gigaByte));
debugManager.flags.EnableCopyWithStagingBuffers.set(0);
EXPECT_FALSE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, MemoryConstants::pageSize2M));
debugManager.flags.EnableCopyWithStagingBuffers.set(-1);
EXPECT_FALSE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, MemoryConstants::pageSize2M));
}
HWTEST_F(StagingBufferManagerTest, givenFailedAllocationWhenRequestStagingBufferCalledThenReturnNullptr) {
size_t size = MemoryConstants::pageSize2M;
auto memoryManager = static_cast<MockMemoryManager *>(pDevice->getMemoryManager());
memoryManager->isMockHostMemoryManager = true;
memoryManager->forceFailureInPrimaryAllocation = true;
auto [heapAllocator, stagingBuffer] = stagingBufferManager->requestStagingBuffer(size, csr);
EXPECT_EQ(stagingBuffer, 0u);
}