mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-21 09:14:47 +08:00
performance: introduce staging reads from image
Related-To: NEO-12968 Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
f2725f217e
commit
6c4eb322b1
@@ -13,7 +13,6 @@
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||
#include "shared/source/utilities/heap_allocator.h"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
StagingBuffer::StagingBuffer(void *baseAddress, size_t size) : baseAddress(baseAddress) {
|
||||
@@ -24,6 +23,14 @@ StagingBuffer::StagingBuffer(StagingBuffer &&other) : baseAddress(other.baseAddr
|
||||
this->allocator.reset(other.allocator.release());
|
||||
}
|
||||
|
||||
bool StagingBufferTracker::isReady() const {
|
||||
return csr->testTaskCountReady(csr->getTagAddress(), taskCountToWait);
|
||||
}
|
||||
|
||||
void StagingBufferTracker::freeChunk() const {
|
||||
allocator->free(chunkAddress, size);
|
||||
}
|
||||
|
||||
StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields) : svmAllocsManager(svmAllocsManager), rootDeviceIndices(rootDeviceIndices), deviceBitfields(deviceBitfields) {
|
||||
if (debugManager.flags.StagingBufferSize.get() != -1) {
|
||||
chunkSize = debugManager.flags.StagingBufferSize.get() * MemoryConstants::kiloByte;
|
||||
@@ -37,22 +44,45 @@ StagingBufferManager::~StagingBufferManager() {
|
||||
}
|
||||
|
||||
/*
|
||||
* This method performs 4 steps for single chunk transfer
|
||||
* 1. Get existing chunk of staging buffer, if can't - allocate new one,
|
||||
* 2. Perform actual transfer,
|
||||
* 3. Store used buffer to tracking container (with current task count)
|
||||
* 4. Update tag if required to reuse this buffer in next chunk copies
|
||||
* This method performs single chunk transfer. If transfer is a read operation, it will fetch oldest staging
|
||||
* buffer from the queue, otherwise it allocates or reuses buffer from the pool.
|
||||
* After transfer is submitted to GPU, it stores used buffer to either queue in case of reads,
|
||||
* or tracking container for further reusage.
|
||||
*/
|
||||
template <class Func, class... Args>
|
||||
int32_t StagingBufferManager::performChunkTransfer(CommandStreamReceiver *csr, size_t size, Func &func, Args... args) {
|
||||
auto allocatedSize = size;
|
||||
auto [allocator, stagingBuffer] = requestStagingBuffer(allocatedSize);
|
||||
auto ret = func(addrToPtr(stagingBuffer), size, args...);
|
||||
trackChunk({allocator, stagingBuffer, allocatedSize, csr, csr->peekTaskCount()});
|
||||
StagingTransferStatus StagingBufferManager::performChunkTransfer(bool isRead, void *userPtr, size_t size, StagingQueue ¤tStagingBuffers, CommandStreamReceiver *csr, Func &func, Args... args) {
|
||||
StagingTransferStatus result{};
|
||||
StagingBufferTracker tracker{};
|
||||
if (currentStagingBuffers.size() > 1) {
|
||||
if (fetchHead(currentStagingBuffers, tracker) == WaitStatus::gpuHang) {
|
||||
result.waitStatus = WaitStatus::gpuHang;
|
||||
return result;
|
||||
}
|
||||
} else {
|
||||
auto allocatedSize = size;
|
||||
auto [allocator, stagingBuffer] = requestStagingBuffer(allocatedSize);
|
||||
tracker = StagingBufferTracker{allocator, stagingBuffer, allocatedSize, csr};
|
||||
}
|
||||
|
||||
auto stagingBuffer = addrToPtr(tracker.chunkAddress);
|
||||
if (!isRead) {
|
||||
memcpy(stagingBuffer, userPtr, size);
|
||||
}
|
||||
|
||||
result.chunkCopyStatus = func(stagingBuffer, args...);
|
||||
|
||||
tracker.taskCountToWait = csr->peekTaskCount();
|
||||
if (isRead) {
|
||||
UserDstData dstData{userPtr, size};
|
||||
currentStagingBuffers.push({dstData, tracker});
|
||||
} else {
|
||||
trackChunk(tracker);
|
||||
}
|
||||
|
||||
if (csr->isAnyDirectSubmissionEnabled()) {
|
||||
csr->flushTagUpdate();
|
||||
}
|
||||
return ret;
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -60,38 +90,40 @@ int32_t StagingBufferManager::performChunkTransfer(CommandStreamReceiver *csr, s
|
||||
* Each chunk copy contains staging buffer which should be used instead of non-usm memory during transfers on GPU.
|
||||
* Caller provides actual function to transfer data for single chunk.
|
||||
*/
|
||||
int32_t StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr) {
|
||||
StagingTransferStatus StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr) {
|
||||
StagingQueue stagingQueue;
|
||||
auto copiesNum = size / chunkSize;
|
||||
auto remainder = size % chunkSize;
|
||||
|
||||
StagingTransferStatus result{};
|
||||
for (auto i = 0u; i < copiesNum; i++) {
|
||||
auto chunkDst = ptrOffset(dstPtr, i * chunkSize);
|
||||
auto chunkSrc = ptrOffset(srcPtr, i * chunkSize);
|
||||
auto ret = performChunkTransfer(csr, chunkSize, chunkCopyFunc, chunkDst, chunkSrc);
|
||||
if (ret) {
|
||||
return ret;
|
||||
result = performChunkTransfer(false, const_cast<void *>(chunkSrc), chunkSize, stagingQueue, csr, chunkCopyFunc, chunkDst, chunkSize);
|
||||
if (result.chunkCopyStatus != 0) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
if (remainder != 0) {
|
||||
auto chunkDst = ptrOffset(dstPtr, copiesNum * chunkSize);
|
||||
auto chunkSrc = ptrOffset(srcPtr, copiesNum * chunkSize);
|
||||
auto ret = performChunkTransfer(csr, remainder, chunkCopyFunc, chunkDst, chunkSrc);
|
||||
if (ret) {
|
||||
return ret;
|
||||
auto result = performChunkTransfer(false, const_cast<void *>(chunkSrc), remainder, stagingQueue, csr, chunkCopyFunc, chunkDst, remainder);
|
||||
if (result.chunkCopyStatus != 0) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* This method orchestrates write operation for images with given origin and region.
|
||||
* This method orchestrates transfer operation for images with given origin and region.
|
||||
* Transfer is splitted into chunks, each chunk represents sub-region to transfer.
|
||||
* Each chunk contains staging buffer which should be used instead of non-usm memory during transfers on GPU.
|
||||
* Several rows are packed into single chunk unless size of single row exceeds maximum chunk size (2MB).
|
||||
* Caller provides actual function to enqueue write operation for single chunk.
|
||||
* Caller provides actual function to enqueue read/write operation for single chunk.
|
||||
*/
|
||||
int32_t StagingBufferManager::performImageWrite(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkWriteImageFunc &chunkWriteImageFunc, CommandStreamReceiver *csr) {
|
||||
StagingTransferStatus StagingBufferManager::performImageTransfer(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkTransferImageFunc &chunkTransferImageFunc, CommandStreamReceiver *csr, bool isRead) {
|
||||
StagingQueue stagingQueue;
|
||||
size_t origin[3] = {};
|
||||
size_t region[3] = {};
|
||||
origin[0] = globalOrigin[0];
|
||||
@@ -102,15 +134,16 @@ int32_t StagingBufferManager::performImageWrite(const void *ptr, const size_t *g
|
||||
rowsPerChunk = std::min<size_t>(rowsPerChunk, globalRegion[1]);
|
||||
auto numOfChunks = globalRegion[1] / rowsPerChunk;
|
||||
auto remainder = globalRegion[1] % (rowsPerChunk * numOfChunks);
|
||||
StagingTransferStatus result{};
|
||||
|
||||
for (auto i = 0u; i < numOfChunks; i++) {
|
||||
origin[1] = globalOrigin[1] + i * rowsPerChunk;
|
||||
region[1] = rowsPerChunk;
|
||||
auto size = region[1] * rowPitch;
|
||||
auto chunkPtr = ptrOffset(ptr, i * rowsPerChunk * rowPitch);
|
||||
auto ret = performChunkTransfer(csr, size, chunkWriteImageFunc, chunkPtr, origin, region);
|
||||
if (ret) {
|
||||
return ret;
|
||||
result = performChunkTransfer(isRead, const_cast<void *>(chunkPtr), size, stagingQueue, csr, chunkTransferImageFunc, origin, region);
|
||||
if (result.chunkCopyStatus != 0 || result.waitStatus == WaitStatus::gpuHang) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -119,12 +152,50 @@ int32_t StagingBufferManager::performImageWrite(const void *ptr, const size_t *g
|
||||
region[1] = remainder;
|
||||
auto size = region[1] * rowPitch;
|
||||
auto chunkPtr = ptrOffset(ptr, numOfChunks * rowsPerChunk * rowPitch);
|
||||
auto ret = performChunkTransfer(csr, size, chunkWriteImageFunc, chunkPtr, origin, region);
|
||||
if (ret) {
|
||||
return ret;
|
||||
result = performChunkTransfer(isRead, const_cast<void *>(chunkPtr), size, stagingQueue, csr, chunkTransferImageFunc, origin, region);
|
||||
if (result.chunkCopyStatus != 0 || result.waitStatus == WaitStatus::gpuHang) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
result.waitStatus = drainAndReleaseStagingQueue(stagingQueue);
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* This method is used for read transfers. It waits for oldest transfer to finish
|
||||
* and copies data associated with that transfer to host allocation.
|
||||
* Returned tracker contains staging buffer ready for reuse.
|
||||
*/
|
||||
WaitStatus StagingBufferManager::fetchHead(StagingQueue &stagingQueue, StagingBufferTracker &tracker) const {
|
||||
auto &head = stagingQueue.front();
|
||||
auto status = head.second.csr->waitForTaskCount(head.second.taskCountToWait);
|
||||
if (status == WaitStatus::gpuHang) {
|
||||
return status;
|
||||
}
|
||||
|
||||
auto &userData = head.first;
|
||||
tracker = head.second;
|
||||
auto stagingBuffer = addrToPtr(tracker.chunkAddress);
|
||||
memcpy(userData.ptr, stagingBuffer, userData.size);
|
||||
stagingQueue.pop();
|
||||
return WaitStatus::ready;
|
||||
}
|
||||
|
||||
/*
|
||||
* Waits for all pending transfers to finish.
|
||||
* Releases staging buffers back to pool for reuse.
|
||||
*/
|
||||
WaitStatus StagingBufferManager::drainAndReleaseStagingQueue(StagingQueue &stagingQueue) const {
|
||||
StagingBufferTracker tracker{};
|
||||
while (!stagingQueue.empty()) {
|
||||
auto status = fetchHead(stagingQueue, tracker);
|
||||
if (status == WaitStatus::gpuHang) {
|
||||
return status;
|
||||
}
|
||||
tracker.freeChunk();
|
||||
}
|
||||
return WaitStatus::ready;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -196,7 +267,7 @@ bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, co
|
||||
return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize);
|
||||
}
|
||||
|
||||
bool StagingBufferManager::isValidForStagingWriteImage(const Device &device, const void *ptr, bool hasDependencies) const {
|
||||
bool StagingBufferManager::isValidForStagingTransferImage(const Device &device, const void *ptr, bool hasDependencies) const {
|
||||
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
|
||||
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
|
||||
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
|
||||
@@ -207,9 +278,8 @@ bool StagingBufferManager::isValidForStagingWriteImage(const Device &device, con
|
||||
|
||||
void StagingBufferManager::clearTrackedChunks() {
|
||||
for (auto iterator = trackers.begin(); iterator != trackers.end();) {
|
||||
auto csr = iterator->csr;
|
||||
if (csr->testTaskCountReady(csr->getTagAddress(), iterator->taskCountToWait)) {
|
||||
iterator->allocator->free(iterator->chunkAddress, iterator->size);
|
||||
if (iterator->isReady()) {
|
||||
iterator->freeChunk();
|
||||
iterator = trackers.erase(iterator);
|
||||
} else {
|
||||
break;
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/utilities/stackvec.h"
|
||||
|
||||
@@ -14,6 +15,7 @@
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <queue>
|
||||
|
||||
namespace NEO {
|
||||
class SVMAllocsManager;
|
||||
@@ -21,8 +23,8 @@ class CommandStreamReceiver;
|
||||
class Device;
|
||||
class HeapAllocator;
|
||||
|
||||
using ChunkCopyFunction = std::function<int32_t(void *, size_t, void *, const void *)>;
|
||||
using ChunkWriteImageFunc = std::function<int32_t(void *, size_t, const void *, const size_t *, const size_t *)>;
|
||||
using ChunkCopyFunction = std::function<int32_t(void *, void *, size_t)>;
|
||||
using ChunkTransferImageFunc = std::function<int32_t(void *, const size_t *, const size_t *)>;
|
||||
|
||||
class StagingBuffer {
|
||||
public:
|
||||
@@ -50,8 +52,23 @@ struct StagingBufferTracker {
|
||||
size_t size = 0;
|
||||
CommandStreamReceiver *csr = nullptr;
|
||||
uint64_t taskCountToWait = 0;
|
||||
|
||||
bool isReady() const;
|
||||
void freeChunk() const;
|
||||
};
|
||||
|
||||
struct UserDstData {
|
||||
void *ptr;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct StagingTransferStatus {
|
||||
int32_t chunkCopyStatus = 0; // status from L0/OCL chunk copy
|
||||
WaitStatus waitStatus = WaitStatus::ready;
|
||||
};
|
||||
|
||||
using StagingQueue = std::queue<std::pair<UserDstData, StagingBufferTracker>>;
|
||||
|
||||
class StagingBufferManager {
|
||||
public:
|
||||
StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields);
|
||||
@@ -62,10 +79,10 @@ class StagingBufferManager {
|
||||
StagingBufferManager &operator=(const StagingBufferManager &other) = delete;
|
||||
|
||||
bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const;
|
||||
bool isValidForStagingWriteImage(const Device &device, const void *ptr, bool hasDependencies) const;
|
||||
bool isValidForStagingTransferImage(const Device &device, const void *ptr, bool hasDependencies) const;
|
||||
|
||||
int32_t performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr);
|
||||
int32_t performImageWrite(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkWriteImageFunc &chunkWriteImageFunc, CommandStreamReceiver *csr);
|
||||
StagingTransferStatus performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr);
|
||||
StagingTransferStatus performImageTransfer(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkTransferImageFunc &chunkTransferImageFunc, CommandStreamReceiver *csr, bool isRead);
|
||||
|
||||
std::pair<HeapAllocator *, uint64_t> requestStagingBuffer(size_t &size);
|
||||
void trackChunk(const StagingBufferTracker &tracker);
|
||||
@@ -76,7 +93,10 @@ class StagingBufferManager {
|
||||
void clearTrackedChunks();
|
||||
|
||||
template <class Func, class... Args>
|
||||
int32_t performChunkTransfer(CommandStreamReceiver *csr, size_t size, Func &chunkCopyFunc, Args... args);
|
||||
StagingTransferStatus performChunkTransfer(bool isRead, void *userPtr, size_t size, StagingQueue ¤tStagingBuffers, CommandStreamReceiver *csr, Func &func, Args... args);
|
||||
|
||||
WaitStatus fetchHead(StagingQueue &stagingQueue, StagingBufferTracker &tracker) const;
|
||||
WaitStatus drainAndReleaseStagingQueue(StagingQueue &stagingQueue) const;
|
||||
|
||||
size_t chunkSize = MemoryConstants::pageSize2M;
|
||||
std::mutex mtx;
|
||||
|
||||
@@ -323,6 +323,13 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
|
||||
return BaseClass::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, throttle);
|
||||
}
|
||||
|
||||
WaitStatus waitForTaskCount(TaskCountType requiredTaskCount) override {
|
||||
if (waitForTaskCountReturnValue.has_value()) {
|
||||
return *waitForTaskCountReturnValue;
|
||||
}
|
||||
return BaseClass::waitForTaskCount(requiredTaskCount);
|
||||
}
|
||||
|
||||
void overrideCsrSizeReqFlags(CsrSizeRequestFlags &flags) { this->csrSizeRequestFlags = flags; }
|
||||
GraphicsAllocation *getPreemptionAllocation() const { return this->preemptionAllocation; }
|
||||
|
||||
@@ -585,6 +592,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
|
||||
uint32_t createAllocationForHostSurfaceCalled = 0;
|
||||
WaitStatus returnWaitForCompletionWithTimeout = WaitStatus::ready;
|
||||
std::optional<WaitStatus> waitForTaskCountWithKmdNotifyFallbackReturnValue{};
|
||||
std::optional<WaitStatus> waitForTaskCountReturnValue{};
|
||||
std::optional<SubmissionStatus> flushReturnValue{};
|
||||
CommandStreamReceiverType commandStreamReceiverType = CommandStreamReceiverType::hardware;
|
||||
std::atomic<uint32_t> downloadAllocationsCalledCount = 0;
|
||||
|
||||
@@ -55,10 +55,9 @@ class StagingBufferManagerFixture : public DeviceFixture {
|
||||
memset(usmBuffer, 0, copySize);
|
||||
memset(nonUsmBuffer, 0xFF, copySize);
|
||||
|
||||
ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) {
|
||||
ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) {
|
||||
chunkCounter++;
|
||||
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
||||
memcpy(chunkDst, stagingBuffer, chunkSize);
|
||||
memcpy(chunkDst, chunkSrc, chunkSize);
|
||||
reinterpret_cast<MockCommandStreamReceiver *>(csr)->taskCount++;
|
||||
return 0;
|
||||
};
|
||||
@@ -66,7 +65,8 @@ class StagingBufferManagerFixture : public DeviceFixture {
|
||||
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, copySize, chunkCopy, csr);
|
||||
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||
|
||||
EXPECT_EQ(0, ret);
|
||||
EXPECT_EQ(0, ret.chunkCopyStatus);
|
||||
EXPECT_EQ(WaitStatus::ready, ret.waitStatus);
|
||||
EXPECT_EQ(0, memcmp(usmBuffer, nonUsmBuffer, copySize));
|
||||
EXPECT_EQ(expectedChunks, chunkCounter);
|
||||
EXPECT_EQ(expectedAllocations, newUsmAllocations);
|
||||
@@ -74,17 +74,23 @@ class StagingBufferManagerFixture : public DeviceFixture {
|
||||
delete[] nonUsmBuffer;
|
||||
}
|
||||
|
||||
void imageWriteThroughStagingBuffers(size_t rowPitch, const size_t *globalOrigin, const size_t *globalRegion, size_t expectedChunks) {
|
||||
auto ptr = new unsigned char[stagingBufferSize * expectedChunks];
|
||||
|
||||
void imageTransferThroughStagingBuffers(bool isRead, size_t rowPitch, const size_t *globalOrigin, const size_t *globalRegion, size_t expectedChunks) {
|
||||
auto hostPtr = new unsigned char[stagingBufferSize * expectedChunks];
|
||||
auto imageData = new unsigned char[stagingBufferSize * expectedChunks];
|
||||
if (isRead) {
|
||||
memset(hostPtr, 0, stagingBufferSize * expectedChunks);
|
||||
memset(imageData, 0xFF, stagingBufferSize * expectedChunks);
|
||||
} else {
|
||||
memset(hostPtr, 0xFF, stagingBufferSize * expectedChunks);
|
||||
memset(imageData, 0, stagingBufferSize * expectedChunks);
|
||||
}
|
||||
size_t chunkCounter = 0;
|
||||
size_t expectedOrigin = globalOrigin[1];
|
||||
auto expectedRowsPerChunk = std::min<size_t>(std::max<size_t>(1ul, stagingBufferSize / rowPitch), globalRegion[1]);
|
||||
auto numOfChunks = globalRegion[1] / expectedRowsPerChunk;
|
||||
auto remainder = globalRegion[1] % (expectedRowsPerChunk * numOfChunks);
|
||||
ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t {
|
||||
ChunkTransferImageFunc chunkTransfer = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||
EXPECT_NE(nullptr, stagingBuffer);
|
||||
EXPECT_NE(nullptr, chunkPtr);
|
||||
EXPECT_NE(nullptr, origin);
|
||||
EXPECT_NE(nullptr, region);
|
||||
|
||||
@@ -97,19 +103,33 @@ class StagingBufferManagerFixture : public DeviceFixture {
|
||||
} else {
|
||||
EXPECT_EQ(expectedRowsPerChunk, region[1]);
|
||||
}
|
||||
auto offset = origin[1] - globalOrigin[1];
|
||||
if (isRead) {
|
||||
memcpy(stagingBuffer, imageData + rowPitch * offset, rowPitch * region[1]);
|
||||
} else {
|
||||
memcpy(imageData + rowPitch * offset, stagingBuffer, rowPitch * region[1]);
|
||||
}
|
||||
expectedOrigin += region[1];
|
||||
chunkCounter++;
|
||||
reinterpret_cast<MockCommandStreamReceiver *>(csr)->taskCount++;
|
||||
return 0;
|
||||
};
|
||||
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
|
||||
auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, rowPitch, chunkWrite, csr);
|
||||
auto ret = stagingBufferManager->performImageTransfer(hostPtr, globalOrigin, globalRegion, rowPitch, chunkTransfer, csr, isRead);
|
||||
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||
|
||||
EXPECT_EQ(0, ret);
|
||||
EXPECT_EQ(0, memcmp(hostPtr, imageData, rowPitch * (numOfChunks * expectedRowsPerChunk + remainder)));
|
||||
EXPECT_EQ(0, ret.chunkCopyStatus);
|
||||
EXPECT_EQ(WaitStatus::ready, ret.waitStatus);
|
||||
EXPECT_EQ(expectedChunks, chunkCounter);
|
||||
EXPECT_EQ(1u, newUsmAllocations);
|
||||
delete[] ptr;
|
||||
|
||||
auto expectedNewUsmAllocations = 1u;
|
||||
if (isRead) {
|
||||
expectedNewUsmAllocations = 2u;
|
||||
}
|
||||
EXPECT_EQ(expectedNewUsmAllocations, newUsmAllocations);
|
||||
delete[] hostPtr;
|
||||
delete[] imageData;
|
||||
}
|
||||
|
||||
constexpr static size_t stagingBufferSize = MemoryConstants::megaByte * 2;
|
||||
@@ -178,16 +198,16 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForImageWrite
|
||||
{nonUsmBuffer, true, false},
|
||||
};
|
||||
for (auto i = 0; i < 4; i++) {
|
||||
auto actualValid = stagingBufferManager->isValidForStagingWriteImage(*pDevice, copyParamsStruct[i].ptr, copyParamsStruct[i].hasDependencies);
|
||||
auto actualValid = stagingBufferManager->isValidForStagingTransferImage(*pDevice, copyParamsStruct[i].ptr, copyParamsStruct[i].hasDependencies);
|
||||
EXPECT_EQ(actualValid, copyParamsStruct[i].expectValid);
|
||||
}
|
||||
|
||||
debugManager.flags.EnableCopyWithStagingBuffers.set(0);
|
||||
EXPECT_FALSE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, nonUsmBuffer, false));
|
||||
EXPECT_FALSE(stagingBufferManager->isValidForStagingTransferImage(*pDevice, nonUsmBuffer, false));
|
||||
|
||||
debugManager.flags.EnableCopyWithStagingBuffers.set(-1);
|
||||
auto isStaingBuffersEnabled = pDevice->getProductHelper().isStagingBuffersEnabled();
|
||||
EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForStagingWriteImage(*pDevice, nonUsmBuffer, false));
|
||||
EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForStagingTransferImage(*pDevice, nonUsmBuffer, false));
|
||||
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
||||
}
|
||||
|
||||
@@ -256,17 +276,17 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkCopyThenEarlyR
|
||||
memset(usmBuffer, 0, totalCopySize);
|
||||
memset(nonUsmBuffer, 0xFF, totalCopySize);
|
||||
|
||||
ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) {
|
||||
ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) {
|
||||
chunkCounter++;
|
||||
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
||||
memcpy(chunkDst, stagingBuffer, chunkSize);
|
||||
memcpy(chunkDst, chunkSrc, chunkSize);
|
||||
return expectedErrorCode;
|
||||
};
|
||||
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
|
||||
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr);
|
||||
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||
|
||||
EXPECT_EQ(expectedErrorCode, ret);
|
||||
EXPECT_EQ(expectedErrorCode, ret.chunkCopyStatus);
|
||||
EXPECT_EQ(WaitStatus::ready, ret.waitStatus);
|
||||
EXPECT_NE(0, memcmp(usmBuffer, nonUsmBuffer, totalCopySize));
|
||||
EXPECT_EQ(1u, chunkCounter);
|
||||
EXPECT_EQ(1u, newUsmAllocations);
|
||||
@@ -286,10 +306,9 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedRemainderCopyThenRe
|
||||
memset(usmBuffer, 0, totalCopySize);
|
||||
memset(nonUsmBuffer, 0xFF, totalCopySize);
|
||||
|
||||
ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) {
|
||||
ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) {
|
||||
chunkCounter++;
|
||||
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
||||
memcpy(chunkDst, stagingBuffer, chunkSize);
|
||||
memcpy(chunkDst, chunkSrc, chunkSize);
|
||||
if (chunkCounter <= numOfChunkCopies) {
|
||||
return 0;
|
||||
} else {
|
||||
@@ -300,7 +319,8 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedRemainderCopyThenRe
|
||||
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr);
|
||||
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||
|
||||
EXPECT_EQ(expectedErrorCode, ret);
|
||||
EXPECT_EQ(expectedErrorCode, ret.chunkCopyStatus);
|
||||
EXPECT_EQ(WaitStatus::ready, ret.waitStatus);
|
||||
EXPECT_EQ(numOfChunkCopies + 1, chunkCounter);
|
||||
EXPECT_EQ(1u, newUsmAllocations);
|
||||
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
||||
@@ -331,7 +351,7 @@ HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenDirectSubmissionEnabled
|
||||
auto nonUsmBuffer = new unsigned char[totalCopySize];
|
||||
|
||||
size_t flushTagsCalled = 0;
|
||||
ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) {
|
||||
ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) {
|
||||
if (ultCsr->flushTagUpdateCalled) {
|
||||
flushTagsCalled++;
|
||||
ultCsr->flushTagUpdateCalled = false;
|
||||
@@ -362,28 +382,121 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteThenWhol
|
||||
size_t expectedChunks = 8;
|
||||
const size_t globalOrigin[3] = {0, 0, 0};
|
||||
const size_t globalRegion[3] = {4, expectedChunks, 1};
|
||||
imageWriteThroughStagingBuffers(stagingBufferSize, globalOrigin, globalRegion, expectedChunks);
|
||||
imageTransferThroughStagingBuffers(false, stagingBufferSize, globalOrigin, globalRegion, expectedChunks);
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithOriginThenWholeRegionCovered) {
|
||||
size_t expectedChunks = 8;
|
||||
const size_t globalOrigin[3] = {4, 4, 0};
|
||||
const size_t globalRegion[3] = {4, expectedChunks, 1};
|
||||
imageWriteThroughStagingBuffers(stagingBufferSize, globalOrigin, globalRegion, expectedChunks);
|
||||
imageTransferThroughStagingBuffers(false, stagingBufferSize, globalOrigin, globalRegion, expectedChunks);
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithMultipleRowsPerChunkThenWholeRegionCovered) {
|
||||
size_t expectedChunks = 4;
|
||||
const size_t globalOrigin[3] = {0, 0, 0};
|
||||
const size_t globalRegion[3] = {4, 8, 1};
|
||||
imageWriteThroughStagingBuffers(MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks);
|
||||
imageTransferThroughStagingBuffers(false, MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks);
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithRemainderThenWholeRegionCovered) {
|
||||
size_t expectedChunks = 4;
|
||||
const size_t globalOrigin[3] = {0, 0, 0};
|
||||
const size_t globalRegion[3] = {4, 7, 1};
|
||||
imageWriteThroughStagingBuffers(MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks);
|
||||
imageTransferThroughStagingBuffers(false, MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks);
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageReadThenWholeRegionCovered) {
|
||||
size_t expectedChunks = 8;
|
||||
const size_t globalOrigin[3] = {0, 0, 0};
|
||||
const size_t globalRegion[3] = {4, expectedChunks, 1};
|
||||
imageTransferThroughStagingBuffers(true, stagingBufferSize, globalOrigin, globalRegion, expectedChunks);
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageReadWithOriginThenWholeRegionCovered) {
|
||||
size_t expectedChunks = 8;
|
||||
const size_t globalOrigin[3] = {4, 4, 0};
|
||||
const size_t globalRegion[3] = {4, expectedChunks, 1};
|
||||
imageTransferThroughStagingBuffers(true, stagingBufferSize, globalOrigin, globalRegion, expectedChunks);
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageReadWithMultipleRowsPerChunkThenWholeRegionCovered) {
|
||||
size_t expectedChunks = 4;
|
||||
const size_t globalOrigin[3] = {0, 0, 0};
|
||||
const size_t globalRegion[3] = {4, 8, 1};
|
||||
imageTransferThroughStagingBuffers(true, MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks);
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageReadWithRemainderThenWholeRegionCovered) {
|
||||
size_t expectedChunks = 4;
|
||||
const size_t globalOrigin[3] = {0, 0, 0};
|
||||
const size_t globalRegion[3] = {4, 7, 1};
|
||||
imageTransferThroughStagingBuffers(true, MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks);
|
||||
}
|
||||
|
||||
HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenGpuHangDuringChunkReadFromImageThenReturnImmediatelyWithFailure) {
|
||||
size_t expectedChunks = 4;
|
||||
const size_t globalOrigin[3] = {0, 0, 0};
|
||||
const size_t globalRegion[3] = {4, 8, 1};
|
||||
auto ptr = new unsigned char[stagingBufferSize * expectedChunks];
|
||||
|
||||
size_t chunkCounter = 0;
|
||||
ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||
++chunkCounter;
|
||||
return 0;
|
||||
};
|
||||
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(csr);
|
||||
ultCsr->waitForTaskCountReturnValue = WaitStatus::gpuHang;
|
||||
auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, true);
|
||||
EXPECT_EQ(0, ret.chunkCopyStatus);
|
||||
EXPECT_EQ(WaitStatus::gpuHang, ret.waitStatus);
|
||||
EXPECT_EQ(2u, chunkCounter);
|
||||
delete[] ptr;
|
||||
}
|
||||
|
||||
HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenGpuHangAfterChunkReadFromImageThenReturnWithFailure) {
|
||||
size_t expectedChunks = 4;
|
||||
const size_t globalOrigin[3] = {0, 0, 0};
|
||||
const size_t globalRegion[3] = {4, 8, 1};
|
||||
auto ptr = new unsigned char[stagingBufferSize * expectedChunks];
|
||||
|
||||
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(csr);
|
||||
size_t chunkCounter = 0;
|
||||
ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||
++chunkCounter;
|
||||
if (chunkCounter == expectedChunks) {
|
||||
ultCsr->waitForTaskCountReturnValue = WaitStatus::gpuHang;
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, true);
|
||||
EXPECT_EQ(0, ret.chunkCopyStatus);
|
||||
EXPECT_EQ(WaitStatus::gpuHang, ret.waitStatus);
|
||||
EXPECT_EQ(4u, chunkCounter);
|
||||
delete[] ptr;
|
||||
}
|
||||
|
||||
HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenGpuHangDuringRemainderChunkReadFromImageThenReturnImmediatelyWithFailure) {
|
||||
size_t expectedChunks = 4;
|
||||
const size_t globalOrigin[3] = {0, 0, 0};
|
||||
const size_t globalRegion[3] = {4, 7, 1};
|
||||
auto ptr = new unsigned char[stagingBufferSize * expectedChunks];
|
||||
|
||||
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(csr);
|
||||
size_t chunkCounter = 0;
|
||||
size_t remainderCounter = 4;
|
||||
ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||
++chunkCounter;
|
||||
if (chunkCounter == remainderCounter - 1) {
|
||||
ultCsr->waitForTaskCountReturnValue = WaitStatus::gpuHang;
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, true);
|
||||
EXPECT_EQ(0, ret.chunkCopyStatus);
|
||||
EXPECT_EQ(WaitStatus::gpuHang, ret.waitStatus);
|
||||
EXPECT_EQ(remainderCounter - 1, chunkCounter);
|
||||
delete[] ptr;
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkImageWriteThenEarlyReturnWithFailure) {
|
||||
@@ -394,13 +507,13 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkImageWriteThen
|
||||
auto ptr = new unsigned char[stagingBufferSize * expectedChunks];
|
||||
|
||||
size_t chunkCounter = 0;
|
||||
ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t {
|
||||
ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||
++chunkCounter;
|
||||
return expectedErrorCode;
|
||||
};
|
||||
|
||||
auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr);
|
||||
EXPECT_EQ(expectedErrorCode, ret);
|
||||
auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, false);
|
||||
EXPECT_EQ(expectedErrorCode, ret.chunkCopyStatus);
|
||||
EXPECT_EQ(WaitStatus::ready, ret.waitStatus);
|
||||
EXPECT_EQ(1u, chunkCounter);
|
||||
delete[] ptr;
|
||||
}
|
||||
@@ -414,16 +527,16 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkImageWriteWith
|
||||
|
||||
size_t chunkCounter = 0;
|
||||
size_t remainderCounter = 4;
|
||||
ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t {
|
||||
ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||
++chunkCounter;
|
||||
if (chunkCounter == remainderCounter) {
|
||||
return expectedErrorCode;
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
|
||||
auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr);
|
||||
EXPECT_EQ(expectedErrorCode, ret);
|
||||
auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, false);
|
||||
EXPECT_EQ(expectedErrorCode, ret.chunkCopyStatus);
|
||||
EXPECT_EQ(WaitStatus::ready, ret.waitStatus);
|
||||
EXPECT_EQ(remainderCounter, chunkCounter);
|
||||
delete[] ptr;
|
||||
}
|
||||
Reference in New Issue
Block a user