performance: non-usm copy through staging buffers

Related-To: NEO-11501

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek 2024-06-24 17:01:44 +00:00 committed by Compute-Runtime-Automation
parent 659075ffe5
commit 29e3eb512c
21 changed files with 738 additions and 12 deletions

View File

@ -20,6 +20,7 @@
#include "shared/source/os_interface/device_factory.h" #include "shared/source/os_interface/device_factory.h"
#include "shared/source/utilities/buffer_pool_allocator.inl" #include "shared/source/utilities/buffer_pool_allocator.inl"
#include "shared/source/utilities/heap_allocator.h" #include "shared/source/utilities/heap_allocator.h"
#include "shared/source/utilities/staging_buffer_manager.h"
#include "opencl/source/accelerators/intel_motion_estimation.h" #include "opencl/source/accelerators/intel_motion_estimation.h"
#include "opencl/source/api/additional_extensions.h" #include "opencl/source/api/additional_extensions.h"
@ -4913,14 +4914,19 @@ cl_int CL_API_CALL clEnqueueSVMMemcpy(cl_command_queue commandQueue,
} }
if (size != 0) { if (size != 0) {
retVal = pCommandQueue->enqueueSVMMemcpy( auto stagingBufferManager = pCommandQueue->getContext().getStagingBufferManager();
blockingCopy, if (stagingBufferManager->isValidForCopy(device, dstPtr, srcPtr, numEventsInWaitList)) {
dstPtr, retVal = pCommandQueue->enqueueStagingBufferMemcpy(blockingCopy, dstPtr, srcPtr, size, event);
srcPtr, } else {
size, retVal = pCommandQueue->enqueueSVMMemcpy(
numEventsInWaitList, blockingCopy,
eventWaitList, dstPtr,
event); srcPtr,
size,
numEventsInWaitList,
eventWaitList,
event);
}
} else { } else {
retVal = pCommandQueue->enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event); retVal = pCommandQueue->enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event);
} }

View File

@ -29,6 +29,7 @@
#include "shared/source/os_interface/os_context.h" #include "shared/source/os_interface/os_context.h"
#include "shared/source/os_interface/product_helper.h" #include "shared/source/os_interface/product_helper.h"
#include "shared/source/utilities/api_intercept.h" #include "shared/source/utilities/api_intercept.h"
#include "shared/source/utilities/staging_buffer_manager.h"
#include "shared/source/utilities/tag_allocator.h" #include "shared/source/utilities/tag_allocator.h"
#include "opencl/source/built_ins/builtins_dispatch_builder.h" #include "opencl/source/built_ins/builtins_dispatch_builder.h"
@ -1504,4 +1505,56 @@ void CommandQueue::unregisterGpgpuAndBcsCsrClients() {
} }
} }
cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event) {
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, &size};
csrSelectionArgs.direction = TransferDirection::hostToLocal;
auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs);
Event profilingEvent{this, CL_COMMAND_SVM_MEMCPY, CompletionStamp::notReady, CompletionStamp::notReady};
if (isProfilingEnabled()) {
profilingEvent.setQueueTimeStamp();
}
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) -> int32_t {
auto isFirstTransfer = (chunkDst == dstPtr);
auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size);
if (isFirstTransfer && isProfilingEnabled()) {
profilingEvent.setSubmitTimeStamp();
}
memcpy(stagingBuffer, chunkSrc, chunkSize);
if (isFirstTransfer && isProfilingEnabled()) {
profilingEvent.setStartTimeStamp();
}
cl_event *outEvent = nullptr;
if (isLastTransfer && !this->isOOQEnabled()) {
outEvent = event;
}
auto ret = this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, outEvent);
return ret;
};
auto stagingBufferManager = this->context->getStagingBufferManager();
auto ret = stagingBufferManager->performCopy(dstPtr, srcPtr, size, chunkCopy, csr);
if (ret != CL_SUCCESS) {
return ret;
}
if (event != nullptr) {
if (this->isOOQEnabled()) {
ret = this->enqueueBarrierWithWaitList(0, nullptr, event);
}
if (isProfilingEnabled()) {
auto pEvent = castToObjectOrAbort<Event>(*event);
pEvent->copyTimestamps(profilingEvent);
pEvent->setCPUProfilingPath(false);
}
}
if (blockingCopy) {
ret = this->finish();
}
return ret;
}
} // namespace NEO } // namespace NEO

View File

@ -388,6 +388,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
bool isBcsSplitInitialized() const { return this->bcsSplitInitialized; } bool isBcsSplitInitialized() const { return this->bcsSplitInitialized; }
bool isBcs() const { return isCopyOnly; }; bool isBcs() const { return isCopyOnly; };
cl_int enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event);
protected: protected:
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet); void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest); cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest);

View File

@ -23,6 +23,7 @@
#include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/utilities/buffer_pool_allocator.inl" #include "shared/source/utilities/buffer_pool_allocator.inl"
#include "shared/source/utilities/heap_allocator.h" #include "shared/source/utilities/heap_allocator.h"
#include "shared/source/utilities/staging_buffer_manager.h"
#include "shared/source/utilities/tag_allocator.h" #include "shared/source/utilities/tag_allocator.h"
#include "opencl/source/cl_device/cl_device.h" #include "opencl/source/cl_device/cl_device.h"
@ -74,6 +75,7 @@ Context::~Context() {
} }
} }
if (svmAllocsManager) { if (svmAllocsManager) {
this->stagingBufferManager.reset();
svmAllocsManager->trimUSMDeviceAllocCache(); svmAllocsManager->trimUSMDeviceAllocCache();
delete svmAllocsManager; delete svmAllocsManager;
} }
@ -281,6 +283,7 @@ bool Context::createImpl(const cl_context_properties *properties,
this->svmAllocsManager = new SVMAllocsManager(this->memoryManager, this->svmAllocsManager = new SVMAllocsManager(this->memoryManager,
this->areMultiStorageAllocationsPreferred()); this->areMultiStorageAllocationsPreferred());
this->svmAllocsManager->initUsmAllocationsCaches(device->getDevice()); this->svmAllocsManager->initUsmAllocationsCaches(device->getDevice());
this->stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager, rootDeviceIndices, deviceBitfields);
} }
} }
@ -676,4 +679,8 @@ bool Context::checkIfContextIsNonZebin() const {
return this->nonZebinContext; return this->nonZebinContext;
} }
StagingBufferManager *Context::getStagingBufferManager() const {
return this->stagingBufferManager.get();
}
} // namespace NEO } // namespace NEO

View File

@ -40,6 +40,7 @@ class SVMAllocsManager;
class Program; class Program;
class Platform; class Platform;
class TagAllocatorBase; class TagAllocatorBase;
class StagingBufferManager;
template <> template <>
struct OpenCLObjectMapper<_cl_context> { struct OpenCLObjectMapper<_cl_context> {
@ -256,6 +257,8 @@ class Context : public BaseObject<_cl_context> {
void initializeUsmAllocationPools(); void initializeUsmAllocationPools();
void cleanupUsmAllocationPools(); void cleanupUsmAllocationPools();
StagingBufferManager *getStagingBufferManager() const;
protected: protected:
struct BuiltInKernel { struct BuiltInKernel {
const char *pSource = nullptr; const char *pSource = nullptr;
@ -300,6 +303,8 @@ class Context : public BaseObject<_cl_context> {
std::unique_ptr<TagAllocatorBase> multiRootDeviceTimestampPacketAllocator; std::unique_ptr<TagAllocatorBase> multiRootDeviceTimestampPacketAllocator;
std::mutex multiRootDeviceAllocatorMtx; std::mutex multiRootDeviceAllocatorMtx;
std::unique_ptr<StagingBufferManager> stagingBufferManager;
bool interopUserSync = false; bool interopUserSync = false;
bool resolvesRequiredInKernels = false; bool resolvesRequiredInKernels = false;
bool nonZebinContext = false; bool nonZebinContext = false;

View File

@ -397,6 +397,10 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
auto &device = this->cmdQueue->getDevice(); auto &device = this->cmdQueue->getDevice();
auto &gfxCoreHelper = device.getGfxCoreHelper(); auto &gfxCoreHelper = device.getGfxCoreHelper();
auto resolution = device.getDeviceInfo().profilingTimerResolution; auto resolution = device.getDeviceInfo().profilingTimerResolution;
if (timestampsCopied) {
// Adjust startTS since we calculate profiling based on other event timestamps
contextStartTS = startTimeStamp.gpuTimeStamp;
}
// Calculate startTimestamp only if it was not already set on CPU // Calculate startTimestamp only if it was not already set on CPU
if (startTimeStamp.cpuTimeInNs == 0) { if (startTimeStamp.cpuTimeInNs == 0) {

View File

@ -312,6 +312,14 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
static void getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS); static void getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS);
void copyTimestamps(const Event &srcEvent) {
this->queueTimeStamp = srcEvent.queueTimeStamp;
this->submitTimeStamp = srcEvent.submitTimeStamp;
this->startTimeStamp = srcEvent.startTimeStamp;
this->endTimeStamp = srcEvent.endTimeStamp;
timestampsCopied = true;
}
protected: protected:
Event(Context *ctx, CommandQueue *cmdQueue, cl_command_type cmdType, Event(Context *ctx, CommandQueue *cmdQueue, cl_command_type cmdType,
TaskCountType taskLevel, TaskCountType taskCount); TaskCountType taskLevel, TaskCountType taskCount);
@ -383,6 +391,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
bool profilingEnabled = false; bool profilingEnabled = false;
bool profilingCpuPath = false; bool profilingCpuPath = false;
bool dataCalculated = false; bool dataCalculated = false;
bool timestampsCopied = false;
ProfilingInfo queueTimeStamp{}; ProfilingInfo queueTimeStamp{};
ProfilingInfo submitTimeStamp{}; ProfilingInfo submitTimeStamp{};

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2018-2023 Intel Corporation * Copyright (C) 2018-2024 Intel Corporation
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
* *
@ -231,4 +231,29 @@ TEST_F(ClEnqueueSVMMemcpyTests, GivenDeviceNotSupportingSvmWhenEnqueuingSVMMemcp
EXPECT_EQ(CL_INVALID_OPERATION, retVal); EXPECT_EQ(CL_INVALID_OPERATION, retVal);
} }
TEST_F(ClEnqueueSVMMemcpyTests, givenCopyValidForStagingBuffersCopyThenTransferSuccesfull) {
DebugManagerStateRestore restorer;
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
const ClDeviceInfo &devInfo = pDevice->getDeviceInfo();
if (devInfo.svmCapabilities != 0) {
void *pDstSvm = clSVMAlloc(pContext, CL_MEM_READ_WRITE, 256, 4);
EXPECT_NE(nullptr, pDstSvm);
auto pSrc = new unsigned char[256];
auto retVal = clEnqueueSVMMemcpy(
pCommandQueue, // cl_command_queue command_queue
CL_FALSE, // cl_bool blocking_copy
pDstSvm, // void *dst_ptr
pSrc, // const void *src_ptr
256, // size_t size
0, // cl_uint num_events_in_wait_list
nullptr, // const cl_event *event_wait_list
nullptr // cl_event *event
);
EXPECT_EQ(CL_SUCCESS, retVal);
clSVMFree(pContext, pDstSvm);
delete[] pSrc;
}
}
} // namespace ULT } // namespace ULT

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2018-2023 Intel Corporation * Copyright (C) 2018-2024 Intel Corporation
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
* *
@ -13,6 +13,8 @@
#include "shared/source/memory_manager/surface.h" #include "shared/source/memory_manager/surface.h"
#include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/os_interface/device_factory.h" #include "shared/source/os_interface/device_factory.h"
#include "shared/source/utilities/hw_timestamps.h"
#include "shared/source/utilities/tag_allocator.h"
#include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/cmd_parse/hw_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h" #include "shared/test/common/libult/ult_command_stream_receiver.h"
@ -2369,3 +2371,190 @@ HWTEST_F(EnqueueSvmTest, givenCopyFromMappedPtrToMappedPtrWhenCallingSvmMemcpyTh
EXPECT_EQ(2u, csr.createAllocationForHostSurfaceCalled); EXPECT_EQ(2u, csr.createAllocationForHostSurfaceCalled);
} }
} }
struct StagingBufferTest : public EnqueueSvmTest {
void SetUp() override {
REQUIRE_SVM_OR_SKIP(defaultHwInfo);
EnqueueSvmTest::SetUp();
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::deviceUnifiedMemory, 1, context->getRootDeviceIndices(), context->getDeviceBitfields());
unifiedMemoryProperties.device = pDevice;
svmManager = this->context->getSVMAllocsManager();
dstPtr = svmManager->createUnifiedMemoryAllocation(copySize, unifiedMemoryProperties);
srcPtr = new unsigned char[copySize];
}
void TearDown() override {
if (defaultHwInfo->capabilityTable.ftrSvm == false) {
return;
}
svmManager = this->context->getSVMAllocsManager();
svmManager->freeSVMAlloc(dstPtr);
delete[] srcPtr;
EnqueueSvmTest::TearDown();
}
static constexpr size_t stagingBufferSize = MemoryConstants::megaByte * 2;
static constexpr size_t copySize = stagingBufferSize * 4;
static constexpr size_t expectedNumOfCopies = copySize / stagingBufferSize;
SVMAllocsManager *svmManager;
void *dstPtr;
unsigned char *srcPtr;
};
HWTEST_F(StagingBufferTest, givenInOrderCmdQueueWhenEnqueueStagingBufferMemcpyNonBlockingThenCopySucessfull) {
constexpr cl_command_type expectedLastCmd = CL_COMMAND_SVM_MEMCPY;
cl_event event;
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
auto initialUsmAllocs = svmManager->getNumAllocs();
retVal = myCmdQ.enqueueStagingBufferMemcpy(
false, // cl_bool blocking_copy
dstPtr, // void *dst_ptr
srcPtr, // const void *src_ptr
copySize, // size_t size
&event // cl_event *event
);
auto pEvent = (Event *)event;
auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(1u, numOfStagingBuffers);
EXPECT_EQ(expectedNumOfCopies, myCmdQ.enqueueSVMMemcpyCalledCount);
EXPECT_EQ(0u, myCmdQ.finishCalledCount);
EXPECT_EQ(expectedLastCmd, myCmdQ.lastCommandType);
EXPECT_EQ(expectedLastCmd, pEvent->getCommandType());
clReleaseEvent(event);
}
HWTEST_F(StagingBufferTest, givenOutOfOrderCmdQueueWhenEnqueueStagingBufferMemcpyNonBlockingThenCopySucessfull) {
constexpr cl_command_type expectedLastCmd = CL_COMMAND_BARRIER;
cl_event event;
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
myCmdQ.setOoqEnabled();
auto initialUsmAllocs = svmManager->getNumAllocs();
retVal = myCmdQ.enqueueStagingBufferMemcpy(
false, // cl_bool blocking_copy
dstPtr, // void *dst_ptr
srcPtr, // const void *src_ptr
copySize, // size_t size
&event // cl_event *event
);
auto pEvent = (Event *)event;
auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(1u, numOfStagingBuffers);
EXPECT_EQ(expectedNumOfCopies, myCmdQ.enqueueSVMMemcpyCalledCount);
EXPECT_EQ(0u, myCmdQ.finishCalledCount);
EXPECT_EQ(expectedLastCmd, myCmdQ.lastCommandType);
EXPECT_EQ(expectedLastCmd, pEvent->getCommandType());
clReleaseEvent(event);
}
HWTEST_F(StagingBufferTest, givenEnqueueStagingBufferMemcpyWhenTaskCountNotReadyThenCopySucessfullAndBuffersNotReused) {
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
auto initialUsmAllocs = svmManager->getNumAllocs();
auto &csr = pCmdQ->getGpgpuCommandStreamReceiver();
*csr.getTagAddress() = csr.peekTaskCount();
retVal = myCmdQ.enqueueStagingBufferMemcpy(
false, // cl_bool blocking_copy
dstPtr, // void *dst_ptr
srcPtr, // const void *src_ptr
copySize, // size_t size
nullptr // cl_event *event
);
auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(expectedNumOfCopies, numOfStagingBuffers);
*csr.getTagAddress() = csr.peekTaskCount();
}
HWTEST_F(StagingBufferTest, givenCmdQueueWhenEnqueueStagingBufferMemcpyBlockingThenCopySucessfullAndFinishCalled) {
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
auto initialUsmAllocs = svmManager->getNumAllocs();
retVal = myCmdQ.enqueueStagingBufferMemcpy(
true, // cl_bool blocking_copy
dstPtr, // void *dst_ptr
srcPtr, // const void *src_ptr
copySize, // size_t size
nullptr // cl_event *event
);
auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(1u, numOfStagingBuffers);
EXPECT_EQ(1u, myCmdQ.finishCalledCount);
retVal = myCmdQ.enqueueStagingBufferMemcpy(
true, // cl_bool blocking_copy
dstPtr, // void *dst_ptr
srcPtr, // const void *src_ptr
copySize, // size_t size
nullptr // cl_event *event
);
numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(1u, numOfStagingBuffers);
EXPECT_EQ(2u, myCmdQ.finishCalledCount);
}
HWTEST_F(StagingBufferTest, givenCmdQueueWhenEnqueueStagingBufferWithInvalidBufferThenReturnFailure) {
auto dstPtr = nullptr;
auto srcPtr = new unsigned char[copySize];
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
retVal = myCmdQ.enqueueStagingBufferMemcpy(
false, // cl_bool blocking_copy
dstPtr, // void *dst_ptr
srcPtr, // const void *src_ptr
copySize, // size_t size
nullptr // cl_event *event
);
EXPECT_EQ(CL_INVALID_VALUE, retVal);
delete[] srcPtr;
}
HWTEST_F(StagingBufferTest, givenCmdQueueWithProfilingWhenEnqueueStagingBufferMemcpyThenTimestampsSetCorrectly) {
cl_event event;
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
myCmdQ.setProfilingEnabled();
retVal = myCmdQ.enqueueStagingBufferMemcpy(
false, // cl_bool blocking_copy
dstPtr, // void *dst_ptr
srcPtr, // const void *src_ptr
copySize, // size_t size
&event // cl_event *event
);
auto pEvent = (Event *)event;
// A small adjustment to give end timestamp a valid value instead of mocked value
TimeStampData tsData{};
pClDevice->getDevice().getOSTime()->getGpuCpuTime(&tsData);
if (pEvent->getTimestampPacketNodes()) {
auto node = pEvent->getTimestampPacketNodes()->peekNodes()[0];
auto contextEnd = ptrOffset(node->getCpuBase(), node->getGlobalEndOffset());
*reinterpret_cast<typename FamilyType::TimestampPacketType *>(contextEnd) = static_cast<typename FamilyType::TimestampPacketType>(tsData.gpuTimeStamp);
} else {
HwTimeStamps *timeStamps = static_cast<TagNode<HwTimeStamps> *>(pEvent->getHwTimeStampNode())->tagForCpuAccess;
timeStamps->contextEndTS = tsData.gpuTimeStamp;
timeStamps->globalEndTS = tsData.gpuTimeStamp;
}
EXPECT_FALSE(pEvent->isCPUProfilingPath());
EXPECT_TRUE(pEvent->isProfilingEnabled());
uint64_t queue, submit, start, end;
pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queue, 0);
pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0);
pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(uint64_t), &start, 0);
pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_END, sizeof(uint64_t), &end, 0);
EXPECT_GE(queue, 0ull);
EXPECT_GE(submit, queue);
EXPECT_GE(start, submit);
EXPECT_GE(end, start);
clReleaseEvent(event);
}

View File

@ -460,6 +460,17 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
return BaseClass::enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event); return BaseClass::enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event);
} }
cl_int enqueueSVMMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size,
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override {
enqueueSVMMemcpyCalledCount++;
return BaseClass::enqueueSVMMemcpy(blockingCopy, dstPtr, srcPtr, size, numEventsInWaitList, eventWaitList, event);
}
cl_int finish() override {
finishCalledCount++;
return BaseClass::finish();
}
unsigned int lastCommandType; unsigned int lastCommandType;
std::vector<Kernel *> lastEnqueuedKernels; std::vector<Kernel *> lastEnqueuedKernels;
MultiDispatchInfo storedMultiDispatchInfo; MultiDispatchInfo storedMultiDispatchInfo;
@ -490,7 +501,8 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
std::optional<WaitStatus> waitUntilCompleteReturnValue{}; std::optional<WaitStatus> waitUntilCompleteReturnValue{};
int waitForAllEnginesCalledCount{0}; int waitForAllEnginesCalledCount{0};
int enqueueMarkerWithWaitListCalledCount{0}; int enqueueMarkerWithWaitListCalledCount{0};
size_t enqueueSVMMemcpyCalledCount{0};
size_t finishCalledCount{0};
LinearStream *peekCommandStream() { LinearStream *peekCommandStream() {
return this->commandStream; return this->commandStream;
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2018-2023 Intel Corporation * Copyright (C) 2018-2024 Intel Corporation
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
* *
@ -14,6 +14,7 @@
#include "shared/source/memory_manager/deferred_deleter.h" #include "shared/source/memory_manager/deferred_deleter.h"
#include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/os_interface/os_context.h" #include "shared/source/os_interface/os_context.h"
#include "shared/source/utilities/staging_buffer_manager.h"
#include "shared/test/common/helpers/engine_descriptor_helper.h" #include "shared/test/common/helpers/engine_descriptor_helper.h"
#include "shared/test/common/mocks/mock_svm_manager.h" #include "shared/test/common/mocks/mock_svm_manager.h"
@ -123,6 +124,7 @@ void MockContext::initializeWithDevices(const ClDeviceVector &devices, bool noSp
} }
deviceBitfields.insert({rootDeviceIndex, deviceBitfield}); deviceBitfields.insert({rootDeviceIndex, deviceBitfield});
} }
stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager, rootDeviceIndices, deviceBitfields);
cl_int retVal; cl_int retVal;
if (!noSpecialQueue) { if (!noSpecialQueue) {

View File

@ -387,6 +387,8 @@ DECLARE_DEBUG_VARIABLE(int32_t, SkipDcFlushOnBarrierWithoutEvents, -1, "-1: defa
DECLARE_DEBUG_VARIABLE(int32_t, EnableDeviceUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB") DECLARE_DEBUG_VARIABLE(int32_t, EnableDeviceUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB")
DECLARE_DEBUG_VARIABLE(int32_t, EnableHostUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB") DECLARE_DEBUG_VARIABLE(int32_t, EnableHostUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB")
DECLARE_DEBUG_VARIABLE(int32_t, UseLocalPreferredForCacheableBuffers, -1, "Use localPreferred for cacheable buffers") DECLARE_DEBUG_VARIABLE(int32_t, UseLocalPreferredForCacheableBuffers, -1, "Use localPreferred for cacheable buffers")
DECLARE_DEBUG_VARIABLE(int32_t, EnableCopyWithStagingBuffers, -1, "Enable copy with non-usm memory through staging buffers. -1: default, 0: disabled, 1: enabled")
DECLARE_DEBUG_VARIABLE(int32_t, StagingBufferSize, -1, "Size of single staging buffer. -1: default (2MB), >0: size in KB")
/*DIRECT SUBMISSION FLAGS*/ /*DIRECT SUBMISSION FLAGS*/
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD") DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD")

View File

@ -237,6 +237,7 @@ class ProductHelper {
virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0; virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
virtual bool isAvailableExtendedScratch() const = 0; virtual bool isAvailableExtendedScratch() const = 0;
virtual std::optional<bool> isCoherentAllocation(uint64_t patIndex) const = 0; virtual std::optional<bool> isCoherentAllocation(uint64_t patIndex) const = 0;
virtual bool isStagingBuffersEnabled() const = 0;
virtual ~ProductHelper() = default; virtual ~ProductHelper() = default;

View File

@ -897,4 +897,10 @@ template <PRODUCT_FAMILY gfxProduct>
bool ProductHelperHw<gfxProduct>::isAvailableExtendedScratch() const { bool ProductHelperHw<gfxProduct>::isAvailableExtendedScratch() const {
return false; return false;
} }
template <PRODUCT_FAMILY gfxProduct>
bool ProductHelperHw<gfxProduct>::isStagingBuffersEnabled() const {
return false;
}
} // namespace NEO } // namespace NEO

View File

@ -182,6 +182,7 @@ class ProductHelperHw : public ProductHelper {
size_t getMaxFillPaternSizeForCopyEngine() const override; size_t getMaxFillPaternSizeForCopyEngine() const override;
bool isAvailableExtendedScratch() const override; bool isAvailableExtendedScratch() const override;
std::optional<bool> isCoherentAllocation(uint64_t patIndex) const override; std::optional<bool> isCoherentAllocation(uint64_t patIndex) const override;
bool isStagingBuffersEnabled() const override;
~ProductHelperHw() override = default; ~ProductHelperHw() override = default;

View File

@ -49,6 +49,8 @@ set(NEO_CORE_UTILITIES
${CMAKE_CURRENT_SOURCE_DIR}/wait_util.h ${CMAKE_CURRENT_SOURCE_DIR}/wait_util.h
${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator.cpp
${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator.h ${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator.h
${CMAKE_CURRENT_SOURCE_DIR}/staging_buffer_manager.cpp
${CMAKE_CURRENT_SOURCE_DIR}/staging_buffer_manager.h
) )
set(NEO_CORE_UTILITIES_WINDOWS set(NEO_CORE_UTILITIES_WINDOWS

View File

@ -0,0 +1,123 @@
/*
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/utilities/staging_buffer_manager.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/device/device.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
namespace NEO {
StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields) : svmAllocsManager(svmAllocsManager), rootDeviceIndices(rootDeviceIndices), deviceBitfields(deviceBitfields) {
if (debugManager.flags.StagingBufferSize.get() != -1) {
chunkSize = debugManager.flags.StagingBufferSize.get() * MemoryConstants::kiloByte;
}
}
StagingBufferManager::~StagingBufferManager() {
for (auto &stagingBuffer : stagingBuffers) {
svmAllocsManager->freeSVMAlloc(stagingBuffer.first->gpuAllocations.getDefaultGraphicsAllocation()->getUnderlyingBuffer());
}
}
/*
* This method performs 4 steps for single chunk copy
* 1. Get existing staging buffer, if can't - allocate new one,
* 2. Perform actual copy,
* 3. Store used buffer back to the container (with current task count)
* 4. Update tag to reuse previous buffers within same API call
*/
int32_t StagingBufferManager::performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr) {
auto rootDeviceIndex = csr->getRootDeviceIndex();
auto taskCount = *csr->getTagAddress();
auto stagingBuffer = getExistingBuffer(taskCount, rootDeviceIndex);
if (stagingBuffer == nullptr) {
stagingBuffer = allocateStagingBuffer();
}
auto ret = chunkCopyFunc(chunkDst, stagingBuffer, chunkSrc, size);
storeBuffer(stagingBuffer, csr->peekTaskCount());
csr->flushTagUpdate();
return ret;
}
/*
* This method copies data between non-USM and USM allocations by splitting transfers into chunks.
* Each chunk copy contains staging buffer which should be used instead of non-usm memory during transfers on GPU.
* Caller provides actual function to transfer data for single chunk.
*/
int32_t StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr) {
auto copiesNum = size / chunkSize;
auto remainder = size % chunkSize;
for (auto i = 0u; i < copiesNum; i++) {
auto chunkDst = ptrOffset(dstPtr, i * chunkSize);
auto chunkSrc = ptrOffset(srcPtr, i * chunkSize);
auto ret = performChunkCopy(chunkDst, chunkSrc, chunkSize, chunkCopyFunc, csr);
if (ret) {
return ret;
}
}
if (remainder != 0) {
auto chunkDst = ptrOffset(dstPtr, copiesNum * chunkSize);
auto chunkSrc = ptrOffset(srcPtr, copiesNum * chunkSize);
auto ret = performChunkCopy(chunkDst, chunkSrc, remainder, chunkCopyFunc, csr);
if (ret) {
return ret;
}
}
return 0;
}
/*
* This method will try to return existing staging buffer from the container.
* It's checking only "oldest" allocation.
* Returns nullptr if no staging buffer available.
*/
void *StagingBufferManager::getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex) {
auto lock = std::lock_guard<std::mutex>(mtx);
if (stagingBuffers.empty()) {
return nullptr;
}
void *buffer = nullptr;
auto iterator = stagingBuffers.begin();
UNRECOVERABLE_IF(iterator == stagingBuffers.end());
if (taskCount > iterator->second) {
auto allocation = iterator->first->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
buffer = allocation->getUnderlyingBuffer();
stagingBuffers.erase(iterator);
}
return buffer;
}
void *StagingBufferManager::allocateStagingBuffer() {
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::hostUnifiedMemory, 0u, rootDeviceIndices, deviceBitfields);
auto hostPtr = svmAllocsManager->createHostUnifiedMemoryAllocation(chunkSize, unifiedMemoryProperties);
return hostPtr;
}
void StagingBufferManager::storeBuffer(void *stagingBuffer, uint64_t taskCount) {
auto lock = std::lock_guard<std::mutex>(mtx);
auto svmData = svmAllocsManager->getSVMAlloc(stagingBuffer);
stagingBuffers.push_back({svmData, taskCount});
}
bool StagingBufferManager::isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, bool hasDependencies) const {
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
}
auto usmDstData = svmAllocsManager->getSVMAlloc(dstPtr);
auto usmSrcData = svmAllocsManager->getSVMAlloc(srcPtr);
bool hostToUsmDeviceCopy = usmSrcData == nullptr && usmDstData != nullptr;
return stagingCopyEnabled && hostToUsmDeviceCopy && !hasDependencies;
}
} // namespace NEO

View File

@ -0,0 +1,54 @@
/*
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/constants.h"
#include "shared/source/utilities/stackvec.h"
#include <functional>
#include <map>
#include <mutex>
namespace NEO {
class SVMAllocsManager;
class CommandStreamReceiver;
class Device;
struct SvmAllocationData;
using ChunkCopyFunction = std::function<int32_t(void *, void *, const void *, size_t)>;
struct StagingBufferTracker {
void *stagingBuffer;
uint64_t taskCount;
};
class StagingBufferManager {
public:
StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields);
~StagingBufferManager();
bool isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, bool hasDependencies) const;
int32_t performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr);
private:
void *getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex);
void *allocateStagingBuffer();
void storeBuffer(void *stagingBuffer, uint64_t taskCount);
int32_t performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr);
size_t chunkSize = MemoryConstants::pageSize2M;
std::vector<std::pair<SvmAllocationData *, uint64_t>> stagingBuffers;
std::mutex mtx;
SVMAllocsManager *svmAllocsManager;
const RootDeviceIndicesContainer rootDeviceIndices;
const std::map<uint32_t, DeviceBitfield> deviceBitfields;
};
} // namespace NEO

View File

@ -608,4 +608,6 @@ EnableReusingGpuTimestamps = -1
ForceCopyOperationOffloadForComputeCmdList = -1 ForceCopyOperationOffloadForComputeCmdList = -1
SecondaryContextEngineTypeMask = -1 SecondaryContextEngineTypeMask = -1
DisableSupportForL0Debugger=0 DisableSupportForL0Debugger=0
EnableCopyWithStagingBuffers = -1
StagingBufferSize = -1
# Please don't edit below this line # Please don't edit below this line

View File

@ -28,6 +28,7 @@ target_sources(neo_shared_tests PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/vec_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/vec_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/wait_util_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/wait_util_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/staging_buffer_manager_tests.cpp
) )
add_subdirectories() add_subdirectories()

View File

@ -0,0 +1,220 @@
/*
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/utilities/staging_buffer_manager.h"
#include "shared/test/common/fixtures/device_fixture.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/mocks/mock_svm_manager.h"
#include "shared/test/common/test_macros/test.h"
#include "shared/test/common/test_macros/test_checks_shared.h"
#include "gtest/gtest.h"
using namespace NEO;
class StagingBufferManagerFixture : public DeviceFixture {
public:
void setUp() {
DeviceFixture::setUp();
REQUIRE_SVM_OR_SKIP(&hardwareInfo);
this->svmAllocsManager = std::make_unique<MockSVMAllocsManager>(pDevice->getMemoryManager(), false);
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
this->stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager.get(), rootDeviceIndices, deviceBitfields);
this->csr = pDevice->commandStreamReceivers[0].get();
}
void tearDown() {
stagingBufferManager.reset();
svmAllocsManager.reset();
DeviceFixture::tearDown();
}
void *allocateDeviceBuffer(size_t size) {
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::deviceUnifiedMemory, 0u, rootDeviceIndices, deviceBitfields);
unifiedMemoryProperties.device = pDevice;
return svmAllocsManager->createHostUnifiedMemoryAllocation(size, unifiedMemoryProperties);
}
void copyThroughStagingBuffers(size_t copySize, size_t expectedChunks, size_t expectedAllocations) {
auto usmBuffer = allocateDeviceBuffer(copySize);
auto nonUsmBuffer = new unsigned char[copySize];
size_t chunkCounter = 0;
memset(usmBuffer, 0, copySize);
memset(nonUsmBuffer, 0xFF, copySize);
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) {
chunkCounter++;
memcpy(stagingBuffer, chunkSrc, chunkSize);
memcpy(chunkDst, stagingBuffer, chunkSize);
return 0;
};
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, copySize, chunkCopy, csr);
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
EXPECT_EQ(0, ret);
EXPECT_EQ(0, memcmp(usmBuffer, nonUsmBuffer, copySize));
EXPECT_EQ(expectedChunks, chunkCounter);
EXPECT_EQ(expectedAllocations, newUsmAllocations);
svmAllocsManager->freeSVMAlloc(usmBuffer);
delete[] nonUsmBuffer;
}
constexpr static size_t stagingBufferSize = MemoryConstants::megaByte * 2;
DebugManagerStateRestore restorer;
std::unique_ptr<MockSVMAllocsManager> svmAllocsManager;
std::unique_ptr<StagingBufferManager> stagingBufferManager;
CommandStreamReceiver *csr;
};
using StagingBufferManagerTest = Test<StagingBufferManagerFixture>;
TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForCopyThenReturnTrue) {
constexpr size_t bufferSize = 1024;
auto usmBuffer = allocateDeviceBuffer(bufferSize);
unsigned char nonUsmBuffer[bufferSize];
struct {
void *dstPtr;
void *srcPtr;
bool hasDependencies;
bool expectValid;
} copyParamsStruct[5]{
{usmBuffer, nonUsmBuffer, false, true}, // nonUsm -> usm without dependencies
{usmBuffer, nonUsmBuffer, true, false}, // nonUsm -> usm with dependencies
{nonUsmBuffer, nonUsmBuffer, false, false}, // nonUsm -> nonUsm without dependencies
{usmBuffer, usmBuffer, false, false}, // usm -> usm without dependencies
{nonUsmBuffer, usmBuffer, false, false} // usm -> nonUsm without dependencies
};
for (auto i = 0; i < 5; i++) {
auto actualValid = stagingBufferManager->isValidForCopy(*pDevice, copyParamsStruct[i].dstPtr, copyParamsStruct[i].srcPtr, copyParamsStruct[i].hasDependencies);
EXPECT_EQ(actualValid, copyParamsStruct[i].expectValid);
}
debugManager.flags.EnableCopyWithStagingBuffers.set(0);
EXPECT_FALSE(stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, false));
debugManager.flags.EnableCopyWithStagingBuffers.set(-1);
auto isStaingBuffersEnabled = pDevice->getProductHelper().isStagingBuffersEnabled();
EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, false));
svmAllocsManager->freeSVMAlloc(usmBuffer);
}
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformCopyThenCopyData) {
constexpr size_t numOfChunkCopies = 8;
constexpr size_t remainder = 1024;
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies + remainder;
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies + 1, 1);
}
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformCopyWithoutRemainderThenNoRemainderCalled) {
constexpr size_t numOfChunkCopies = 8;
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies;
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 1);
}
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenTaskCountNotReadyThenDontReuseBuffers) {
constexpr size_t numOfChunkCopies = 8;
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies;
*csr->getTagAddress() = csr->peekTaskCount();
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 8);
}
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenUpdatedTaskCountThenReuseBuffers) {
constexpr size_t numOfChunkCopies = 8;
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies;
*csr->getTagAddress() = csr->peekTaskCount();
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 8);
*csr->getTagAddress() = csr->peekTaskCount() + numOfChunkCopies;
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 0);
EXPECT_EQ(numOfChunkCopies, svmAllocsManager->svmAllocs.getNumAllocs());
}
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkCopyThenEarlyReturnWithFailure) {
constexpr size_t numOfChunkCopies = 8;
constexpr size_t remainder = 1024;
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies + remainder;
constexpr int expectedErrorCode = 1;
auto usmBuffer = allocateDeviceBuffer(totalCopySize);
auto nonUsmBuffer = new unsigned char[totalCopySize];
size_t chunkCounter = 0;
memset(usmBuffer, 0, totalCopySize);
memset(nonUsmBuffer, 0xFF, totalCopySize);
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) {
chunkCounter++;
memcpy(stagingBuffer, chunkSrc, chunkSize);
memcpy(chunkDst, stagingBuffer, chunkSize);
return expectedErrorCode;
};
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr);
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
EXPECT_EQ(expectedErrorCode, ret);
EXPECT_NE(0, memcmp(usmBuffer, nonUsmBuffer, totalCopySize));
EXPECT_EQ(1u, chunkCounter);
EXPECT_EQ(1u, newUsmAllocations);
svmAllocsManager->freeSVMAlloc(usmBuffer);
delete[] nonUsmBuffer;
}
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedRemainderCopyThenReturnWithFailure) {
constexpr size_t numOfChunkCopies = 8;
constexpr size_t remainder = 1024;
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies + remainder;
constexpr int expectedErrorCode = 1;
auto usmBuffer = allocateDeviceBuffer(totalCopySize);
auto nonUsmBuffer = new unsigned char[totalCopySize];
size_t chunkCounter = 0;
memset(usmBuffer, 0, totalCopySize);
memset(nonUsmBuffer, 0xFF, totalCopySize);
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) {
chunkCounter++;
memcpy(stagingBuffer, chunkSrc, chunkSize);
memcpy(chunkDst, stagingBuffer, chunkSize);
if (chunkCounter <= numOfChunkCopies) {
return 0;
} else {
return expectedErrorCode;
}
};
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr);
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
EXPECT_EQ(expectedErrorCode, ret);
EXPECT_EQ(numOfChunkCopies + 1, chunkCounter);
EXPECT_EQ(1u, newUsmAllocations);
svmAllocsManager->freeSVMAlloc(usmBuffer);
delete[] nonUsmBuffer;
}
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenChangedBufferSizeThenPerformCopyWithCorrectNumberOfChunks) {
constexpr size_t stagingBufferSize = 512;
constexpr size_t numOfChunkCopies = 8;
constexpr size_t remainder = 1024;
constexpr size_t totalCopySize = MemoryConstants::kiloByte * stagingBufferSize * numOfChunkCopies + remainder;
debugManager.flags.StagingBufferSize.set(stagingBufferSize); // 512KB
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager.get(), rootDeviceIndices, deviceBitfields);
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies + 1, 1);
}