performance: non-usm copy through staging buffers
Related-To: NEO-11501 Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
parent
659075ffe5
commit
29e3eb512c
|
@ -20,6 +20,7 @@
|
|||
#include "shared/source/os_interface/device_factory.h"
|
||||
#include "shared/source/utilities/buffer_pool_allocator.inl"
|
||||
#include "shared/source/utilities/heap_allocator.h"
|
||||
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||
|
||||
#include "opencl/source/accelerators/intel_motion_estimation.h"
|
||||
#include "opencl/source/api/additional_extensions.h"
|
||||
|
@ -4913,14 +4914,19 @@ cl_int CL_API_CALL clEnqueueSVMMemcpy(cl_command_queue commandQueue,
|
|||
}
|
||||
|
||||
if (size != 0) {
|
||||
retVal = pCommandQueue->enqueueSVMMemcpy(
|
||||
blockingCopy,
|
||||
dstPtr,
|
||||
srcPtr,
|
||||
size,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
auto stagingBufferManager = pCommandQueue->getContext().getStagingBufferManager();
|
||||
if (stagingBufferManager->isValidForCopy(device, dstPtr, srcPtr, numEventsInWaitList)) {
|
||||
retVal = pCommandQueue->enqueueStagingBufferMemcpy(blockingCopy, dstPtr, srcPtr, size, event);
|
||||
} else {
|
||||
retVal = pCommandQueue->enqueueSVMMemcpy(
|
||||
blockingCopy,
|
||||
dstPtr,
|
||||
srcPtr,
|
||||
size,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
}
|
||||
} else {
|
||||
retVal = pCommandQueue->enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event);
|
||||
}
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "shared/source/os_interface/os_context.h"
|
||||
#include "shared/source/os_interface/product_helper.h"
|
||||
#include "shared/source/utilities/api_intercept.h"
|
||||
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||
#include "shared/source/utilities/tag_allocator.h"
|
||||
|
||||
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
|
||||
|
@ -1504,4 +1505,56 @@ void CommandQueue::unregisterGpgpuAndBcsCsrClients() {
|
|||
}
|
||||
}
|
||||
|
||||
cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event) {
|
||||
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, &size};
|
||||
csrSelectionArgs.direction = TransferDirection::hostToLocal;
|
||||
auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs);
|
||||
|
||||
Event profilingEvent{this, CL_COMMAND_SVM_MEMCPY, CompletionStamp::notReady, CompletionStamp::notReady};
|
||||
if (isProfilingEnabled()) {
|
||||
profilingEvent.setQueueTimeStamp();
|
||||
}
|
||||
|
||||
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) -> int32_t {
|
||||
auto isFirstTransfer = (chunkDst == dstPtr);
|
||||
auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size);
|
||||
if (isFirstTransfer && isProfilingEnabled()) {
|
||||
profilingEvent.setSubmitTimeStamp();
|
||||
}
|
||||
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
||||
if (isFirstTransfer && isProfilingEnabled()) {
|
||||
profilingEvent.setStartTimeStamp();
|
||||
}
|
||||
|
||||
cl_event *outEvent = nullptr;
|
||||
if (isLastTransfer && !this->isOOQEnabled()) {
|
||||
outEvent = event;
|
||||
}
|
||||
auto ret = this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, outEvent);
|
||||
return ret;
|
||||
};
|
||||
|
||||
auto stagingBufferManager = this->context->getStagingBufferManager();
|
||||
auto ret = stagingBufferManager->performCopy(dstPtr, srcPtr, size, chunkCopy, csr);
|
||||
if (ret != CL_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (event != nullptr) {
|
||||
if (this->isOOQEnabled()) {
|
||||
ret = this->enqueueBarrierWithWaitList(0, nullptr, event);
|
||||
}
|
||||
if (isProfilingEnabled()) {
|
||||
auto pEvent = castToObjectOrAbort<Event>(*event);
|
||||
pEvent->copyTimestamps(profilingEvent);
|
||||
pEvent->setCPUProfilingPath(false);
|
||||
}
|
||||
}
|
||||
|
||||
if (blockingCopy) {
|
||||
ret = this->finish();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
|
|
@ -388,6 +388,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
|||
bool isBcsSplitInitialized() const { return this->bcsSplitInitialized; }
|
||||
bool isBcs() const { return isCopyOnly; };
|
||||
|
||||
cl_int enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event);
|
||||
|
||||
protected:
|
||||
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
|
||||
cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest);
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||
#include "shared/source/utilities/buffer_pool_allocator.inl"
|
||||
#include "shared/source/utilities/heap_allocator.h"
|
||||
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||
#include "shared/source/utilities/tag_allocator.h"
|
||||
|
||||
#include "opencl/source/cl_device/cl_device.h"
|
||||
|
@ -74,6 +75,7 @@ Context::~Context() {
|
|||
}
|
||||
}
|
||||
if (svmAllocsManager) {
|
||||
this->stagingBufferManager.reset();
|
||||
svmAllocsManager->trimUSMDeviceAllocCache();
|
||||
delete svmAllocsManager;
|
||||
}
|
||||
|
@ -281,6 +283,7 @@ bool Context::createImpl(const cl_context_properties *properties,
|
|||
this->svmAllocsManager = new SVMAllocsManager(this->memoryManager,
|
||||
this->areMultiStorageAllocationsPreferred());
|
||||
this->svmAllocsManager->initUsmAllocationsCaches(device->getDevice());
|
||||
this->stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager, rootDeviceIndices, deviceBitfields);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -676,4 +679,8 @@ bool Context::checkIfContextIsNonZebin() const {
|
|||
return this->nonZebinContext;
|
||||
}
|
||||
|
||||
StagingBufferManager *Context::getStagingBufferManager() const {
|
||||
return this->stagingBufferManager.get();
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
|
|
@ -40,6 +40,7 @@ class SVMAllocsManager;
|
|||
class Program;
|
||||
class Platform;
|
||||
class TagAllocatorBase;
|
||||
class StagingBufferManager;
|
||||
|
||||
template <>
|
||||
struct OpenCLObjectMapper<_cl_context> {
|
||||
|
@ -256,6 +257,8 @@ class Context : public BaseObject<_cl_context> {
|
|||
void initializeUsmAllocationPools();
|
||||
void cleanupUsmAllocationPools();
|
||||
|
||||
StagingBufferManager *getStagingBufferManager() const;
|
||||
|
||||
protected:
|
||||
struct BuiltInKernel {
|
||||
const char *pSource = nullptr;
|
||||
|
@ -300,6 +303,8 @@ class Context : public BaseObject<_cl_context> {
|
|||
std::unique_ptr<TagAllocatorBase> multiRootDeviceTimestampPacketAllocator;
|
||||
std::mutex multiRootDeviceAllocatorMtx;
|
||||
|
||||
std::unique_ptr<StagingBufferManager> stagingBufferManager;
|
||||
|
||||
bool interopUserSync = false;
|
||||
bool resolvesRequiredInKernels = false;
|
||||
bool nonZebinContext = false;
|
||||
|
|
|
@ -397,6 +397,10 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
|
|||
auto &device = this->cmdQueue->getDevice();
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
auto resolution = device.getDeviceInfo().profilingTimerResolution;
|
||||
if (timestampsCopied) {
|
||||
// Adjust startTS since we calculate profiling based on other event timestamps
|
||||
contextStartTS = startTimeStamp.gpuTimeStamp;
|
||||
}
|
||||
|
||||
// Calculate startTimestamp only if it was not already set on CPU
|
||||
if (startTimeStamp.cpuTimeInNs == 0) {
|
||||
|
|
|
@ -312,6 +312,14 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
|||
|
||||
static void getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS);
|
||||
|
||||
void copyTimestamps(const Event &srcEvent) {
|
||||
this->queueTimeStamp = srcEvent.queueTimeStamp;
|
||||
this->submitTimeStamp = srcEvent.submitTimeStamp;
|
||||
this->startTimeStamp = srcEvent.startTimeStamp;
|
||||
this->endTimeStamp = srcEvent.endTimeStamp;
|
||||
timestampsCopied = true;
|
||||
}
|
||||
|
||||
protected:
|
||||
Event(Context *ctx, CommandQueue *cmdQueue, cl_command_type cmdType,
|
||||
TaskCountType taskLevel, TaskCountType taskCount);
|
||||
|
@ -383,6 +391,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
|||
bool profilingEnabled = false;
|
||||
bool profilingCpuPath = false;
|
||||
bool dataCalculated = false;
|
||||
bool timestampsCopied = false;
|
||||
|
||||
ProfilingInfo queueTimeStamp{};
|
||||
ProfilingInfo submitTimeStamp{};
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2023 Intel Corporation
|
||||
* Copyright (C) 2018-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
|
@ -231,4 +231,29 @@ TEST_F(ClEnqueueSVMMemcpyTests, GivenDeviceNotSupportingSvmWhenEnqueuingSVMMemcp
|
|||
EXPECT_EQ(CL_INVALID_OPERATION, retVal);
|
||||
}
|
||||
|
||||
TEST_F(ClEnqueueSVMMemcpyTests, givenCopyValidForStagingBuffersCopyThenTransferSuccesfull) {
|
||||
DebugManagerStateRestore restorer;
|
||||
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
|
||||
const ClDeviceInfo &devInfo = pDevice->getDeviceInfo();
|
||||
if (devInfo.svmCapabilities != 0) {
|
||||
void *pDstSvm = clSVMAlloc(pContext, CL_MEM_READ_WRITE, 256, 4);
|
||||
EXPECT_NE(nullptr, pDstSvm);
|
||||
auto pSrc = new unsigned char[256];
|
||||
auto retVal = clEnqueueSVMMemcpy(
|
||||
pCommandQueue, // cl_command_queue command_queue
|
||||
CL_FALSE, // cl_bool blocking_copy
|
||||
pDstSvm, // void *dst_ptr
|
||||
pSrc, // const void *src_ptr
|
||||
256, // size_t size
|
||||
0, // cl_uint num_events_in_wait_list
|
||||
nullptr, // const cl_event *event_wait_list
|
||||
nullptr // cl_event *event
|
||||
);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
clSVMFree(pContext, pDstSvm);
|
||||
delete[] pSrc;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ULT
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2023 Intel Corporation
|
||||
* Copyright (C) 2018-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
|
@ -13,6 +13,8 @@
|
|||
#include "shared/source/memory_manager/surface.h"
|
||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||
#include "shared/source/os_interface/device_factory.h"
|
||||
#include "shared/source/utilities/hw_timestamps.h"
|
||||
#include "shared/source/utilities/tag_allocator.h"
|
||||
#include "shared/test/common/cmd_parse/hw_parse.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/libult/ult_command_stream_receiver.h"
|
||||
|
@ -2369,3 +2371,190 @@ HWTEST_F(EnqueueSvmTest, givenCopyFromMappedPtrToMappedPtrWhenCallingSvmMemcpyTh
|
|||
EXPECT_EQ(2u, csr.createAllocationForHostSurfaceCalled);
|
||||
}
|
||||
}
|
||||
|
||||
struct StagingBufferTest : public EnqueueSvmTest {
|
||||
void SetUp() override {
|
||||
REQUIRE_SVM_OR_SKIP(defaultHwInfo);
|
||||
EnqueueSvmTest::SetUp();
|
||||
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::deviceUnifiedMemory, 1, context->getRootDeviceIndices(), context->getDeviceBitfields());
|
||||
unifiedMemoryProperties.device = pDevice;
|
||||
svmManager = this->context->getSVMAllocsManager();
|
||||
|
||||
dstPtr = svmManager->createUnifiedMemoryAllocation(copySize, unifiedMemoryProperties);
|
||||
srcPtr = new unsigned char[copySize];
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
if (defaultHwInfo->capabilityTable.ftrSvm == false) {
|
||||
return;
|
||||
}
|
||||
svmManager = this->context->getSVMAllocsManager();
|
||||
svmManager->freeSVMAlloc(dstPtr);
|
||||
delete[] srcPtr;
|
||||
EnqueueSvmTest::TearDown();
|
||||
}
|
||||
|
||||
static constexpr size_t stagingBufferSize = MemoryConstants::megaByte * 2;
|
||||
static constexpr size_t copySize = stagingBufferSize * 4;
|
||||
static constexpr size_t expectedNumOfCopies = copySize / stagingBufferSize;
|
||||
|
||||
SVMAllocsManager *svmManager;
|
||||
void *dstPtr;
|
||||
unsigned char *srcPtr;
|
||||
};
|
||||
|
||||
HWTEST_F(StagingBufferTest, givenInOrderCmdQueueWhenEnqueueStagingBufferMemcpyNonBlockingThenCopySucessfull) {
|
||||
constexpr cl_command_type expectedLastCmd = CL_COMMAND_SVM_MEMCPY;
|
||||
|
||||
cl_event event;
|
||||
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
|
||||
auto initialUsmAllocs = svmManager->getNumAllocs();
|
||||
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||
false, // cl_bool blocking_copy
|
||||
dstPtr, // void *dst_ptr
|
||||
srcPtr, // const void *src_ptr
|
||||
copySize, // size_t size
|
||||
&event // cl_event *event
|
||||
);
|
||||
auto pEvent = (Event *)event;
|
||||
auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
EXPECT_EQ(1u, numOfStagingBuffers);
|
||||
EXPECT_EQ(expectedNumOfCopies, myCmdQ.enqueueSVMMemcpyCalledCount);
|
||||
EXPECT_EQ(0u, myCmdQ.finishCalledCount);
|
||||
EXPECT_EQ(expectedLastCmd, myCmdQ.lastCommandType);
|
||||
EXPECT_EQ(expectedLastCmd, pEvent->getCommandType());
|
||||
|
||||
clReleaseEvent(event);
|
||||
}
|
||||
|
||||
HWTEST_F(StagingBufferTest, givenOutOfOrderCmdQueueWhenEnqueueStagingBufferMemcpyNonBlockingThenCopySucessfull) {
|
||||
constexpr cl_command_type expectedLastCmd = CL_COMMAND_BARRIER;
|
||||
|
||||
cl_event event;
|
||||
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
|
||||
myCmdQ.setOoqEnabled();
|
||||
|
||||
auto initialUsmAllocs = svmManager->getNumAllocs();
|
||||
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||
false, // cl_bool blocking_copy
|
||||
dstPtr, // void *dst_ptr
|
||||
srcPtr, // const void *src_ptr
|
||||
copySize, // size_t size
|
||||
&event // cl_event *event
|
||||
);
|
||||
|
||||
auto pEvent = (Event *)event;
|
||||
auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
EXPECT_EQ(1u, numOfStagingBuffers);
|
||||
EXPECT_EQ(expectedNumOfCopies, myCmdQ.enqueueSVMMemcpyCalledCount);
|
||||
EXPECT_EQ(0u, myCmdQ.finishCalledCount);
|
||||
EXPECT_EQ(expectedLastCmd, myCmdQ.lastCommandType);
|
||||
EXPECT_EQ(expectedLastCmd, pEvent->getCommandType());
|
||||
|
||||
clReleaseEvent(event);
|
||||
}
|
||||
|
||||
HWTEST_F(StagingBufferTest, givenEnqueueStagingBufferMemcpyWhenTaskCountNotReadyThenCopySucessfullAndBuffersNotReused) {
|
||||
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
|
||||
auto initialUsmAllocs = svmManager->getNumAllocs();
|
||||
auto &csr = pCmdQ->getGpgpuCommandStreamReceiver();
|
||||
*csr.getTagAddress() = csr.peekTaskCount();
|
||||
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||
false, // cl_bool blocking_copy
|
||||
dstPtr, // void *dst_ptr
|
||||
srcPtr, // const void *src_ptr
|
||||
copySize, // size_t size
|
||||
nullptr // cl_event *event
|
||||
);
|
||||
auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
EXPECT_EQ(expectedNumOfCopies, numOfStagingBuffers);
|
||||
*csr.getTagAddress() = csr.peekTaskCount();
|
||||
}
|
||||
|
||||
HWTEST_F(StagingBufferTest, givenCmdQueueWhenEnqueueStagingBufferMemcpyBlockingThenCopySucessfullAndFinishCalled) {
|
||||
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
|
||||
auto initialUsmAllocs = svmManager->getNumAllocs();
|
||||
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||
true, // cl_bool blocking_copy
|
||||
dstPtr, // void *dst_ptr
|
||||
srcPtr, // const void *src_ptr
|
||||
copySize, // size_t size
|
||||
nullptr // cl_event *event
|
||||
);
|
||||
auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
EXPECT_EQ(1u, numOfStagingBuffers);
|
||||
EXPECT_EQ(1u, myCmdQ.finishCalledCount);
|
||||
|
||||
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||
true, // cl_bool blocking_copy
|
||||
dstPtr, // void *dst_ptr
|
||||
srcPtr, // const void *src_ptr
|
||||
copySize, // size_t size
|
||||
nullptr // cl_event *event
|
||||
);
|
||||
numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
EXPECT_EQ(1u, numOfStagingBuffers);
|
||||
EXPECT_EQ(2u, myCmdQ.finishCalledCount);
|
||||
}
|
||||
|
||||
HWTEST_F(StagingBufferTest, givenCmdQueueWhenEnqueueStagingBufferWithInvalidBufferThenReturnFailure) {
|
||||
auto dstPtr = nullptr;
|
||||
auto srcPtr = new unsigned char[copySize];
|
||||
|
||||
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
|
||||
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||
false, // cl_bool blocking_copy
|
||||
dstPtr, // void *dst_ptr
|
||||
srcPtr, // const void *src_ptr
|
||||
copySize, // size_t size
|
||||
nullptr // cl_event *event
|
||||
);
|
||||
EXPECT_EQ(CL_INVALID_VALUE, retVal);
|
||||
|
||||
delete[] srcPtr;
|
||||
}
|
||||
|
||||
HWTEST_F(StagingBufferTest, givenCmdQueueWithProfilingWhenEnqueueStagingBufferMemcpyThenTimestampsSetCorrectly) {
|
||||
cl_event event;
|
||||
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
|
||||
myCmdQ.setProfilingEnabled();
|
||||
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||
false, // cl_bool blocking_copy
|
||||
dstPtr, // void *dst_ptr
|
||||
srcPtr, // const void *src_ptr
|
||||
copySize, // size_t size
|
||||
&event // cl_event *event
|
||||
);
|
||||
auto pEvent = (Event *)event;
|
||||
|
||||
// A small adjustment to give end timestamp a valid value instead of mocked value
|
||||
TimeStampData tsData{};
|
||||
pClDevice->getDevice().getOSTime()->getGpuCpuTime(&tsData);
|
||||
if (pEvent->getTimestampPacketNodes()) {
|
||||
auto node = pEvent->getTimestampPacketNodes()->peekNodes()[0];
|
||||
auto contextEnd = ptrOffset(node->getCpuBase(), node->getGlobalEndOffset());
|
||||
*reinterpret_cast<typename FamilyType::TimestampPacketType *>(contextEnd) = static_cast<typename FamilyType::TimestampPacketType>(tsData.gpuTimeStamp);
|
||||
} else {
|
||||
HwTimeStamps *timeStamps = static_cast<TagNode<HwTimeStamps> *>(pEvent->getHwTimeStampNode())->tagForCpuAccess;
|
||||
timeStamps->contextEndTS = tsData.gpuTimeStamp;
|
||||
timeStamps->globalEndTS = tsData.gpuTimeStamp;
|
||||
}
|
||||
|
||||
EXPECT_FALSE(pEvent->isCPUProfilingPath());
|
||||
EXPECT_TRUE(pEvent->isProfilingEnabled());
|
||||
uint64_t queue, submit, start, end;
|
||||
pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queue, 0);
|
||||
pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0);
|
||||
pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(uint64_t), &start, 0);
|
||||
pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_END, sizeof(uint64_t), &end, 0);
|
||||
EXPECT_GE(queue, 0ull);
|
||||
EXPECT_GE(submit, queue);
|
||||
EXPECT_GE(start, submit);
|
||||
EXPECT_GE(end, start);
|
||||
clReleaseEvent(event);
|
||||
}
|
|
@ -460,6 +460,17 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
|||
return BaseClass::enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event);
|
||||
}
|
||||
|
||||
cl_int enqueueSVMMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size,
|
||||
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override {
|
||||
enqueueSVMMemcpyCalledCount++;
|
||||
return BaseClass::enqueueSVMMemcpy(blockingCopy, dstPtr, srcPtr, size, numEventsInWaitList, eventWaitList, event);
|
||||
}
|
||||
|
||||
cl_int finish() override {
|
||||
finishCalledCount++;
|
||||
return BaseClass::finish();
|
||||
}
|
||||
|
||||
unsigned int lastCommandType;
|
||||
std::vector<Kernel *> lastEnqueuedKernels;
|
||||
MultiDispatchInfo storedMultiDispatchInfo;
|
||||
|
@ -490,7 +501,8 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
|||
std::optional<WaitStatus> waitUntilCompleteReturnValue{};
|
||||
int waitForAllEnginesCalledCount{0};
|
||||
int enqueueMarkerWithWaitListCalledCount{0};
|
||||
|
||||
size_t enqueueSVMMemcpyCalledCount{0};
|
||||
size_t finishCalledCount{0};
|
||||
LinearStream *peekCommandStream() {
|
||||
return this->commandStream;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2023 Intel Corporation
|
||||
* Copyright (C) 2018-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
|
@ -14,6 +14,7 @@
|
|||
#include "shared/source/memory_manager/deferred_deleter.h"
|
||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||
#include "shared/source/os_interface/os_context.h"
|
||||
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||
#include "shared/test/common/helpers/engine_descriptor_helper.h"
|
||||
#include "shared/test/common/mocks/mock_svm_manager.h"
|
||||
|
||||
|
@ -123,6 +124,7 @@ void MockContext::initializeWithDevices(const ClDeviceVector &devices, bool noSp
|
|||
}
|
||||
deviceBitfields.insert({rootDeviceIndex, deviceBitfield});
|
||||
}
|
||||
stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager, rootDeviceIndices, deviceBitfields);
|
||||
|
||||
cl_int retVal;
|
||||
if (!noSpecialQueue) {
|
||||
|
|
|
@ -387,6 +387,8 @@ DECLARE_DEBUG_VARIABLE(int32_t, SkipDcFlushOnBarrierWithoutEvents, -1, "-1: defa
|
|||
DECLARE_DEBUG_VARIABLE(int32_t, EnableDeviceUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableHostUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, UseLocalPreferredForCacheableBuffers, -1, "Use localPreferred for cacheable buffers")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableCopyWithStagingBuffers, -1, "Enable copy with non-usm memory through staging buffers. -1: default, 0: disabled, 1: enabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, StagingBufferSize, -1, "Size of single staging buffer. -1: default (2MB), >0: size in KB")
|
||||
|
||||
/*DIRECT SUBMISSION FLAGS*/
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD")
|
||||
|
|
|
@ -237,6 +237,7 @@ class ProductHelper {
|
|||
virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
|
||||
virtual bool isAvailableExtendedScratch() const = 0;
|
||||
virtual std::optional<bool> isCoherentAllocation(uint64_t patIndex) const = 0;
|
||||
virtual bool isStagingBuffersEnabled() const = 0;
|
||||
|
||||
virtual ~ProductHelper() = default;
|
||||
|
||||
|
|
|
@ -897,4 +897,10 @@ template <PRODUCT_FAMILY gfxProduct>
|
|||
bool ProductHelperHw<gfxProduct>::isAvailableExtendedScratch() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <PRODUCT_FAMILY gfxProduct>
|
||||
bool ProductHelperHw<gfxProduct>::isStagingBuffersEnabled() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
|
|
@ -182,6 +182,7 @@ class ProductHelperHw : public ProductHelper {
|
|||
size_t getMaxFillPaternSizeForCopyEngine() const override;
|
||||
bool isAvailableExtendedScratch() const override;
|
||||
std::optional<bool> isCoherentAllocation(uint64_t patIndex) const override;
|
||||
bool isStagingBuffersEnabled() const override;
|
||||
|
||||
~ProductHelperHw() override = default;
|
||||
|
||||
|
|
|
@ -49,6 +49,8 @@ set(NEO_CORE_UTILITIES
|
|||
${CMAKE_CURRENT_SOURCE_DIR}/wait_util.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/staging_buffer_manager.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/staging_buffer_manager.h
|
||||
)
|
||||
|
||||
set(NEO_CORE_UTILITIES_WINDOWS
|
||||
|
|
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* Copyright (C) 2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||
|
||||
#include "shared/source/command_stream/command_stream_receiver.h"
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/device/device.h"
|
||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields) : svmAllocsManager(svmAllocsManager), rootDeviceIndices(rootDeviceIndices), deviceBitfields(deviceBitfields) {
|
||||
if (debugManager.flags.StagingBufferSize.get() != -1) {
|
||||
chunkSize = debugManager.flags.StagingBufferSize.get() * MemoryConstants::kiloByte;
|
||||
}
|
||||
}
|
||||
|
||||
StagingBufferManager::~StagingBufferManager() {
|
||||
for (auto &stagingBuffer : stagingBuffers) {
|
||||
svmAllocsManager->freeSVMAlloc(stagingBuffer.first->gpuAllocations.getDefaultGraphicsAllocation()->getUnderlyingBuffer());
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This method performs 4 steps for single chunk copy
|
||||
* 1. Get existing staging buffer, if can't - allocate new one,
|
||||
* 2. Perform actual copy,
|
||||
* 3. Store used buffer back to the container (with current task count)
|
||||
* 4. Update tag to reuse previous buffers within same API call
|
||||
*/
|
||||
int32_t StagingBufferManager::performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr) {
|
||||
auto rootDeviceIndex = csr->getRootDeviceIndex();
|
||||
auto taskCount = *csr->getTagAddress();
|
||||
auto stagingBuffer = getExistingBuffer(taskCount, rootDeviceIndex);
|
||||
if (stagingBuffer == nullptr) {
|
||||
stagingBuffer = allocateStagingBuffer();
|
||||
}
|
||||
auto ret = chunkCopyFunc(chunkDst, stagingBuffer, chunkSrc, size);
|
||||
storeBuffer(stagingBuffer, csr->peekTaskCount());
|
||||
csr->flushTagUpdate();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This method copies data between non-USM and USM allocations by splitting transfers into chunks.
|
||||
* Each chunk copy contains staging buffer which should be used instead of non-usm memory during transfers on GPU.
|
||||
* Caller provides actual function to transfer data for single chunk.
|
||||
*/
|
||||
int32_t StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr) {
|
||||
auto copiesNum = size / chunkSize;
|
||||
auto remainder = size % chunkSize;
|
||||
|
||||
for (auto i = 0u; i < copiesNum; i++) {
|
||||
auto chunkDst = ptrOffset(dstPtr, i * chunkSize);
|
||||
auto chunkSrc = ptrOffset(srcPtr, i * chunkSize);
|
||||
auto ret = performChunkCopy(chunkDst, chunkSrc, chunkSize, chunkCopyFunc, csr);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
if (remainder != 0) {
|
||||
auto chunkDst = ptrOffset(dstPtr, copiesNum * chunkSize);
|
||||
auto chunkSrc = ptrOffset(srcPtr, copiesNum * chunkSize);
|
||||
auto ret = performChunkCopy(chunkDst, chunkSrc, remainder, chunkCopyFunc, csr);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This method will try to return existing staging buffer from the container.
|
||||
* It's checking only "oldest" allocation.
|
||||
* Returns nullptr if no staging buffer available.
|
||||
*/
|
||||
void *StagingBufferManager::getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex) {
|
||||
auto lock = std::lock_guard<std::mutex>(mtx);
|
||||
if (stagingBuffers.empty()) {
|
||||
return nullptr;
|
||||
}
|
||||
void *buffer = nullptr;
|
||||
auto iterator = stagingBuffers.begin();
|
||||
UNRECOVERABLE_IF(iterator == stagingBuffers.end());
|
||||
|
||||
if (taskCount > iterator->second) {
|
||||
auto allocation = iterator->first->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
|
||||
buffer = allocation->getUnderlyingBuffer();
|
||||
stagingBuffers.erase(iterator);
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void *StagingBufferManager::allocateStagingBuffer() {
|
||||
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::hostUnifiedMemory, 0u, rootDeviceIndices, deviceBitfields);
|
||||
auto hostPtr = svmAllocsManager->createHostUnifiedMemoryAllocation(chunkSize, unifiedMemoryProperties);
|
||||
return hostPtr;
|
||||
}
|
||||
|
||||
void StagingBufferManager::storeBuffer(void *stagingBuffer, uint64_t taskCount) {
|
||||
auto lock = std::lock_guard<std::mutex>(mtx);
|
||||
auto svmData = svmAllocsManager->getSVMAlloc(stagingBuffer);
|
||||
stagingBuffers.push_back({svmData, taskCount});
|
||||
}
|
||||
|
||||
bool StagingBufferManager::isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, bool hasDependencies) const {
|
||||
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
|
||||
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
|
||||
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
|
||||
}
|
||||
auto usmDstData = svmAllocsManager->getSVMAlloc(dstPtr);
|
||||
auto usmSrcData = svmAllocsManager->getSVMAlloc(srcPtr);
|
||||
bool hostToUsmDeviceCopy = usmSrcData == nullptr && usmDstData != nullptr;
|
||||
return stagingCopyEnabled && hostToUsmDeviceCopy && !hasDependencies;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* Copyright (C) 2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/utilities/stackvec.h"
|
||||
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
|
||||
namespace NEO {
|
||||
class SVMAllocsManager;
|
||||
class CommandStreamReceiver;
|
||||
class Device;
|
||||
struct SvmAllocationData;
|
||||
|
||||
using ChunkCopyFunction = std::function<int32_t(void *, void *, const void *, size_t)>;
|
||||
|
||||
struct StagingBufferTracker {
|
||||
void *stagingBuffer;
|
||||
uint64_t taskCount;
|
||||
};
|
||||
|
||||
class StagingBufferManager {
|
||||
public:
|
||||
StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields);
|
||||
~StagingBufferManager();
|
||||
|
||||
bool isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, bool hasDependencies) const;
|
||||
int32_t performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr);
|
||||
|
||||
private:
|
||||
void *getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex);
|
||||
void *allocateStagingBuffer();
|
||||
void storeBuffer(void *stagingBuffer, uint64_t taskCount);
|
||||
int32_t performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr);
|
||||
|
||||
size_t chunkSize = MemoryConstants::pageSize2M;
|
||||
|
||||
std::vector<std::pair<SvmAllocationData *, uint64_t>> stagingBuffers;
|
||||
std::mutex mtx;
|
||||
|
||||
SVMAllocsManager *svmAllocsManager;
|
||||
const RootDeviceIndicesContainer rootDeviceIndices;
|
||||
const std::map<uint32_t, DeviceBitfield> deviceBitfields;
|
||||
};
|
||||
|
||||
} // namespace NEO
|
|
@ -608,4 +608,6 @@ EnableReusingGpuTimestamps = -1
|
|||
ForceCopyOperationOffloadForComputeCmdList = -1
|
||||
SecondaryContextEngineTypeMask = -1
|
||||
DisableSupportForL0Debugger=0
|
||||
EnableCopyWithStagingBuffers = -1
|
||||
StagingBufferSize = -1
|
||||
# Please don't edit below this line
|
||||
|
|
|
@ -28,6 +28,7 @@ target_sources(neo_shared_tests PRIVATE
|
|||
${CMAKE_CURRENT_SOURCE_DIR}/vec_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/wait_util_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/staging_buffer_manager_tests.cpp
|
||||
)
|
||||
|
||||
add_subdirectories()
|
|
@ -0,0 +1,220 @@
|
|||
/*
|
||||
* Copyright (C) 2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||
#include "shared/test/common/fixtures/device_fixture.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/mocks/mock_device.h"
|
||||
#include "shared/test/common/mocks/mock_svm_manager.h"
|
||||
#include "shared/test/common/test_macros/test.h"
|
||||
#include "shared/test/common/test_macros/test_checks_shared.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace NEO;
|
||||
|
||||
class StagingBufferManagerFixture : public DeviceFixture {
|
||||
public:
|
||||
void setUp() {
|
||||
DeviceFixture::setUp();
|
||||
REQUIRE_SVM_OR_SKIP(&hardwareInfo);
|
||||
this->svmAllocsManager = std::make_unique<MockSVMAllocsManager>(pDevice->getMemoryManager(), false);
|
||||
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
|
||||
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
|
||||
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
|
||||
this->stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager.get(), rootDeviceIndices, deviceBitfields);
|
||||
this->csr = pDevice->commandStreamReceivers[0].get();
|
||||
}
|
||||
|
||||
void tearDown() {
|
||||
stagingBufferManager.reset();
|
||||
svmAllocsManager.reset();
|
||||
DeviceFixture::tearDown();
|
||||
}
|
||||
|
||||
void *allocateDeviceBuffer(size_t size) {
|
||||
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
|
||||
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
|
||||
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::deviceUnifiedMemory, 0u, rootDeviceIndices, deviceBitfields);
|
||||
unifiedMemoryProperties.device = pDevice;
|
||||
return svmAllocsManager->createHostUnifiedMemoryAllocation(size, unifiedMemoryProperties);
|
||||
}
|
||||
|
||||
void copyThroughStagingBuffers(size_t copySize, size_t expectedChunks, size_t expectedAllocations) {
|
||||
auto usmBuffer = allocateDeviceBuffer(copySize);
|
||||
auto nonUsmBuffer = new unsigned char[copySize];
|
||||
|
||||
size_t chunkCounter = 0;
|
||||
memset(usmBuffer, 0, copySize);
|
||||
memset(nonUsmBuffer, 0xFF, copySize);
|
||||
|
||||
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) {
|
||||
chunkCounter++;
|
||||
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
||||
memcpy(chunkDst, stagingBuffer, chunkSize);
|
||||
return 0;
|
||||
};
|
||||
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
|
||||
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, copySize, chunkCopy, csr);
|
||||
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||
|
||||
EXPECT_EQ(0, ret);
|
||||
EXPECT_EQ(0, memcmp(usmBuffer, nonUsmBuffer, copySize));
|
||||
EXPECT_EQ(expectedChunks, chunkCounter);
|
||||
EXPECT_EQ(expectedAllocations, newUsmAllocations);
|
||||
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
||||
delete[] nonUsmBuffer;
|
||||
}
|
||||
|
||||
constexpr static size_t stagingBufferSize = MemoryConstants::megaByte * 2;
|
||||
DebugManagerStateRestore restorer;
|
||||
std::unique_ptr<MockSVMAllocsManager> svmAllocsManager;
|
||||
std::unique_ptr<StagingBufferManager> stagingBufferManager;
|
||||
CommandStreamReceiver *csr;
|
||||
};
|
||||
|
||||
using StagingBufferManagerTest = Test<StagingBufferManagerFixture>;
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForCopyThenReturnTrue) {
|
||||
constexpr size_t bufferSize = 1024;
|
||||
auto usmBuffer = allocateDeviceBuffer(bufferSize);
|
||||
unsigned char nonUsmBuffer[bufferSize];
|
||||
|
||||
struct {
|
||||
void *dstPtr;
|
||||
void *srcPtr;
|
||||
bool hasDependencies;
|
||||
bool expectValid;
|
||||
} copyParamsStruct[5]{
|
||||
{usmBuffer, nonUsmBuffer, false, true}, // nonUsm -> usm without dependencies
|
||||
{usmBuffer, nonUsmBuffer, true, false}, // nonUsm -> usm with dependencies
|
||||
{nonUsmBuffer, nonUsmBuffer, false, false}, // nonUsm -> nonUsm without dependencies
|
||||
{usmBuffer, usmBuffer, false, false}, // usm -> usm without dependencies
|
||||
{nonUsmBuffer, usmBuffer, false, false} // usm -> nonUsm without dependencies
|
||||
};
|
||||
for (auto i = 0; i < 5; i++) {
|
||||
auto actualValid = stagingBufferManager->isValidForCopy(*pDevice, copyParamsStruct[i].dstPtr, copyParamsStruct[i].srcPtr, copyParamsStruct[i].hasDependencies);
|
||||
EXPECT_EQ(actualValid, copyParamsStruct[i].expectValid);
|
||||
}
|
||||
|
||||
debugManager.flags.EnableCopyWithStagingBuffers.set(0);
|
||||
EXPECT_FALSE(stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, false));
|
||||
|
||||
debugManager.flags.EnableCopyWithStagingBuffers.set(-1);
|
||||
auto isStaingBuffersEnabled = pDevice->getProductHelper().isStagingBuffersEnabled();
|
||||
EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, false));
|
||||
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformCopyThenCopyData) {
|
||||
constexpr size_t numOfChunkCopies = 8;
|
||||
constexpr size_t remainder = 1024;
|
||||
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies + remainder;
|
||||
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies + 1, 1);
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformCopyWithoutRemainderThenNoRemainderCalled) {
|
||||
constexpr size_t numOfChunkCopies = 8;
|
||||
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies;
|
||||
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 1);
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenTaskCountNotReadyThenDontReuseBuffers) {
|
||||
constexpr size_t numOfChunkCopies = 8;
|
||||
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies;
|
||||
|
||||
*csr->getTagAddress() = csr->peekTaskCount();
|
||||
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 8);
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenUpdatedTaskCountThenReuseBuffers) {
|
||||
constexpr size_t numOfChunkCopies = 8;
|
||||
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies;
|
||||
|
||||
*csr->getTagAddress() = csr->peekTaskCount();
|
||||
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 8);
|
||||
|
||||
*csr->getTagAddress() = csr->peekTaskCount() + numOfChunkCopies;
|
||||
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 0);
|
||||
EXPECT_EQ(numOfChunkCopies, svmAllocsManager->svmAllocs.getNumAllocs());
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkCopyThenEarlyReturnWithFailure) {
|
||||
constexpr size_t numOfChunkCopies = 8;
|
||||
constexpr size_t remainder = 1024;
|
||||
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies + remainder;
|
||||
constexpr int expectedErrorCode = 1;
|
||||
auto usmBuffer = allocateDeviceBuffer(totalCopySize);
|
||||
auto nonUsmBuffer = new unsigned char[totalCopySize];
|
||||
|
||||
size_t chunkCounter = 0;
|
||||
memset(usmBuffer, 0, totalCopySize);
|
||||
memset(nonUsmBuffer, 0xFF, totalCopySize);
|
||||
|
||||
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) {
|
||||
chunkCounter++;
|
||||
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
||||
memcpy(chunkDst, stagingBuffer, chunkSize);
|
||||
return expectedErrorCode;
|
||||
};
|
||||
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
|
||||
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr);
|
||||
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||
|
||||
EXPECT_EQ(expectedErrorCode, ret);
|
||||
EXPECT_NE(0, memcmp(usmBuffer, nonUsmBuffer, totalCopySize));
|
||||
EXPECT_EQ(1u, chunkCounter);
|
||||
EXPECT_EQ(1u, newUsmAllocations);
|
||||
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
||||
delete[] nonUsmBuffer;
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedRemainderCopyThenReturnWithFailure) {
|
||||
constexpr size_t numOfChunkCopies = 8;
|
||||
constexpr size_t remainder = 1024;
|
||||
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies + remainder;
|
||||
constexpr int expectedErrorCode = 1;
|
||||
auto usmBuffer = allocateDeviceBuffer(totalCopySize);
|
||||
auto nonUsmBuffer = new unsigned char[totalCopySize];
|
||||
|
||||
size_t chunkCounter = 0;
|
||||
memset(usmBuffer, 0, totalCopySize);
|
||||
memset(nonUsmBuffer, 0xFF, totalCopySize);
|
||||
|
||||
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) {
|
||||
chunkCounter++;
|
||||
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
||||
memcpy(chunkDst, stagingBuffer, chunkSize);
|
||||
if (chunkCounter <= numOfChunkCopies) {
|
||||
return 0;
|
||||
} else {
|
||||
return expectedErrorCode;
|
||||
}
|
||||
};
|
||||
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
|
||||
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr);
|
||||
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||
|
||||
EXPECT_EQ(expectedErrorCode, ret);
|
||||
EXPECT_EQ(numOfChunkCopies + 1, chunkCounter);
|
||||
EXPECT_EQ(1u, newUsmAllocations);
|
||||
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
||||
delete[] nonUsmBuffer;
|
||||
}
|
||||
|
||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenChangedBufferSizeThenPerformCopyWithCorrectNumberOfChunks) {
|
||||
constexpr size_t stagingBufferSize = 512;
|
||||
constexpr size_t numOfChunkCopies = 8;
|
||||
constexpr size_t remainder = 1024;
|
||||
constexpr size_t totalCopySize = MemoryConstants::kiloByte * stagingBufferSize * numOfChunkCopies + remainder;
|
||||
debugManager.flags.StagingBufferSize.set(stagingBufferSize); // 512KB
|
||||
|
||||
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
|
||||
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
|
||||
stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager.get(), rootDeviceIndices, deviceBitfields);
|
||||
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies + 1, 1);
|
||||
}
|
Loading…
Reference in New Issue