diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp index 9b7566d0ed..6510a68a3a 100644 --- a/opencl/source/api/api.cpp +++ b/opencl/source/api/api.cpp @@ -20,6 +20,7 @@ #include "shared/source/os_interface/device_factory.h" #include "shared/source/utilities/buffer_pool_allocator.inl" #include "shared/source/utilities/heap_allocator.h" +#include "shared/source/utilities/staging_buffer_manager.h" #include "opencl/source/accelerators/intel_motion_estimation.h" #include "opencl/source/api/additional_extensions.h" @@ -4913,14 +4914,19 @@ cl_int CL_API_CALL clEnqueueSVMMemcpy(cl_command_queue commandQueue, } if (size != 0) { - retVal = pCommandQueue->enqueueSVMMemcpy( - blockingCopy, - dstPtr, - srcPtr, - size, - numEventsInWaitList, - eventWaitList, - event); + auto stagingBufferManager = pCommandQueue->getContext().getStagingBufferManager(); + if (stagingBufferManager->isValidForCopy(device, dstPtr, srcPtr, numEventsInWaitList)) { + retVal = pCommandQueue->enqueueStagingBufferMemcpy(blockingCopy, dstPtr, srcPtr, size, event); + } else { + retVal = pCommandQueue->enqueueSVMMemcpy( + blockingCopy, + dstPtr, + srcPtr, + size, + numEventsInWaitList, + eventWaitList, + event); + } } else { retVal = pCommandQueue->enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event); } diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index af65709aaf..9cbc6fb61c 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -29,6 +29,7 @@ #include "shared/source/os_interface/os_context.h" #include "shared/source/os_interface/product_helper.h" #include "shared/source/utilities/api_intercept.h" +#include "shared/source/utilities/staging_buffer_manager.h" #include "shared/source/utilities/tag_allocator.h" #include "opencl/source/built_ins/builtins_dispatch_builder.h" @@ -1504,4 +1505,56 @@ void CommandQueue::unregisterGpgpuAndBcsCsrClients() { } } +cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event) { + CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, &size}; + csrSelectionArgs.direction = TransferDirection::hostToLocal; + auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs); + + Event profilingEvent{this, CL_COMMAND_SVM_MEMCPY, CompletionStamp::notReady, CompletionStamp::notReady}; + if (isProfilingEnabled()) { + profilingEvent.setQueueTimeStamp(); + } + + auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) -> int32_t { + auto isFirstTransfer = (chunkDst == dstPtr); + auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size); + if (isFirstTransfer && isProfilingEnabled()) { + profilingEvent.setSubmitTimeStamp(); + } + memcpy(stagingBuffer, chunkSrc, chunkSize); + if (isFirstTransfer && isProfilingEnabled()) { + profilingEvent.setStartTimeStamp(); + } + + cl_event *outEvent = nullptr; + if (isLastTransfer && !this->isOOQEnabled()) { + outEvent = event; + } + auto ret = this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, outEvent); + return ret; + }; + + auto stagingBufferManager = this->context->getStagingBufferManager(); + auto ret = stagingBufferManager->performCopy(dstPtr, srcPtr, size, chunkCopy, csr); + if (ret != CL_SUCCESS) { + return ret; + } + + if (event != nullptr) { + if (this->isOOQEnabled()) { + ret = this->enqueueBarrierWithWaitList(0, nullptr, event); + } + if (isProfilingEnabled()) { + auto pEvent = castToObjectOrAbort(*event); + pEvent->copyTimestamps(profilingEvent); + pEvent->setCPUProfilingPath(false); + } + } + + if (blockingCopy) { + ret = this->finish(); + } + return ret; +} + } // namespace NEO diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 452edf9d3b..b4e52390a7 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -388,6 +388,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool isBcsSplitInitialized() const { return this->bcsSplitInitialized; } bool isBcs() const { return isCopyOnly; }; + cl_int enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event); + protected: void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet); cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest); diff --git a/opencl/source/context/context.cpp b/opencl/source/context/context.cpp index 0aad7b4e84..00acbf9ffc 100644 --- a/opencl/source/context/context.cpp +++ b/opencl/source/context/context.cpp @@ -23,6 +23,7 @@ #include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/utilities/buffer_pool_allocator.inl" #include "shared/source/utilities/heap_allocator.h" +#include "shared/source/utilities/staging_buffer_manager.h" #include "shared/source/utilities/tag_allocator.h" #include "opencl/source/cl_device/cl_device.h" @@ -74,6 +75,7 @@ Context::~Context() { } } if (svmAllocsManager) { + this->stagingBufferManager.reset(); svmAllocsManager->trimUSMDeviceAllocCache(); delete svmAllocsManager; } @@ -281,6 +283,7 @@ bool Context::createImpl(const cl_context_properties *properties, this->svmAllocsManager = new SVMAllocsManager(this->memoryManager, this->areMultiStorageAllocationsPreferred()); this->svmAllocsManager->initUsmAllocationsCaches(device->getDevice()); + this->stagingBufferManager = std::make_unique(svmAllocsManager, rootDeviceIndices, deviceBitfields); } } @@ -676,4 +679,8 @@ bool Context::checkIfContextIsNonZebin() const { return this->nonZebinContext; } +StagingBufferManager *Context::getStagingBufferManager() const { + return this->stagingBufferManager.get(); +} + } // namespace NEO diff --git a/opencl/source/context/context.h b/opencl/source/context/context.h index ab13993e72..b5f6814812 100644 --- a/opencl/source/context/context.h +++ b/opencl/source/context/context.h @@ -40,6 +40,7 @@ class SVMAllocsManager; class Program; class Platform; class TagAllocatorBase; +class StagingBufferManager; template <> struct OpenCLObjectMapper<_cl_context> { @@ -256,6 +257,8 @@ class Context : public BaseObject<_cl_context> { void initializeUsmAllocationPools(); void cleanupUsmAllocationPools(); + StagingBufferManager *getStagingBufferManager() const; + protected: struct BuiltInKernel { const char *pSource = nullptr; @@ -300,6 +303,8 @@ class Context : public BaseObject<_cl_context> { std::unique_ptr multiRootDeviceTimestampPacketAllocator; std::mutex multiRootDeviceAllocatorMtx; + std::unique_ptr stagingBufferManager; + bool interopUserSync = false; bool resolvesRequiredInKernels = false; bool nonZebinContext = false; diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index 7509f9c2d9..236a3150a0 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -397,6 +397,10 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con auto &device = this->cmdQueue->getDevice(); auto &gfxCoreHelper = device.getGfxCoreHelper(); auto resolution = device.getDeviceInfo().profilingTimerResolution; + if (timestampsCopied) { + // Adjust startTS since we calculate profiling based on other event timestamps + contextStartTS = startTimeStamp.gpuTimeStamp; + } // Calculate startTimestamp only if it was not already set on CPU if (startTimeStamp.cpuTimeInNs == 0) { diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h index 385125d320..2731f7a727 100644 --- a/opencl/source/event/event.h +++ b/opencl/source/event/event.h @@ -312,6 +312,14 @@ class Event : public BaseObject<_cl_event>, public IDNode { static void getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS); + void copyTimestamps(const Event &srcEvent) { + this->queueTimeStamp = srcEvent.queueTimeStamp; + this->submitTimeStamp = srcEvent.submitTimeStamp; + this->startTimeStamp = srcEvent.startTimeStamp; + this->endTimeStamp = srcEvent.endTimeStamp; + timestampsCopied = true; + } + protected: Event(Context *ctx, CommandQueue *cmdQueue, cl_command_type cmdType, TaskCountType taskLevel, TaskCountType taskCount); @@ -383,6 +391,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { bool profilingEnabled = false; bool profilingCpuPath = false; bool dataCalculated = false; + bool timestampsCopied = false; ProfilingInfo queueTimeStamp{}; ProfilingInfo submitTimeStamp{}; diff --git a/opencl/test/unit_test/api/cl_enqueue_svm_memcpy_tests.inl b/opencl/test/unit_test/api/cl_enqueue_svm_memcpy_tests.inl index ab1b912089..012201f143 100644 --- a/opencl/test/unit_test/api/cl_enqueue_svm_memcpy_tests.inl +++ b/opencl/test/unit_test/api/cl_enqueue_svm_memcpy_tests.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -231,4 +231,29 @@ TEST_F(ClEnqueueSVMMemcpyTests, GivenDeviceNotSupportingSvmWhenEnqueuingSVMMemcp EXPECT_EQ(CL_INVALID_OPERATION, retVal); } +TEST_F(ClEnqueueSVMMemcpyTests, givenCopyValidForStagingBuffersCopyThenTransferSuccesfull) { + DebugManagerStateRestore restorer; + debugManager.flags.EnableCopyWithStagingBuffers.set(1); + const ClDeviceInfo &devInfo = pDevice->getDeviceInfo(); + if (devInfo.svmCapabilities != 0) { + void *pDstSvm = clSVMAlloc(pContext, CL_MEM_READ_WRITE, 256, 4); + EXPECT_NE(nullptr, pDstSvm); + auto pSrc = new unsigned char[256]; + auto retVal = clEnqueueSVMMemcpy( + pCommandQueue, // cl_command_queue command_queue + CL_FALSE, // cl_bool blocking_copy + pDstSvm, // void *dst_ptr + pSrc, // const void *src_ptr + 256, // size_t size + 0, // cl_uint num_events_in_wait_list + nullptr, // const cl_event *event_wait_list + nullptr // cl_event *event + ); + EXPECT_EQ(CL_SUCCESS, retVal); + + clSVMFree(pContext, pDstSvm); + delete[] pSrc; + } +} + } // namespace ULT diff --git a/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp index 1e0383a9ac..a4c554e6b8 100644 --- a/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -13,6 +13,8 @@ #include "shared/source/memory_manager/surface.h" #include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/os_interface/device_factory.h" +#include "shared/source/utilities/hw_timestamps.h" +#include "shared/source/utilities/tag_allocator.h" #include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/libult/ult_command_stream_receiver.h" @@ -2369,3 +2371,190 @@ HWTEST_F(EnqueueSvmTest, givenCopyFromMappedPtrToMappedPtrWhenCallingSvmMemcpyTh EXPECT_EQ(2u, csr.createAllocationForHostSurfaceCalled); } } + +struct StagingBufferTest : public EnqueueSvmTest { + void SetUp() override { + REQUIRE_SVM_OR_SKIP(defaultHwInfo); + EnqueueSvmTest::SetUp(); + SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::deviceUnifiedMemory, 1, context->getRootDeviceIndices(), context->getDeviceBitfields()); + unifiedMemoryProperties.device = pDevice; + svmManager = this->context->getSVMAllocsManager(); + + dstPtr = svmManager->createUnifiedMemoryAllocation(copySize, unifiedMemoryProperties); + srcPtr = new unsigned char[copySize]; + } + + void TearDown() override { + if (defaultHwInfo->capabilityTable.ftrSvm == false) { + return; + } + svmManager = this->context->getSVMAllocsManager(); + svmManager->freeSVMAlloc(dstPtr); + delete[] srcPtr; + EnqueueSvmTest::TearDown(); + } + + static constexpr size_t stagingBufferSize = MemoryConstants::megaByte * 2; + static constexpr size_t copySize = stagingBufferSize * 4; + static constexpr size_t expectedNumOfCopies = copySize / stagingBufferSize; + + SVMAllocsManager *svmManager; + void *dstPtr; + unsigned char *srcPtr; +}; + +HWTEST_F(StagingBufferTest, givenInOrderCmdQueueWhenEnqueueStagingBufferMemcpyNonBlockingThenCopySucessfull) { + constexpr cl_command_type expectedLastCmd = CL_COMMAND_SVM_MEMCPY; + + cl_event event; + MockCommandQueueHw myCmdQ(context, pClDevice, 0); + auto initialUsmAllocs = svmManager->getNumAllocs(); + retVal = myCmdQ.enqueueStagingBufferMemcpy( + false, // cl_bool blocking_copy + dstPtr, // void *dst_ptr + srcPtr, // const void *src_ptr + copySize, // size_t size + &event // cl_event *event + ); + auto pEvent = (Event *)event; + auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs; + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(1u, numOfStagingBuffers); + EXPECT_EQ(expectedNumOfCopies, myCmdQ.enqueueSVMMemcpyCalledCount); + EXPECT_EQ(0u, myCmdQ.finishCalledCount); + EXPECT_EQ(expectedLastCmd, myCmdQ.lastCommandType); + EXPECT_EQ(expectedLastCmd, pEvent->getCommandType()); + + clReleaseEvent(event); +} + +HWTEST_F(StagingBufferTest, givenOutOfOrderCmdQueueWhenEnqueueStagingBufferMemcpyNonBlockingThenCopySucessfull) { + constexpr cl_command_type expectedLastCmd = CL_COMMAND_BARRIER; + + cl_event event; + MockCommandQueueHw myCmdQ(context, pClDevice, 0); + myCmdQ.setOoqEnabled(); + + auto initialUsmAllocs = svmManager->getNumAllocs(); + retVal = myCmdQ.enqueueStagingBufferMemcpy( + false, // cl_bool blocking_copy + dstPtr, // void *dst_ptr + srcPtr, // const void *src_ptr + copySize, // size_t size + &event // cl_event *event + ); + + auto pEvent = (Event *)event; + auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs; + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(1u, numOfStagingBuffers); + EXPECT_EQ(expectedNumOfCopies, myCmdQ.enqueueSVMMemcpyCalledCount); + EXPECT_EQ(0u, myCmdQ.finishCalledCount); + EXPECT_EQ(expectedLastCmd, myCmdQ.lastCommandType); + EXPECT_EQ(expectedLastCmd, pEvent->getCommandType()); + + clReleaseEvent(event); +} + +HWTEST_F(StagingBufferTest, givenEnqueueStagingBufferMemcpyWhenTaskCountNotReadyThenCopySucessfullAndBuffersNotReused) { + MockCommandQueueHw myCmdQ(context, pClDevice, 0); + auto initialUsmAllocs = svmManager->getNumAllocs(); + auto &csr = pCmdQ->getGpgpuCommandStreamReceiver(); + *csr.getTagAddress() = csr.peekTaskCount(); + retVal = myCmdQ.enqueueStagingBufferMemcpy( + false, // cl_bool blocking_copy + dstPtr, // void *dst_ptr + srcPtr, // const void *src_ptr + copySize, // size_t size + nullptr // cl_event *event + ); + auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs; + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(expectedNumOfCopies, numOfStagingBuffers); + *csr.getTagAddress() = csr.peekTaskCount(); +} + +HWTEST_F(StagingBufferTest, givenCmdQueueWhenEnqueueStagingBufferMemcpyBlockingThenCopySucessfullAndFinishCalled) { + MockCommandQueueHw myCmdQ(context, pClDevice, 0); + auto initialUsmAllocs = svmManager->getNumAllocs(); + retVal = myCmdQ.enqueueStagingBufferMemcpy( + true, // cl_bool blocking_copy + dstPtr, // void *dst_ptr + srcPtr, // const void *src_ptr + copySize, // size_t size + nullptr // cl_event *event + ); + auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs; + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(1u, numOfStagingBuffers); + EXPECT_EQ(1u, myCmdQ.finishCalledCount); + + retVal = myCmdQ.enqueueStagingBufferMemcpy( + true, // cl_bool blocking_copy + dstPtr, // void *dst_ptr + srcPtr, // const void *src_ptr + copySize, // size_t size + nullptr // cl_event *event + ); + numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs; + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(1u, numOfStagingBuffers); + EXPECT_EQ(2u, myCmdQ.finishCalledCount); +} + +HWTEST_F(StagingBufferTest, givenCmdQueueWhenEnqueueStagingBufferWithInvalidBufferThenReturnFailure) { + auto dstPtr = nullptr; + auto srcPtr = new unsigned char[copySize]; + + MockCommandQueueHw myCmdQ(context, pClDevice, 0); + retVal = myCmdQ.enqueueStagingBufferMemcpy( + false, // cl_bool blocking_copy + dstPtr, // void *dst_ptr + srcPtr, // const void *src_ptr + copySize, // size_t size + nullptr // cl_event *event + ); + EXPECT_EQ(CL_INVALID_VALUE, retVal); + + delete[] srcPtr; +} + +HWTEST_F(StagingBufferTest, givenCmdQueueWithProfilingWhenEnqueueStagingBufferMemcpyThenTimestampsSetCorrectly) { + cl_event event; + MockCommandQueueHw myCmdQ(context, pClDevice, 0); + myCmdQ.setProfilingEnabled(); + retVal = myCmdQ.enqueueStagingBufferMemcpy( + false, // cl_bool blocking_copy + dstPtr, // void *dst_ptr + srcPtr, // const void *src_ptr + copySize, // size_t size + &event // cl_event *event + ); + auto pEvent = (Event *)event; + + // A small adjustment to give end timestamp a valid value instead of mocked value + TimeStampData tsData{}; + pClDevice->getDevice().getOSTime()->getGpuCpuTime(&tsData); + if (pEvent->getTimestampPacketNodes()) { + auto node = pEvent->getTimestampPacketNodes()->peekNodes()[0]; + auto contextEnd = ptrOffset(node->getCpuBase(), node->getGlobalEndOffset()); + *reinterpret_cast(contextEnd) = static_cast(tsData.gpuTimeStamp); + } else { + HwTimeStamps *timeStamps = static_cast *>(pEvent->getHwTimeStampNode())->tagForCpuAccess; + timeStamps->contextEndTS = tsData.gpuTimeStamp; + timeStamps->globalEndTS = tsData.gpuTimeStamp; + } + + EXPECT_FALSE(pEvent->isCPUProfilingPath()); + EXPECT_TRUE(pEvent->isProfilingEnabled()); + uint64_t queue, submit, start, end; + pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queue, 0); + pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0); + pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(uint64_t), &start, 0); + pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_END, sizeof(uint64_t), &end, 0); + EXPECT_GE(queue, 0ull); + EXPECT_GE(submit, queue); + EXPECT_GE(start, submit); + EXPECT_GE(end, start); + clReleaseEvent(event); +} \ No newline at end of file diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index a141fd8dcb..bad784d404 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -460,6 +460,17 @@ class MockCommandQueueHw : public CommandQueueHw { return BaseClass::enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event); } + cl_int enqueueSVMMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, + cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override { + enqueueSVMMemcpyCalledCount++; + return BaseClass::enqueueSVMMemcpy(blockingCopy, dstPtr, srcPtr, size, numEventsInWaitList, eventWaitList, event); + } + + cl_int finish() override { + finishCalledCount++; + return BaseClass::finish(); + } + unsigned int lastCommandType; std::vector lastEnqueuedKernels; MultiDispatchInfo storedMultiDispatchInfo; @@ -490,7 +501,8 @@ class MockCommandQueueHw : public CommandQueueHw { std::optional waitUntilCompleteReturnValue{}; int waitForAllEnginesCalledCount{0}; int enqueueMarkerWithWaitListCalledCount{0}; - + size_t enqueueSVMMemcpyCalledCount{0}; + size_t finishCalledCount{0}; LinearStream *peekCommandStream() { return this->commandStream; } diff --git a/opencl/test/unit_test/mocks/mock_context.cpp b/opencl/test/unit_test/mocks/mock_context.cpp index c66bd78b0c..51149d765b 100644 --- a/opencl/test/unit_test/mocks/mock_context.cpp +++ b/opencl/test/unit_test/mocks/mock_context.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -14,6 +14,7 @@ #include "shared/source/memory_manager/deferred_deleter.h" #include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/os_interface/os_context.h" +#include "shared/source/utilities/staging_buffer_manager.h" #include "shared/test/common/helpers/engine_descriptor_helper.h" #include "shared/test/common/mocks/mock_svm_manager.h" @@ -123,6 +124,7 @@ void MockContext::initializeWithDevices(const ClDeviceVector &devices, bool noSp } deviceBitfields.insert({rootDeviceIndex, deviceBitfield}); } + stagingBufferManager = std::make_unique(svmAllocsManager, rootDeviceIndices, deviceBitfields); cl_int retVal; if (!noSpecialQueue) { diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index d81bf02291..bf6cd620f6 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -387,6 +387,8 @@ DECLARE_DEBUG_VARIABLE(int32_t, SkipDcFlushOnBarrierWithoutEvents, -1, "-1: defa DECLARE_DEBUG_VARIABLE(int32_t, EnableDeviceUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB") DECLARE_DEBUG_VARIABLE(int32_t, EnableHostUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB") DECLARE_DEBUG_VARIABLE(int32_t, UseLocalPreferredForCacheableBuffers, -1, "Use localPreferred for cacheable buffers") +DECLARE_DEBUG_VARIABLE(int32_t, EnableCopyWithStagingBuffers, -1, "Enable copy with non-usm memory through staging buffers. -1: default, 0: disabled, 1: enabled") +DECLARE_DEBUG_VARIABLE(int32_t, StagingBufferSize, -1, "Size of single staging buffer. -1: default (2MB), >0: size in KB") /*DIRECT SUBMISSION FLAGS*/ DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD") diff --git a/shared/source/os_interface/product_helper.h b/shared/source/os_interface/product_helper.h index c1c69858fa..97e1becab2 100644 --- a/shared/source/os_interface/product_helper.h +++ b/shared/source/os_interface/product_helper.h @@ -237,6 +237,7 @@ class ProductHelper { virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0; virtual bool isAvailableExtendedScratch() const = 0; virtual std::optional isCoherentAllocation(uint64_t patIndex) const = 0; + virtual bool isStagingBuffersEnabled() const = 0; virtual ~ProductHelper() = default; diff --git a/shared/source/os_interface/product_helper.inl b/shared/source/os_interface/product_helper.inl index 70a2894ecb..4f590b375b 100644 --- a/shared/source/os_interface/product_helper.inl +++ b/shared/source/os_interface/product_helper.inl @@ -897,4 +897,10 @@ template bool ProductHelperHw::isAvailableExtendedScratch() const { return false; } + +template +bool ProductHelperHw::isStagingBuffersEnabled() const { + return false; +} + } // namespace NEO diff --git a/shared/source/os_interface/product_helper_hw.h b/shared/source/os_interface/product_helper_hw.h index 407390cb81..10564a9e7b 100644 --- a/shared/source/os_interface/product_helper_hw.h +++ b/shared/source/os_interface/product_helper_hw.h @@ -182,6 +182,7 @@ class ProductHelperHw : public ProductHelper { size_t getMaxFillPaternSizeForCopyEngine() const override; bool isAvailableExtendedScratch() const override; std::optional isCoherentAllocation(uint64_t patIndex) const override; + bool isStagingBuffersEnabled() const override; ~ProductHelperHw() override = default; diff --git a/shared/source/utilities/CMakeLists.txt b/shared/source/utilities/CMakeLists.txt index aa34d5856a..067bf2daf0 100644 --- a/shared/source/utilities/CMakeLists.txt +++ b/shared/source/utilities/CMakeLists.txt @@ -49,6 +49,8 @@ set(NEO_CORE_UTILITIES ${CMAKE_CURRENT_SOURCE_DIR}/wait_util.h ${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator.h + ${CMAKE_CURRENT_SOURCE_DIR}/staging_buffer_manager.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/staging_buffer_manager.h ) set(NEO_CORE_UTILITIES_WINDOWS diff --git a/shared/source/utilities/staging_buffer_manager.cpp b/shared/source/utilities/staging_buffer_manager.cpp new file mode 100644 index 0000000000..9eba6f423b --- /dev/null +++ b/shared/source/utilities/staging_buffer_manager.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/utilities/staging_buffer_manager.h" + +#include "shared/source/command_stream/command_stream_receiver.h" +#include "shared/source/debug_settings/debug_settings_manager.h" +#include "shared/source/device/device.h" +#include "shared/source/memory_manager/unified_memory_manager.h" + +namespace NEO { + +StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map &deviceBitfields) : svmAllocsManager(svmAllocsManager), rootDeviceIndices(rootDeviceIndices), deviceBitfields(deviceBitfields) { + if (debugManager.flags.StagingBufferSize.get() != -1) { + chunkSize = debugManager.flags.StagingBufferSize.get() * MemoryConstants::kiloByte; + } +} + +StagingBufferManager::~StagingBufferManager() { + for (auto &stagingBuffer : stagingBuffers) { + svmAllocsManager->freeSVMAlloc(stagingBuffer.first->gpuAllocations.getDefaultGraphicsAllocation()->getUnderlyingBuffer()); + } +} + +/* + * This method performs 4 steps for single chunk copy + * 1. Get existing staging buffer, if can't - allocate new one, + * 2. Perform actual copy, + * 3. Store used buffer back to the container (with current task count) + * 4. Update tag to reuse previous buffers within same API call + */ +int32_t StagingBufferManager::performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr) { + auto rootDeviceIndex = csr->getRootDeviceIndex(); + auto taskCount = *csr->getTagAddress(); + auto stagingBuffer = getExistingBuffer(taskCount, rootDeviceIndex); + if (stagingBuffer == nullptr) { + stagingBuffer = allocateStagingBuffer(); + } + auto ret = chunkCopyFunc(chunkDst, stagingBuffer, chunkSrc, size); + storeBuffer(stagingBuffer, csr->peekTaskCount()); + csr->flushTagUpdate(); + return ret; +} + +/* + * This method copies data between non-USM and USM allocations by splitting transfers into chunks. + * Each chunk copy contains staging buffer which should be used instead of non-usm memory during transfers on GPU. + * Caller provides actual function to transfer data for single chunk. + */ +int32_t StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr) { + auto copiesNum = size / chunkSize; + auto remainder = size % chunkSize; + + for (auto i = 0u; i < copiesNum; i++) { + auto chunkDst = ptrOffset(dstPtr, i * chunkSize); + auto chunkSrc = ptrOffset(srcPtr, i * chunkSize); + auto ret = performChunkCopy(chunkDst, chunkSrc, chunkSize, chunkCopyFunc, csr); + if (ret) { + return ret; + } + } + + if (remainder != 0) { + auto chunkDst = ptrOffset(dstPtr, copiesNum * chunkSize); + auto chunkSrc = ptrOffset(srcPtr, copiesNum * chunkSize); + auto ret = performChunkCopy(chunkDst, chunkSrc, remainder, chunkCopyFunc, csr); + if (ret) { + return ret; + } + } + return 0; +} + +/* + * This method will try to return existing staging buffer from the container. + * It's checking only "oldest" allocation. + * Returns nullptr if no staging buffer available. + */ +void *StagingBufferManager::getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex) { + auto lock = std::lock_guard(mtx); + if (stagingBuffers.empty()) { + return nullptr; + } + void *buffer = nullptr; + auto iterator = stagingBuffers.begin(); + UNRECOVERABLE_IF(iterator == stagingBuffers.end()); + + if (taskCount > iterator->second) { + auto allocation = iterator->first->gpuAllocations.getGraphicsAllocation(rootDeviceIndex); + buffer = allocation->getUnderlyingBuffer(); + stagingBuffers.erase(iterator); + } + return buffer; +} + +void *StagingBufferManager::allocateStagingBuffer() { + SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::hostUnifiedMemory, 0u, rootDeviceIndices, deviceBitfields); + auto hostPtr = svmAllocsManager->createHostUnifiedMemoryAllocation(chunkSize, unifiedMemoryProperties); + return hostPtr; +} + +void StagingBufferManager::storeBuffer(void *stagingBuffer, uint64_t taskCount) { + auto lock = std::lock_guard(mtx); + auto svmData = svmAllocsManager->getSVMAlloc(stagingBuffer); + stagingBuffers.push_back({svmData, taskCount}); +} + +bool StagingBufferManager::isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, bool hasDependencies) const { + auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled(); + if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) { + stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get(); + } + auto usmDstData = svmAllocsManager->getSVMAlloc(dstPtr); + auto usmSrcData = svmAllocsManager->getSVMAlloc(srcPtr); + bool hostToUsmDeviceCopy = usmSrcData == nullptr && usmDstData != nullptr; + return stagingCopyEnabled && hostToUsmDeviceCopy && !hasDependencies; +} + +} // namespace NEO diff --git a/shared/source/utilities/staging_buffer_manager.h b/shared/source/utilities/staging_buffer_manager.h new file mode 100644 index 0000000000..6b82df2801 --- /dev/null +++ b/shared/source/utilities/staging_buffer_manager.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +#include "shared/source/helpers/constants.h" +#include "shared/source/utilities/stackvec.h" + +#include +#include +#include + +namespace NEO { +class SVMAllocsManager; +class CommandStreamReceiver; +class Device; +struct SvmAllocationData; + +using ChunkCopyFunction = std::function; + +struct StagingBufferTracker { + void *stagingBuffer; + uint64_t taskCount; +}; + +class StagingBufferManager { + public: + StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map &deviceBitfields); + ~StagingBufferManager(); + + bool isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, bool hasDependencies) const; + int32_t performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr); + + private: + void *getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex); + void *allocateStagingBuffer(); + void storeBuffer(void *stagingBuffer, uint64_t taskCount); + int32_t performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr); + + size_t chunkSize = MemoryConstants::pageSize2M; + + std::vector> stagingBuffers; + std::mutex mtx; + + SVMAllocsManager *svmAllocsManager; + const RootDeviceIndicesContainer rootDeviceIndices; + const std::map deviceBitfields; +}; + +} // namespace NEO diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index ad3fbd548a..064bde497a 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -608,4 +608,6 @@ EnableReusingGpuTimestamps = -1 ForceCopyOperationOffloadForComputeCmdList = -1 SecondaryContextEngineTypeMask = -1 DisableSupportForL0Debugger=0 +EnableCopyWithStagingBuffers = -1 +StagingBufferSize = -1 # Please don't edit below this line diff --git a/shared/test/unit_test/utilities/CMakeLists.txt b/shared/test/unit_test/utilities/CMakeLists.txt index d29e6f9c86..08cb847a27 100644 --- a/shared/test/unit_test/utilities/CMakeLists.txt +++ b/shared/test/unit_test/utilities/CMakeLists.txt @@ -28,6 +28,7 @@ target_sources(neo_shared_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/vec_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/wait_util_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator_tests.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/staging_buffer_manager_tests.cpp ) add_subdirectories() \ No newline at end of file diff --git a/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp b/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp new file mode 100644 index 0000000000..18795b57b1 --- /dev/null +++ b/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp @@ -0,0 +1,220 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/utilities/staging_buffer_manager.h" +#include "shared/test/common/fixtures/device_fixture.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/mocks/mock_device.h" +#include "shared/test/common/mocks/mock_svm_manager.h" +#include "shared/test/common/test_macros/test.h" +#include "shared/test/common/test_macros/test_checks_shared.h" + +#include "gtest/gtest.h" + +using namespace NEO; + +class StagingBufferManagerFixture : public DeviceFixture { + public: + void setUp() { + DeviceFixture::setUp(); + REQUIRE_SVM_OR_SKIP(&hardwareInfo); + this->svmAllocsManager = std::make_unique(pDevice->getMemoryManager(), false); + debugManager.flags.EnableCopyWithStagingBuffers.set(1); + RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex}; + std::map deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}}; + this->stagingBufferManager = std::make_unique(svmAllocsManager.get(), rootDeviceIndices, deviceBitfields); + this->csr = pDevice->commandStreamReceivers[0].get(); + } + + void tearDown() { + stagingBufferManager.reset(); + svmAllocsManager.reset(); + DeviceFixture::tearDown(); + } + + void *allocateDeviceBuffer(size_t size) { + RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex}; + std::map deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}}; + SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::deviceUnifiedMemory, 0u, rootDeviceIndices, deviceBitfields); + unifiedMemoryProperties.device = pDevice; + return svmAllocsManager->createHostUnifiedMemoryAllocation(size, unifiedMemoryProperties); + } + + void copyThroughStagingBuffers(size_t copySize, size_t expectedChunks, size_t expectedAllocations) { + auto usmBuffer = allocateDeviceBuffer(copySize); + auto nonUsmBuffer = new unsigned char[copySize]; + + size_t chunkCounter = 0; + memset(usmBuffer, 0, copySize); + memset(nonUsmBuffer, 0xFF, copySize); + + auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) { + chunkCounter++; + memcpy(stagingBuffer, chunkSrc, chunkSize); + memcpy(chunkDst, stagingBuffer, chunkSize); + return 0; + }; + auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs(); + auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, copySize, chunkCopy, csr); + auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations; + + EXPECT_EQ(0, ret); + EXPECT_EQ(0, memcmp(usmBuffer, nonUsmBuffer, copySize)); + EXPECT_EQ(expectedChunks, chunkCounter); + EXPECT_EQ(expectedAllocations, newUsmAllocations); + svmAllocsManager->freeSVMAlloc(usmBuffer); + delete[] nonUsmBuffer; + } + + constexpr static size_t stagingBufferSize = MemoryConstants::megaByte * 2; + DebugManagerStateRestore restorer; + std::unique_ptr svmAllocsManager; + std::unique_ptr stagingBufferManager; + CommandStreamReceiver *csr; +}; + +using StagingBufferManagerTest = Test; + +TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForCopyThenReturnTrue) { + constexpr size_t bufferSize = 1024; + auto usmBuffer = allocateDeviceBuffer(bufferSize); + unsigned char nonUsmBuffer[bufferSize]; + + struct { + void *dstPtr; + void *srcPtr; + bool hasDependencies; + bool expectValid; + } copyParamsStruct[5]{ + {usmBuffer, nonUsmBuffer, false, true}, // nonUsm -> usm without dependencies + {usmBuffer, nonUsmBuffer, true, false}, // nonUsm -> usm with dependencies + {nonUsmBuffer, nonUsmBuffer, false, false}, // nonUsm -> nonUsm without dependencies + {usmBuffer, usmBuffer, false, false}, // usm -> usm without dependencies + {nonUsmBuffer, usmBuffer, false, false} // usm -> nonUsm without dependencies + }; + for (auto i = 0; i < 5; i++) { + auto actualValid = stagingBufferManager->isValidForCopy(*pDevice, copyParamsStruct[i].dstPtr, copyParamsStruct[i].srcPtr, copyParamsStruct[i].hasDependencies); + EXPECT_EQ(actualValid, copyParamsStruct[i].expectValid); + } + + debugManager.flags.EnableCopyWithStagingBuffers.set(0); + EXPECT_FALSE(stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, false)); + + debugManager.flags.EnableCopyWithStagingBuffers.set(-1); + auto isStaingBuffersEnabled = pDevice->getProductHelper().isStagingBuffersEnabled(); + EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, false)); + svmAllocsManager->freeSVMAlloc(usmBuffer); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformCopyThenCopyData) { + constexpr size_t numOfChunkCopies = 8; + constexpr size_t remainder = 1024; + constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies + remainder; + copyThroughStagingBuffers(totalCopySize, numOfChunkCopies + 1, 1); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformCopyWithoutRemainderThenNoRemainderCalled) { + constexpr size_t numOfChunkCopies = 8; + constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies; + copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 1); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenTaskCountNotReadyThenDontReuseBuffers) { + constexpr size_t numOfChunkCopies = 8; + constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies; + + *csr->getTagAddress() = csr->peekTaskCount(); + copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 8); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenUpdatedTaskCountThenReuseBuffers) { + constexpr size_t numOfChunkCopies = 8; + constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies; + + *csr->getTagAddress() = csr->peekTaskCount(); + copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 8); + + *csr->getTagAddress() = csr->peekTaskCount() + numOfChunkCopies; + copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 0); + EXPECT_EQ(numOfChunkCopies, svmAllocsManager->svmAllocs.getNumAllocs()); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkCopyThenEarlyReturnWithFailure) { + constexpr size_t numOfChunkCopies = 8; + constexpr size_t remainder = 1024; + constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies + remainder; + constexpr int expectedErrorCode = 1; + auto usmBuffer = allocateDeviceBuffer(totalCopySize); + auto nonUsmBuffer = new unsigned char[totalCopySize]; + + size_t chunkCounter = 0; + memset(usmBuffer, 0, totalCopySize); + memset(nonUsmBuffer, 0xFF, totalCopySize); + + auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) { + chunkCounter++; + memcpy(stagingBuffer, chunkSrc, chunkSize); + memcpy(chunkDst, stagingBuffer, chunkSize); + return expectedErrorCode; + }; + auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs(); + auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr); + auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations; + + EXPECT_EQ(expectedErrorCode, ret); + EXPECT_NE(0, memcmp(usmBuffer, nonUsmBuffer, totalCopySize)); + EXPECT_EQ(1u, chunkCounter); + EXPECT_EQ(1u, newUsmAllocations); + svmAllocsManager->freeSVMAlloc(usmBuffer); + delete[] nonUsmBuffer; +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedRemainderCopyThenReturnWithFailure) { + constexpr size_t numOfChunkCopies = 8; + constexpr size_t remainder = 1024; + constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies + remainder; + constexpr int expectedErrorCode = 1; + auto usmBuffer = allocateDeviceBuffer(totalCopySize); + auto nonUsmBuffer = new unsigned char[totalCopySize]; + + size_t chunkCounter = 0; + memset(usmBuffer, 0, totalCopySize); + memset(nonUsmBuffer, 0xFF, totalCopySize); + + auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) { + chunkCounter++; + memcpy(stagingBuffer, chunkSrc, chunkSize); + memcpy(chunkDst, stagingBuffer, chunkSize); + if (chunkCounter <= numOfChunkCopies) { + return 0; + } else { + return expectedErrorCode; + } + }; + auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs(); + auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr); + auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations; + + EXPECT_EQ(expectedErrorCode, ret); + EXPECT_EQ(numOfChunkCopies + 1, chunkCounter); + EXPECT_EQ(1u, newUsmAllocations); + svmAllocsManager->freeSVMAlloc(usmBuffer); + delete[] nonUsmBuffer; +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenChangedBufferSizeThenPerformCopyWithCorrectNumberOfChunks) { + constexpr size_t stagingBufferSize = 512; + constexpr size_t numOfChunkCopies = 8; + constexpr size_t remainder = 1024; + constexpr size_t totalCopySize = MemoryConstants::kiloByte * stagingBufferSize * numOfChunkCopies + remainder; + debugManager.flags.StagingBufferSize.set(stagingBufferSize); // 512KB + + RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex}; + std::map deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}}; + stagingBufferManager = std::make_unique(svmAllocsManager.get(), rootDeviceIndices, deviceBitfields); + copyThroughStagingBuffers(totalCopySize, numOfChunkCopies + 1, 1); +} \ No newline at end of file