performance: non-usm copy through staging buffers
Related-To: NEO-11501 Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
parent
659075ffe5
commit
29e3eb512c
|
@ -20,6 +20,7 @@
|
||||||
#include "shared/source/os_interface/device_factory.h"
|
#include "shared/source/os_interface/device_factory.h"
|
||||||
#include "shared/source/utilities/buffer_pool_allocator.inl"
|
#include "shared/source/utilities/buffer_pool_allocator.inl"
|
||||||
#include "shared/source/utilities/heap_allocator.h"
|
#include "shared/source/utilities/heap_allocator.h"
|
||||||
|
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||||
|
|
||||||
#include "opencl/source/accelerators/intel_motion_estimation.h"
|
#include "opencl/source/accelerators/intel_motion_estimation.h"
|
||||||
#include "opencl/source/api/additional_extensions.h"
|
#include "opencl/source/api/additional_extensions.h"
|
||||||
|
@ -4913,14 +4914,19 @@ cl_int CL_API_CALL clEnqueueSVMMemcpy(cl_command_queue commandQueue,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (size != 0) {
|
if (size != 0) {
|
||||||
retVal = pCommandQueue->enqueueSVMMemcpy(
|
auto stagingBufferManager = pCommandQueue->getContext().getStagingBufferManager();
|
||||||
blockingCopy,
|
if (stagingBufferManager->isValidForCopy(device, dstPtr, srcPtr, numEventsInWaitList)) {
|
||||||
dstPtr,
|
retVal = pCommandQueue->enqueueStagingBufferMemcpy(blockingCopy, dstPtr, srcPtr, size, event);
|
||||||
srcPtr,
|
} else {
|
||||||
size,
|
retVal = pCommandQueue->enqueueSVMMemcpy(
|
||||||
numEventsInWaitList,
|
blockingCopy,
|
||||||
eventWaitList,
|
dstPtr,
|
||||||
event);
|
srcPtr,
|
||||||
|
size,
|
||||||
|
numEventsInWaitList,
|
||||||
|
eventWaitList,
|
||||||
|
event);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
retVal = pCommandQueue->enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event);
|
retVal = pCommandQueue->enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event);
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,7 @@
|
||||||
#include "shared/source/os_interface/os_context.h"
|
#include "shared/source/os_interface/os_context.h"
|
||||||
#include "shared/source/os_interface/product_helper.h"
|
#include "shared/source/os_interface/product_helper.h"
|
||||||
#include "shared/source/utilities/api_intercept.h"
|
#include "shared/source/utilities/api_intercept.h"
|
||||||
|
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||||
#include "shared/source/utilities/tag_allocator.h"
|
#include "shared/source/utilities/tag_allocator.h"
|
||||||
|
|
||||||
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
|
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
|
||||||
|
@ -1504,4 +1505,56 @@ void CommandQueue::unregisterGpgpuAndBcsCsrClients() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event) {
|
||||||
|
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, &size};
|
||||||
|
csrSelectionArgs.direction = TransferDirection::hostToLocal;
|
||||||
|
auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs);
|
||||||
|
|
||||||
|
Event profilingEvent{this, CL_COMMAND_SVM_MEMCPY, CompletionStamp::notReady, CompletionStamp::notReady};
|
||||||
|
if (isProfilingEnabled()) {
|
||||||
|
profilingEvent.setQueueTimeStamp();
|
||||||
|
}
|
||||||
|
|
||||||
|
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) -> int32_t {
|
||||||
|
auto isFirstTransfer = (chunkDst == dstPtr);
|
||||||
|
auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size);
|
||||||
|
if (isFirstTransfer && isProfilingEnabled()) {
|
||||||
|
profilingEvent.setSubmitTimeStamp();
|
||||||
|
}
|
||||||
|
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
||||||
|
if (isFirstTransfer && isProfilingEnabled()) {
|
||||||
|
profilingEvent.setStartTimeStamp();
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_event *outEvent = nullptr;
|
||||||
|
if (isLastTransfer && !this->isOOQEnabled()) {
|
||||||
|
outEvent = event;
|
||||||
|
}
|
||||||
|
auto ret = this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, outEvent);
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto stagingBufferManager = this->context->getStagingBufferManager();
|
||||||
|
auto ret = stagingBufferManager->performCopy(dstPtr, srcPtr, size, chunkCopy, csr);
|
||||||
|
if (ret != CL_SUCCESS) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event != nullptr) {
|
||||||
|
if (this->isOOQEnabled()) {
|
||||||
|
ret = this->enqueueBarrierWithWaitList(0, nullptr, event);
|
||||||
|
}
|
||||||
|
if (isProfilingEnabled()) {
|
||||||
|
auto pEvent = castToObjectOrAbort<Event>(*event);
|
||||||
|
pEvent->copyTimestamps(profilingEvent);
|
||||||
|
pEvent->setCPUProfilingPath(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (blockingCopy) {
|
||||||
|
ret = this->finish();
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|
|
@ -388,6 +388,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||||
bool isBcsSplitInitialized() const { return this->bcsSplitInitialized; }
|
bool isBcsSplitInitialized() const { return this->bcsSplitInitialized; }
|
||||||
bool isBcs() const { return isCopyOnly; };
|
bool isBcs() const { return isCopyOnly; };
|
||||||
|
|
||||||
|
cl_int enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
|
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
|
||||||
cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest);
|
cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest);
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||||
#include "shared/source/utilities/buffer_pool_allocator.inl"
|
#include "shared/source/utilities/buffer_pool_allocator.inl"
|
||||||
#include "shared/source/utilities/heap_allocator.h"
|
#include "shared/source/utilities/heap_allocator.h"
|
||||||
|
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||||
#include "shared/source/utilities/tag_allocator.h"
|
#include "shared/source/utilities/tag_allocator.h"
|
||||||
|
|
||||||
#include "opencl/source/cl_device/cl_device.h"
|
#include "opencl/source/cl_device/cl_device.h"
|
||||||
|
@ -74,6 +75,7 @@ Context::~Context() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (svmAllocsManager) {
|
if (svmAllocsManager) {
|
||||||
|
this->stagingBufferManager.reset();
|
||||||
svmAllocsManager->trimUSMDeviceAllocCache();
|
svmAllocsManager->trimUSMDeviceAllocCache();
|
||||||
delete svmAllocsManager;
|
delete svmAllocsManager;
|
||||||
}
|
}
|
||||||
|
@ -281,6 +283,7 @@ bool Context::createImpl(const cl_context_properties *properties,
|
||||||
this->svmAllocsManager = new SVMAllocsManager(this->memoryManager,
|
this->svmAllocsManager = new SVMAllocsManager(this->memoryManager,
|
||||||
this->areMultiStorageAllocationsPreferred());
|
this->areMultiStorageAllocationsPreferred());
|
||||||
this->svmAllocsManager->initUsmAllocationsCaches(device->getDevice());
|
this->svmAllocsManager->initUsmAllocationsCaches(device->getDevice());
|
||||||
|
this->stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager, rootDeviceIndices, deviceBitfields);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -676,4 +679,8 @@ bool Context::checkIfContextIsNonZebin() const {
|
||||||
return this->nonZebinContext;
|
return this->nonZebinContext;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
StagingBufferManager *Context::getStagingBufferManager() const {
|
||||||
|
return this->stagingBufferManager.get();
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|
|
@ -40,6 +40,7 @@ class SVMAllocsManager;
|
||||||
class Program;
|
class Program;
|
||||||
class Platform;
|
class Platform;
|
||||||
class TagAllocatorBase;
|
class TagAllocatorBase;
|
||||||
|
class StagingBufferManager;
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct OpenCLObjectMapper<_cl_context> {
|
struct OpenCLObjectMapper<_cl_context> {
|
||||||
|
@ -256,6 +257,8 @@ class Context : public BaseObject<_cl_context> {
|
||||||
void initializeUsmAllocationPools();
|
void initializeUsmAllocationPools();
|
||||||
void cleanupUsmAllocationPools();
|
void cleanupUsmAllocationPools();
|
||||||
|
|
||||||
|
StagingBufferManager *getStagingBufferManager() const;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
struct BuiltInKernel {
|
struct BuiltInKernel {
|
||||||
const char *pSource = nullptr;
|
const char *pSource = nullptr;
|
||||||
|
@ -300,6 +303,8 @@ class Context : public BaseObject<_cl_context> {
|
||||||
std::unique_ptr<TagAllocatorBase> multiRootDeviceTimestampPacketAllocator;
|
std::unique_ptr<TagAllocatorBase> multiRootDeviceTimestampPacketAllocator;
|
||||||
std::mutex multiRootDeviceAllocatorMtx;
|
std::mutex multiRootDeviceAllocatorMtx;
|
||||||
|
|
||||||
|
std::unique_ptr<StagingBufferManager> stagingBufferManager;
|
||||||
|
|
||||||
bool interopUserSync = false;
|
bool interopUserSync = false;
|
||||||
bool resolvesRequiredInKernels = false;
|
bool resolvesRequiredInKernels = false;
|
||||||
bool nonZebinContext = false;
|
bool nonZebinContext = false;
|
||||||
|
|
|
@ -397,6 +397,10 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
|
||||||
auto &device = this->cmdQueue->getDevice();
|
auto &device = this->cmdQueue->getDevice();
|
||||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||||
auto resolution = device.getDeviceInfo().profilingTimerResolution;
|
auto resolution = device.getDeviceInfo().profilingTimerResolution;
|
||||||
|
if (timestampsCopied) {
|
||||||
|
// Adjust startTS since we calculate profiling based on other event timestamps
|
||||||
|
contextStartTS = startTimeStamp.gpuTimeStamp;
|
||||||
|
}
|
||||||
|
|
||||||
// Calculate startTimestamp only if it was not already set on CPU
|
// Calculate startTimestamp only if it was not already set on CPU
|
||||||
if (startTimeStamp.cpuTimeInNs == 0) {
|
if (startTimeStamp.cpuTimeInNs == 0) {
|
||||||
|
|
|
@ -312,6 +312,14 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||||
|
|
||||||
static void getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS);
|
static void getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS);
|
||||||
|
|
||||||
|
void copyTimestamps(const Event &srcEvent) {
|
||||||
|
this->queueTimeStamp = srcEvent.queueTimeStamp;
|
||||||
|
this->submitTimeStamp = srcEvent.submitTimeStamp;
|
||||||
|
this->startTimeStamp = srcEvent.startTimeStamp;
|
||||||
|
this->endTimeStamp = srcEvent.endTimeStamp;
|
||||||
|
timestampsCopied = true;
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Event(Context *ctx, CommandQueue *cmdQueue, cl_command_type cmdType,
|
Event(Context *ctx, CommandQueue *cmdQueue, cl_command_type cmdType,
|
||||||
TaskCountType taskLevel, TaskCountType taskCount);
|
TaskCountType taskLevel, TaskCountType taskCount);
|
||||||
|
@ -383,6 +391,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||||
bool profilingEnabled = false;
|
bool profilingEnabled = false;
|
||||||
bool profilingCpuPath = false;
|
bool profilingCpuPath = false;
|
||||||
bool dataCalculated = false;
|
bool dataCalculated = false;
|
||||||
|
bool timestampsCopied = false;
|
||||||
|
|
||||||
ProfilingInfo queueTimeStamp{};
|
ProfilingInfo queueTimeStamp{};
|
||||||
ProfilingInfo submitTimeStamp{};
|
ProfilingInfo submitTimeStamp{};
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -231,4 +231,29 @@ TEST_F(ClEnqueueSVMMemcpyTests, GivenDeviceNotSupportingSvmWhenEnqueuingSVMMemcp
|
||||||
EXPECT_EQ(CL_INVALID_OPERATION, retVal);
|
EXPECT_EQ(CL_INVALID_OPERATION, retVal);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(ClEnqueueSVMMemcpyTests, givenCopyValidForStagingBuffersCopyThenTransferSuccesfull) {
|
||||||
|
DebugManagerStateRestore restorer;
|
||||||
|
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
|
||||||
|
const ClDeviceInfo &devInfo = pDevice->getDeviceInfo();
|
||||||
|
if (devInfo.svmCapabilities != 0) {
|
||||||
|
void *pDstSvm = clSVMAlloc(pContext, CL_MEM_READ_WRITE, 256, 4);
|
||||||
|
EXPECT_NE(nullptr, pDstSvm);
|
||||||
|
auto pSrc = new unsigned char[256];
|
||||||
|
auto retVal = clEnqueueSVMMemcpy(
|
||||||
|
pCommandQueue, // cl_command_queue command_queue
|
||||||
|
CL_FALSE, // cl_bool blocking_copy
|
||||||
|
pDstSvm, // void *dst_ptr
|
||||||
|
pSrc, // const void *src_ptr
|
||||||
|
256, // size_t size
|
||||||
|
0, // cl_uint num_events_in_wait_list
|
||||||
|
nullptr, // const cl_event *event_wait_list
|
||||||
|
nullptr // cl_event *event
|
||||||
|
);
|
||||||
|
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||||
|
|
||||||
|
clSVMFree(pContext, pDstSvm);
|
||||||
|
delete[] pSrc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace ULT
|
} // namespace ULT
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -13,6 +13,8 @@
|
||||||
#include "shared/source/memory_manager/surface.h"
|
#include "shared/source/memory_manager/surface.h"
|
||||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||||
#include "shared/source/os_interface/device_factory.h"
|
#include "shared/source/os_interface/device_factory.h"
|
||||||
|
#include "shared/source/utilities/hw_timestamps.h"
|
||||||
|
#include "shared/source/utilities/tag_allocator.h"
|
||||||
#include "shared/test/common/cmd_parse/hw_parse.h"
|
#include "shared/test/common/cmd_parse/hw_parse.h"
|
||||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||||
#include "shared/test/common/libult/ult_command_stream_receiver.h"
|
#include "shared/test/common/libult/ult_command_stream_receiver.h"
|
||||||
|
@ -2369,3 +2371,190 @@ HWTEST_F(EnqueueSvmTest, givenCopyFromMappedPtrToMappedPtrWhenCallingSvmMemcpyTh
|
||||||
EXPECT_EQ(2u, csr.createAllocationForHostSurfaceCalled);
|
EXPECT_EQ(2u, csr.createAllocationForHostSurfaceCalled);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct StagingBufferTest : public EnqueueSvmTest {
|
||||||
|
void SetUp() override {
|
||||||
|
REQUIRE_SVM_OR_SKIP(defaultHwInfo);
|
||||||
|
EnqueueSvmTest::SetUp();
|
||||||
|
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::deviceUnifiedMemory, 1, context->getRootDeviceIndices(), context->getDeviceBitfields());
|
||||||
|
unifiedMemoryProperties.device = pDevice;
|
||||||
|
svmManager = this->context->getSVMAllocsManager();
|
||||||
|
|
||||||
|
dstPtr = svmManager->createUnifiedMemoryAllocation(copySize, unifiedMemoryProperties);
|
||||||
|
srcPtr = new unsigned char[copySize];
|
||||||
|
}
|
||||||
|
|
||||||
|
void TearDown() override {
|
||||||
|
if (defaultHwInfo->capabilityTable.ftrSvm == false) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
svmManager = this->context->getSVMAllocsManager();
|
||||||
|
svmManager->freeSVMAlloc(dstPtr);
|
||||||
|
delete[] srcPtr;
|
||||||
|
EnqueueSvmTest::TearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr size_t stagingBufferSize = MemoryConstants::megaByte * 2;
|
||||||
|
static constexpr size_t copySize = stagingBufferSize * 4;
|
||||||
|
static constexpr size_t expectedNumOfCopies = copySize / stagingBufferSize;
|
||||||
|
|
||||||
|
SVMAllocsManager *svmManager;
|
||||||
|
void *dstPtr;
|
||||||
|
unsigned char *srcPtr;
|
||||||
|
};
|
||||||
|
|
||||||
|
HWTEST_F(StagingBufferTest, givenInOrderCmdQueueWhenEnqueueStagingBufferMemcpyNonBlockingThenCopySucessfull) {
|
||||||
|
constexpr cl_command_type expectedLastCmd = CL_COMMAND_SVM_MEMCPY;
|
||||||
|
|
||||||
|
cl_event event;
|
||||||
|
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
|
||||||
|
auto initialUsmAllocs = svmManager->getNumAllocs();
|
||||||
|
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||||
|
false, // cl_bool blocking_copy
|
||||||
|
dstPtr, // void *dst_ptr
|
||||||
|
srcPtr, // const void *src_ptr
|
||||||
|
copySize, // size_t size
|
||||||
|
&event // cl_event *event
|
||||||
|
);
|
||||||
|
auto pEvent = (Event *)event;
|
||||||
|
auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
|
||||||
|
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||||
|
EXPECT_EQ(1u, numOfStagingBuffers);
|
||||||
|
EXPECT_EQ(expectedNumOfCopies, myCmdQ.enqueueSVMMemcpyCalledCount);
|
||||||
|
EXPECT_EQ(0u, myCmdQ.finishCalledCount);
|
||||||
|
EXPECT_EQ(expectedLastCmd, myCmdQ.lastCommandType);
|
||||||
|
EXPECT_EQ(expectedLastCmd, pEvent->getCommandType());
|
||||||
|
|
||||||
|
clReleaseEvent(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(StagingBufferTest, givenOutOfOrderCmdQueueWhenEnqueueStagingBufferMemcpyNonBlockingThenCopySucessfull) {
|
||||||
|
constexpr cl_command_type expectedLastCmd = CL_COMMAND_BARRIER;
|
||||||
|
|
||||||
|
cl_event event;
|
||||||
|
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
|
||||||
|
myCmdQ.setOoqEnabled();
|
||||||
|
|
||||||
|
auto initialUsmAllocs = svmManager->getNumAllocs();
|
||||||
|
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||||
|
false, // cl_bool blocking_copy
|
||||||
|
dstPtr, // void *dst_ptr
|
||||||
|
srcPtr, // const void *src_ptr
|
||||||
|
copySize, // size_t size
|
||||||
|
&event // cl_event *event
|
||||||
|
);
|
||||||
|
|
||||||
|
auto pEvent = (Event *)event;
|
||||||
|
auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
|
||||||
|
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||||
|
EXPECT_EQ(1u, numOfStagingBuffers);
|
||||||
|
EXPECT_EQ(expectedNumOfCopies, myCmdQ.enqueueSVMMemcpyCalledCount);
|
||||||
|
EXPECT_EQ(0u, myCmdQ.finishCalledCount);
|
||||||
|
EXPECT_EQ(expectedLastCmd, myCmdQ.lastCommandType);
|
||||||
|
EXPECT_EQ(expectedLastCmd, pEvent->getCommandType());
|
||||||
|
|
||||||
|
clReleaseEvent(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(StagingBufferTest, givenEnqueueStagingBufferMemcpyWhenTaskCountNotReadyThenCopySucessfullAndBuffersNotReused) {
|
||||||
|
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
|
||||||
|
auto initialUsmAllocs = svmManager->getNumAllocs();
|
||||||
|
auto &csr = pCmdQ->getGpgpuCommandStreamReceiver();
|
||||||
|
*csr.getTagAddress() = csr.peekTaskCount();
|
||||||
|
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||||
|
false, // cl_bool blocking_copy
|
||||||
|
dstPtr, // void *dst_ptr
|
||||||
|
srcPtr, // const void *src_ptr
|
||||||
|
copySize, // size_t size
|
||||||
|
nullptr // cl_event *event
|
||||||
|
);
|
||||||
|
auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
|
||||||
|
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||||
|
EXPECT_EQ(expectedNumOfCopies, numOfStagingBuffers);
|
||||||
|
*csr.getTagAddress() = csr.peekTaskCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(StagingBufferTest, givenCmdQueueWhenEnqueueStagingBufferMemcpyBlockingThenCopySucessfullAndFinishCalled) {
|
||||||
|
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
|
||||||
|
auto initialUsmAllocs = svmManager->getNumAllocs();
|
||||||
|
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||||
|
true, // cl_bool blocking_copy
|
||||||
|
dstPtr, // void *dst_ptr
|
||||||
|
srcPtr, // const void *src_ptr
|
||||||
|
copySize, // size_t size
|
||||||
|
nullptr // cl_event *event
|
||||||
|
);
|
||||||
|
auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
|
||||||
|
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||||
|
EXPECT_EQ(1u, numOfStagingBuffers);
|
||||||
|
EXPECT_EQ(1u, myCmdQ.finishCalledCount);
|
||||||
|
|
||||||
|
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||||
|
true, // cl_bool blocking_copy
|
||||||
|
dstPtr, // void *dst_ptr
|
||||||
|
srcPtr, // const void *src_ptr
|
||||||
|
copySize, // size_t size
|
||||||
|
nullptr // cl_event *event
|
||||||
|
);
|
||||||
|
numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs;
|
||||||
|
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||||
|
EXPECT_EQ(1u, numOfStagingBuffers);
|
||||||
|
EXPECT_EQ(2u, myCmdQ.finishCalledCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(StagingBufferTest, givenCmdQueueWhenEnqueueStagingBufferWithInvalidBufferThenReturnFailure) {
|
||||||
|
auto dstPtr = nullptr;
|
||||||
|
auto srcPtr = new unsigned char[copySize];
|
||||||
|
|
||||||
|
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
|
||||||
|
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||||
|
false, // cl_bool blocking_copy
|
||||||
|
dstPtr, // void *dst_ptr
|
||||||
|
srcPtr, // const void *src_ptr
|
||||||
|
copySize, // size_t size
|
||||||
|
nullptr // cl_event *event
|
||||||
|
);
|
||||||
|
EXPECT_EQ(CL_INVALID_VALUE, retVal);
|
||||||
|
|
||||||
|
delete[] srcPtr;
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(StagingBufferTest, givenCmdQueueWithProfilingWhenEnqueueStagingBufferMemcpyThenTimestampsSetCorrectly) {
|
||||||
|
cl_event event;
|
||||||
|
MockCommandQueueHw<FamilyType> myCmdQ(context, pClDevice, 0);
|
||||||
|
myCmdQ.setProfilingEnabled();
|
||||||
|
retVal = myCmdQ.enqueueStagingBufferMemcpy(
|
||||||
|
false, // cl_bool blocking_copy
|
||||||
|
dstPtr, // void *dst_ptr
|
||||||
|
srcPtr, // const void *src_ptr
|
||||||
|
copySize, // size_t size
|
||||||
|
&event // cl_event *event
|
||||||
|
);
|
||||||
|
auto pEvent = (Event *)event;
|
||||||
|
|
||||||
|
// A small adjustment to give end timestamp a valid value instead of mocked value
|
||||||
|
TimeStampData tsData{};
|
||||||
|
pClDevice->getDevice().getOSTime()->getGpuCpuTime(&tsData);
|
||||||
|
if (pEvent->getTimestampPacketNodes()) {
|
||||||
|
auto node = pEvent->getTimestampPacketNodes()->peekNodes()[0];
|
||||||
|
auto contextEnd = ptrOffset(node->getCpuBase(), node->getGlobalEndOffset());
|
||||||
|
*reinterpret_cast<typename FamilyType::TimestampPacketType *>(contextEnd) = static_cast<typename FamilyType::TimestampPacketType>(tsData.gpuTimeStamp);
|
||||||
|
} else {
|
||||||
|
HwTimeStamps *timeStamps = static_cast<TagNode<HwTimeStamps> *>(pEvent->getHwTimeStampNode())->tagForCpuAccess;
|
||||||
|
timeStamps->contextEndTS = tsData.gpuTimeStamp;
|
||||||
|
timeStamps->globalEndTS = tsData.gpuTimeStamp;
|
||||||
|
}
|
||||||
|
|
||||||
|
EXPECT_FALSE(pEvent->isCPUProfilingPath());
|
||||||
|
EXPECT_TRUE(pEvent->isProfilingEnabled());
|
||||||
|
uint64_t queue, submit, start, end;
|
||||||
|
pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queue, 0);
|
||||||
|
pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0);
|
||||||
|
pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(uint64_t), &start, 0);
|
||||||
|
pEvent->getEventProfilingInfo(CL_PROFILING_COMMAND_END, sizeof(uint64_t), &end, 0);
|
||||||
|
EXPECT_GE(queue, 0ull);
|
||||||
|
EXPECT_GE(submit, queue);
|
||||||
|
EXPECT_GE(start, submit);
|
||||||
|
EXPECT_GE(end, start);
|
||||||
|
clReleaseEvent(event);
|
||||||
|
}
|
|
@ -460,6 +460,17 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
||||||
return BaseClass::enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event);
|
return BaseClass::enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cl_int enqueueSVMMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size,
|
||||||
|
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override {
|
||||||
|
enqueueSVMMemcpyCalledCount++;
|
||||||
|
return BaseClass::enqueueSVMMemcpy(blockingCopy, dstPtr, srcPtr, size, numEventsInWaitList, eventWaitList, event);
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_int finish() override {
|
||||||
|
finishCalledCount++;
|
||||||
|
return BaseClass::finish();
|
||||||
|
}
|
||||||
|
|
||||||
unsigned int lastCommandType;
|
unsigned int lastCommandType;
|
||||||
std::vector<Kernel *> lastEnqueuedKernels;
|
std::vector<Kernel *> lastEnqueuedKernels;
|
||||||
MultiDispatchInfo storedMultiDispatchInfo;
|
MultiDispatchInfo storedMultiDispatchInfo;
|
||||||
|
@ -490,7 +501,8 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
||||||
std::optional<WaitStatus> waitUntilCompleteReturnValue{};
|
std::optional<WaitStatus> waitUntilCompleteReturnValue{};
|
||||||
int waitForAllEnginesCalledCount{0};
|
int waitForAllEnginesCalledCount{0};
|
||||||
int enqueueMarkerWithWaitListCalledCount{0};
|
int enqueueMarkerWithWaitListCalledCount{0};
|
||||||
|
size_t enqueueSVMMemcpyCalledCount{0};
|
||||||
|
size_t finishCalledCount{0};
|
||||||
LinearStream *peekCommandStream() {
|
LinearStream *peekCommandStream() {
|
||||||
return this->commandStream;
|
return this->commandStream;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -14,6 +14,7 @@
|
||||||
#include "shared/source/memory_manager/deferred_deleter.h"
|
#include "shared/source/memory_manager/deferred_deleter.h"
|
||||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||||
#include "shared/source/os_interface/os_context.h"
|
#include "shared/source/os_interface/os_context.h"
|
||||||
|
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||||
#include "shared/test/common/helpers/engine_descriptor_helper.h"
|
#include "shared/test/common/helpers/engine_descriptor_helper.h"
|
||||||
#include "shared/test/common/mocks/mock_svm_manager.h"
|
#include "shared/test/common/mocks/mock_svm_manager.h"
|
||||||
|
|
||||||
|
@ -123,6 +124,7 @@ void MockContext::initializeWithDevices(const ClDeviceVector &devices, bool noSp
|
||||||
}
|
}
|
||||||
deviceBitfields.insert({rootDeviceIndex, deviceBitfield});
|
deviceBitfields.insert({rootDeviceIndex, deviceBitfield});
|
||||||
}
|
}
|
||||||
|
stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager, rootDeviceIndices, deviceBitfields);
|
||||||
|
|
||||||
cl_int retVal;
|
cl_int retVal;
|
||||||
if (!noSpecialQueue) {
|
if (!noSpecialQueue) {
|
||||||
|
|
|
@ -387,6 +387,8 @@ DECLARE_DEBUG_VARIABLE(int32_t, SkipDcFlushOnBarrierWithoutEvents, -1, "-1: defa
|
||||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableDeviceUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB")
|
DECLARE_DEBUG_VARIABLE(int32_t, EnableDeviceUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB")
|
||||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableHostUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB")
|
DECLARE_DEBUG_VARIABLE(int32_t, EnableHostUsmAllocationPool, -1, "-1: default (enabled, 2MB), 0: disabled, >=1: enabled, size in MB")
|
||||||
DECLARE_DEBUG_VARIABLE(int32_t, UseLocalPreferredForCacheableBuffers, -1, "Use localPreferred for cacheable buffers")
|
DECLARE_DEBUG_VARIABLE(int32_t, UseLocalPreferredForCacheableBuffers, -1, "Use localPreferred for cacheable buffers")
|
||||||
|
DECLARE_DEBUG_VARIABLE(int32_t, EnableCopyWithStagingBuffers, -1, "Enable copy with non-usm memory through staging buffers. -1: default, 0: disabled, 1: enabled")
|
||||||
|
DECLARE_DEBUG_VARIABLE(int32_t, StagingBufferSize, -1, "Size of single staging buffer. -1: default (2MB), >0: size in KB")
|
||||||
|
|
||||||
/*DIRECT SUBMISSION FLAGS*/
|
/*DIRECT SUBMISSION FLAGS*/
|
||||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD")
|
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD")
|
||||||
|
|
|
@ -237,6 +237,7 @@ class ProductHelper {
|
||||||
virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
|
virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
|
||||||
virtual bool isAvailableExtendedScratch() const = 0;
|
virtual bool isAvailableExtendedScratch() const = 0;
|
||||||
virtual std::optional<bool> isCoherentAllocation(uint64_t patIndex) const = 0;
|
virtual std::optional<bool> isCoherentAllocation(uint64_t patIndex) const = 0;
|
||||||
|
virtual bool isStagingBuffersEnabled() const = 0;
|
||||||
|
|
||||||
virtual ~ProductHelper() = default;
|
virtual ~ProductHelper() = default;
|
||||||
|
|
||||||
|
|
|
@ -897,4 +897,10 @@ template <PRODUCT_FAMILY gfxProduct>
|
||||||
bool ProductHelperHw<gfxProduct>::isAvailableExtendedScratch() const {
|
bool ProductHelperHw<gfxProduct>::isAvailableExtendedScratch() const {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <PRODUCT_FAMILY gfxProduct>
|
||||||
|
bool ProductHelperHw<gfxProduct>::isStagingBuffersEnabled() const {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|
|
@ -182,6 +182,7 @@ class ProductHelperHw : public ProductHelper {
|
||||||
size_t getMaxFillPaternSizeForCopyEngine() const override;
|
size_t getMaxFillPaternSizeForCopyEngine() const override;
|
||||||
bool isAvailableExtendedScratch() const override;
|
bool isAvailableExtendedScratch() const override;
|
||||||
std::optional<bool> isCoherentAllocation(uint64_t patIndex) const override;
|
std::optional<bool> isCoherentAllocation(uint64_t patIndex) const override;
|
||||||
|
bool isStagingBuffersEnabled() const override;
|
||||||
|
|
||||||
~ProductHelperHw() override = default;
|
~ProductHelperHw() override = default;
|
||||||
|
|
||||||
|
|
|
@ -49,6 +49,8 @@ set(NEO_CORE_UTILITIES
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/wait_util.h
|
${CMAKE_CURRENT_SOURCE_DIR}/wait_util.h
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator.h
|
${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator.h
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/staging_buffer_manager.cpp
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/staging_buffer_manager.h
|
||||||
)
|
)
|
||||||
|
|
||||||
set(NEO_CORE_UTILITIES_WINDOWS
|
set(NEO_CORE_UTILITIES_WINDOWS
|
||||||
|
|
|
@ -0,0 +1,123 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2024 Intel Corporation
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||||
|
|
||||||
|
#include "shared/source/command_stream/command_stream_receiver.h"
|
||||||
|
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||||
|
#include "shared/source/device/device.h"
|
||||||
|
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||||
|
|
||||||
|
namespace NEO {
|
||||||
|
|
||||||
|
StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields) : svmAllocsManager(svmAllocsManager), rootDeviceIndices(rootDeviceIndices), deviceBitfields(deviceBitfields) {
|
||||||
|
if (debugManager.flags.StagingBufferSize.get() != -1) {
|
||||||
|
chunkSize = debugManager.flags.StagingBufferSize.get() * MemoryConstants::kiloByte;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
StagingBufferManager::~StagingBufferManager() {
|
||||||
|
for (auto &stagingBuffer : stagingBuffers) {
|
||||||
|
svmAllocsManager->freeSVMAlloc(stagingBuffer.first->gpuAllocations.getDefaultGraphicsAllocation()->getUnderlyingBuffer());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This method performs 4 steps for single chunk copy
|
||||||
|
* 1. Get existing staging buffer, if can't - allocate new one,
|
||||||
|
* 2. Perform actual copy,
|
||||||
|
* 3. Store used buffer back to the container (with current task count)
|
||||||
|
* 4. Update tag to reuse previous buffers within same API call
|
||||||
|
*/
|
||||||
|
int32_t StagingBufferManager::performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr) {
|
||||||
|
auto rootDeviceIndex = csr->getRootDeviceIndex();
|
||||||
|
auto taskCount = *csr->getTagAddress();
|
||||||
|
auto stagingBuffer = getExistingBuffer(taskCount, rootDeviceIndex);
|
||||||
|
if (stagingBuffer == nullptr) {
|
||||||
|
stagingBuffer = allocateStagingBuffer();
|
||||||
|
}
|
||||||
|
auto ret = chunkCopyFunc(chunkDst, stagingBuffer, chunkSrc, size);
|
||||||
|
storeBuffer(stagingBuffer, csr->peekTaskCount());
|
||||||
|
csr->flushTagUpdate();
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This method copies data between non-USM and USM allocations by splitting transfers into chunks.
|
||||||
|
* Each chunk copy contains staging buffer which should be used instead of non-usm memory during transfers on GPU.
|
||||||
|
* Caller provides actual function to transfer data for single chunk.
|
||||||
|
*/
|
||||||
|
int32_t StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr) {
|
||||||
|
auto copiesNum = size / chunkSize;
|
||||||
|
auto remainder = size % chunkSize;
|
||||||
|
|
||||||
|
for (auto i = 0u; i < copiesNum; i++) {
|
||||||
|
auto chunkDst = ptrOffset(dstPtr, i * chunkSize);
|
||||||
|
auto chunkSrc = ptrOffset(srcPtr, i * chunkSize);
|
||||||
|
auto ret = performChunkCopy(chunkDst, chunkSrc, chunkSize, chunkCopyFunc, csr);
|
||||||
|
if (ret) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (remainder != 0) {
|
||||||
|
auto chunkDst = ptrOffset(dstPtr, copiesNum * chunkSize);
|
||||||
|
auto chunkSrc = ptrOffset(srcPtr, copiesNum * chunkSize);
|
||||||
|
auto ret = performChunkCopy(chunkDst, chunkSrc, remainder, chunkCopyFunc, csr);
|
||||||
|
if (ret) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This method will try to return existing staging buffer from the container.
|
||||||
|
* It's checking only "oldest" allocation.
|
||||||
|
* Returns nullptr if no staging buffer available.
|
||||||
|
*/
|
||||||
|
void *StagingBufferManager::getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex) {
|
||||||
|
auto lock = std::lock_guard<std::mutex>(mtx);
|
||||||
|
if (stagingBuffers.empty()) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
void *buffer = nullptr;
|
||||||
|
auto iterator = stagingBuffers.begin();
|
||||||
|
UNRECOVERABLE_IF(iterator == stagingBuffers.end());
|
||||||
|
|
||||||
|
if (taskCount > iterator->second) {
|
||||||
|
auto allocation = iterator->first->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
|
||||||
|
buffer = allocation->getUnderlyingBuffer();
|
||||||
|
stagingBuffers.erase(iterator);
|
||||||
|
}
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *StagingBufferManager::allocateStagingBuffer() {
|
||||||
|
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::hostUnifiedMemory, 0u, rootDeviceIndices, deviceBitfields);
|
||||||
|
auto hostPtr = svmAllocsManager->createHostUnifiedMemoryAllocation(chunkSize, unifiedMemoryProperties);
|
||||||
|
return hostPtr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void StagingBufferManager::storeBuffer(void *stagingBuffer, uint64_t taskCount) {
|
||||||
|
auto lock = std::lock_guard<std::mutex>(mtx);
|
||||||
|
auto svmData = svmAllocsManager->getSVMAlloc(stagingBuffer);
|
||||||
|
stagingBuffers.push_back({svmData, taskCount});
|
||||||
|
}
|
||||||
|
|
||||||
|
bool StagingBufferManager::isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, bool hasDependencies) const {
|
||||||
|
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
|
||||||
|
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
|
||||||
|
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
|
||||||
|
}
|
||||||
|
auto usmDstData = svmAllocsManager->getSVMAlloc(dstPtr);
|
||||||
|
auto usmSrcData = svmAllocsManager->getSVMAlloc(srcPtr);
|
||||||
|
bool hostToUsmDeviceCopy = usmSrcData == nullptr && usmDstData != nullptr;
|
||||||
|
return stagingCopyEnabled && hostToUsmDeviceCopy && !hasDependencies;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace NEO
|
|
@ -0,0 +1,54 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2024 Intel Corporation
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "shared/source/helpers/constants.h"
|
||||||
|
#include "shared/source/utilities/stackvec.h"
|
||||||
|
|
||||||
|
#include <functional>
|
||||||
|
#include <map>
|
||||||
|
#include <mutex>
|
||||||
|
|
||||||
|
namespace NEO {
|
||||||
|
class SVMAllocsManager;
|
||||||
|
class CommandStreamReceiver;
|
||||||
|
class Device;
|
||||||
|
struct SvmAllocationData;
|
||||||
|
|
||||||
|
using ChunkCopyFunction = std::function<int32_t(void *, void *, const void *, size_t)>;
|
||||||
|
|
||||||
|
struct StagingBufferTracker {
|
||||||
|
void *stagingBuffer;
|
||||||
|
uint64_t taskCount;
|
||||||
|
};
|
||||||
|
|
||||||
|
class StagingBufferManager {
|
||||||
|
public:
|
||||||
|
StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields);
|
||||||
|
~StagingBufferManager();
|
||||||
|
|
||||||
|
bool isValidForCopy(Device &device, void *dstPtr, const void *srcPtr, bool hasDependencies) const;
|
||||||
|
int32_t performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr);
|
||||||
|
|
||||||
|
private:
|
||||||
|
void *getExistingBuffer(uint64_t taskCount, uint32_t rootDeviceIndex);
|
||||||
|
void *allocateStagingBuffer();
|
||||||
|
void storeBuffer(void *stagingBuffer, uint64_t taskCount);
|
||||||
|
int32_t performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction chunkCopyFunc, CommandStreamReceiver *csr);
|
||||||
|
|
||||||
|
size_t chunkSize = MemoryConstants::pageSize2M;
|
||||||
|
|
||||||
|
std::vector<std::pair<SvmAllocationData *, uint64_t>> stagingBuffers;
|
||||||
|
std::mutex mtx;
|
||||||
|
|
||||||
|
SVMAllocsManager *svmAllocsManager;
|
||||||
|
const RootDeviceIndicesContainer rootDeviceIndices;
|
||||||
|
const std::map<uint32_t, DeviceBitfield> deviceBitfields;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace NEO
|
|
@ -608,4 +608,6 @@ EnableReusingGpuTimestamps = -1
|
||||||
ForceCopyOperationOffloadForComputeCmdList = -1
|
ForceCopyOperationOffloadForComputeCmdList = -1
|
||||||
SecondaryContextEngineTypeMask = -1
|
SecondaryContextEngineTypeMask = -1
|
||||||
DisableSupportForL0Debugger=0
|
DisableSupportForL0Debugger=0
|
||||||
|
EnableCopyWithStagingBuffers = -1
|
||||||
|
StagingBufferSize = -1
|
||||||
# Please don't edit below this line
|
# Please don't edit below this line
|
||||||
|
|
|
@ -28,6 +28,7 @@ target_sources(neo_shared_tests PRIVATE
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/vec_tests.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/vec_tests.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/wait_util_tests.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/wait_util_tests.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator_tests.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/isa_pool_allocator_tests.cpp
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/staging_buffer_manager_tests.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
add_subdirectories()
|
add_subdirectories()
|
|
@ -0,0 +1,220 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2024 Intel Corporation
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||||
|
#include "shared/test/common/fixtures/device_fixture.h"
|
||||||
|
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||||
|
#include "shared/test/common/mocks/mock_device.h"
|
||||||
|
#include "shared/test/common/mocks/mock_svm_manager.h"
|
||||||
|
#include "shared/test/common/test_macros/test.h"
|
||||||
|
#include "shared/test/common/test_macros/test_checks_shared.h"
|
||||||
|
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
|
using namespace NEO;
|
||||||
|
|
||||||
|
class StagingBufferManagerFixture : public DeviceFixture {
|
||||||
|
public:
|
||||||
|
void setUp() {
|
||||||
|
DeviceFixture::setUp();
|
||||||
|
REQUIRE_SVM_OR_SKIP(&hardwareInfo);
|
||||||
|
this->svmAllocsManager = std::make_unique<MockSVMAllocsManager>(pDevice->getMemoryManager(), false);
|
||||||
|
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
|
||||||
|
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
|
||||||
|
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
|
||||||
|
this->stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager.get(), rootDeviceIndices, deviceBitfields);
|
||||||
|
this->csr = pDevice->commandStreamReceivers[0].get();
|
||||||
|
}
|
||||||
|
|
||||||
|
void tearDown() {
|
||||||
|
stagingBufferManager.reset();
|
||||||
|
svmAllocsManager.reset();
|
||||||
|
DeviceFixture::tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
void *allocateDeviceBuffer(size_t size) {
|
||||||
|
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
|
||||||
|
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
|
||||||
|
SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::deviceUnifiedMemory, 0u, rootDeviceIndices, deviceBitfields);
|
||||||
|
unifiedMemoryProperties.device = pDevice;
|
||||||
|
return svmAllocsManager->createHostUnifiedMemoryAllocation(size, unifiedMemoryProperties);
|
||||||
|
}
|
||||||
|
|
||||||
|
void copyThroughStagingBuffers(size_t copySize, size_t expectedChunks, size_t expectedAllocations) {
|
||||||
|
auto usmBuffer = allocateDeviceBuffer(copySize);
|
||||||
|
auto nonUsmBuffer = new unsigned char[copySize];
|
||||||
|
|
||||||
|
size_t chunkCounter = 0;
|
||||||
|
memset(usmBuffer, 0, copySize);
|
||||||
|
memset(nonUsmBuffer, 0xFF, copySize);
|
||||||
|
|
||||||
|
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) {
|
||||||
|
chunkCounter++;
|
||||||
|
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
||||||
|
memcpy(chunkDst, stagingBuffer, chunkSize);
|
||||||
|
return 0;
|
||||||
|
};
|
||||||
|
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
|
||||||
|
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, copySize, chunkCopy, csr);
|
||||||
|
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||||
|
|
||||||
|
EXPECT_EQ(0, ret);
|
||||||
|
EXPECT_EQ(0, memcmp(usmBuffer, nonUsmBuffer, copySize));
|
||||||
|
EXPECT_EQ(expectedChunks, chunkCounter);
|
||||||
|
EXPECT_EQ(expectedAllocations, newUsmAllocations);
|
||||||
|
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
||||||
|
delete[] nonUsmBuffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static size_t stagingBufferSize = MemoryConstants::megaByte * 2;
|
||||||
|
DebugManagerStateRestore restorer;
|
||||||
|
std::unique_ptr<MockSVMAllocsManager> svmAllocsManager;
|
||||||
|
std::unique_ptr<StagingBufferManager> stagingBufferManager;
|
||||||
|
CommandStreamReceiver *csr;
|
||||||
|
};
|
||||||
|
|
||||||
|
using StagingBufferManagerTest = Test<StagingBufferManagerFixture>;
|
||||||
|
|
||||||
|
TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForCopyThenReturnTrue) {
|
||||||
|
constexpr size_t bufferSize = 1024;
|
||||||
|
auto usmBuffer = allocateDeviceBuffer(bufferSize);
|
||||||
|
unsigned char nonUsmBuffer[bufferSize];
|
||||||
|
|
||||||
|
struct {
|
||||||
|
void *dstPtr;
|
||||||
|
void *srcPtr;
|
||||||
|
bool hasDependencies;
|
||||||
|
bool expectValid;
|
||||||
|
} copyParamsStruct[5]{
|
||||||
|
{usmBuffer, nonUsmBuffer, false, true}, // nonUsm -> usm without dependencies
|
||||||
|
{usmBuffer, nonUsmBuffer, true, false}, // nonUsm -> usm with dependencies
|
||||||
|
{nonUsmBuffer, nonUsmBuffer, false, false}, // nonUsm -> nonUsm without dependencies
|
||||||
|
{usmBuffer, usmBuffer, false, false}, // usm -> usm without dependencies
|
||||||
|
{nonUsmBuffer, usmBuffer, false, false} // usm -> nonUsm without dependencies
|
||||||
|
};
|
||||||
|
for (auto i = 0; i < 5; i++) {
|
||||||
|
auto actualValid = stagingBufferManager->isValidForCopy(*pDevice, copyParamsStruct[i].dstPtr, copyParamsStruct[i].srcPtr, copyParamsStruct[i].hasDependencies);
|
||||||
|
EXPECT_EQ(actualValid, copyParamsStruct[i].expectValid);
|
||||||
|
}
|
||||||
|
|
||||||
|
debugManager.flags.EnableCopyWithStagingBuffers.set(0);
|
||||||
|
EXPECT_FALSE(stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, false));
|
||||||
|
|
||||||
|
debugManager.flags.EnableCopyWithStagingBuffers.set(-1);
|
||||||
|
auto isStaingBuffersEnabled = pDevice->getProductHelper().isStagingBuffersEnabled();
|
||||||
|
EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForCopy(*pDevice, usmBuffer, nonUsmBuffer, false));
|
||||||
|
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformCopyThenCopyData) {
|
||||||
|
constexpr size_t numOfChunkCopies = 8;
|
||||||
|
constexpr size_t remainder = 1024;
|
||||||
|
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies + remainder;
|
||||||
|
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies + 1, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformCopyWithoutRemainderThenNoRemainderCalled) {
|
||||||
|
constexpr size_t numOfChunkCopies = 8;
|
||||||
|
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies;
|
||||||
|
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenTaskCountNotReadyThenDontReuseBuffers) {
|
||||||
|
constexpr size_t numOfChunkCopies = 8;
|
||||||
|
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies;
|
||||||
|
|
||||||
|
*csr->getTagAddress() = csr->peekTaskCount();
|
||||||
|
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenUpdatedTaskCountThenReuseBuffers) {
|
||||||
|
constexpr size_t numOfChunkCopies = 8;
|
||||||
|
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies;
|
||||||
|
|
||||||
|
*csr->getTagAddress() = csr->peekTaskCount();
|
||||||
|
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 8);
|
||||||
|
|
||||||
|
*csr->getTagAddress() = csr->peekTaskCount() + numOfChunkCopies;
|
||||||
|
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies, 0);
|
||||||
|
EXPECT_EQ(numOfChunkCopies, svmAllocsManager->svmAllocs.getNumAllocs());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkCopyThenEarlyReturnWithFailure) {
|
||||||
|
constexpr size_t numOfChunkCopies = 8;
|
||||||
|
constexpr size_t remainder = 1024;
|
||||||
|
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies + remainder;
|
||||||
|
constexpr int expectedErrorCode = 1;
|
||||||
|
auto usmBuffer = allocateDeviceBuffer(totalCopySize);
|
||||||
|
auto nonUsmBuffer = new unsigned char[totalCopySize];
|
||||||
|
|
||||||
|
size_t chunkCounter = 0;
|
||||||
|
memset(usmBuffer, 0, totalCopySize);
|
||||||
|
memset(nonUsmBuffer, 0xFF, totalCopySize);
|
||||||
|
|
||||||
|
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) {
|
||||||
|
chunkCounter++;
|
||||||
|
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
||||||
|
memcpy(chunkDst, stagingBuffer, chunkSize);
|
||||||
|
return expectedErrorCode;
|
||||||
|
};
|
||||||
|
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
|
||||||
|
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr);
|
||||||
|
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||||
|
|
||||||
|
EXPECT_EQ(expectedErrorCode, ret);
|
||||||
|
EXPECT_NE(0, memcmp(usmBuffer, nonUsmBuffer, totalCopySize));
|
||||||
|
EXPECT_EQ(1u, chunkCounter);
|
||||||
|
EXPECT_EQ(1u, newUsmAllocations);
|
||||||
|
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
||||||
|
delete[] nonUsmBuffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedRemainderCopyThenReturnWithFailure) {
|
||||||
|
constexpr size_t numOfChunkCopies = 8;
|
||||||
|
constexpr size_t remainder = 1024;
|
||||||
|
constexpr size_t totalCopySize = stagingBufferSize * numOfChunkCopies + remainder;
|
||||||
|
constexpr int expectedErrorCode = 1;
|
||||||
|
auto usmBuffer = allocateDeviceBuffer(totalCopySize);
|
||||||
|
auto nonUsmBuffer = new unsigned char[totalCopySize];
|
||||||
|
|
||||||
|
size_t chunkCounter = 0;
|
||||||
|
memset(usmBuffer, 0, totalCopySize);
|
||||||
|
memset(nonUsmBuffer, 0xFF, totalCopySize);
|
||||||
|
|
||||||
|
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) {
|
||||||
|
chunkCounter++;
|
||||||
|
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
||||||
|
memcpy(chunkDst, stagingBuffer, chunkSize);
|
||||||
|
if (chunkCounter <= numOfChunkCopies) {
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
return expectedErrorCode;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
|
||||||
|
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr);
|
||||||
|
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||||
|
|
||||||
|
EXPECT_EQ(expectedErrorCode, ret);
|
||||||
|
EXPECT_EQ(numOfChunkCopies + 1, chunkCounter);
|
||||||
|
EXPECT_EQ(1u, newUsmAllocations);
|
||||||
|
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
||||||
|
delete[] nonUsmBuffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenChangedBufferSizeThenPerformCopyWithCorrectNumberOfChunks) {
|
||||||
|
constexpr size_t stagingBufferSize = 512;
|
||||||
|
constexpr size_t numOfChunkCopies = 8;
|
||||||
|
constexpr size_t remainder = 1024;
|
||||||
|
constexpr size_t totalCopySize = MemoryConstants::kiloByte * stagingBufferSize * numOfChunkCopies + remainder;
|
||||||
|
debugManager.flags.StagingBufferSize.set(stagingBufferSize); // 512KB
|
||||||
|
|
||||||
|
RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex};
|
||||||
|
std::map<uint32_t, DeviceBitfield> deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}};
|
||||||
|
stagingBufferManager = std::make_unique<StagingBufferManager>(svmAllocsManager.get(), rootDeviceIndices, deviceBitfields);
|
||||||
|
copyThroughStagingBuffers(totalCopySize, numOfChunkCopies + 1, 1);
|
||||||
|
}
|
Loading…
Reference in New Issue