diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp index 0345f16f79..abcf9f58bc 100644 --- a/opencl/source/api/api.cpp +++ b/opencl/source/api/api.cpp @@ -2965,7 +2965,7 @@ cl_int CL_API_CALL clEnqueueWriteImage(cl_command_queue commandQueue, TRACING_EXIT(ClEnqueueWriteImage, &retVal); return retVal; } - if (pCommandQueue->isValidForStagingWriteImage(pImage, ptr, numEventsInWaitList > 0)) { + if (pCommandQueue->isValidForStagingTransferImage(pImage, ptr, numEventsInWaitList > 0)) { retVal = pCommandQueue->enqueueStagingWriteImage(pImage, blockingWrite, origin, region, inputRowPitch, inputSlicePitch, ptr, event); } else { retVal = pCommandQueue->enqueueWriteImage( diff --git a/opencl/source/command_queue/CMakeLists.txt b/opencl/source/command_queue/CMakeLists.txt index e63cac5f08..994ad06e4f 100644 --- a/opencl/source/command_queue/CMakeLists.txt +++ b/opencl/source/command_queue/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2018-2024 Intel Corporation # # SPDX-License-Identifier: MIT # @@ -9,6 +9,7 @@ set(RUNTIME_SRCS_COMMAND_QUEUE ${CMAKE_CURRENT_SOURCE_DIR}/cl_local_work_size.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cl_local_work_size.h ${CMAKE_CURRENT_SOURCE_DIR}/command_queue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_staging.cpp ${CMAKE_CURRENT_SOURCE_DIR}/command_queue.h ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw.h ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw_base.inl diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 4e25cfbaaf..90988091d6 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -29,7 +29,6 @@ #include "shared/source/os_interface/os_context.h" #include "shared/source/os_interface/product_helper.h" #include "shared/source/utilities/api_intercept.h" -#include "shared/source/utilities/staging_buffer_manager.h" #include "shared/source/utilities/tag_allocator.h" #include "opencl/source/built_ins/builtins_dispatch_builder.h" @@ -1557,146 +1556,4 @@ void CommandQueue::unregisterGpgpuAndBcsCsrClients() { } } -cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event) { - CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, &size}; - csrSelectionArgs.direction = TransferDirection::hostToLocal; - auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs); - - Event profilingEvent{this, CL_COMMAND_SVM_MEMCPY, CompletionStamp::notReady, CompletionStamp::notReady}; - if (isProfilingEnabled()) { - profilingEvent.setQueueTimeStamp(); - } - - // If there was only one chunk copy, no barrier for OOQ is needed - bool isSingleTransfer = false; - ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) -> int32_t { - auto isFirstTransfer = (chunkDst == dstPtr); - auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size); - isSingleTransfer = isFirstTransfer && isLastTransfer; - - if (isFirstTransfer && isProfilingEnabled()) { - profilingEvent.setSubmitTimeStamp(); - } - memcpy(stagingBuffer, chunkSrc, chunkSize); - if (isSingleTransfer) { - return this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, event, csr); - } - - if (isFirstTransfer && isProfilingEnabled()) { - profilingEvent.setStartTimeStamp(); - } - - cl_event *outEvent = nullptr; - if (isLastTransfer && !this->isOOQEnabled()) { - outEvent = event; - } - auto ret = this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, outEvent, csr); - return ret; - }; - - auto stagingBufferManager = this->context->getStagingBufferManager(); - auto ret = stagingBufferManager->performCopy(dstPtr, srcPtr, size, chunkCopy, csr); - if (ret != CL_SUCCESS) { - return ret; - } - return postStagingTransferSync(event, profilingEvent, isSingleTransfer, blockingCopy); -} - -cl_int CommandQueue::enqueueStagingWriteImage(Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion, - size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event) { - constexpr cl_command_type cmdType = CL_COMMAND_WRITE_IMAGE; - CsrSelectionArgs csrSelectionArgs{cmdType, nullptr, dstImage, this->getDevice().getRootDeviceIndex(), globalRegion, nullptr, globalOrigin}; - auto &csr = selectCsrForBuiltinOperation(csrSelectionArgs); - - Event profilingEvent{this, CL_COMMAND_WRITE_IMAGE, CompletionStamp::notReady, CompletionStamp::notReady}; - if (isProfilingEnabled()) { - profilingEvent.setQueueTimeStamp(); - } - - // If there was only one chunk write, no barrier for OOQ is needed - bool isSingleTransfer = false; - ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t { - auto isFirstTransfer = (globalOrigin[1] == origin[1]); - auto isLastTransfer = (globalOrigin[1] + globalRegion[1] == origin[1] + region[1]); - isSingleTransfer = isFirstTransfer && isLastTransfer; - - if (isFirstTransfer && isProfilingEnabled()) { - profilingEvent.setSubmitTimeStamp(); - } - memcpy(stagingBuffer, chunkPtr, bufferSize); - if (isSingleTransfer) { - return this->enqueueWriteImageImpl(dstImage, false, origin, region, inputRowPitch, inputSlicePitch, stagingBuffer, nullptr, 0, nullptr, event, csr); - } - - if (isFirstTransfer && isProfilingEnabled()) { - profilingEvent.setStartTimeStamp(); - } - - cl_event *outEvent = nullptr; - if (isLastTransfer && !this->isOOQEnabled()) { - outEvent = event; - } - auto ret = this->enqueueWriteImageImpl(dstImage, false, origin, region, inputRowPitch, inputSlicePitch, stagingBuffer, nullptr, 0, nullptr, outEvent, csr); - return ret; - }; - auto bytesPerPixel = dstImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes; - auto dstRowPitch = inputRowPitch ? inputRowPitch : globalRegion[0] * bytesPerPixel; - auto stagingBufferManager = this->context->getStagingBufferManager(); - auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, dstRowPitch, chunkWrite, &csr); - if (ret != CL_SUCCESS) { - return ret; - } - return postStagingTransferSync(event, profilingEvent, isSingleTransfer, blockingCopy); -} - -cl_int CommandQueue::postStagingTransferSync(cl_event *event, const Event &profilingEvent, bool isSingleTransfer, bool isBlocking) { - cl_int ret = CL_SUCCESS; - if (event != nullptr) { - if (!isSingleTransfer && this->isOOQEnabled()) { - ret = this->enqueueBarrierWithWaitList(0, nullptr, event); - } - auto pEvent = castToObjectOrAbort(*event); - if (isProfilingEnabled()) { - pEvent->copyTimestamps(profilingEvent, !isSingleTransfer); - pEvent->setCPUProfilingPath(false); - } - pEvent->setCmdType(profilingEvent.getCommandType()); - } - - if (isBlocking) { - ret = this->finish(); - } - return ret; -} - -bool CommandQueue::isValidForStagingBufferCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies) { - GraphicsAllocation *allocation = nullptr; - context->tryGetExistingMapAllocation(srcPtr, size, allocation); - if (allocation != nullptr) { - // Direct transfer from mapped allocation is faster than staging buffer - return false; - } - CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, nullptr}; - csrSelectionArgs.direction = TransferDirection::hostToLocal; - auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs); - auto osContextId = csr->getOsContext().getContextId(); - auto stagingBufferManager = context->getStagingBufferManager(); - UNRECOVERABLE_IF(stagingBufferManager == nullptr); - return stagingBufferManager->isValidForCopy(device, dstPtr, srcPtr, size, hasDependencies, osContextId); -} - -bool CommandQueue::isValidForStagingWriteImage(Image *image, const void *ptr, bool hasDependencies) { - auto stagingBufferManager = context->getStagingBufferManager(); - if (!stagingBufferManager) { - return false; - } - switch (image->getImageDesc().image_type) { - case CL_MEM_OBJECT_IMAGE1D: - case CL_MEM_OBJECT_IMAGE2D: - return stagingBufferManager->isValidForStagingWriteImage(this->getDevice(), ptr, hasDependencies); - default: - return false; - } -} - } // namespace NEO diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 298e6998e0..41e7830e98 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -45,6 +45,7 @@ struct BuiltinOpParams; struct CsrSelectionArgs; struct MultiDispatchInfo; struct TimestampPacketDependencies; +struct StagingTransferStatus; enum class QueuePriority { low, @@ -147,6 +148,10 @@ class CommandQueue : public BaseObject<_cl_command_queue> { size_t rowPitch, size_t slicePitch, void *ptr, GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) = 0; + virtual cl_int enqueueReadImageImpl(Image *srcImage, cl_bool blockingRead, const size_t *origin, const size_t *region, + size_t rowPitch, size_t slicePitch, void *ptr, GraphicsAllocation *mapAllocation, + cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, CommandStreamReceiver &csr) = 0; + virtual cl_int enqueueWriteBuffer(Buffer *buffer, cl_bool blockingWrite, size_t offset, size_t cb, const void *ptr, GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) = 0; @@ -396,8 +401,11 @@ class CommandQueue : public BaseObject<_cl_command_queue> { cl_int enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event); cl_int enqueueStagingWriteImage(Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion, size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event); + cl_int enqueueStagingReadImage(Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion, + size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event); + bool isValidForStagingBufferCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies); - bool isValidForStagingWriteImage(Image *image, const void *ptr, bool hasDependencies); + bool isValidForStagingTransferImage(Image *image, const void *ptr, bool hasDependencies); protected: void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet); @@ -441,7 +449,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> { void unregisterGpgpuAndBcsCsrClients(); - cl_int postStagingTransferSync(cl_event *event, const Event &profilingEvent, bool isSingleTransfer, bool isBlocking); + cl_int postStagingTransferSync(const StagingTransferStatus &status, cl_event *event, const cl_event profilingEvent, bool isSingleTransfer, bool isBlocking, cl_command_type commandType); + cl_event *assignEventForStaging(cl_event *userEvent, cl_event *profilingEvent, bool isFirstTransfer, bool isLastTransfer) const; Context *context = nullptr; ClDevice *device = nullptr; diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 851c9a9df8..dd9e5c3319 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -269,6 +269,18 @@ class CommandQueueHw : public CommandQueue { const cl_event *eventWaitList, cl_event *event) override; + cl_int enqueueReadImageImpl(Image *srcImage, + cl_bool blockingRead, + const size_t *origin, + const size_t *region, + size_t rowPitch, + size_t slicePitch, + void *ptr, + GraphicsAllocation *mapAllocation, + cl_uint numEventsInWaitList, + const cl_event *eventWaitList, + cl_event *event, CommandStreamReceiver &csr) override; + cl_int enqueueWriteBuffer(Buffer *buffer, cl_bool blockingWrite, size_t offset, diff --git a/opencl/source/command_queue/command_queue_staging.cpp b/opencl/source/command_queue/command_queue_staging.cpp new file mode 100644 index 0000000000..1cd9fe3d82 --- /dev/null +++ b/opencl/source/command_queue/command_queue_staging.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_stream/command_stream_receiver.h" +#include "shared/source/device/device.h" +#include "shared/source/os_interface/os_context.h" +#include "shared/source/utilities/staging_buffer_manager.h" + +#include "opencl/source/command_queue/command_queue.h" +#include "opencl/source/command_queue/csr_selection_args.h" +#include "opencl/source/context/context.h" +#include "opencl/source/event/user_event.h" +#include "opencl/source/helpers/base_object.h" +#include "opencl/source/mem_obj/image.h" + +#include "CL/cl_ext.h" + +namespace NEO { + +cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event) { + CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, &size}; + csrSelectionArgs.direction = TransferDirection::hostToLocal; + auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs); + cl_event profilingEvent; + + bool isSingleTransfer = false; + ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) -> int32_t { + auto isFirstTransfer = (chunkDst == dstPtr); + auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size); + isSingleTransfer = isFirstTransfer && isLastTransfer; + cl_event *outEvent = assignEventForStaging(event, &profilingEvent, isFirstTransfer, isLastTransfer); + + return this->enqueueSVMMemcpy(false, chunkDst, chunkSrc, chunkSize, 0, nullptr, outEvent, csr); + }; + + auto stagingBufferManager = this->context->getStagingBufferManager(); + auto ret = stagingBufferManager->performCopy(dstPtr, srcPtr, size, chunkCopy, csr); + return postStagingTransferSync(ret, event, profilingEvent, isSingleTransfer, blockingCopy, CL_COMMAND_SVM_MEMCPY); +} + +cl_int CommandQueue::enqueueStagingWriteImage(Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion, + size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event) { + CsrSelectionArgs csrSelectionArgs{CL_COMMAND_WRITE_IMAGE, nullptr, dstImage, this->getDevice().getRootDeviceIndex(), globalRegion, nullptr, globalOrigin}; + auto &csr = selectCsrForBuiltinOperation(csrSelectionArgs); + cl_event profilingEvent; + + bool isSingleTransfer = false; + ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t { + auto isFirstTransfer = (globalOrigin[1] == origin[1]); + auto isLastTransfer = (globalOrigin[1] + globalRegion[1] == origin[1] + region[1]); + isSingleTransfer = isFirstTransfer && isLastTransfer; + cl_event *outEvent = assignEventForStaging(event, &profilingEvent, isFirstTransfer, isLastTransfer); + + return this->enqueueWriteImageImpl(dstImage, false, origin, region, inputRowPitch, inputSlicePitch, stagingBuffer, nullptr, 0, nullptr, outEvent, csr); + }; + auto bytesPerPixel = dstImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes; + auto dstRowPitch = inputRowPitch ? inputRowPitch : globalRegion[0] * bytesPerPixel; + + auto stagingBufferManager = this->context->getStagingBufferManager(); + auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, dstRowPitch, chunkWrite, &csr, false); + return postStagingTransferSync(ret, event, profilingEvent, isSingleTransfer, blockingCopy, CL_COMMAND_WRITE_IMAGE); +} + +cl_int CommandQueue::enqueueStagingReadImage(Image *srcImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion, + size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event) { + CsrSelectionArgs csrSelectionArgs{CL_COMMAND_READ_IMAGE, srcImage, nullptr, this->getDevice().getRootDeviceIndex(), globalRegion, nullptr, globalOrigin}; + auto &csr = selectCsrForBuiltinOperation(csrSelectionArgs); + cl_event profilingEvent; + + bool isSingleTransfer = false; + ChunkTransferImageFunc chunkRead = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t { + auto isFirstTransfer = (globalOrigin[1] == origin[1]); + auto isLastTransfer = (globalOrigin[1] + globalRegion[1] == origin[1] + region[1]); + isSingleTransfer = isFirstTransfer && isLastTransfer; + cl_event *outEvent = assignEventForStaging(event, &profilingEvent, isFirstTransfer, isLastTransfer); + + return this->enqueueReadImageImpl(srcImage, false, origin, region, inputRowPitch, inputSlicePitch, stagingBuffer, nullptr, 0, nullptr, outEvent, csr); + }; + auto bytesPerPixel = srcImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes; + auto dstRowPitch = inputRowPitch ? inputRowPitch : globalRegion[0] * bytesPerPixel; + + auto stagingBufferManager = this->context->getStagingBufferManager(); + auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, dstRowPitch, chunkRead, &csr, true); + return postStagingTransferSync(ret, event, profilingEvent, isSingleTransfer, blockingCopy, CL_COMMAND_READ_IMAGE); +} + +/* + * If there's single transfer, use user event. + * Otherwise, first transfer uses profiling event to obtain queue/submit/start timestamps. + * Last transfer uses user event in case of IOQ. + * For OOQ user event will be passed to barrier to gather all submitted transfers. + */ +cl_event *CommandQueue::assignEventForStaging(cl_event *userEvent, cl_event *profilingEvent, bool isFirstTransfer, bool isLastTransfer) const { + cl_event *outEvent = nullptr; + if (userEvent != nullptr) { + if (isFirstTransfer && isProfilingEnabled()) { + outEvent = profilingEvent; + } else if (isLastTransfer && !this->isOOQEnabled()) { + outEvent = userEvent; + } + } + if (isFirstTransfer && isLastTransfer) { + outEvent = userEvent; + } + return outEvent; +} + +cl_int CommandQueue::postStagingTransferSync(const StagingTransferStatus &status, cl_event *event, const cl_event profilingEvent, bool isSingleTransfer, bool isBlocking, cl_command_type commandType) { + if (status.waitStatus == WaitStatus::gpuHang) { + return CL_OUT_OF_RESOURCES; + } else if (status.chunkCopyStatus != CL_SUCCESS) { + return status.chunkCopyStatus; + } + + cl_int ret = CL_SUCCESS; + if (event != nullptr) { + if (!isSingleTransfer && this->isOOQEnabled()) { + ret = this->enqueueBarrierWithWaitList(0, nullptr, event); + } + auto pEvent = castToObjectOrAbort(*event); + if (!isSingleTransfer && isProfilingEnabled()) { + auto pProfilingEvent = castToObjectOrAbort(profilingEvent); + pEvent->copyTimestamps(*pProfilingEvent); + pProfilingEvent->release(); + } + pEvent->setCmdType(commandType); + } + + if (isBlocking) { + ret = this->finish(); + } + return ret; +} + +bool CommandQueue::isValidForStagingBufferCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies) { + GraphicsAllocation *allocation = nullptr; + context->tryGetExistingMapAllocation(srcPtr, size, allocation); + if (allocation != nullptr) { + // Direct transfer from mapped allocation is faster than staging buffer + return false; + } + CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, nullptr}; + csrSelectionArgs.direction = TransferDirection::hostToLocal; + auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs); + auto osContextId = csr->getOsContext().getContextId(); + auto stagingBufferManager = context->getStagingBufferManager(); + UNRECOVERABLE_IF(stagingBufferManager == nullptr); + return stagingBufferManager->isValidForCopy(device, dstPtr, srcPtr, size, hasDependencies, osContextId); +} + +bool CommandQueue::isValidForStagingTransferImage(Image *image, const void *ptr, bool hasDependencies) { + auto stagingBufferManager = context->getStagingBufferManager(); + if (!stagingBufferManager) { + return false; + } + switch (image->getImageDesc().image_type) { + case CL_MEM_OBJECT_IMAGE1D: + case CL_MEM_OBJECT_IMAGE2D: + return stagingBufferManager->isValidForStagingTransferImage(this->getDevice(), ptr, hasDependencies); + default: + return false; + } +} + +} // namespace NEO diff --git a/opencl/source/command_queue/enqueue_read_image.h b/opencl/source/command_queue/enqueue_read_image.h index 5c650dffa6..7a08a7e38a 100644 --- a/opencl/source/command_queue/enqueue_read_image.h +++ b/opencl/source/command_queue/enqueue_read_image.h @@ -39,6 +39,25 @@ cl_int CommandQueueHw::enqueueReadImage( CsrSelectionArgs csrSelectionArgs{cmdType, srcImage, {}, device->getRootDeviceIndex(), region, origin, nullptr}; CommandStreamReceiver &csr = selectCsrForBuiltinOperation(csrSelectionArgs); + return enqueueReadImageImpl(srcImage, blockingRead, origin, region, inputRowPitch, inputSlicePitch, ptr, mapAllocation, numEventsInWaitList, eventWaitList, event, csr); +} + +template +cl_int CommandQueueHw::enqueueReadImageImpl( + Image *srcImage, + cl_bool blockingRead, + const size_t *origin, + const size_t *region, + size_t inputRowPitch, + size_t inputSlicePitch, + void *ptr, + GraphicsAllocation *mapAllocation, + cl_uint numEventsInWaitList, + const cl_event *eventWaitList, + cl_event *event, CommandStreamReceiver &csr) { + constexpr cl_command_type cmdType = CL_COMMAND_READ_IMAGE; + + CsrSelectionArgs csrSelectionArgs{cmdType, srcImage, {}, device->getRootDeviceIndex(), region, origin, nullptr}; if (nullptr == mapAllocation) { notifyEnqueueReadImage(srcImage, static_cast(blockingRead), EngineHelpers::isBcs(csr.getOsContext().getEngineType())); diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index e87aa2f498..d0da19f149 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -397,10 +397,6 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con auto &device = this->cmdQueue->getDevice(); auto &gfxCoreHelper = device.getGfxCoreHelper(); auto resolution = device.getDeviceInfo().profilingTimerResolution; - if (isAdjustmentNeeded) { - // Adjust startTS since we calculate profiling based on other event timestamps - contextStartTS = startTimeStamp.gpuTimeStamp; - } // Calculate startTimestamp only if it was not already set on CPU if (startTimeStamp.cpuTimeInNs == 0) { @@ -1046,4 +1042,20 @@ TaskCountType Event::peekTaskLevel() const { return taskLevel; } +void Event::copyTimestamps(Event &srcEvent) { + if (timestampPacketContainer) { + this->addTimestampPacketNodes(*srcEvent.getTimestampPacketNodes()); + } else { + if (this->timeStampNode != nullptr) { + this->timeStampNode->returnTag(); + } + this->timeStampNode = srcEvent.timeStampNode; + srcEvent.timeStampNode = nullptr; + } + this->queueTimeStamp = srcEvent.queueTimeStamp; + this->submitTimeStamp = srcEvent.submitTimeStamp; + this->startTimeStamp = srcEvent.startTimeStamp; + this->endTimeStamp = srcEvent.endTimeStamp; +} + } // namespace NEO diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h index ed11bb6a53..fe9c6cc257 100644 --- a/opencl/source/event/event.h +++ b/opencl/source/event/event.h @@ -312,13 +312,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { static void getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS); - void copyTimestamps(const Event &srcEvent, bool isAdjustmentNeeded) { - this->queueTimeStamp = srcEvent.queueTimeStamp; - this->submitTimeStamp = srcEvent.submitTimeStamp; - this->startTimeStamp = srcEvent.startTimeStamp; - this->endTimeStamp = srcEvent.endTimeStamp; - this->isAdjustmentNeeded = isAdjustmentNeeded; - } + void copyTimestamps(Event &srcEvent); protected: Event(Context *ctx, CommandQueue *cmdQueue, cl_command_type cmdType, @@ -391,7 +385,6 @@ class Event : public BaseObject<_cl_event>, public IDNode { bool profilingEnabled = false; bool profilingCpuPath = false; bool dataCalculated = false; - bool isAdjustmentNeeded = false; ProfilingInfo queueTimeStamp{}; ProfilingInfo submitTimeStamp{}; diff --git a/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp index fa4eaff2ab..fe39b75c4c 100644 --- a/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp @@ -1097,4 +1097,136 @@ HWTEST_F(EnqueueReadImageTest, whenEnqueueReadImageWithUsmPtrThenDontImportAlloc auto &csr = pDevice->getUltCommandStreamReceiver(); EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled); svmManager->freeSVMAlloc(usmPtr); +} + +struct ReadImageStagingBufferTest : public EnqueueReadImageTest { + void SetUp() override { + REQUIRE_SVM_OR_SKIP(defaultHwInfo); + EnqueueReadImageTest::SetUp(); + ptr = new unsigned char[readSize]; + device.reset(new MockClDevice{MockClDevice::createWithNewExecutionEnvironment(nullptr)}); + } + + void TearDown() override { + if (defaultHwInfo->capabilityTable.ftrSvm == false) { + return; + } + delete[] ptr; + EnqueueReadImageTest::TearDown(); + } + + static constexpr size_t stagingBufferSize = MemoryConstants::megaByte * 2; + static constexpr size_t readSize = stagingBufferSize * 4; + unsigned char *ptr; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {4, 8, 1}; + std::unique_ptr device; + cl_queue_properties props = {}; +}; + +HWTEST_F(ReadImageStagingBufferTest, whenEnqueueStagingReadImageCalledThenReturnSuccess) { + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr); + + EXPECT_EQ(res, CL_SUCCESS); + EXPECT_EQ(4ul, mockCommandQueueHw.enqueueReadImageCounter); + auto &csr = pDevice->getUltCommandStreamReceiver(); + EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled); +} + +HWTEST_F(ReadImageStagingBufferTest, whenEnqueueStagingReadImageCalledWithoutRowPitchThenReturnSuccess) { + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + region[0] = MemoryConstants::megaByte / srcImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes; + auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, 0u, MemoryConstants::megaByte, ptr, nullptr); + + EXPECT_EQ(res, CL_SUCCESS); + EXPECT_EQ(4ul, mockCommandQueueHw.enqueueReadImageCounter); + auto &csr = pDevice->getUltCommandStreamReceiver(); + EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled); +} + +HWTEST_F(ReadImageStagingBufferTest, whenBlockingEnqueueStagingReadImageCalledThenFinishCalled) { + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, true, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr); + + EXPECT_EQ(res, CL_SUCCESS); + EXPECT_EQ(1u, mockCommandQueueHw.finishCalledCount); +} + +HWTEST_F(ReadImageStagingBufferTest, whenEnqueueStagingReadImageCalledWithEventThenReturnValidEvent) { + constexpr cl_command_type expectedLastCmd = CL_COMMAND_READ_IMAGE; + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + cl_event event; + auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, &event); + EXPECT_EQ(res, CL_SUCCESS); + + auto pEvent = (Event *)event; + EXPECT_EQ(expectedLastCmd, mockCommandQueueHw.lastCommandType); + EXPECT_EQ(expectedLastCmd, pEvent->getCommandType()); + + clReleaseEvent(event); +} + +HWTEST_F(ReadImageStagingBufferTest, givenOutOfOrderQueueWhenEnqueueStagingReadImageCalledWithEventThenReturnValidEvent) { + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + mockCommandQueueHw.setOoqEnabled(); + cl_event event; + auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, &event); + EXPECT_EQ(res, CL_SUCCESS); + + auto pEvent = (Event *)event; + EXPECT_EQ(static_cast(CL_COMMAND_BARRIER), mockCommandQueueHw.lastCommandType); + EXPECT_EQ(static_cast(CL_COMMAND_READ_IMAGE), pEvent->getCommandType()); + + clReleaseEvent(event); +} + +HWTEST_F(ReadImageStagingBufferTest, givenOutOfOrderQueueWhenEnqueueStagingReadImageCalledWithSingleTransferThenNoBarrierEnqueued) { + constexpr cl_command_type expectedLastCmd = CL_COMMAND_READ_IMAGE; + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + mockCommandQueueHw.setOoqEnabled(); + cl_event event; + region[1] = 1; + auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, &event); + EXPECT_EQ(res, CL_SUCCESS); + + auto pEvent = (Event *)event; + EXPECT_EQ(expectedLastCmd, mockCommandQueueHw.lastCommandType); + EXPECT_EQ(expectedLastCmd, pEvent->getCommandType()); + + clReleaseEvent(event); +} + +HWTEST_F(ReadImageStagingBufferTest, givenCmdQueueWithProfilingWhenEnqueueStagingReadImageThenTimestampsSetCorrectly) { + cl_event event; + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + mockCommandQueueHw.setProfilingEnabled(); + auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, &event); + EXPECT_EQ(res, CL_SUCCESS); + + auto pEvent = (Event *)event; + EXPECT_FALSE(pEvent->isCPUProfilingPath()); + EXPECT_TRUE(pEvent->isProfilingEnabled()); + + clReleaseEvent(event); +} + +HWTEST_F(ReadImageStagingBufferTest, whenEnqueueStagingReadImageFailedThenPropagateErrorCode) { + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + mockCommandQueueHw.enqueueReadImageCallBase = false; + auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr); + + EXPECT_EQ(res, CL_INVALID_OPERATION); + EXPECT_EQ(1ul, mockCommandQueueHw.enqueueReadImageCounter); +} + +HWTEST_F(ReadImageStagingBufferTest, whenEnqueueStagingReadImageCalledWithGpuHangThenReturnOutOfResources) { + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + CsrSelectionArgs csrSelectionArgs{CL_COMMAND_READ_IMAGE, srcImage, nullptr, pDevice->getRootDeviceIndex(), region, nullptr, origin}; + auto ultCsr = reinterpret_cast *>(&mockCommandQueueHw.selectCsrForBuiltinOperation(csrSelectionArgs)); + ultCsr->waitForTaskCountReturnValue = WaitStatus::gpuHang; + auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr); + + EXPECT_EQ(res, CL_OUT_OF_RESOURCES); + EXPECT_EQ(2ul, mockCommandQueueHw.enqueueReadImageCounter); } \ No newline at end of file diff --git a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp index 45d259e64d..d87550987b 100644 --- a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp @@ -801,7 +801,7 @@ HWTEST_F(EnqueueWriteImageTest, whenEnqueueWriteImageWithUsmPtrAndSizeLowerThanR svmManager->freeSVMAlloc(usmPtr); } -HWTEST_F(EnqueueWriteImageTest, whenIsValidForStagingWriteImageCalledThenReturnCorrectValue) { +HWTEST_F(EnqueueWriteImageTest, whenIsValidForStagingTransferImageCalledThenReturnCorrectValue) { bool svmSupported = pDevice->getHardwareInfo().capabilityTable.ftrSvm; if (!svmSupported) { GTEST_SKIP(); @@ -810,13 +810,13 @@ HWTEST_F(EnqueueWriteImageTest, whenIsValidForStagingWriteImageCalledThenReturnC unsigned char ptr[16]; std::unique_ptr image(Image1dHelper<>::create(context)); - EXPECT_EQ(isStagingBuffersEnabled, pCmdQ->isValidForStagingWriteImage(image.get(), ptr, false)); + EXPECT_EQ(isStagingBuffersEnabled, pCmdQ->isValidForStagingTransferImage(image.get(), ptr, false)); image.reset(Image2dHelper<>::create(context)); - EXPECT_EQ(isStagingBuffersEnabled, pCmdQ->isValidForStagingWriteImage(image.get(), ptr, false)); + EXPECT_EQ(isStagingBuffersEnabled, pCmdQ->isValidForStagingTransferImage(image.get(), ptr, false)); image.reset(Image3dHelper<>::create(context)); - EXPECT_FALSE(pCmdQ->isValidForStagingWriteImage(image.get(), ptr, false)); + EXPECT_FALSE(pCmdQ->isValidForStagingTransferImage(image.get(), ptr, false)); } struct WriteImageStagingBufferTest : public EnqueueWriteImageTest { @@ -854,6 +854,17 @@ HWTEST_F(WriteImageStagingBufferTest, whenEnqueueStagingWriteImageCalledThenRetu EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled); } +HWTEST_F(WriteImageStagingBufferTest, whenEnqueueStagingWriteImageCalledWithoutRowPitchThenReturnSuccess) { + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + region[0] = MemoryConstants::megaByte / dstImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes; + auto res = mockCommandQueueHw.enqueueStagingWriteImage(dstImage, false, origin, region, 0u, MemoryConstants::megaByte, ptr, nullptr); + + EXPECT_EQ(res, CL_SUCCESS); + EXPECT_EQ(4ul, mockCommandQueueHw.enqueueWriteImageCounter); + auto &csr = pDevice->getUltCommandStreamReceiver(); + EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled); +} + HWTEST_F(WriteImageStagingBufferTest, whenBlockingEnqueueStagingWriteImageCalledThenFinishCalled) { MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); auto res = mockCommandQueueHw.enqueueStagingWriteImage(dstImage, true, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr); diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index f9f5c53570..677090414b 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -188,6 +188,11 @@ class MockCommandQueue : public CommandQueue { GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override { return CL_SUCCESS; } + cl_int enqueueReadImageImpl(Image *srcImage, cl_bool blockingRead, const size_t *origin, const size_t *region, + size_t rowPitch, size_t slicePitch, void *ptr, + GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList, + const cl_event *eventWaitList, cl_event *event, CommandStreamReceiver &csr) override { return CL_SUCCESS; } + cl_int enqueueWriteImage(Image *dstImage, cl_bool blockingWrite, const size_t *origin, const size_t *region, size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, @@ -379,6 +384,34 @@ class MockCommandQueueHw : public CommandQueueHw { } return CL_INVALID_OPERATION; } + cl_int enqueueReadImageImpl(Image *srcImage, + cl_bool blockingRead, + const size_t *origin, + const size_t *region, + size_t rowPitch, + size_t slicePitch, + void *ptr, + GraphicsAllocation *mapAllocation, + cl_uint numEventsInWaitList, + const cl_event *eventWaitList, + cl_event *event, CommandStreamReceiver &csr) override { + enqueueReadImageCounter++; + if (enqueueReadImageCallBase) { + return BaseClass::enqueueReadImageImpl(srcImage, + blockingRead, + origin, + region, + rowPitch, + slicePitch, + ptr, + mapAllocation, + numEventsInWaitList, + eventWaitList, + event, + csr); + } + return CL_INVALID_OPERATION; + } void *cpuDataTransferHandler(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &retVal) override { cpuDataTransferHandlerCalled = true; return BaseClass::cpuDataTransferHandler(transferProperties, eventsRequest, retVal); @@ -493,6 +526,8 @@ class MockCommandQueueHw : public CommandQueueHw { MultiDispatchInfo storedMultiDispatchInfo; size_t enqueueWriteImageCounter = 0; bool enqueueWriteImageCallBase = true; + size_t enqueueReadImageCounter = 0; + bool enqueueReadImageCallBase = true; size_t enqueueWriteBufferCounter = 0; size_t requestedCmdStreamSize = 0; bool blockingWriteBuffer = false; diff --git a/shared/source/utilities/staging_buffer_manager.cpp b/shared/source/utilities/staging_buffer_manager.cpp index 734653c3fc..864facd8b9 100644 --- a/shared/source/utilities/staging_buffer_manager.cpp +++ b/shared/source/utilities/staging_buffer_manager.cpp @@ -13,7 +13,6 @@ #include "shared/source/helpers/aligned_memory.h" #include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/utilities/heap_allocator.h" - namespace NEO { StagingBuffer::StagingBuffer(void *baseAddress, size_t size) : baseAddress(baseAddress) { @@ -24,6 +23,14 @@ StagingBuffer::StagingBuffer(StagingBuffer &&other) : baseAddress(other.baseAddr this->allocator.reset(other.allocator.release()); } +bool StagingBufferTracker::isReady() const { + return csr->testTaskCountReady(csr->getTagAddress(), taskCountToWait); +} + +void StagingBufferTracker::freeChunk() const { + allocator->free(chunkAddress, size); +} + StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map &deviceBitfields) : svmAllocsManager(svmAllocsManager), rootDeviceIndices(rootDeviceIndices), deviceBitfields(deviceBitfields) { if (debugManager.flags.StagingBufferSize.get() != -1) { chunkSize = debugManager.flags.StagingBufferSize.get() * MemoryConstants::kiloByte; @@ -37,22 +44,45 @@ StagingBufferManager::~StagingBufferManager() { } /* - * This method performs 4 steps for single chunk transfer - * 1. Get existing chunk of staging buffer, if can't - allocate new one, - * 2. Perform actual transfer, - * 3. Store used buffer to tracking container (with current task count) - * 4. Update tag if required to reuse this buffer in next chunk copies + * This method performs single chunk transfer. If transfer is a read operation, it will fetch oldest staging + * buffer from the queue, otherwise it allocates or reuses buffer from the pool. + * After transfer is submitted to GPU, it stores used buffer to either queue in case of reads, + * or tracking container for further reusage. */ template -int32_t StagingBufferManager::performChunkTransfer(CommandStreamReceiver *csr, size_t size, Func &func, Args... args) { - auto allocatedSize = size; - auto [allocator, stagingBuffer] = requestStagingBuffer(allocatedSize); - auto ret = func(addrToPtr(stagingBuffer), size, args...); - trackChunk({allocator, stagingBuffer, allocatedSize, csr, csr->peekTaskCount()}); +StagingTransferStatus StagingBufferManager::performChunkTransfer(bool isRead, void *userPtr, size_t size, StagingQueue ¤tStagingBuffers, CommandStreamReceiver *csr, Func &func, Args... args) { + StagingTransferStatus result{}; + StagingBufferTracker tracker{}; + if (currentStagingBuffers.size() > 1) { + if (fetchHead(currentStagingBuffers, tracker) == WaitStatus::gpuHang) { + result.waitStatus = WaitStatus::gpuHang; + return result; + } + } else { + auto allocatedSize = size; + auto [allocator, stagingBuffer] = requestStagingBuffer(allocatedSize); + tracker = StagingBufferTracker{allocator, stagingBuffer, allocatedSize, csr}; + } + + auto stagingBuffer = addrToPtr(tracker.chunkAddress); + if (!isRead) { + memcpy(stagingBuffer, userPtr, size); + } + + result.chunkCopyStatus = func(stagingBuffer, args...); + + tracker.taskCountToWait = csr->peekTaskCount(); + if (isRead) { + UserDstData dstData{userPtr, size}; + currentStagingBuffers.push({dstData, tracker}); + } else { + trackChunk(tracker); + } + if (csr->isAnyDirectSubmissionEnabled()) { csr->flushTagUpdate(); } - return ret; + return result; } /* @@ -60,38 +90,40 @@ int32_t StagingBufferManager::performChunkTransfer(CommandStreamReceiver *csr, s * Each chunk copy contains staging buffer which should be used instead of non-usm memory during transfers on GPU. * Caller provides actual function to transfer data for single chunk. */ -int32_t StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr) { +StagingTransferStatus StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr) { + StagingQueue stagingQueue; auto copiesNum = size / chunkSize; auto remainder = size % chunkSize; - + StagingTransferStatus result{}; for (auto i = 0u; i < copiesNum; i++) { auto chunkDst = ptrOffset(dstPtr, i * chunkSize); auto chunkSrc = ptrOffset(srcPtr, i * chunkSize); - auto ret = performChunkTransfer(csr, chunkSize, chunkCopyFunc, chunkDst, chunkSrc); - if (ret) { - return ret; + result = performChunkTransfer(false, const_cast(chunkSrc), chunkSize, stagingQueue, csr, chunkCopyFunc, chunkDst, chunkSize); + if (result.chunkCopyStatus != 0) { + return result; } } if (remainder != 0) { auto chunkDst = ptrOffset(dstPtr, copiesNum * chunkSize); auto chunkSrc = ptrOffset(srcPtr, copiesNum * chunkSize); - auto ret = performChunkTransfer(csr, remainder, chunkCopyFunc, chunkDst, chunkSrc); - if (ret) { - return ret; + auto result = performChunkTransfer(false, const_cast(chunkSrc), remainder, stagingQueue, csr, chunkCopyFunc, chunkDst, remainder); + if (result.chunkCopyStatus != 0) { + return result; } } - return 0; + return result; } /* - * This method orchestrates write operation for images with given origin and region. + * This method orchestrates transfer operation for images with given origin and region. * Transfer is splitted into chunks, each chunk represents sub-region to transfer. * Each chunk contains staging buffer which should be used instead of non-usm memory during transfers on GPU. * Several rows are packed into single chunk unless size of single row exceeds maximum chunk size (2MB). - * Caller provides actual function to enqueue write operation for single chunk. + * Caller provides actual function to enqueue read/write operation for single chunk. */ -int32_t StagingBufferManager::performImageWrite(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkWriteImageFunc &chunkWriteImageFunc, CommandStreamReceiver *csr) { +StagingTransferStatus StagingBufferManager::performImageTransfer(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkTransferImageFunc &chunkTransferImageFunc, CommandStreamReceiver *csr, bool isRead) { + StagingQueue stagingQueue; size_t origin[3] = {}; size_t region[3] = {}; origin[0] = globalOrigin[0]; @@ -102,15 +134,16 @@ int32_t StagingBufferManager::performImageWrite(const void *ptr, const size_t *g rowsPerChunk = std::min(rowsPerChunk, globalRegion[1]); auto numOfChunks = globalRegion[1] / rowsPerChunk; auto remainder = globalRegion[1] % (rowsPerChunk * numOfChunks); + StagingTransferStatus result{}; for (auto i = 0u; i < numOfChunks; i++) { origin[1] = globalOrigin[1] + i * rowsPerChunk; region[1] = rowsPerChunk; auto size = region[1] * rowPitch; auto chunkPtr = ptrOffset(ptr, i * rowsPerChunk * rowPitch); - auto ret = performChunkTransfer(csr, size, chunkWriteImageFunc, chunkPtr, origin, region); - if (ret) { - return ret; + result = performChunkTransfer(isRead, const_cast(chunkPtr), size, stagingQueue, csr, chunkTransferImageFunc, origin, region); + if (result.chunkCopyStatus != 0 || result.waitStatus == WaitStatus::gpuHang) { + return result; } } @@ -119,12 +152,50 @@ int32_t StagingBufferManager::performImageWrite(const void *ptr, const size_t *g region[1] = remainder; auto size = region[1] * rowPitch; auto chunkPtr = ptrOffset(ptr, numOfChunks * rowsPerChunk * rowPitch); - auto ret = performChunkTransfer(csr, size, chunkWriteImageFunc, chunkPtr, origin, region); - if (ret) { - return ret; + result = performChunkTransfer(isRead, const_cast(chunkPtr), size, stagingQueue, csr, chunkTransferImageFunc, origin, region); + if (result.chunkCopyStatus != 0 || result.waitStatus == WaitStatus::gpuHang) { + return result; } } - return 0; + + result.waitStatus = drainAndReleaseStagingQueue(stagingQueue); + return result; +} + +/* + * This method is used for read transfers. It waits for oldest transfer to finish + * and copies data associated with that transfer to host allocation. + * Returned tracker contains staging buffer ready for reuse. + */ +WaitStatus StagingBufferManager::fetchHead(StagingQueue &stagingQueue, StagingBufferTracker &tracker) const { + auto &head = stagingQueue.front(); + auto status = head.second.csr->waitForTaskCount(head.second.taskCountToWait); + if (status == WaitStatus::gpuHang) { + return status; + } + + auto &userData = head.first; + tracker = head.second; + auto stagingBuffer = addrToPtr(tracker.chunkAddress); + memcpy(userData.ptr, stagingBuffer, userData.size); + stagingQueue.pop(); + return WaitStatus::ready; +} + +/* + * Waits for all pending transfers to finish. + * Releases staging buffers back to pool for reuse. + */ +WaitStatus StagingBufferManager::drainAndReleaseStagingQueue(StagingQueue &stagingQueue) const { + StagingBufferTracker tracker{}; + while (!stagingQueue.empty()) { + auto status = fetchHead(stagingQueue, tracker); + if (status == WaitStatus::gpuHang) { + return status; + } + tracker.freeChunk(); + } + return WaitStatus::ready; } /* @@ -196,7 +267,7 @@ bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, co return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize); } -bool StagingBufferManager::isValidForStagingWriteImage(const Device &device, const void *ptr, bool hasDependencies) const { +bool StagingBufferManager::isValidForStagingTransferImage(const Device &device, const void *ptr, bool hasDependencies) const { auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled(); if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) { stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get(); @@ -207,9 +278,8 @@ bool StagingBufferManager::isValidForStagingWriteImage(const Device &device, con void StagingBufferManager::clearTrackedChunks() { for (auto iterator = trackers.begin(); iterator != trackers.end();) { - auto csr = iterator->csr; - if (csr->testTaskCountReady(csr->getTagAddress(), iterator->taskCountToWait)) { - iterator->allocator->free(iterator->chunkAddress, iterator->size); + if (iterator->isReady()) { + iterator->freeChunk(); iterator = trackers.erase(iterator); } else { break; diff --git a/shared/source/utilities/staging_buffer_manager.h b/shared/source/utilities/staging_buffer_manager.h index 9fb22249b6..de8e29e265 100644 --- a/shared/source/utilities/staging_buffer_manager.h +++ b/shared/source/utilities/staging_buffer_manager.h @@ -7,6 +7,7 @@ #pragma once +#include "shared/source/command_stream/wait_status.h" #include "shared/source/helpers/constants.h" #include "shared/source/utilities/stackvec.h" @@ -14,6 +15,7 @@ #include #include #include +#include namespace NEO { class SVMAllocsManager; @@ -21,8 +23,8 @@ class CommandStreamReceiver; class Device; class HeapAllocator; -using ChunkCopyFunction = std::function; -using ChunkWriteImageFunc = std::function; +using ChunkCopyFunction = std::function; +using ChunkTransferImageFunc = std::function; class StagingBuffer { public: @@ -50,8 +52,23 @@ struct StagingBufferTracker { size_t size = 0; CommandStreamReceiver *csr = nullptr; uint64_t taskCountToWait = 0; + + bool isReady() const; + void freeChunk() const; }; +struct UserDstData { + void *ptr; + size_t size; +}; + +struct StagingTransferStatus { + int32_t chunkCopyStatus = 0; // status from L0/OCL chunk copy + WaitStatus waitStatus = WaitStatus::ready; +}; + +using StagingQueue = std::queue>; + class StagingBufferManager { public: StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map &deviceBitfields); @@ -62,10 +79,10 @@ class StagingBufferManager { StagingBufferManager &operator=(const StagingBufferManager &other) = delete; bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const; - bool isValidForStagingWriteImage(const Device &device, const void *ptr, bool hasDependencies) const; + bool isValidForStagingTransferImage(const Device &device, const void *ptr, bool hasDependencies) const; - int32_t performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr); - int32_t performImageWrite(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkWriteImageFunc &chunkWriteImageFunc, CommandStreamReceiver *csr); + StagingTransferStatus performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr); + StagingTransferStatus performImageTransfer(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkTransferImageFunc &chunkTransferImageFunc, CommandStreamReceiver *csr, bool isRead); std::pair requestStagingBuffer(size_t &size); void trackChunk(const StagingBufferTracker &tracker); @@ -76,7 +93,10 @@ class StagingBufferManager { void clearTrackedChunks(); template - int32_t performChunkTransfer(CommandStreamReceiver *csr, size_t size, Func &chunkCopyFunc, Args... args); + StagingTransferStatus performChunkTransfer(bool isRead, void *userPtr, size_t size, StagingQueue ¤tStagingBuffers, CommandStreamReceiver *csr, Func &func, Args... args); + + WaitStatus fetchHead(StagingQueue &stagingQueue, StagingBufferTracker &tracker) const; + WaitStatus drainAndReleaseStagingQueue(StagingQueue &stagingQueue) const; size_t chunkSize = MemoryConstants::pageSize2M; std::mutex mtx; diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index 0ca829ff89..bfc7402c92 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -323,6 +323,13 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ return BaseClass::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, throttle); } + WaitStatus waitForTaskCount(TaskCountType requiredTaskCount) override { + if (waitForTaskCountReturnValue.has_value()) { + return *waitForTaskCountReturnValue; + } + return BaseClass::waitForTaskCount(requiredTaskCount); + } + void overrideCsrSizeReqFlags(CsrSizeRequestFlags &flags) { this->csrSizeRequestFlags = flags; } GraphicsAllocation *getPreemptionAllocation() const { return this->preemptionAllocation; } @@ -585,6 +592,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ uint32_t createAllocationForHostSurfaceCalled = 0; WaitStatus returnWaitForCompletionWithTimeout = WaitStatus::ready; std::optional waitForTaskCountWithKmdNotifyFallbackReturnValue{}; + std::optional waitForTaskCountReturnValue{}; std::optional flushReturnValue{}; CommandStreamReceiverType commandStreamReceiverType = CommandStreamReceiverType::hardware; std::atomic downloadAllocationsCalledCount = 0; diff --git a/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp b/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp index d938234e7a..1849485c75 100644 --- a/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp +++ b/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp @@ -55,10 +55,9 @@ class StagingBufferManagerFixture : public DeviceFixture { memset(usmBuffer, 0, copySize); memset(nonUsmBuffer, 0xFF, copySize); - ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) { + ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) { chunkCounter++; - memcpy(stagingBuffer, chunkSrc, chunkSize); - memcpy(chunkDst, stagingBuffer, chunkSize); + memcpy(chunkDst, chunkSrc, chunkSize); reinterpret_cast(csr)->taskCount++; return 0; }; @@ -66,7 +65,8 @@ class StagingBufferManagerFixture : public DeviceFixture { auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, copySize, chunkCopy, csr); auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations; - EXPECT_EQ(0, ret); + EXPECT_EQ(0, ret.chunkCopyStatus); + EXPECT_EQ(WaitStatus::ready, ret.waitStatus); EXPECT_EQ(0, memcmp(usmBuffer, nonUsmBuffer, copySize)); EXPECT_EQ(expectedChunks, chunkCounter); EXPECT_EQ(expectedAllocations, newUsmAllocations); @@ -74,17 +74,23 @@ class StagingBufferManagerFixture : public DeviceFixture { delete[] nonUsmBuffer; } - void imageWriteThroughStagingBuffers(size_t rowPitch, const size_t *globalOrigin, const size_t *globalRegion, size_t expectedChunks) { - auto ptr = new unsigned char[stagingBufferSize * expectedChunks]; - + void imageTransferThroughStagingBuffers(bool isRead, size_t rowPitch, const size_t *globalOrigin, const size_t *globalRegion, size_t expectedChunks) { + auto hostPtr = new unsigned char[stagingBufferSize * expectedChunks]; + auto imageData = new unsigned char[stagingBufferSize * expectedChunks]; + if (isRead) { + memset(hostPtr, 0, stagingBufferSize * expectedChunks); + memset(imageData, 0xFF, stagingBufferSize * expectedChunks); + } else { + memset(hostPtr, 0xFF, stagingBufferSize * expectedChunks); + memset(imageData, 0, stagingBufferSize * expectedChunks); + } size_t chunkCounter = 0; size_t expectedOrigin = globalOrigin[1]; auto expectedRowsPerChunk = std::min(std::max(1ul, stagingBufferSize / rowPitch), globalRegion[1]); auto numOfChunks = globalRegion[1] / expectedRowsPerChunk; auto remainder = globalRegion[1] % (expectedRowsPerChunk * numOfChunks); - ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t { + ChunkTransferImageFunc chunkTransfer = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t { EXPECT_NE(nullptr, stagingBuffer); - EXPECT_NE(nullptr, chunkPtr); EXPECT_NE(nullptr, origin); EXPECT_NE(nullptr, region); @@ -97,19 +103,33 @@ class StagingBufferManagerFixture : public DeviceFixture { } else { EXPECT_EQ(expectedRowsPerChunk, region[1]); } + auto offset = origin[1] - globalOrigin[1]; + if (isRead) { + memcpy(stagingBuffer, imageData + rowPitch * offset, rowPitch * region[1]); + } else { + memcpy(imageData + rowPitch * offset, stagingBuffer, rowPitch * region[1]); + } expectedOrigin += region[1]; chunkCounter++; reinterpret_cast(csr)->taskCount++; return 0; }; auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs(); - auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, rowPitch, chunkWrite, csr); + auto ret = stagingBufferManager->performImageTransfer(hostPtr, globalOrigin, globalRegion, rowPitch, chunkTransfer, csr, isRead); auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations; - EXPECT_EQ(0, ret); + EXPECT_EQ(0, memcmp(hostPtr, imageData, rowPitch * (numOfChunks * expectedRowsPerChunk + remainder))); + EXPECT_EQ(0, ret.chunkCopyStatus); + EXPECT_EQ(WaitStatus::ready, ret.waitStatus); EXPECT_EQ(expectedChunks, chunkCounter); - EXPECT_EQ(1u, newUsmAllocations); - delete[] ptr; + + auto expectedNewUsmAllocations = 1u; + if (isRead) { + expectedNewUsmAllocations = 2u; + } + EXPECT_EQ(expectedNewUsmAllocations, newUsmAllocations); + delete[] hostPtr; + delete[] imageData; } constexpr static size_t stagingBufferSize = MemoryConstants::megaByte * 2; @@ -178,16 +198,16 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForImageWrite {nonUsmBuffer, true, false}, }; for (auto i = 0; i < 4; i++) { - auto actualValid = stagingBufferManager->isValidForStagingWriteImage(*pDevice, copyParamsStruct[i].ptr, copyParamsStruct[i].hasDependencies); + auto actualValid = stagingBufferManager->isValidForStagingTransferImage(*pDevice, copyParamsStruct[i].ptr, copyParamsStruct[i].hasDependencies); EXPECT_EQ(actualValid, copyParamsStruct[i].expectValid); } debugManager.flags.EnableCopyWithStagingBuffers.set(0); - EXPECT_FALSE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, nonUsmBuffer, false)); + EXPECT_FALSE(stagingBufferManager->isValidForStagingTransferImage(*pDevice, nonUsmBuffer, false)); debugManager.flags.EnableCopyWithStagingBuffers.set(-1); auto isStaingBuffersEnabled = pDevice->getProductHelper().isStagingBuffersEnabled(); - EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForStagingWriteImage(*pDevice, nonUsmBuffer, false)); + EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForStagingTransferImage(*pDevice, nonUsmBuffer, false)); svmAllocsManager->freeSVMAlloc(usmBuffer); } @@ -256,17 +276,17 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkCopyThenEarlyR memset(usmBuffer, 0, totalCopySize); memset(nonUsmBuffer, 0xFF, totalCopySize); - ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) { + ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) { chunkCounter++; - memcpy(stagingBuffer, chunkSrc, chunkSize); - memcpy(chunkDst, stagingBuffer, chunkSize); + memcpy(chunkDst, chunkSrc, chunkSize); return expectedErrorCode; }; auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs(); auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr); auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations; - EXPECT_EQ(expectedErrorCode, ret); + EXPECT_EQ(expectedErrorCode, ret.chunkCopyStatus); + EXPECT_EQ(WaitStatus::ready, ret.waitStatus); EXPECT_NE(0, memcmp(usmBuffer, nonUsmBuffer, totalCopySize)); EXPECT_EQ(1u, chunkCounter); EXPECT_EQ(1u, newUsmAllocations); @@ -286,10 +306,9 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedRemainderCopyThenRe memset(usmBuffer, 0, totalCopySize); memset(nonUsmBuffer, 0xFF, totalCopySize); - ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) { + ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) { chunkCounter++; - memcpy(stagingBuffer, chunkSrc, chunkSize); - memcpy(chunkDst, stagingBuffer, chunkSize); + memcpy(chunkDst, chunkSrc, chunkSize); if (chunkCounter <= numOfChunkCopies) { return 0; } else { @@ -300,7 +319,8 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedRemainderCopyThenRe auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr); auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations; - EXPECT_EQ(expectedErrorCode, ret); + EXPECT_EQ(expectedErrorCode, ret.chunkCopyStatus); + EXPECT_EQ(WaitStatus::ready, ret.waitStatus); EXPECT_EQ(numOfChunkCopies + 1, chunkCounter); EXPECT_EQ(1u, newUsmAllocations); svmAllocsManager->freeSVMAlloc(usmBuffer); @@ -331,7 +351,7 @@ HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenDirectSubmissionEnabled auto nonUsmBuffer = new unsigned char[totalCopySize]; size_t flushTagsCalled = 0; - ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) { + ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) { if (ultCsr->flushTagUpdateCalled) { flushTagsCalled++; ultCsr->flushTagUpdateCalled = false; @@ -362,28 +382,121 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteThenWhol size_t expectedChunks = 8; const size_t globalOrigin[3] = {0, 0, 0}; const size_t globalRegion[3] = {4, expectedChunks, 1}; - imageWriteThroughStagingBuffers(stagingBufferSize, globalOrigin, globalRegion, expectedChunks); + imageTransferThroughStagingBuffers(false, stagingBufferSize, globalOrigin, globalRegion, expectedChunks); } TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithOriginThenWholeRegionCovered) { size_t expectedChunks = 8; const size_t globalOrigin[3] = {4, 4, 0}; const size_t globalRegion[3] = {4, expectedChunks, 1}; - imageWriteThroughStagingBuffers(stagingBufferSize, globalOrigin, globalRegion, expectedChunks); + imageTransferThroughStagingBuffers(false, stagingBufferSize, globalOrigin, globalRegion, expectedChunks); } TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithMultipleRowsPerChunkThenWholeRegionCovered) { size_t expectedChunks = 4; const size_t globalOrigin[3] = {0, 0, 0}; const size_t globalRegion[3] = {4, 8, 1}; - imageWriteThroughStagingBuffers(MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks); + imageTransferThroughStagingBuffers(false, MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks); } TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithRemainderThenWholeRegionCovered) { size_t expectedChunks = 4; const size_t globalOrigin[3] = {0, 0, 0}; const size_t globalRegion[3] = {4, 7, 1}; - imageWriteThroughStagingBuffers(MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks); + imageTransferThroughStagingBuffers(false, MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageReadThenWholeRegionCovered) { + size_t expectedChunks = 8; + const size_t globalOrigin[3] = {0, 0, 0}; + const size_t globalRegion[3] = {4, expectedChunks, 1}; + imageTransferThroughStagingBuffers(true, stagingBufferSize, globalOrigin, globalRegion, expectedChunks); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageReadWithOriginThenWholeRegionCovered) { + size_t expectedChunks = 8; + const size_t globalOrigin[3] = {4, 4, 0}; + const size_t globalRegion[3] = {4, expectedChunks, 1}; + imageTransferThroughStagingBuffers(true, stagingBufferSize, globalOrigin, globalRegion, expectedChunks); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageReadWithMultipleRowsPerChunkThenWholeRegionCovered) { + size_t expectedChunks = 4; + const size_t globalOrigin[3] = {0, 0, 0}; + const size_t globalRegion[3] = {4, 8, 1}; + imageTransferThroughStagingBuffers(true, MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageReadWithRemainderThenWholeRegionCovered) { + size_t expectedChunks = 4; + const size_t globalOrigin[3] = {0, 0, 0}; + const size_t globalRegion[3] = {4, 7, 1}; + imageTransferThroughStagingBuffers(true, MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks); +} + +HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenGpuHangDuringChunkReadFromImageThenReturnImmediatelyWithFailure) { + size_t expectedChunks = 4; + const size_t globalOrigin[3] = {0, 0, 0}; + const size_t globalRegion[3] = {4, 8, 1}; + auto ptr = new unsigned char[stagingBufferSize * expectedChunks]; + + size_t chunkCounter = 0; + ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t { + ++chunkCounter; + return 0; + }; + auto ultCsr = reinterpret_cast *>(csr); + ultCsr->waitForTaskCountReturnValue = WaitStatus::gpuHang; + auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, true); + EXPECT_EQ(0, ret.chunkCopyStatus); + EXPECT_EQ(WaitStatus::gpuHang, ret.waitStatus); + EXPECT_EQ(2u, chunkCounter); + delete[] ptr; +} + +HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenGpuHangAfterChunkReadFromImageThenReturnWithFailure) { + size_t expectedChunks = 4; + const size_t globalOrigin[3] = {0, 0, 0}; + const size_t globalRegion[3] = {4, 8, 1}; + auto ptr = new unsigned char[stagingBufferSize * expectedChunks]; + + auto ultCsr = reinterpret_cast *>(csr); + size_t chunkCounter = 0; + ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t { + ++chunkCounter; + if (chunkCounter == expectedChunks) { + ultCsr->waitForTaskCountReturnValue = WaitStatus::gpuHang; + } + return 0; + }; + auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, true); + EXPECT_EQ(0, ret.chunkCopyStatus); + EXPECT_EQ(WaitStatus::gpuHang, ret.waitStatus); + EXPECT_EQ(4u, chunkCounter); + delete[] ptr; +} + +HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenGpuHangDuringRemainderChunkReadFromImageThenReturnImmediatelyWithFailure) { + size_t expectedChunks = 4; + const size_t globalOrigin[3] = {0, 0, 0}; + const size_t globalRegion[3] = {4, 7, 1}; + auto ptr = new unsigned char[stagingBufferSize * expectedChunks]; + + auto ultCsr = reinterpret_cast *>(csr); + size_t chunkCounter = 0; + size_t remainderCounter = 4; + ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t { + ++chunkCounter; + if (chunkCounter == remainderCounter - 1) { + ultCsr->waitForTaskCountReturnValue = WaitStatus::gpuHang; + } + return 0; + }; + auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, true); + EXPECT_EQ(0, ret.chunkCopyStatus); + EXPECT_EQ(WaitStatus::gpuHang, ret.waitStatus); + EXPECT_EQ(remainderCounter - 1, chunkCounter); + delete[] ptr; } TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkImageWriteThenEarlyReturnWithFailure) { @@ -394,13 +507,13 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkImageWriteThen auto ptr = new unsigned char[stagingBufferSize * expectedChunks]; size_t chunkCounter = 0; - ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t { + ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t { ++chunkCounter; return expectedErrorCode; }; - - auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr); - EXPECT_EQ(expectedErrorCode, ret); + auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, false); + EXPECT_EQ(expectedErrorCode, ret.chunkCopyStatus); + EXPECT_EQ(WaitStatus::ready, ret.waitStatus); EXPECT_EQ(1u, chunkCounter); delete[] ptr; } @@ -414,16 +527,16 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkImageWriteWith size_t chunkCounter = 0; size_t remainderCounter = 4; - ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t { + ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t { ++chunkCounter; if (chunkCounter == remainderCounter) { return expectedErrorCode; } return 0; }; - - auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr); - EXPECT_EQ(expectedErrorCode, ret); + auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, false); + EXPECT_EQ(expectedErrorCode, ret.chunkCopyStatus); + EXPECT_EQ(WaitStatus::ready, ret.waitStatus); EXPECT_EQ(remainderCounter, chunkCounter); delete[] ptr; } \ No newline at end of file