mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-21 09:14:47 +08:00
performance: introduce staging reads from image
Related-To: NEO-12968 Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
f2725f217e
commit
6c4eb322b1
@@ -2965,7 +2965,7 @@ cl_int CL_API_CALL clEnqueueWriteImage(cl_command_queue commandQueue,
|
|||||||
TRACING_EXIT(ClEnqueueWriteImage, &retVal);
|
TRACING_EXIT(ClEnqueueWriteImage, &retVal);
|
||||||
return retVal;
|
return retVal;
|
||||||
}
|
}
|
||||||
if (pCommandQueue->isValidForStagingWriteImage(pImage, ptr, numEventsInWaitList > 0)) {
|
if (pCommandQueue->isValidForStagingTransferImage(pImage, ptr, numEventsInWaitList > 0)) {
|
||||||
retVal = pCommandQueue->enqueueStagingWriteImage(pImage, blockingWrite, origin, region, inputRowPitch, inputSlicePitch, ptr, event);
|
retVal = pCommandQueue->enqueueStagingWriteImage(pImage, blockingWrite, origin, region, inputRowPitch, inputSlicePitch, ptr, event);
|
||||||
} else {
|
} else {
|
||||||
retVal = pCommandQueue->enqueueWriteImage(
|
retVal = pCommandQueue->enqueueWriteImage(
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (C) 2018-2023 Intel Corporation
|
# Copyright (C) 2018-2024 Intel Corporation
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
#
|
#
|
||||||
@@ -9,6 +9,7 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
|
|||||||
${CMAKE_CURRENT_SOURCE_DIR}/cl_local_work_size.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/cl_local_work_size.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/cl_local_work_size.h
|
${CMAKE_CURRENT_SOURCE_DIR}/cl_local_work_size.h
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/command_queue.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/command_queue.cpp
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/command_queue_staging.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/command_queue.h
|
${CMAKE_CURRENT_SOURCE_DIR}/command_queue.h
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw.h
|
${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw.h
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw_base.inl
|
${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw_base.inl
|
||||||
|
|||||||
@@ -29,7 +29,6 @@
|
|||||||
#include "shared/source/os_interface/os_context.h"
|
#include "shared/source/os_interface/os_context.h"
|
||||||
#include "shared/source/os_interface/product_helper.h"
|
#include "shared/source/os_interface/product_helper.h"
|
||||||
#include "shared/source/utilities/api_intercept.h"
|
#include "shared/source/utilities/api_intercept.h"
|
||||||
#include "shared/source/utilities/staging_buffer_manager.h"
|
|
||||||
#include "shared/source/utilities/tag_allocator.h"
|
#include "shared/source/utilities/tag_allocator.h"
|
||||||
|
|
||||||
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
|
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
|
||||||
@@ -1557,146 +1556,4 @@ void CommandQueue::unregisterGpgpuAndBcsCsrClients() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event) {
|
|
||||||
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, &size};
|
|
||||||
csrSelectionArgs.direction = TransferDirection::hostToLocal;
|
|
||||||
auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs);
|
|
||||||
|
|
||||||
Event profilingEvent{this, CL_COMMAND_SVM_MEMCPY, CompletionStamp::notReady, CompletionStamp::notReady};
|
|
||||||
if (isProfilingEnabled()) {
|
|
||||||
profilingEvent.setQueueTimeStamp();
|
|
||||||
}
|
|
||||||
|
|
||||||
// If there was only one chunk copy, no barrier for OOQ is needed
|
|
||||||
bool isSingleTransfer = false;
|
|
||||||
ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) -> int32_t {
|
|
||||||
auto isFirstTransfer = (chunkDst == dstPtr);
|
|
||||||
auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size);
|
|
||||||
isSingleTransfer = isFirstTransfer && isLastTransfer;
|
|
||||||
|
|
||||||
if (isFirstTransfer && isProfilingEnabled()) {
|
|
||||||
profilingEvent.setSubmitTimeStamp();
|
|
||||||
}
|
|
||||||
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
|
||||||
if (isSingleTransfer) {
|
|
||||||
return this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, event, csr);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isFirstTransfer && isProfilingEnabled()) {
|
|
||||||
profilingEvent.setStartTimeStamp();
|
|
||||||
}
|
|
||||||
|
|
||||||
cl_event *outEvent = nullptr;
|
|
||||||
if (isLastTransfer && !this->isOOQEnabled()) {
|
|
||||||
outEvent = event;
|
|
||||||
}
|
|
||||||
auto ret = this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, outEvent, csr);
|
|
||||||
return ret;
|
|
||||||
};
|
|
||||||
|
|
||||||
auto stagingBufferManager = this->context->getStagingBufferManager();
|
|
||||||
auto ret = stagingBufferManager->performCopy(dstPtr, srcPtr, size, chunkCopy, csr);
|
|
||||||
if (ret != CL_SUCCESS) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
return postStagingTransferSync(event, profilingEvent, isSingleTransfer, blockingCopy);
|
|
||||||
}
|
|
||||||
|
|
||||||
cl_int CommandQueue::enqueueStagingWriteImage(Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion,
|
|
||||||
size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event) {
|
|
||||||
constexpr cl_command_type cmdType = CL_COMMAND_WRITE_IMAGE;
|
|
||||||
CsrSelectionArgs csrSelectionArgs{cmdType, nullptr, dstImage, this->getDevice().getRootDeviceIndex(), globalRegion, nullptr, globalOrigin};
|
|
||||||
auto &csr = selectCsrForBuiltinOperation(csrSelectionArgs);
|
|
||||||
|
|
||||||
Event profilingEvent{this, CL_COMMAND_WRITE_IMAGE, CompletionStamp::notReady, CompletionStamp::notReady};
|
|
||||||
if (isProfilingEnabled()) {
|
|
||||||
profilingEvent.setQueueTimeStamp();
|
|
||||||
}
|
|
||||||
|
|
||||||
// If there was only one chunk write, no barrier for OOQ is needed
|
|
||||||
bool isSingleTransfer = false;
|
|
||||||
ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t {
|
|
||||||
auto isFirstTransfer = (globalOrigin[1] == origin[1]);
|
|
||||||
auto isLastTransfer = (globalOrigin[1] + globalRegion[1] == origin[1] + region[1]);
|
|
||||||
isSingleTransfer = isFirstTransfer && isLastTransfer;
|
|
||||||
|
|
||||||
if (isFirstTransfer && isProfilingEnabled()) {
|
|
||||||
profilingEvent.setSubmitTimeStamp();
|
|
||||||
}
|
|
||||||
memcpy(stagingBuffer, chunkPtr, bufferSize);
|
|
||||||
if (isSingleTransfer) {
|
|
||||||
return this->enqueueWriteImageImpl(dstImage, false, origin, region, inputRowPitch, inputSlicePitch, stagingBuffer, nullptr, 0, nullptr, event, csr);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isFirstTransfer && isProfilingEnabled()) {
|
|
||||||
profilingEvent.setStartTimeStamp();
|
|
||||||
}
|
|
||||||
|
|
||||||
cl_event *outEvent = nullptr;
|
|
||||||
if (isLastTransfer && !this->isOOQEnabled()) {
|
|
||||||
outEvent = event;
|
|
||||||
}
|
|
||||||
auto ret = this->enqueueWriteImageImpl(dstImage, false, origin, region, inputRowPitch, inputSlicePitch, stagingBuffer, nullptr, 0, nullptr, outEvent, csr);
|
|
||||||
return ret;
|
|
||||||
};
|
|
||||||
auto bytesPerPixel = dstImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes;
|
|
||||||
auto dstRowPitch = inputRowPitch ? inputRowPitch : globalRegion[0] * bytesPerPixel;
|
|
||||||
auto stagingBufferManager = this->context->getStagingBufferManager();
|
|
||||||
auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, dstRowPitch, chunkWrite, &csr);
|
|
||||||
if (ret != CL_SUCCESS) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
return postStagingTransferSync(event, profilingEvent, isSingleTransfer, blockingCopy);
|
|
||||||
}
|
|
||||||
|
|
||||||
cl_int CommandQueue::postStagingTransferSync(cl_event *event, const Event &profilingEvent, bool isSingleTransfer, bool isBlocking) {
|
|
||||||
cl_int ret = CL_SUCCESS;
|
|
||||||
if (event != nullptr) {
|
|
||||||
if (!isSingleTransfer && this->isOOQEnabled()) {
|
|
||||||
ret = this->enqueueBarrierWithWaitList(0, nullptr, event);
|
|
||||||
}
|
|
||||||
auto pEvent = castToObjectOrAbort<Event>(*event);
|
|
||||||
if (isProfilingEnabled()) {
|
|
||||||
pEvent->copyTimestamps(profilingEvent, !isSingleTransfer);
|
|
||||||
pEvent->setCPUProfilingPath(false);
|
|
||||||
}
|
|
||||||
pEvent->setCmdType(profilingEvent.getCommandType());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isBlocking) {
|
|
||||||
ret = this->finish();
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CommandQueue::isValidForStagingBufferCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies) {
|
|
||||||
GraphicsAllocation *allocation = nullptr;
|
|
||||||
context->tryGetExistingMapAllocation(srcPtr, size, allocation);
|
|
||||||
if (allocation != nullptr) {
|
|
||||||
// Direct transfer from mapped allocation is faster than staging buffer
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, nullptr};
|
|
||||||
csrSelectionArgs.direction = TransferDirection::hostToLocal;
|
|
||||||
auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs);
|
|
||||||
auto osContextId = csr->getOsContext().getContextId();
|
|
||||||
auto stagingBufferManager = context->getStagingBufferManager();
|
|
||||||
UNRECOVERABLE_IF(stagingBufferManager == nullptr);
|
|
||||||
return stagingBufferManager->isValidForCopy(device, dstPtr, srcPtr, size, hasDependencies, osContextId);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CommandQueue::isValidForStagingWriteImage(Image *image, const void *ptr, bool hasDependencies) {
|
|
||||||
auto stagingBufferManager = context->getStagingBufferManager();
|
|
||||||
if (!stagingBufferManager) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
switch (image->getImageDesc().image_type) {
|
|
||||||
case CL_MEM_OBJECT_IMAGE1D:
|
|
||||||
case CL_MEM_OBJECT_IMAGE2D:
|
|
||||||
return stagingBufferManager->isValidForStagingWriteImage(this->getDevice(), ptr, hasDependencies);
|
|
||||||
default:
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ struct BuiltinOpParams;
|
|||||||
struct CsrSelectionArgs;
|
struct CsrSelectionArgs;
|
||||||
struct MultiDispatchInfo;
|
struct MultiDispatchInfo;
|
||||||
struct TimestampPacketDependencies;
|
struct TimestampPacketDependencies;
|
||||||
|
struct StagingTransferStatus;
|
||||||
|
|
||||||
enum class QueuePriority {
|
enum class QueuePriority {
|
||||||
low,
|
low,
|
||||||
@@ -147,6 +148,10 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
|||||||
size_t rowPitch, size_t slicePitch, void *ptr, GraphicsAllocation *mapAllocation,
|
size_t rowPitch, size_t slicePitch, void *ptr, GraphicsAllocation *mapAllocation,
|
||||||
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) = 0;
|
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) = 0;
|
||||||
|
|
||||||
|
virtual cl_int enqueueReadImageImpl(Image *srcImage, cl_bool blockingRead, const size_t *origin, const size_t *region,
|
||||||
|
size_t rowPitch, size_t slicePitch, void *ptr, GraphicsAllocation *mapAllocation,
|
||||||
|
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, CommandStreamReceiver &csr) = 0;
|
||||||
|
|
||||||
virtual cl_int enqueueWriteBuffer(Buffer *buffer, cl_bool blockingWrite, size_t offset, size_t cb,
|
virtual cl_int enqueueWriteBuffer(Buffer *buffer, cl_bool blockingWrite, size_t offset, size_t cb,
|
||||||
const void *ptr, GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList,
|
const void *ptr, GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList,
|
||||||
const cl_event *eventWaitList, cl_event *event) = 0;
|
const cl_event *eventWaitList, cl_event *event) = 0;
|
||||||
@@ -396,8 +401,11 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
|||||||
cl_int enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event);
|
cl_int enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event);
|
||||||
cl_int enqueueStagingWriteImage(Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion,
|
cl_int enqueueStagingWriteImage(Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion,
|
||||||
size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event);
|
size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event);
|
||||||
|
cl_int enqueueStagingReadImage(Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion,
|
||||||
|
size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event);
|
||||||
|
|
||||||
bool isValidForStagingBufferCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies);
|
bool isValidForStagingBufferCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies);
|
||||||
bool isValidForStagingWriteImage(Image *image, const void *ptr, bool hasDependencies);
|
bool isValidForStagingTransferImage(Image *image, const void *ptr, bool hasDependencies);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
|
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
|
||||||
@@ -441,7 +449,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
|||||||
|
|
||||||
void unregisterGpgpuAndBcsCsrClients();
|
void unregisterGpgpuAndBcsCsrClients();
|
||||||
|
|
||||||
cl_int postStagingTransferSync(cl_event *event, const Event &profilingEvent, bool isSingleTransfer, bool isBlocking);
|
cl_int postStagingTransferSync(const StagingTransferStatus &status, cl_event *event, const cl_event profilingEvent, bool isSingleTransfer, bool isBlocking, cl_command_type commandType);
|
||||||
|
cl_event *assignEventForStaging(cl_event *userEvent, cl_event *profilingEvent, bool isFirstTransfer, bool isLastTransfer) const;
|
||||||
|
|
||||||
Context *context = nullptr;
|
Context *context = nullptr;
|
||||||
ClDevice *device = nullptr;
|
ClDevice *device = nullptr;
|
||||||
|
|||||||
@@ -269,6 +269,18 @@ class CommandQueueHw : public CommandQueue {
|
|||||||
const cl_event *eventWaitList,
|
const cl_event *eventWaitList,
|
||||||
cl_event *event) override;
|
cl_event *event) override;
|
||||||
|
|
||||||
|
cl_int enqueueReadImageImpl(Image *srcImage,
|
||||||
|
cl_bool blockingRead,
|
||||||
|
const size_t *origin,
|
||||||
|
const size_t *region,
|
||||||
|
size_t rowPitch,
|
||||||
|
size_t slicePitch,
|
||||||
|
void *ptr,
|
||||||
|
GraphicsAllocation *mapAllocation,
|
||||||
|
cl_uint numEventsInWaitList,
|
||||||
|
const cl_event *eventWaitList,
|
||||||
|
cl_event *event, CommandStreamReceiver &csr) override;
|
||||||
|
|
||||||
cl_int enqueueWriteBuffer(Buffer *buffer,
|
cl_int enqueueWriteBuffer(Buffer *buffer,
|
||||||
cl_bool blockingWrite,
|
cl_bool blockingWrite,
|
||||||
size_t offset,
|
size_t offset,
|
||||||
|
|||||||
169
opencl/source/command_queue/command_queue_staging.cpp
Normal file
169
opencl/source/command_queue/command_queue_staging.cpp
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2024 Intel Corporation
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "shared/source/command_stream/command_stream_receiver.h"
|
||||||
|
#include "shared/source/device/device.h"
|
||||||
|
#include "shared/source/os_interface/os_context.h"
|
||||||
|
#include "shared/source/utilities/staging_buffer_manager.h"
|
||||||
|
|
||||||
|
#include "opencl/source/command_queue/command_queue.h"
|
||||||
|
#include "opencl/source/command_queue/csr_selection_args.h"
|
||||||
|
#include "opencl/source/context/context.h"
|
||||||
|
#include "opencl/source/event/user_event.h"
|
||||||
|
#include "opencl/source/helpers/base_object.h"
|
||||||
|
#include "opencl/source/mem_obj/image.h"
|
||||||
|
|
||||||
|
#include "CL/cl_ext.h"
|
||||||
|
|
||||||
|
namespace NEO {
|
||||||
|
|
||||||
|
cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event) {
|
||||||
|
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, &size};
|
||||||
|
csrSelectionArgs.direction = TransferDirection::hostToLocal;
|
||||||
|
auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs);
|
||||||
|
cl_event profilingEvent;
|
||||||
|
|
||||||
|
bool isSingleTransfer = false;
|
||||||
|
ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) -> int32_t {
|
||||||
|
auto isFirstTransfer = (chunkDst == dstPtr);
|
||||||
|
auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size);
|
||||||
|
isSingleTransfer = isFirstTransfer && isLastTransfer;
|
||||||
|
cl_event *outEvent = assignEventForStaging(event, &profilingEvent, isFirstTransfer, isLastTransfer);
|
||||||
|
|
||||||
|
return this->enqueueSVMMemcpy(false, chunkDst, chunkSrc, chunkSize, 0, nullptr, outEvent, csr);
|
||||||
|
};
|
||||||
|
|
||||||
|
auto stagingBufferManager = this->context->getStagingBufferManager();
|
||||||
|
auto ret = stagingBufferManager->performCopy(dstPtr, srcPtr, size, chunkCopy, csr);
|
||||||
|
return postStagingTransferSync(ret, event, profilingEvent, isSingleTransfer, blockingCopy, CL_COMMAND_SVM_MEMCPY);
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_int CommandQueue::enqueueStagingWriteImage(Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion,
|
||||||
|
size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event) {
|
||||||
|
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_WRITE_IMAGE, nullptr, dstImage, this->getDevice().getRootDeviceIndex(), globalRegion, nullptr, globalOrigin};
|
||||||
|
auto &csr = selectCsrForBuiltinOperation(csrSelectionArgs);
|
||||||
|
cl_event profilingEvent;
|
||||||
|
|
||||||
|
bool isSingleTransfer = false;
|
||||||
|
ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||||
|
auto isFirstTransfer = (globalOrigin[1] == origin[1]);
|
||||||
|
auto isLastTransfer = (globalOrigin[1] + globalRegion[1] == origin[1] + region[1]);
|
||||||
|
isSingleTransfer = isFirstTransfer && isLastTransfer;
|
||||||
|
cl_event *outEvent = assignEventForStaging(event, &profilingEvent, isFirstTransfer, isLastTransfer);
|
||||||
|
|
||||||
|
return this->enqueueWriteImageImpl(dstImage, false, origin, region, inputRowPitch, inputSlicePitch, stagingBuffer, nullptr, 0, nullptr, outEvent, csr);
|
||||||
|
};
|
||||||
|
auto bytesPerPixel = dstImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes;
|
||||||
|
auto dstRowPitch = inputRowPitch ? inputRowPitch : globalRegion[0] * bytesPerPixel;
|
||||||
|
|
||||||
|
auto stagingBufferManager = this->context->getStagingBufferManager();
|
||||||
|
auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, dstRowPitch, chunkWrite, &csr, false);
|
||||||
|
return postStagingTransferSync(ret, event, profilingEvent, isSingleTransfer, blockingCopy, CL_COMMAND_WRITE_IMAGE);
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_int CommandQueue::enqueueStagingReadImage(Image *srcImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion,
|
||||||
|
size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event) {
|
||||||
|
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_READ_IMAGE, srcImage, nullptr, this->getDevice().getRootDeviceIndex(), globalRegion, nullptr, globalOrigin};
|
||||||
|
auto &csr = selectCsrForBuiltinOperation(csrSelectionArgs);
|
||||||
|
cl_event profilingEvent;
|
||||||
|
|
||||||
|
bool isSingleTransfer = false;
|
||||||
|
ChunkTransferImageFunc chunkRead = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||||
|
auto isFirstTransfer = (globalOrigin[1] == origin[1]);
|
||||||
|
auto isLastTransfer = (globalOrigin[1] + globalRegion[1] == origin[1] + region[1]);
|
||||||
|
isSingleTransfer = isFirstTransfer && isLastTransfer;
|
||||||
|
cl_event *outEvent = assignEventForStaging(event, &profilingEvent, isFirstTransfer, isLastTransfer);
|
||||||
|
|
||||||
|
return this->enqueueReadImageImpl(srcImage, false, origin, region, inputRowPitch, inputSlicePitch, stagingBuffer, nullptr, 0, nullptr, outEvent, csr);
|
||||||
|
};
|
||||||
|
auto bytesPerPixel = srcImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes;
|
||||||
|
auto dstRowPitch = inputRowPitch ? inputRowPitch : globalRegion[0] * bytesPerPixel;
|
||||||
|
|
||||||
|
auto stagingBufferManager = this->context->getStagingBufferManager();
|
||||||
|
auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, dstRowPitch, chunkRead, &csr, true);
|
||||||
|
return postStagingTransferSync(ret, event, profilingEvent, isSingleTransfer, blockingCopy, CL_COMMAND_READ_IMAGE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If there's single transfer, use user event.
|
||||||
|
* Otherwise, first transfer uses profiling event to obtain queue/submit/start timestamps.
|
||||||
|
* Last transfer uses user event in case of IOQ.
|
||||||
|
* For OOQ user event will be passed to barrier to gather all submitted transfers.
|
||||||
|
*/
|
||||||
|
cl_event *CommandQueue::assignEventForStaging(cl_event *userEvent, cl_event *profilingEvent, bool isFirstTransfer, bool isLastTransfer) const {
|
||||||
|
cl_event *outEvent = nullptr;
|
||||||
|
if (userEvent != nullptr) {
|
||||||
|
if (isFirstTransfer && isProfilingEnabled()) {
|
||||||
|
outEvent = profilingEvent;
|
||||||
|
} else if (isLastTransfer && !this->isOOQEnabled()) {
|
||||||
|
outEvent = userEvent;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (isFirstTransfer && isLastTransfer) {
|
||||||
|
outEvent = userEvent;
|
||||||
|
}
|
||||||
|
return outEvent;
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_int CommandQueue::postStagingTransferSync(const StagingTransferStatus &status, cl_event *event, const cl_event profilingEvent, bool isSingleTransfer, bool isBlocking, cl_command_type commandType) {
|
||||||
|
if (status.waitStatus == WaitStatus::gpuHang) {
|
||||||
|
return CL_OUT_OF_RESOURCES;
|
||||||
|
} else if (status.chunkCopyStatus != CL_SUCCESS) {
|
||||||
|
return status.chunkCopyStatus;
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_int ret = CL_SUCCESS;
|
||||||
|
if (event != nullptr) {
|
||||||
|
if (!isSingleTransfer && this->isOOQEnabled()) {
|
||||||
|
ret = this->enqueueBarrierWithWaitList(0, nullptr, event);
|
||||||
|
}
|
||||||
|
auto pEvent = castToObjectOrAbort<Event>(*event);
|
||||||
|
if (!isSingleTransfer && isProfilingEnabled()) {
|
||||||
|
auto pProfilingEvent = castToObjectOrAbort<Event>(profilingEvent);
|
||||||
|
pEvent->copyTimestamps(*pProfilingEvent);
|
||||||
|
pProfilingEvent->release();
|
||||||
|
}
|
||||||
|
pEvent->setCmdType(commandType);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isBlocking) {
|
||||||
|
ret = this->finish();
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CommandQueue::isValidForStagingBufferCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies) {
|
||||||
|
GraphicsAllocation *allocation = nullptr;
|
||||||
|
context->tryGetExistingMapAllocation(srcPtr, size, allocation);
|
||||||
|
if (allocation != nullptr) {
|
||||||
|
// Direct transfer from mapped allocation is faster than staging buffer
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, nullptr};
|
||||||
|
csrSelectionArgs.direction = TransferDirection::hostToLocal;
|
||||||
|
auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs);
|
||||||
|
auto osContextId = csr->getOsContext().getContextId();
|
||||||
|
auto stagingBufferManager = context->getStagingBufferManager();
|
||||||
|
UNRECOVERABLE_IF(stagingBufferManager == nullptr);
|
||||||
|
return stagingBufferManager->isValidForCopy(device, dstPtr, srcPtr, size, hasDependencies, osContextId);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CommandQueue::isValidForStagingTransferImage(Image *image, const void *ptr, bool hasDependencies) {
|
||||||
|
auto stagingBufferManager = context->getStagingBufferManager();
|
||||||
|
if (!stagingBufferManager) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
switch (image->getImageDesc().image_type) {
|
||||||
|
case CL_MEM_OBJECT_IMAGE1D:
|
||||||
|
case CL_MEM_OBJECT_IMAGE2D:
|
||||||
|
return stagingBufferManager->isValidForStagingTransferImage(this->getDevice(), ptr, hasDependencies);
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace NEO
|
||||||
@@ -39,6 +39,25 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadImage(
|
|||||||
|
|
||||||
CsrSelectionArgs csrSelectionArgs{cmdType, srcImage, {}, device->getRootDeviceIndex(), region, origin, nullptr};
|
CsrSelectionArgs csrSelectionArgs{cmdType, srcImage, {}, device->getRootDeviceIndex(), region, origin, nullptr};
|
||||||
CommandStreamReceiver &csr = selectCsrForBuiltinOperation(csrSelectionArgs);
|
CommandStreamReceiver &csr = selectCsrForBuiltinOperation(csrSelectionArgs);
|
||||||
|
return enqueueReadImageImpl(srcImage, blockingRead, origin, region, inputRowPitch, inputSlicePitch, ptr, mapAllocation, numEventsInWaitList, eventWaitList, event, csr);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
cl_int CommandQueueHw<GfxFamily>::enqueueReadImageImpl(
|
||||||
|
Image *srcImage,
|
||||||
|
cl_bool blockingRead,
|
||||||
|
const size_t *origin,
|
||||||
|
const size_t *region,
|
||||||
|
size_t inputRowPitch,
|
||||||
|
size_t inputSlicePitch,
|
||||||
|
void *ptr,
|
||||||
|
GraphicsAllocation *mapAllocation,
|
||||||
|
cl_uint numEventsInWaitList,
|
||||||
|
const cl_event *eventWaitList,
|
||||||
|
cl_event *event, CommandStreamReceiver &csr) {
|
||||||
|
constexpr cl_command_type cmdType = CL_COMMAND_READ_IMAGE;
|
||||||
|
|
||||||
|
CsrSelectionArgs csrSelectionArgs{cmdType, srcImage, {}, device->getRootDeviceIndex(), region, origin, nullptr};
|
||||||
|
|
||||||
if (nullptr == mapAllocation) {
|
if (nullptr == mapAllocation) {
|
||||||
notifyEnqueueReadImage(srcImage, static_cast<bool>(blockingRead), EngineHelpers::isBcs(csr.getOsContext().getEngineType()));
|
notifyEnqueueReadImage(srcImage, static_cast<bool>(blockingRead), EngineHelpers::isBcs(csr.getOsContext().getEngineType()));
|
||||||
|
|||||||
@@ -397,10 +397,6 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
|
|||||||
auto &device = this->cmdQueue->getDevice();
|
auto &device = this->cmdQueue->getDevice();
|
||||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||||
auto resolution = device.getDeviceInfo().profilingTimerResolution;
|
auto resolution = device.getDeviceInfo().profilingTimerResolution;
|
||||||
if (isAdjustmentNeeded) {
|
|
||||||
// Adjust startTS since we calculate profiling based on other event timestamps
|
|
||||||
contextStartTS = startTimeStamp.gpuTimeStamp;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate startTimestamp only if it was not already set on CPU
|
// Calculate startTimestamp only if it was not already set on CPU
|
||||||
if (startTimeStamp.cpuTimeInNs == 0) {
|
if (startTimeStamp.cpuTimeInNs == 0) {
|
||||||
@@ -1046,4 +1042,20 @@ TaskCountType Event::peekTaskLevel() const {
|
|||||||
return taskLevel;
|
return taskLevel;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Event::copyTimestamps(Event &srcEvent) {
|
||||||
|
if (timestampPacketContainer) {
|
||||||
|
this->addTimestampPacketNodes(*srcEvent.getTimestampPacketNodes());
|
||||||
|
} else {
|
||||||
|
if (this->timeStampNode != nullptr) {
|
||||||
|
this->timeStampNode->returnTag();
|
||||||
|
}
|
||||||
|
this->timeStampNode = srcEvent.timeStampNode;
|
||||||
|
srcEvent.timeStampNode = nullptr;
|
||||||
|
}
|
||||||
|
this->queueTimeStamp = srcEvent.queueTimeStamp;
|
||||||
|
this->submitTimeStamp = srcEvent.submitTimeStamp;
|
||||||
|
this->startTimeStamp = srcEvent.startTimeStamp;
|
||||||
|
this->endTimeStamp = srcEvent.endTimeStamp;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|||||||
@@ -312,13 +312,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
|||||||
|
|
||||||
static void getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS);
|
static void getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS);
|
||||||
|
|
||||||
void copyTimestamps(const Event &srcEvent, bool isAdjustmentNeeded) {
|
void copyTimestamps(Event &srcEvent);
|
||||||
this->queueTimeStamp = srcEvent.queueTimeStamp;
|
|
||||||
this->submitTimeStamp = srcEvent.submitTimeStamp;
|
|
||||||
this->startTimeStamp = srcEvent.startTimeStamp;
|
|
||||||
this->endTimeStamp = srcEvent.endTimeStamp;
|
|
||||||
this->isAdjustmentNeeded = isAdjustmentNeeded;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Event(Context *ctx, CommandQueue *cmdQueue, cl_command_type cmdType,
|
Event(Context *ctx, CommandQueue *cmdQueue, cl_command_type cmdType,
|
||||||
@@ -391,7 +385,6 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
|||||||
bool profilingEnabled = false;
|
bool profilingEnabled = false;
|
||||||
bool profilingCpuPath = false;
|
bool profilingCpuPath = false;
|
||||||
bool dataCalculated = false;
|
bool dataCalculated = false;
|
||||||
bool isAdjustmentNeeded = false;
|
|
||||||
|
|
||||||
ProfilingInfo queueTimeStamp{};
|
ProfilingInfo queueTimeStamp{};
|
||||||
ProfilingInfo submitTimeStamp{};
|
ProfilingInfo submitTimeStamp{};
|
||||||
|
|||||||
@@ -1097,4 +1097,136 @@ HWTEST_F(EnqueueReadImageTest, whenEnqueueReadImageWithUsmPtrThenDontImportAlloc
|
|||||||
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||||
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
|
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
|
||||||
svmManager->freeSVMAlloc(usmPtr);
|
svmManager->freeSVMAlloc(usmPtr);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ReadImageStagingBufferTest : public EnqueueReadImageTest {
|
||||||
|
void SetUp() override {
|
||||||
|
REQUIRE_SVM_OR_SKIP(defaultHwInfo);
|
||||||
|
EnqueueReadImageTest::SetUp();
|
||||||
|
ptr = new unsigned char[readSize];
|
||||||
|
device.reset(new MockClDevice{MockClDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr)});
|
||||||
|
}
|
||||||
|
|
||||||
|
void TearDown() override {
|
||||||
|
if (defaultHwInfo->capabilityTable.ftrSvm == false) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
delete[] ptr;
|
||||||
|
EnqueueReadImageTest::TearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr size_t stagingBufferSize = MemoryConstants::megaByte * 2;
|
||||||
|
static constexpr size_t readSize = stagingBufferSize * 4;
|
||||||
|
unsigned char *ptr;
|
||||||
|
size_t origin[3] = {0, 0, 0};
|
||||||
|
size_t region[3] = {4, 8, 1};
|
||||||
|
std::unique_ptr<ClDevice> device;
|
||||||
|
cl_queue_properties props = {};
|
||||||
|
};
|
||||||
|
|
||||||
|
HWTEST_F(ReadImageStagingBufferTest, whenEnqueueStagingReadImageCalledThenReturnSuccess) {
|
||||||
|
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context, device.get(), &props);
|
||||||
|
auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr);
|
||||||
|
|
||||||
|
EXPECT_EQ(res, CL_SUCCESS);
|
||||||
|
EXPECT_EQ(4ul, mockCommandQueueHw.enqueueReadImageCounter);
|
||||||
|
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||||
|
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(ReadImageStagingBufferTest, whenEnqueueStagingReadImageCalledWithoutRowPitchThenReturnSuccess) {
|
||||||
|
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context, device.get(), &props);
|
||||||
|
region[0] = MemoryConstants::megaByte / srcImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes;
|
||||||
|
auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, 0u, MemoryConstants::megaByte, ptr, nullptr);
|
||||||
|
|
||||||
|
EXPECT_EQ(res, CL_SUCCESS);
|
||||||
|
EXPECT_EQ(4ul, mockCommandQueueHw.enqueueReadImageCounter);
|
||||||
|
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||||
|
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(ReadImageStagingBufferTest, whenBlockingEnqueueStagingReadImageCalledThenFinishCalled) {
|
||||||
|
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context, device.get(), &props);
|
||||||
|
auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, true, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr);
|
||||||
|
|
||||||
|
EXPECT_EQ(res, CL_SUCCESS);
|
||||||
|
EXPECT_EQ(1u, mockCommandQueueHw.finishCalledCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(ReadImageStagingBufferTest, whenEnqueueStagingReadImageCalledWithEventThenReturnValidEvent) {
|
||||||
|
constexpr cl_command_type expectedLastCmd = CL_COMMAND_READ_IMAGE;
|
||||||
|
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context, device.get(), &props);
|
||||||
|
cl_event event;
|
||||||
|
auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, &event);
|
||||||
|
EXPECT_EQ(res, CL_SUCCESS);
|
||||||
|
|
||||||
|
auto pEvent = (Event *)event;
|
||||||
|
EXPECT_EQ(expectedLastCmd, mockCommandQueueHw.lastCommandType);
|
||||||
|
EXPECT_EQ(expectedLastCmd, pEvent->getCommandType());
|
||||||
|
|
||||||
|
clReleaseEvent(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(ReadImageStagingBufferTest, givenOutOfOrderQueueWhenEnqueueStagingReadImageCalledWithEventThenReturnValidEvent) {
|
||||||
|
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context, device.get(), &props);
|
||||||
|
mockCommandQueueHw.setOoqEnabled();
|
||||||
|
cl_event event;
|
||||||
|
auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, &event);
|
||||||
|
EXPECT_EQ(res, CL_SUCCESS);
|
||||||
|
|
||||||
|
auto pEvent = (Event *)event;
|
||||||
|
EXPECT_EQ(static_cast<cl_command_type>(CL_COMMAND_BARRIER), mockCommandQueueHw.lastCommandType);
|
||||||
|
EXPECT_EQ(static_cast<cl_command_type>(CL_COMMAND_READ_IMAGE), pEvent->getCommandType());
|
||||||
|
|
||||||
|
clReleaseEvent(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(ReadImageStagingBufferTest, givenOutOfOrderQueueWhenEnqueueStagingReadImageCalledWithSingleTransferThenNoBarrierEnqueued) {
|
||||||
|
constexpr cl_command_type expectedLastCmd = CL_COMMAND_READ_IMAGE;
|
||||||
|
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context, device.get(), &props);
|
||||||
|
mockCommandQueueHw.setOoqEnabled();
|
||||||
|
cl_event event;
|
||||||
|
region[1] = 1;
|
||||||
|
auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, &event);
|
||||||
|
EXPECT_EQ(res, CL_SUCCESS);
|
||||||
|
|
||||||
|
auto pEvent = (Event *)event;
|
||||||
|
EXPECT_EQ(expectedLastCmd, mockCommandQueueHw.lastCommandType);
|
||||||
|
EXPECT_EQ(expectedLastCmd, pEvent->getCommandType());
|
||||||
|
|
||||||
|
clReleaseEvent(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(ReadImageStagingBufferTest, givenCmdQueueWithProfilingWhenEnqueueStagingReadImageThenTimestampsSetCorrectly) {
|
||||||
|
cl_event event;
|
||||||
|
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context, device.get(), &props);
|
||||||
|
mockCommandQueueHw.setProfilingEnabled();
|
||||||
|
auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, &event);
|
||||||
|
EXPECT_EQ(res, CL_SUCCESS);
|
||||||
|
|
||||||
|
auto pEvent = (Event *)event;
|
||||||
|
EXPECT_FALSE(pEvent->isCPUProfilingPath());
|
||||||
|
EXPECT_TRUE(pEvent->isProfilingEnabled());
|
||||||
|
|
||||||
|
clReleaseEvent(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(ReadImageStagingBufferTest, whenEnqueueStagingReadImageFailedThenPropagateErrorCode) {
|
||||||
|
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context, device.get(), &props);
|
||||||
|
mockCommandQueueHw.enqueueReadImageCallBase = false;
|
||||||
|
auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr);
|
||||||
|
|
||||||
|
EXPECT_EQ(res, CL_INVALID_OPERATION);
|
||||||
|
EXPECT_EQ(1ul, mockCommandQueueHw.enqueueReadImageCounter);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(ReadImageStagingBufferTest, whenEnqueueStagingReadImageCalledWithGpuHangThenReturnOutOfResources) {
|
||||||
|
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context, device.get(), &props);
|
||||||
|
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_READ_IMAGE, srcImage, nullptr, pDevice->getRootDeviceIndex(), region, nullptr, origin};
|
||||||
|
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(&mockCommandQueueHw.selectCsrForBuiltinOperation(csrSelectionArgs));
|
||||||
|
ultCsr->waitForTaskCountReturnValue = WaitStatus::gpuHang;
|
||||||
|
auto res = mockCommandQueueHw.enqueueStagingReadImage(srcImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr);
|
||||||
|
|
||||||
|
EXPECT_EQ(res, CL_OUT_OF_RESOURCES);
|
||||||
|
EXPECT_EQ(2ul, mockCommandQueueHw.enqueueReadImageCounter);
|
||||||
}
|
}
|
||||||
@@ -801,7 +801,7 @@ HWTEST_F(EnqueueWriteImageTest, whenEnqueueWriteImageWithUsmPtrAndSizeLowerThanR
|
|||||||
svmManager->freeSVMAlloc(usmPtr);
|
svmManager->freeSVMAlloc(usmPtr);
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST_F(EnqueueWriteImageTest, whenIsValidForStagingWriteImageCalledThenReturnCorrectValue) {
|
HWTEST_F(EnqueueWriteImageTest, whenIsValidForStagingTransferImageCalledThenReturnCorrectValue) {
|
||||||
bool svmSupported = pDevice->getHardwareInfo().capabilityTable.ftrSvm;
|
bool svmSupported = pDevice->getHardwareInfo().capabilityTable.ftrSvm;
|
||||||
if (!svmSupported) {
|
if (!svmSupported) {
|
||||||
GTEST_SKIP();
|
GTEST_SKIP();
|
||||||
@@ -810,13 +810,13 @@ HWTEST_F(EnqueueWriteImageTest, whenIsValidForStagingWriteImageCalledThenReturnC
|
|||||||
unsigned char ptr[16];
|
unsigned char ptr[16];
|
||||||
|
|
||||||
std::unique_ptr<Image> image(Image1dHelper<>::create(context));
|
std::unique_ptr<Image> image(Image1dHelper<>::create(context));
|
||||||
EXPECT_EQ(isStagingBuffersEnabled, pCmdQ->isValidForStagingWriteImage(image.get(), ptr, false));
|
EXPECT_EQ(isStagingBuffersEnabled, pCmdQ->isValidForStagingTransferImage(image.get(), ptr, false));
|
||||||
|
|
||||||
image.reset(Image2dHelper<>::create(context));
|
image.reset(Image2dHelper<>::create(context));
|
||||||
EXPECT_EQ(isStagingBuffersEnabled, pCmdQ->isValidForStagingWriteImage(image.get(), ptr, false));
|
EXPECT_EQ(isStagingBuffersEnabled, pCmdQ->isValidForStagingTransferImage(image.get(), ptr, false));
|
||||||
|
|
||||||
image.reset(Image3dHelper<>::create(context));
|
image.reset(Image3dHelper<>::create(context));
|
||||||
EXPECT_FALSE(pCmdQ->isValidForStagingWriteImage(image.get(), ptr, false));
|
EXPECT_FALSE(pCmdQ->isValidForStagingTransferImage(image.get(), ptr, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct WriteImageStagingBufferTest : public EnqueueWriteImageTest {
|
struct WriteImageStagingBufferTest : public EnqueueWriteImageTest {
|
||||||
@@ -854,6 +854,17 @@ HWTEST_F(WriteImageStagingBufferTest, whenEnqueueStagingWriteImageCalledThenRetu
|
|||||||
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
|
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HWTEST_F(WriteImageStagingBufferTest, whenEnqueueStagingWriteImageCalledWithoutRowPitchThenReturnSuccess) {
|
||||||
|
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context, device.get(), &props);
|
||||||
|
region[0] = MemoryConstants::megaByte / dstImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes;
|
||||||
|
auto res = mockCommandQueueHw.enqueueStagingWriteImage(dstImage, false, origin, region, 0u, MemoryConstants::megaByte, ptr, nullptr);
|
||||||
|
|
||||||
|
EXPECT_EQ(res, CL_SUCCESS);
|
||||||
|
EXPECT_EQ(4ul, mockCommandQueueHw.enqueueWriteImageCounter);
|
||||||
|
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||||
|
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
|
||||||
|
}
|
||||||
|
|
||||||
HWTEST_F(WriteImageStagingBufferTest, whenBlockingEnqueueStagingWriteImageCalledThenFinishCalled) {
|
HWTEST_F(WriteImageStagingBufferTest, whenBlockingEnqueueStagingWriteImageCalledThenFinishCalled) {
|
||||||
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context, device.get(), &props);
|
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context, device.get(), &props);
|
||||||
auto res = mockCommandQueueHw.enqueueStagingWriteImage(dstImage, true, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr);
|
auto res = mockCommandQueueHw.enqueueStagingWriteImage(dstImage, true, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr);
|
||||||
|
|||||||
@@ -188,6 +188,11 @@ class MockCommandQueue : public CommandQueue {
|
|||||||
GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList,
|
GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList,
|
||||||
const cl_event *eventWaitList, cl_event *event) override { return CL_SUCCESS; }
|
const cl_event *eventWaitList, cl_event *event) override { return CL_SUCCESS; }
|
||||||
|
|
||||||
|
cl_int enqueueReadImageImpl(Image *srcImage, cl_bool blockingRead, const size_t *origin, const size_t *region,
|
||||||
|
size_t rowPitch, size_t slicePitch, void *ptr,
|
||||||
|
GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList,
|
||||||
|
const cl_event *eventWaitList, cl_event *event, CommandStreamReceiver &csr) override { return CL_SUCCESS; }
|
||||||
|
|
||||||
cl_int enqueueWriteImage(Image *dstImage, cl_bool blockingWrite, const size_t *origin, const size_t *region,
|
cl_int enqueueWriteImage(Image *dstImage, cl_bool blockingWrite, const size_t *origin, const size_t *region,
|
||||||
size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, GraphicsAllocation *mapAllocation,
|
size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, GraphicsAllocation *mapAllocation,
|
||||||
cl_uint numEventsInWaitList, const cl_event *eventWaitList,
|
cl_uint numEventsInWaitList, const cl_event *eventWaitList,
|
||||||
@@ -379,6 +384,34 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
|||||||
}
|
}
|
||||||
return CL_INVALID_OPERATION;
|
return CL_INVALID_OPERATION;
|
||||||
}
|
}
|
||||||
|
cl_int enqueueReadImageImpl(Image *srcImage,
|
||||||
|
cl_bool blockingRead,
|
||||||
|
const size_t *origin,
|
||||||
|
const size_t *region,
|
||||||
|
size_t rowPitch,
|
||||||
|
size_t slicePitch,
|
||||||
|
void *ptr,
|
||||||
|
GraphicsAllocation *mapAllocation,
|
||||||
|
cl_uint numEventsInWaitList,
|
||||||
|
const cl_event *eventWaitList,
|
||||||
|
cl_event *event, CommandStreamReceiver &csr) override {
|
||||||
|
enqueueReadImageCounter++;
|
||||||
|
if (enqueueReadImageCallBase) {
|
||||||
|
return BaseClass::enqueueReadImageImpl(srcImage,
|
||||||
|
blockingRead,
|
||||||
|
origin,
|
||||||
|
region,
|
||||||
|
rowPitch,
|
||||||
|
slicePitch,
|
||||||
|
ptr,
|
||||||
|
mapAllocation,
|
||||||
|
numEventsInWaitList,
|
||||||
|
eventWaitList,
|
||||||
|
event,
|
||||||
|
csr);
|
||||||
|
}
|
||||||
|
return CL_INVALID_OPERATION;
|
||||||
|
}
|
||||||
void *cpuDataTransferHandler(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &retVal) override {
|
void *cpuDataTransferHandler(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &retVal) override {
|
||||||
cpuDataTransferHandlerCalled = true;
|
cpuDataTransferHandlerCalled = true;
|
||||||
return BaseClass::cpuDataTransferHandler(transferProperties, eventsRequest, retVal);
|
return BaseClass::cpuDataTransferHandler(transferProperties, eventsRequest, retVal);
|
||||||
@@ -493,6 +526,8 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
|||||||
MultiDispatchInfo storedMultiDispatchInfo;
|
MultiDispatchInfo storedMultiDispatchInfo;
|
||||||
size_t enqueueWriteImageCounter = 0;
|
size_t enqueueWriteImageCounter = 0;
|
||||||
bool enqueueWriteImageCallBase = true;
|
bool enqueueWriteImageCallBase = true;
|
||||||
|
size_t enqueueReadImageCounter = 0;
|
||||||
|
bool enqueueReadImageCallBase = true;
|
||||||
size_t enqueueWriteBufferCounter = 0;
|
size_t enqueueWriteBufferCounter = 0;
|
||||||
size_t requestedCmdStreamSize = 0;
|
size_t requestedCmdStreamSize = 0;
|
||||||
bool blockingWriteBuffer = false;
|
bool blockingWriteBuffer = false;
|
||||||
|
|||||||
@@ -13,7 +13,6 @@
|
|||||||
#include "shared/source/helpers/aligned_memory.h"
|
#include "shared/source/helpers/aligned_memory.h"
|
||||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||||
#include "shared/source/utilities/heap_allocator.h"
|
#include "shared/source/utilities/heap_allocator.h"
|
||||||
|
|
||||||
namespace NEO {
|
namespace NEO {
|
||||||
|
|
||||||
StagingBuffer::StagingBuffer(void *baseAddress, size_t size) : baseAddress(baseAddress) {
|
StagingBuffer::StagingBuffer(void *baseAddress, size_t size) : baseAddress(baseAddress) {
|
||||||
@@ -24,6 +23,14 @@ StagingBuffer::StagingBuffer(StagingBuffer &&other) : baseAddress(other.baseAddr
|
|||||||
this->allocator.reset(other.allocator.release());
|
this->allocator.reset(other.allocator.release());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool StagingBufferTracker::isReady() const {
|
||||||
|
return csr->testTaskCountReady(csr->getTagAddress(), taskCountToWait);
|
||||||
|
}
|
||||||
|
|
||||||
|
void StagingBufferTracker::freeChunk() const {
|
||||||
|
allocator->free(chunkAddress, size);
|
||||||
|
}
|
||||||
|
|
||||||
StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields) : svmAllocsManager(svmAllocsManager), rootDeviceIndices(rootDeviceIndices), deviceBitfields(deviceBitfields) {
|
StagingBufferManager::StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields) : svmAllocsManager(svmAllocsManager), rootDeviceIndices(rootDeviceIndices), deviceBitfields(deviceBitfields) {
|
||||||
if (debugManager.flags.StagingBufferSize.get() != -1) {
|
if (debugManager.flags.StagingBufferSize.get() != -1) {
|
||||||
chunkSize = debugManager.flags.StagingBufferSize.get() * MemoryConstants::kiloByte;
|
chunkSize = debugManager.flags.StagingBufferSize.get() * MemoryConstants::kiloByte;
|
||||||
@@ -37,22 +44,45 @@ StagingBufferManager::~StagingBufferManager() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This method performs 4 steps for single chunk transfer
|
* This method performs single chunk transfer. If transfer is a read operation, it will fetch oldest staging
|
||||||
* 1. Get existing chunk of staging buffer, if can't - allocate new one,
|
* buffer from the queue, otherwise it allocates or reuses buffer from the pool.
|
||||||
* 2. Perform actual transfer,
|
* After transfer is submitted to GPU, it stores used buffer to either queue in case of reads,
|
||||||
* 3. Store used buffer to tracking container (with current task count)
|
* or tracking container for further reusage.
|
||||||
* 4. Update tag if required to reuse this buffer in next chunk copies
|
|
||||||
*/
|
*/
|
||||||
template <class Func, class... Args>
|
template <class Func, class... Args>
|
||||||
int32_t StagingBufferManager::performChunkTransfer(CommandStreamReceiver *csr, size_t size, Func &func, Args... args) {
|
StagingTransferStatus StagingBufferManager::performChunkTransfer(bool isRead, void *userPtr, size_t size, StagingQueue ¤tStagingBuffers, CommandStreamReceiver *csr, Func &func, Args... args) {
|
||||||
auto allocatedSize = size;
|
StagingTransferStatus result{};
|
||||||
auto [allocator, stagingBuffer] = requestStagingBuffer(allocatedSize);
|
StagingBufferTracker tracker{};
|
||||||
auto ret = func(addrToPtr(stagingBuffer), size, args...);
|
if (currentStagingBuffers.size() > 1) {
|
||||||
trackChunk({allocator, stagingBuffer, allocatedSize, csr, csr->peekTaskCount()});
|
if (fetchHead(currentStagingBuffers, tracker) == WaitStatus::gpuHang) {
|
||||||
|
result.waitStatus = WaitStatus::gpuHang;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
auto allocatedSize = size;
|
||||||
|
auto [allocator, stagingBuffer] = requestStagingBuffer(allocatedSize);
|
||||||
|
tracker = StagingBufferTracker{allocator, stagingBuffer, allocatedSize, csr};
|
||||||
|
}
|
||||||
|
|
||||||
|
auto stagingBuffer = addrToPtr(tracker.chunkAddress);
|
||||||
|
if (!isRead) {
|
||||||
|
memcpy(stagingBuffer, userPtr, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
result.chunkCopyStatus = func(stagingBuffer, args...);
|
||||||
|
|
||||||
|
tracker.taskCountToWait = csr->peekTaskCount();
|
||||||
|
if (isRead) {
|
||||||
|
UserDstData dstData{userPtr, size};
|
||||||
|
currentStagingBuffers.push({dstData, tracker});
|
||||||
|
} else {
|
||||||
|
trackChunk(tracker);
|
||||||
|
}
|
||||||
|
|
||||||
if (csr->isAnyDirectSubmissionEnabled()) {
|
if (csr->isAnyDirectSubmissionEnabled()) {
|
||||||
csr->flushTagUpdate();
|
csr->flushTagUpdate();
|
||||||
}
|
}
|
||||||
return ret;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -60,38 +90,40 @@ int32_t StagingBufferManager::performChunkTransfer(CommandStreamReceiver *csr, s
|
|||||||
* Each chunk copy contains staging buffer which should be used instead of non-usm memory during transfers on GPU.
|
* Each chunk copy contains staging buffer which should be used instead of non-usm memory during transfers on GPU.
|
||||||
* Caller provides actual function to transfer data for single chunk.
|
* Caller provides actual function to transfer data for single chunk.
|
||||||
*/
|
*/
|
||||||
int32_t StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr) {
|
StagingTransferStatus StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr) {
|
||||||
|
StagingQueue stagingQueue;
|
||||||
auto copiesNum = size / chunkSize;
|
auto copiesNum = size / chunkSize;
|
||||||
auto remainder = size % chunkSize;
|
auto remainder = size % chunkSize;
|
||||||
|
StagingTransferStatus result{};
|
||||||
for (auto i = 0u; i < copiesNum; i++) {
|
for (auto i = 0u; i < copiesNum; i++) {
|
||||||
auto chunkDst = ptrOffset(dstPtr, i * chunkSize);
|
auto chunkDst = ptrOffset(dstPtr, i * chunkSize);
|
||||||
auto chunkSrc = ptrOffset(srcPtr, i * chunkSize);
|
auto chunkSrc = ptrOffset(srcPtr, i * chunkSize);
|
||||||
auto ret = performChunkTransfer(csr, chunkSize, chunkCopyFunc, chunkDst, chunkSrc);
|
result = performChunkTransfer(false, const_cast<void *>(chunkSrc), chunkSize, stagingQueue, csr, chunkCopyFunc, chunkDst, chunkSize);
|
||||||
if (ret) {
|
if (result.chunkCopyStatus != 0) {
|
||||||
return ret;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (remainder != 0) {
|
if (remainder != 0) {
|
||||||
auto chunkDst = ptrOffset(dstPtr, copiesNum * chunkSize);
|
auto chunkDst = ptrOffset(dstPtr, copiesNum * chunkSize);
|
||||||
auto chunkSrc = ptrOffset(srcPtr, copiesNum * chunkSize);
|
auto chunkSrc = ptrOffset(srcPtr, copiesNum * chunkSize);
|
||||||
auto ret = performChunkTransfer(csr, remainder, chunkCopyFunc, chunkDst, chunkSrc);
|
auto result = performChunkTransfer(false, const_cast<void *>(chunkSrc), remainder, stagingQueue, csr, chunkCopyFunc, chunkDst, remainder);
|
||||||
if (ret) {
|
if (result.chunkCopyStatus != 0) {
|
||||||
return ret;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This method orchestrates write operation for images with given origin and region.
|
* This method orchestrates transfer operation for images with given origin and region.
|
||||||
* Transfer is splitted into chunks, each chunk represents sub-region to transfer.
|
* Transfer is splitted into chunks, each chunk represents sub-region to transfer.
|
||||||
* Each chunk contains staging buffer which should be used instead of non-usm memory during transfers on GPU.
|
* Each chunk contains staging buffer which should be used instead of non-usm memory during transfers on GPU.
|
||||||
* Several rows are packed into single chunk unless size of single row exceeds maximum chunk size (2MB).
|
* Several rows are packed into single chunk unless size of single row exceeds maximum chunk size (2MB).
|
||||||
* Caller provides actual function to enqueue write operation for single chunk.
|
* Caller provides actual function to enqueue read/write operation for single chunk.
|
||||||
*/
|
*/
|
||||||
int32_t StagingBufferManager::performImageWrite(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkWriteImageFunc &chunkWriteImageFunc, CommandStreamReceiver *csr) {
|
StagingTransferStatus StagingBufferManager::performImageTransfer(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkTransferImageFunc &chunkTransferImageFunc, CommandStreamReceiver *csr, bool isRead) {
|
||||||
|
StagingQueue stagingQueue;
|
||||||
size_t origin[3] = {};
|
size_t origin[3] = {};
|
||||||
size_t region[3] = {};
|
size_t region[3] = {};
|
||||||
origin[0] = globalOrigin[0];
|
origin[0] = globalOrigin[0];
|
||||||
@@ -102,15 +134,16 @@ int32_t StagingBufferManager::performImageWrite(const void *ptr, const size_t *g
|
|||||||
rowsPerChunk = std::min<size_t>(rowsPerChunk, globalRegion[1]);
|
rowsPerChunk = std::min<size_t>(rowsPerChunk, globalRegion[1]);
|
||||||
auto numOfChunks = globalRegion[1] / rowsPerChunk;
|
auto numOfChunks = globalRegion[1] / rowsPerChunk;
|
||||||
auto remainder = globalRegion[1] % (rowsPerChunk * numOfChunks);
|
auto remainder = globalRegion[1] % (rowsPerChunk * numOfChunks);
|
||||||
|
StagingTransferStatus result{};
|
||||||
|
|
||||||
for (auto i = 0u; i < numOfChunks; i++) {
|
for (auto i = 0u; i < numOfChunks; i++) {
|
||||||
origin[1] = globalOrigin[1] + i * rowsPerChunk;
|
origin[1] = globalOrigin[1] + i * rowsPerChunk;
|
||||||
region[1] = rowsPerChunk;
|
region[1] = rowsPerChunk;
|
||||||
auto size = region[1] * rowPitch;
|
auto size = region[1] * rowPitch;
|
||||||
auto chunkPtr = ptrOffset(ptr, i * rowsPerChunk * rowPitch);
|
auto chunkPtr = ptrOffset(ptr, i * rowsPerChunk * rowPitch);
|
||||||
auto ret = performChunkTransfer(csr, size, chunkWriteImageFunc, chunkPtr, origin, region);
|
result = performChunkTransfer(isRead, const_cast<void *>(chunkPtr), size, stagingQueue, csr, chunkTransferImageFunc, origin, region);
|
||||||
if (ret) {
|
if (result.chunkCopyStatus != 0 || result.waitStatus == WaitStatus::gpuHang) {
|
||||||
return ret;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -119,12 +152,50 @@ int32_t StagingBufferManager::performImageWrite(const void *ptr, const size_t *g
|
|||||||
region[1] = remainder;
|
region[1] = remainder;
|
||||||
auto size = region[1] * rowPitch;
|
auto size = region[1] * rowPitch;
|
||||||
auto chunkPtr = ptrOffset(ptr, numOfChunks * rowsPerChunk * rowPitch);
|
auto chunkPtr = ptrOffset(ptr, numOfChunks * rowsPerChunk * rowPitch);
|
||||||
auto ret = performChunkTransfer(csr, size, chunkWriteImageFunc, chunkPtr, origin, region);
|
result = performChunkTransfer(isRead, const_cast<void *>(chunkPtr), size, stagingQueue, csr, chunkTransferImageFunc, origin, region);
|
||||||
if (ret) {
|
if (result.chunkCopyStatus != 0 || result.waitStatus == WaitStatus::gpuHang) {
|
||||||
return ret;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
|
||||||
|
result.waitStatus = drainAndReleaseStagingQueue(stagingQueue);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This method is used for read transfers. It waits for oldest transfer to finish
|
||||||
|
* and copies data associated with that transfer to host allocation.
|
||||||
|
* Returned tracker contains staging buffer ready for reuse.
|
||||||
|
*/
|
||||||
|
WaitStatus StagingBufferManager::fetchHead(StagingQueue &stagingQueue, StagingBufferTracker &tracker) const {
|
||||||
|
auto &head = stagingQueue.front();
|
||||||
|
auto status = head.second.csr->waitForTaskCount(head.second.taskCountToWait);
|
||||||
|
if (status == WaitStatus::gpuHang) {
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto &userData = head.first;
|
||||||
|
tracker = head.second;
|
||||||
|
auto stagingBuffer = addrToPtr(tracker.chunkAddress);
|
||||||
|
memcpy(userData.ptr, stagingBuffer, userData.size);
|
||||||
|
stagingQueue.pop();
|
||||||
|
return WaitStatus::ready;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Waits for all pending transfers to finish.
|
||||||
|
* Releases staging buffers back to pool for reuse.
|
||||||
|
*/
|
||||||
|
WaitStatus StagingBufferManager::drainAndReleaseStagingQueue(StagingQueue &stagingQueue) const {
|
||||||
|
StagingBufferTracker tracker{};
|
||||||
|
while (!stagingQueue.empty()) {
|
||||||
|
auto status = fetchHead(stagingQueue, tracker);
|
||||||
|
if (status == WaitStatus::gpuHang) {
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
tracker.freeChunk();
|
||||||
|
}
|
||||||
|
return WaitStatus::ready;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -196,7 +267,7 @@ bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, co
|
|||||||
return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize);
|
return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool StagingBufferManager::isValidForStagingWriteImage(const Device &device, const void *ptr, bool hasDependencies) const {
|
bool StagingBufferManager::isValidForStagingTransferImage(const Device &device, const void *ptr, bool hasDependencies) const {
|
||||||
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
|
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
|
||||||
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
|
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
|
||||||
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
|
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
|
||||||
@@ -207,9 +278,8 @@ bool StagingBufferManager::isValidForStagingWriteImage(const Device &device, con
|
|||||||
|
|
||||||
void StagingBufferManager::clearTrackedChunks() {
|
void StagingBufferManager::clearTrackedChunks() {
|
||||||
for (auto iterator = trackers.begin(); iterator != trackers.end();) {
|
for (auto iterator = trackers.begin(); iterator != trackers.end();) {
|
||||||
auto csr = iterator->csr;
|
if (iterator->isReady()) {
|
||||||
if (csr->testTaskCountReady(csr->getTagAddress(), iterator->taskCountToWait)) {
|
iterator->freeChunk();
|
||||||
iterator->allocator->free(iterator->chunkAddress, iterator->size);
|
|
||||||
iterator = trackers.erase(iterator);
|
iterator = trackers.erase(iterator);
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "shared/source/command_stream/wait_status.h"
|
||||||
#include "shared/source/helpers/constants.h"
|
#include "shared/source/helpers/constants.h"
|
||||||
#include "shared/source/utilities/stackvec.h"
|
#include "shared/source/utilities/stackvec.h"
|
||||||
|
|
||||||
@@ -14,6 +15,7 @@
|
|||||||
#include <map>
|
#include <map>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
#include <queue>
|
||||||
|
|
||||||
namespace NEO {
|
namespace NEO {
|
||||||
class SVMAllocsManager;
|
class SVMAllocsManager;
|
||||||
@@ -21,8 +23,8 @@ class CommandStreamReceiver;
|
|||||||
class Device;
|
class Device;
|
||||||
class HeapAllocator;
|
class HeapAllocator;
|
||||||
|
|
||||||
using ChunkCopyFunction = std::function<int32_t(void *, size_t, void *, const void *)>;
|
using ChunkCopyFunction = std::function<int32_t(void *, void *, size_t)>;
|
||||||
using ChunkWriteImageFunc = std::function<int32_t(void *, size_t, const void *, const size_t *, const size_t *)>;
|
using ChunkTransferImageFunc = std::function<int32_t(void *, const size_t *, const size_t *)>;
|
||||||
|
|
||||||
class StagingBuffer {
|
class StagingBuffer {
|
||||||
public:
|
public:
|
||||||
@@ -50,8 +52,23 @@ struct StagingBufferTracker {
|
|||||||
size_t size = 0;
|
size_t size = 0;
|
||||||
CommandStreamReceiver *csr = nullptr;
|
CommandStreamReceiver *csr = nullptr;
|
||||||
uint64_t taskCountToWait = 0;
|
uint64_t taskCountToWait = 0;
|
||||||
|
|
||||||
|
bool isReady() const;
|
||||||
|
void freeChunk() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct UserDstData {
|
||||||
|
void *ptr;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct StagingTransferStatus {
|
||||||
|
int32_t chunkCopyStatus = 0; // status from L0/OCL chunk copy
|
||||||
|
WaitStatus waitStatus = WaitStatus::ready;
|
||||||
|
};
|
||||||
|
|
||||||
|
using StagingQueue = std::queue<std::pair<UserDstData, StagingBufferTracker>>;
|
||||||
|
|
||||||
class StagingBufferManager {
|
class StagingBufferManager {
|
||||||
public:
|
public:
|
||||||
StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields);
|
StagingBufferManager(SVMAllocsManager *svmAllocsManager, const RootDeviceIndicesContainer &rootDeviceIndices, const std::map<uint32_t, DeviceBitfield> &deviceBitfields);
|
||||||
@@ -62,10 +79,10 @@ class StagingBufferManager {
|
|||||||
StagingBufferManager &operator=(const StagingBufferManager &other) = delete;
|
StagingBufferManager &operator=(const StagingBufferManager &other) = delete;
|
||||||
|
|
||||||
bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const;
|
bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const;
|
||||||
bool isValidForStagingWriteImage(const Device &device, const void *ptr, bool hasDependencies) const;
|
bool isValidForStagingTransferImage(const Device &device, const void *ptr, bool hasDependencies) const;
|
||||||
|
|
||||||
int32_t performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr);
|
StagingTransferStatus performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr);
|
||||||
int32_t performImageWrite(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkWriteImageFunc &chunkWriteImageFunc, CommandStreamReceiver *csr);
|
StagingTransferStatus performImageTransfer(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkTransferImageFunc &chunkTransferImageFunc, CommandStreamReceiver *csr, bool isRead);
|
||||||
|
|
||||||
std::pair<HeapAllocator *, uint64_t> requestStagingBuffer(size_t &size);
|
std::pair<HeapAllocator *, uint64_t> requestStagingBuffer(size_t &size);
|
||||||
void trackChunk(const StagingBufferTracker &tracker);
|
void trackChunk(const StagingBufferTracker &tracker);
|
||||||
@@ -76,7 +93,10 @@ class StagingBufferManager {
|
|||||||
void clearTrackedChunks();
|
void clearTrackedChunks();
|
||||||
|
|
||||||
template <class Func, class... Args>
|
template <class Func, class... Args>
|
||||||
int32_t performChunkTransfer(CommandStreamReceiver *csr, size_t size, Func &chunkCopyFunc, Args... args);
|
StagingTransferStatus performChunkTransfer(bool isRead, void *userPtr, size_t size, StagingQueue ¤tStagingBuffers, CommandStreamReceiver *csr, Func &func, Args... args);
|
||||||
|
|
||||||
|
WaitStatus fetchHead(StagingQueue &stagingQueue, StagingBufferTracker &tracker) const;
|
||||||
|
WaitStatus drainAndReleaseStagingQueue(StagingQueue &stagingQueue) const;
|
||||||
|
|
||||||
size_t chunkSize = MemoryConstants::pageSize2M;
|
size_t chunkSize = MemoryConstants::pageSize2M;
|
||||||
std::mutex mtx;
|
std::mutex mtx;
|
||||||
|
|||||||
@@ -323,6 +323,13 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
|
|||||||
return BaseClass::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, throttle);
|
return BaseClass::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, throttle);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
WaitStatus waitForTaskCount(TaskCountType requiredTaskCount) override {
|
||||||
|
if (waitForTaskCountReturnValue.has_value()) {
|
||||||
|
return *waitForTaskCountReturnValue;
|
||||||
|
}
|
||||||
|
return BaseClass::waitForTaskCount(requiredTaskCount);
|
||||||
|
}
|
||||||
|
|
||||||
void overrideCsrSizeReqFlags(CsrSizeRequestFlags &flags) { this->csrSizeRequestFlags = flags; }
|
void overrideCsrSizeReqFlags(CsrSizeRequestFlags &flags) { this->csrSizeRequestFlags = flags; }
|
||||||
GraphicsAllocation *getPreemptionAllocation() const { return this->preemptionAllocation; }
|
GraphicsAllocation *getPreemptionAllocation() const { return this->preemptionAllocation; }
|
||||||
|
|
||||||
@@ -585,6 +592,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
|
|||||||
uint32_t createAllocationForHostSurfaceCalled = 0;
|
uint32_t createAllocationForHostSurfaceCalled = 0;
|
||||||
WaitStatus returnWaitForCompletionWithTimeout = WaitStatus::ready;
|
WaitStatus returnWaitForCompletionWithTimeout = WaitStatus::ready;
|
||||||
std::optional<WaitStatus> waitForTaskCountWithKmdNotifyFallbackReturnValue{};
|
std::optional<WaitStatus> waitForTaskCountWithKmdNotifyFallbackReturnValue{};
|
||||||
|
std::optional<WaitStatus> waitForTaskCountReturnValue{};
|
||||||
std::optional<SubmissionStatus> flushReturnValue{};
|
std::optional<SubmissionStatus> flushReturnValue{};
|
||||||
CommandStreamReceiverType commandStreamReceiverType = CommandStreamReceiverType::hardware;
|
CommandStreamReceiverType commandStreamReceiverType = CommandStreamReceiverType::hardware;
|
||||||
std::atomic<uint32_t> downloadAllocationsCalledCount = 0;
|
std::atomic<uint32_t> downloadAllocationsCalledCount = 0;
|
||||||
|
|||||||
@@ -55,10 +55,9 @@ class StagingBufferManagerFixture : public DeviceFixture {
|
|||||||
memset(usmBuffer, 0, copySize);
|
memset(usmBuffer, 0, copySize);
|
||||||
memset(nonUsmBuffer, 0xFF, copySize);
|
memset(nonUsmBuffer, 0xFF, copySize);
|
||||||
|
|
||||||
ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) {
|
ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) {
|
||||||
chunkCounter++;
|
chunkCounter++;
|
||||||
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
memcpy(chunkDst, chunkSrc, chunkSize);
|
||||||
memcpy(chunkDst, stagingBuffer, chunkSize);
|
|
||||||
reinterpret_cast<MockCommandStreamReceiver *>(csr)->taskCount++;
|
reinterpret_cast<MockCommandStreamReceiver *>(csr)->taskCount++;
|
||||||
return 0;
|
return 0;
|
||||||
};
|
};
|
||||||
@@ -66,7 +65,8 @@ class StagingBufferManagerFixture : public DeviceFixture {
|
|||||||
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, copySize, chunkCopy, csr);
|
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, copySize, chunkCopy, csr);
|
||||||
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||||
|
|
||||||
EXPECT_EQ(0, ret);
|
EXPECT_EQ(0, ret.chunkCopyStatus);
|
||||||
|
EXPECT_EQ(WaitStatus::ready, ret.waitStatus);
|
||||||
EXPECT_EQ(0, memcmp(usmBuffer, nonUsmBuffer, copySize));
|
EXPECT_EQ(0, memcmp(usmBuffer, nonUsmBuffer, copySize));
|
||||||
EXPECT_EQ(expectedChunks, chunkCounter);
|
EXPECT_EQ(expectedChunks, chunkCounter);
|
||||||
EXPECT_EQ(expectedAllocations, newUsmAllocations);
|
EXPECT_EQ(expectedAllocations, newUsmAllocations);
|
||||||
@@ -74,17 +74,23 @@ class StagingBufferManagerFixture : public DeviceFixture {
|
|||||||
delete[] nonUsmBuffer;
|
delete[] nonUsmBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
void imageWriteThroughStagingBuffers(size_t rowPitch, const size_t *globalOrigin, const size_t *globalRegion, size_t expectedChunks) {
|
void imageTransferThroughStagingBuffers(bool isRead, size_t rowPitch, const size_t *globalOrigin, const size_t *globalRegion, size_t expectedChunks) {
|
||||||
auto ptr = new unsigned char[stagingBufferSize * expectedChunks];
|
auto hostPtr = new unsigned char[stagingBufferSize * expectedChunks];
|
||||||
|
auto imageData = new unsigned char[stagingBufferSize * expectedChunks];
|
||||||
|
if (isRead) {
|
||||||
|
memset(hostPtr, 0, stagingBufferSize * expectedChunks);
|
||||||
|
memset(imageData, 0xFF, stagingBufferSize * expectedChunks);
|
||||||
|
} else {
|
||||||
|
memset(hostPtr, 0xFF, stagingBufferSize * expectedChunks);
|
||||||
|
memset(imageData, 0, stagingBufferSize * expectedChunks);
|
||||||
|
}
|
||||||
size_t chunkCounter = 0;
|
size_t chunkCounter = 0;
|
||||||
size_t expectedOrigin = globalOrigin[1];
|
size_t expectedOrigin = globalOrigin[1];
|
||||||
auto expectedRowsPerChunk = std::min<size_t>(std::max<size_t>(1ul, stagingBufferSize / rowPitch), globalRegion[1]);
|
auto expectedRowsPerChunk = std::min<size_t>(std::max<size_t>(1ul, stagingBufferSize / rowPitch), globalRegion[1]);
|
||||||
auto numOfChunks = globalRegion[1] / expectedRowsPerChunk;
|
auto numOfChunks = globalRegion[1] / expectedRowsPerChunk;
|
||||||
auto remainder = globalRegion[1] % (expectedRowsPerChunk * numOfChunks);
|
auto remainder = globalRegion[1] % (expectedRowsPerChunk * numOfChunks);
|
||||||
ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t {
|
ChunkTransferImageFunc chunkTransfer = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||||
EXPECT_NE(nullptr, stagingBuffer);
|
EXPECT_NE(nullptr, stagingBuffer);
|
||||||
EXPECT_NE(nullptr, chunkPtr);
|
|
||||||
EXPECT_NE(nullptr, origin);
|
EXPECT_NE(nullptr, origin);
|
||||||
EXPECT_NE(nullptr, region);
|
EXPECT_NE(nullptr, region);
|
||||||
|
|
||||||
@@ -97,19 +103,33 @@ class StagingBufferManagerFixture : public DeviceFixture {
|
|||||||
} else {
|
} else {
|
||||||
EXPECT_EQ(expectedRowsPerChunk, region[1]);
|
EXPECT_EQ(expectedRowsPerChunk, region[1]);
|
||||||
}
|
}
|
||||||
|
auto offset = origin[1] - globalOrigin[1];
|
||||||
|
if (isRead) {
|
||||||
|
memcpy(stagingBuffer, imageData + rowPitch * offset, rowPitch * region[1]);
|
||||||
|
} else {
|
||||||
|
memcpy(imageData + rowPitch * offset, stagingBuffer, rowPitch * region[1]);
|
||||||
|
}
|
||||||
expectedOrigin += region[1];
|
expectedOrigin += region[1];
|
||||||
chunkCounter++;
|
chunkCounter++;
|
||||||
reinterpret_cast<MockCommandStreamReceiver *>(csr)->taskCount++;
|
reinterpret_cast<MockCommandStreamReceiver *>(csr)->taskCount++;
|
||||||
return 0;
|
return 0;
|
||||||
};
|
};
|
||||||
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
|
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
|
||||||
auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, rowPitch, chunkWrite, csr);
|
auto ret = stagingBufferManager->performImageTransfer(hostPtr, globalOrigin, globalRegion, rowPitch, chunkTransfer, csr, isRead);
|
||||||
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||||
|
|
||||||
EXPECT_EQ(0, ret);
|
EXPECT_EQ(0, memcmp(hostPtr, imageData, rowPitch * (numOfChunks * expectedRowsPerChunk + remainder)));
|
||||||
|
EXPECT_EQ(0, ret.chunkCopyStatus);
|
||||||
|
EXPECT_EQ(WaitStatus::ready, ret.waitStatus);
|
||||||
EXPECT_EQ(expectedChunks, chunkCounter);
|
EXPECT_EQ(expectedChunks, chunkCounter);
|
||||||
EXPECT_EQ(1u, newUsmAllocations);
|
|
||||||
delete[] ptr;
|
auto expectedNewUsmAllocations = 1u;
|
||||||
|
if (isRead) {
|
||||||
|
expectedNewUsmAllocations = 2u;
|
||||||
|
}
|
||||||
|
EXPECT_EQ(expectedNewUsmAllocations, newUsmAllocations);
|
||||||
|
delete[] hostPtr;
|
||||||
|
delete[] imageData;
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr static size_t stagingBufferSize = MemoryConstants::megaByte * 2;
|
constexpr static size_t stagingBufferSize = MemoryConstants::megaByte * 2;
|
||||||
@@ -178,16 +198,16 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForImageWrite
|
|||||||
{nonUsmBuffer, true, false},
|
{nonUsmBuffer, true, false},
|
||||||
};
|
};
|
||||||
for (auto i = 0; i < 4; i++) {
|
for (auto i = 0; i < 4; i++) {
|
||||||
auto actualValid = stagingBufferManager->isValidForStagingWriteImage(*pDevice, copyParamsStruct[i].ptr, copyParamsStruct[i].hasDependencies);
|
auto actualValid = stagingBufferManager->isValidForStagingTransferImage(*pDevice, copyParamsStruct[i].ptr, copyParamsStruct[i].hasDependencies);
|
||||||
EXPECT_EQ(actualValid, copyParamsStruct[i].expectValid);
|
EXPECT_EQ(actualValid, copyParamsStruct[i].expectValid);
|
||||||
}
|
}
|
||||||
|
|
||||||
debugManager.flags.EnableCopyWithStagingBuffers.set(0);
|
debugManager.flags.EnableCopyWithStagingBuffers.set(0);
|
||||||
EXPECT_FALSE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, nonUsmBuffer, false));
|
EXPECT_FALSE(stagingBufferManager->isValidForStagingTransferImage(*pDevice, nonUsmBuffer, false));
|
||||||
|
|
||||||
debugManager.flags.EnableCopyWithStagingBuffers.set(-1);
|
debugManager.flags.EnableCopyWithStagingBuffers.set(-1);
|
||||||
auto isStaingBuffersEnabled = pDevice->getProductHelper().isStagingBuffersEnabled();
|
auto isStaingBuffersEnabled = pDevice->getProductHelper().isStagingBuffersEnabled();
|
||||||
EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForStagingWriteImage(*pDevice, nonUsmBuffer, false));
|
EXPECT_EQ(isStaingBuffersEnabled, stagingBufferManager->isValidForStagingTransferImage(*pDevice, nonUsmBuffer, false));
|
||||||
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -256,17 +276,17 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkCopyThenEarlyR
|
|||||||
memset(usmBuffer, 0, totalCopySize);
|
memset(usmBuffer, 0, totalCopySize);
|
||||||
memset(nonUsmBuffer, 0xFF, totalCopySize);
|
memset(nonUsmBuffer, 0xFF, totalCopySize);
|
||||||
|
|
||||||
ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) {
|
ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) {
|
||||||
chunkCounter++;
|
chunkCounter++;
|
||||||
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
memcpy(chunkDst, chunkSrc, chunkSize);
|
||||||
memcpy(chunkDst, stagingBuffer, chunkSize);
|
|
||||||
return expectedErrorCode;
|
return expectedErrorCode;
|
||||||
};
|
};
|
||||||
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
|
auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs();
|
||||||
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr);
|
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr);
|
||||||
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||||
|
|
||||||
EXPECT_EQ(expectedErrorCode, ret);
|
EXPECT_EQ(expectedErrorCode, ret.chunkCopyStatus);
|
||||||
|
EXPECT_EQ(WaitStatus::ready, ret.waitStatus);
|
||||||
EXPECT_NE(0, memcmp(usmBuffer, nonUsmBuffer, totalCopySize));
|
EXPECT_NE(0, memcmp(usmBuffer, nonUsmBuffer, totalCopySize));
|
||||||
EXPECT_EQ(1u, chunkCounter);
|
EXPECT_EQ(1u, chunkCounter);
|
||||||
EXPECT_EQ(1u, newUsmAllocations);
|
EXPECT_EQ(1u, newUsmAllocations);
|
||||||
@@ -286,10 +306,9 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedRemainderCopyThenRe
|
|||||||
memset(usmBuffer, 0, totalCopySize);
|
memset(usmBuffer, 0, totalCopySize);
|
||||||
memset(nonUsmBuffer, 0xFF, totalCopySize);
|
memset(nonUsmBuffer, 0xFF, totalCopySize);
|
||||||
|
|
||||||
ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) {
|
ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) {
|
||||||
chunkCounter++;
|
chunkCounter++;
|
||||||
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
memcpy(chunkDst, chunkSrc, chunkSize);
|
||||||
memcpy(chunkDst, stagingBuffer, chunkSize);
|
|
||||||
if (chunkCounter <= numOfChunkCopies) {
|
if (chunkCounter <= numOfChunkCopies) {
|
||||||
return 0;
|
return 0;
|
||||||
} else {
|
} else {
|
||||||
@@ -300,7 +319,8 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedRemainderCopyThenRe
|
|||||||
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr);
|
auto ret = stagingBufferManager->performCopy(usmBuffer, nonUsmBuffer, totalCopySize, chunkCopy, csr);
|
||||||
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations;
|
||||||
|
|
||||||
EXPECT_EQ(expectedErrorCode, ret);
|
EXPECT_EQ(expectedErrorCode, ret.chunkCopyStatus);
|
||||||
|
EXPECT_EQ(WaitStatus::ready, ret.waitStatus);
|
||||||
EXPECT_EQ(numOfChunkCopies + 1, chunkCounter);
|
EXPECT_EQ(numOfChunkCopies + 1, chunkCounter);
|
||||||
EXPECT_EQ(1u, newUsmAllocations);
|
EXPECT_EQ(1u, newUsmAllocations);
|
||||||
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
svmAllocsManager->freeSVMAlloc(usmBuffer);
|
||||||
@@ -331,7 +351,7 @@ HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenDirectSubmissionEnabled
|
|||||||
auto nonUsmBuffer = new unsigned char[totalCopySize];
|
auto nonUsmBuffer = new unsigned char[totalCopySize];
|
||||||
|
|
||||||
size_t flushTagsCalled = 0;
|
size_t flushTagsCalled = 0;
|
||||||
ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) {
|
ChunkCopyFunction chunkCopy = [&](void *chunkSrc, void *chunkDst, size_t chunkSize) {
|
||||||
if (ultCsr->flushTagUpdateCalled) {
|
if (ultCsr->flushTagUpdateCalled) {
|
||||||
flushTagsCalled++;
|
flushTagsCalled++;
|
||||||
ultCsr->flushTagUpdateCalled = false;
|
ultCsr->flushTagUpdateCalled = false;
|
||||||
@@ -362,28 +382,121 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteThenWhol
|
|||||||
size_t expectedChunks = 8;
|
size_t expectedChunks = 8;
|
||||||
const size_t globalOrigin[3] = {0, 0, 0};
|
const size_t globalOrigin[3] = {0, 0, 0};
|
||||||
const size_t globalRegion[3] = {4, expectedChunks, 1};
|
const size_t globalRegion[3] = {4, expectedChunks, 1};
|
||||||
imageWriteThroughStagingBuffers(stagingBufferSize, globalOrigin, globalRegion, expectedChunks);
|
imageTransferThroughStagingBuffers(false, stagingBufferSize, globalOrigin, globalRegion, expectedChunks);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithOriginThenWholeRegionCovered) {
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithOriginThenWholeRegionCovered) {
|
||||||
size_t expectedChunks = 8;
|
size_t expectedChunks = 8;
|
||||||
const size_t globalOrigin[3] = {4, 4, 0};
|
const size_t globalOrigin[3] = {4, 4, 0};
|
||||||
const size_t globalRegion[3] = {4, expectedChunks, 1};
|
const size_t globalRegion[3] = {4, expectedChunks, 1};
|
||||||
imageWriteThroughStagingBuffers(stagingBufferSize, globalOrigin, globalRegion, expectedChunks);
|
imageTransferThroughStagingBuffers(false, stagingBufferSize, globalOrigin, globalRegion, expectedChunks);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithMultipleRowsPerChunkThenWholeRegionCovered) {
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithMultipleRowsPerChunkThenWholeRegionCovered) {
|
||||||
size_t expectedChunks = 4;
|
size_t expectedChunks = 4;
|
||||||
const size_t globalOrigin[3] = {0, 0, 0};
|
const size_t globalOrigin[3] = {0, 0, 0};
|
||||||
const size_t globalRegion[3] = {4, 8, 1};
|
const size_t globalRegion[3] = {4, 8, 1};
|
||||||
imageWriteThroughStagingBuffers(MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks);
|
imageTransferThroughStagingBuffers(false, MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithRemainderThenWholeRegionCovered) {
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithRemainderThenWholeRegionCovered) {
|
||||||
size_t expectedChunks = 4;
|
size_t expectedChunks = 4;
|
||||||
const size_t globalOrigin[3] = {0, 0, 0};
|
const size_t globalOrigin[3] = {0, 0, 0};
|
||||||
const size_t globalRegion[3] = {4, 7, 1};
|
const size_t globalRegion[3] = {4, 7, 1};
|
||||||
imageWriteThroughStagingBuffers(MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks);
|
imageTransferThroughStagingBuffers(false, MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageReadThenWholeRegionCovered) {
|
||||||
|
size_t expectedChunks = 8;
|
||||||
|
const size_t globalOrigin[3] = {0, 0, 0};
|
||||||
|
const size_t globalRegion[3] = {4, expectedChunks, 1};
|
||||||
|
imageTransferThroughStagingBuffers(true, stagingBufferSize, globalOrigin, globalRegion, expectedChunks);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageReadWithOriginThenWholeRegionCovered) {
|
||||||
|
size_t expectedChunks = 8;
|
||||||
|
const size_t globalOrigin[3] = {4, 4, 0};
|
||||||
|
const size_t globalRegion[3] = {4, expectedChunks, 1};
|
||||||
|
imageTransferThroughStagingBuffers(true, stagingBufferSize, globalOrigin, globalRegion, expectedChunks);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageReadWithMultipleRowsPerChunkThenWholeRegionCovered) {
|
||||||
|
size_t expectedChunks = 4;
|
||||||
|
const size_t globalOrigin[3] = {0, 0, 0};
|
||||||
|
const size_t globalRegion[3] = {4, 8, 1};
|
||||||
|
imageTransferThroughStagingBuffers(true, MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageReadWithRemainderThenWholeRegionCovered) {
|
||||||
|
size_t expectedChunks = 4;
|
||||||
|
const size_t globalOrigin[3] = {0, 0, 0};
|
||||||
|
const size_t globalRegion[3] = {4, 7, 1};
|
||||||
|
imageTransferThroughStagingBuffers(true, MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenGpuHangDuringChunkReadFromImageThenReturnImmediatelyWithFailure) {
|
||||||
|
size_t expectedChunks = 4;
|
||||||
|
const size_t globalOrigin[3] = {0, 0, 0};
|
||||||
|
const size_t globalRegion[3] = {4, 8, 1};
|
||||||
|
auto ptr = new unsigned char[stagingBufferSize * expectedChunks];
|
||||||
|
|
||||||
|
size_t chunkCounter = 0;
|
||||||
|
ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||||
|
++chunkCounter;
|
||||||
|
return 0;
|
||||||
|
};
|
||||||
|
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(csr);
|
||||||
|
ultCsr->waitForTaskCountReturnValue = WaitStatus::gpuHang;
|
||||||
|
auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, true);
|
||||||
|
EXPECT_EQ(0, ret.chunkCopyStatus);
|
||||||
|
EXPECT_EQ(WaitStatus::gpuHang, ret.waitStatus);
|
||||||
|
EXPECT_EQ(2u, chunkCounter);
|
||||||
|
delete[] ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenGpuHangAfterChunkReadFromImageThenReturnWithFailure) {
|
||||||
|
size_t expectedChunks = 4;
|
||||||
|
const size_t globalOrigin[3] = {0, 0, 0};
|
||||||
|
const size_t globalRegion[3] = {4, 8, 1};
|
||||||
|
auto ptr = new unsigned char[stagingBufferSize * expectedChunks];
|
||||||
|
|
||||||
|
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(csr);
|
||||||
|
size_t chunkCounter = 0;
|
||||||
|
ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||||
|
++chunkCounter;
|
||||||
|
if (chunkCounter == expectedChunks) {
|
||||||
|
ultCsr->waitForTaskCountReturnValue = WaitStatus::gpuHang;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
};
|
||||||
|
auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, true);
|
||||||
|
EXPECT_EQ(0, ret.chunkCopyStatus);
|
||||||
|
EXPECT_EQ(WaitStatus::gpuHang, ret.waitStatus);
|
||||||
|
EXPECT_EQ(4u, chunkCounter);
|
||||||
|
delete[] ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenGpuHangDuringRemainderChunkReadFromImageThenReturnImmediatelyWithFailure) {
|
||||||
|
size_t expectedChunks = 4;
|
||||||
|
const size_t globalOrigin[3] = {0, 0, 0};
|
||||||
|
const size_t globalRegion[3] = {4, 7, 1};
|
||||||
|
auto ptr = new unsigned char[stagingBufferSize * expectedChunks];
|
||||||
|
|
||||||
|
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(csr);
|
||||||
|
size_t chunkCounter = 0;
|
||||||
|
size_t remainderCounter = 4;
|
||||||
|
ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||||
|
++chunkCounter;
|
||||||
|
if (chunkCounter == remainderCounter - 1) {
|
||||||
|
ultCsr->waitForTaskCountReturnValue = WaitStatus::gpuHang;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
};
|
||||||
|
auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, true);
|
||||||
|
EXPECT_EQ(0, ret.chunkCopyStatus);
|
||||||
|
EXPECT_EQ(WaitStatus::gpuHang, ret.waitStatus);
|
||||||
|
EXPECT_EQ(remainderCounter - 1, chunkCounter);
|
||||||
|
delete[] ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkImageWriteThenEarlyReturnWithFailure) {
|
TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkImageWriteThenEarlyReturnWithFailure) {
|
||||||
@@ -394,13 +507,13 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkImageWriteThen
|
|||||||
auto ptr = new unsigned char[stagingBufferSize * expectedChunks];
|
auto ptr = new unsigned char[stagingBufferSize * expectedChunks];
|
||||||
|
|
||||||
size_t chunkCounter = 0;
|
size_t chunkCounter = 0;
|
||||||
ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t {
|
ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||||
++chunkCounter;
|
++chunkCounter;
|
||||||
return expectedErrorCode;
|
return expectedErrorCode;
|
||||||
};
|
};
|
||||||
|
auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, false);
|
||||||
auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr);
|
EXPECT_EQ(expectedErrorCode, ret.chunkCopyStatus);
|
||||||
EXPECT_EQ(expectedErrorCode, ret);
|
EXPECT_EQ(WaitStatus::ready, ret.waitStatus);
|
||||||
EXPECT_EQ(1u, chunkCounter);
|
EXPECT_EQ(1u, chunkCounter);
|
||||||
delete[] ptr;
|
delete[] ptr;
|
||||||
}
|
}
|
||||||
@@ -414,16 +527,16 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkImageWriteWith
|
|||||||
|
|
||||||
size_t chunkCounter = 0;
|
size_t chunkCounter = 0;
|
||||||
size_t remainderCounter = 4;
|
size_t remainderCounter = 4;
|
||||||
ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t {
|
ChunkTransferImageFunc chunkWrite = [&](void *stagingBuffer, const size_t *origin, const size_t *region) -> int32_t {
|
||||||
++chunkCounter;
|
++chunkCounter;
|
||||||
if (chunkCounter == remainderCounter) {
|
if (chunkCounter == remainderCounter) {
|
||||||
return expectedErrorCode;
|
return expectedErrorCode;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
};
|
};
|
||||||
|
auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr, false);
|
||||||
auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr);
|
EXPECT_EQ(expectedErrorCode, ret.chunkCopyStatus);
|
||||||
EXPECT_EQ(expectedErrorCode, ret);
|
EXPECT_EQ(WaitStatus::ready, ret.waitStatus);
|
||||||
EXPECT_EQ(remainderCounter, chunkCounter);
|
EXPECT_EQ(remainderCounter, chunkCounter);
|
||||||
delete[] ptr;
|
delete[] ptr;
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user