performance: introduce staging reads from image

Related-To: NEO-12968

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2024-12-05 13:56:30 +00:00
committed by Compute-Runtime-Automation
parent f2725f217e
commit 6c4eb322b1
16 changed files with 702 additions and 241 deletions

View File

@@ -29,7 +29,6 @@
#include "shared/source/os_interface/os_context.h"
#include "shared/source/os_interface/product_helper.h"
#include "shared/source/utilities/api_intercept.h"
#include "shared/source/utilities/staging_buffer_manager.h"
#include "shared/source/utilities/tag_allocator.h"
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
@@ -1557,146 +1556,4 @@ void CommandQueue::unregisterGpgpuAndBcsCsrClients() {
}
}
cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event) {
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, &size};
csrSelectionArgs.direction = TransferDirection::hostToLocal;
auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs);
Event profilingEvent{this, CL_COMMAND_SVM_MEMCPY, CompletionStamp::notReady, CompletionStamp::notReady};
if (isProfilingEnabled()) {
profilingEvent.setQueueTimeStamp();
}
// If there was only one chunk copy, no barrier for OOQ is needed
bool isSingleTransfer = false;
ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) -> int32_t {
auto isFirstTransfer = (chunkDst == dstPtr);
auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size);
isSingleTransfer = isFirstTransfer && isLastTransfer;
if (isFirstTransfer && isProfilingEnabled()) {
profilingEvent.setSubmitTimeStamp();
}
memcpy(stagingBuffer, chunkSrc, chunkSize);
if (isSingleTransfer) {
return this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, event, csr);
}
if (isFirstTransfer && isProfilingEnabled()) {
profilingEvent.setStartTimeStamp();
}
cl_event *outEvent = nullptr;
if (isLastTransfer && !this->isOOQEnabled()) {
outEvent = event;
}
auto ret = this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, outEvent, csr);
return ret;
};
auto stagingBufferManager = this->context->getStagingBufferManager();
auto ret = stagingBufferManager->performCopy(dstPtr, srcPtr, size, chunkCopy, csr);
if (ret != CL_SUCCESS) {
return ret;
}
return postStagingTransferSync(event, profilingEvent, isSingleTransfer, blockingCopy);
}
cl_int CommandQueue::enqueueStagingWriteImage(Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion,
size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event) {
constexpr cl_command_type cmdType = CL_COMMAND_WRITE_IMAGE;
CsrSelectionArgs csrSelectionArgs{cmdType, nullptr, dstImage, this->getDevice().getRootDeviceIndex(), globalRegion, nullptr, globalOrigin};
auto &csr = selectCsrForBuiltinOperation(csrSelectionArgs);
Event profilingEvent{this, CL_COMMAND_WRITE_IMAGE, CompletionStamp::notReady, CompletionStamp::notReady};
if (isProfilingEnabled()) {
profilingEvent.setQueueTimeStamp();
}
// If there was only one chunk write, no barrier for OOQ is needed
bool isSingleTransfer = false;
ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t {
auto isFirstTransfer = (globalOrigin[1] == origin[1]);
auto isLastTransfer = (globalOrigin[1] + globalRegion[1] == origin[1] + region[1]);
isSingleTransfer = isFirstTransfer && isLastTransfer;
if (isFirstTransfer && isProfilingEnabled()) {
profilingEvent.setSubmitTimeStamp();
}
memcpy(stagingBuffer, chunkPtr, bufferSize);
if (isSingleTransfer) {
return this->enqueueWriteImageImpl(dstImage, false, origin, region, inputRowPitch, inputSlicePitch, stagingBuffer, nullptr, 0, nullptr, event, csr);
}
if (isFirstTransfer && isProfilingEnabled()) {
profilingEvent.setStartTimeStamp();
}
cl_event *outEvent = nullptr;
if (isLastTransfer && !this->isOOQEnabled()) {
outEvent = event;
}
auto ret = this->enqueueWriteImageImpl(dstImage, false, origin, region, inputRowPitch, inputSlicePitch, stagingBuffer, nullptr, 0, nullptr, outEvent, csr);
return ret;
};
auto bytesPerPixel = dstImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes;
auto dstRowPitch = inputRowPitch ? inputRowPitch : globalRegion[0] * bytesPerPixel;
auto stagingBufferManager = this->context->getStagingBufferManager();
auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, dstRowPitch, chunkWrite, &csr);
if (ret != CL_SUCCESS) {
return ret;
}
return postStagingTransferSync(event, profilingEvent, isSingleTransfer, blockingCopy);
}
cl_int CommandQueue::postStagingTransferSync(cl_event *event, const Event &profilingEvent, bool isSingleTransfer, bool isBlocking) {
cl_int ret = CL_SUCCESS;
if (event != nullptr) {
if (!isSingleTransfer && this->isOOQEnabled()) {
ret = this->enqueueBarrierWithWaitList(0, nullptr, event);
}
auto pEvent = castToObjectOrAbort<Event>(*event);
if (isProfilingEnabled()) {
pEvent->copyTimestamps(profilingEvent, !isSingleTransfer);
pEvent->setCPUProfilingPath(false);
}
pEvent->setCmdType(profilingEvent.getCommandType());
}
if (isBlocking) {
ret = this->finish();
}
return ret;
}
bool CommandQueue::isValidForStagingBufferCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies) {
GraphicsAllocation *allocation = nullptr;
context->tryGetExistingMapAllocation(srcPtr, size, allocation);
if (allocation != nullptr) {
// Direct transfer from mapped allocation is faster than staging buffer
return false;
}
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_SVM_MEMCPY, nullptr};
csrSelectionArgs.direction = TransferDirection::hostToLocal;
auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs);
auto osContextId = csr->getOsContext().getContextId();
auto stagingBufferManager = context->getStagingBufferManager();
UNRECOVERABLE_IF(stagingBufferManager == nullptr);
return stagingBufferManager->isValidForCopy(device, dstPtr, srcPtr, size, hasDependencies, osContextId);
}
bool CommandQueue::isValidForStagingWriteImage(Image *image, const void *ptr, bool hasDependencies) {
auto stagingBufferManager = context->getStagingBufferManager();
if (!stagingBufferManager) {
return false;
}
switch (image->getImageDesc().image_type) {
case CL_MEM_OBJECT_IMAGE1D:
case CL_MEM_OBJECT_IMAGE2D:
return stagingBufferManager->isValidForStagingWriteImage(this->getDevice(), ptr, hasDependencies);
default:
return false;
}
}
} // namespace NEO