Copy host ptr on cpu if possible in clCreateBuffer

use cpu copy with locked pointer if possible
because this is faster than copy on gpu
limit to buffers of size at most 64kb

Related-To: NEO-7332

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek
2022-09-21 12:39:39 +00:00
committed by Compute-Runtime-Automation
parent 7ded401615
commit d8b7d56160
7 changed files with 166 additions and 14 deletions

View File

@@ -7,6 +7,7 @@
#include "opencl/source/mem_obj/buffer.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/execution_environment/root_device_environment.h"
@@ -410,19 +411,33 @@ Buffer *Buffer::create(Context *context,
auto &device = pBuffer->getContext()->getDevice(0u)->getDevice();
auto &hwInfo = device.getHardwareInfo();
auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily);
auto blitMemoryToAllocationResult = BlitOperationResult::Unsupported;
if (hwInfoConfig->isBlitterFullySupported(hwInfo) && isLocalMemory) {
blitMemoryToAllocationResult = BlitHelperFunctions::blitMemoryToAllocation(device, allocationInfo.memory, pBuffer->getOffset(), hostPtr, {size, 1, 1});
bool copyOnCpuAllowed = false == ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) &&
size <= Buffer::maxBufferSizeForCopyOnCpu &&
!isCompressionEnabled &&
hwInfoConfig->getLocalMemoryAccessMode(hwInfo) != LocalMemoryAccessMode::CpuAccessDisallowed;
if (DebugManager.flags.CopyHostPtrOnCpu.get() != -1) {
copyOnCpuAllowed = DebugManager.flags.CopyHostPtrOnCpu.get() == 1;
}
if (auto lockedPointer = copyOnCpuAllowed ? device.getMemoryManager()->lockResource(allocationInfo.memory) : nullptr) {
memcpy_s(ptrOffset(lockedPointer, pBuffer->getOffset()), size, hostPtr, size);
allocationInfo.memory->setAubWritable(true, GraphicsAllocation::defaultBank);
allocationInfo.memory->setTbxWritable(true, GraphicsAllocation::defaultBank);
copyExecuted = true;
} else {
auto blitMemoryToAllocationResult = BlitOperationResult::Unsupported;
if (blitMemoryToAllocationResult != BlitOperationResult::Success) {
auto cmdQ = context->getSpecialQueue(rootDeviceIndex);
if (CL_SUCCESS != cmdQ->enqueueWriteBuffer(pBuffer, CL_TRUE, 0, size, hostPtr, allocationInfo.mapAllocation, 0, nullptr, nullptr)) {
errcodeRet = CL_OUT_OF_RESOURCES;
if (hwInfoConfig->isBlitterFullySupported(hwInfo) && isLocalMemory) {
blitMemoryToAllocationResult = BlitHelperFunctions::blitMemoryToAllocation(device, allocationInfo.memory, pBuffer->getOffset(), hostPtr, {size, 1, 1});
}
if (blitMemoryToAllocationResult != BlitOperationResult::Success) {
auto cmdQ = context->getSpecialQueue(rootDeviceIndex);
if (CL_SUCCESS != cmdQ->enqueueWriteBuffer(pBuffer, CL_TRUE, 0, size, hostPtr, allocationInfo.mapAllocation, 0, nullptr, nullptr)) {
errcodeRet = CL_OUT_OF_RESOURCES;
}
}
copyExecuted = true;
}
copyExecuted = true;
} else {
memcpy_s(allocationInfo.memory->getUnderlyingBuffer(), size, hostPtr, size);
copyExecuted = true;

View File

@@ -58,6 +58,7 @@ extern ValidateInputAndCreateBufferFunc validateInputAndCreateBuffer;
class Buffer : public MemObj {
public:
constexpr static size_t maxBufferSizeForReadWriteOnCpu = 10 * MB;
constexpr static size_t maxBufferSizeForCopyOnCpu = 64 * KB;
constexpr static cl_ulong maskMagic = 0xFFFFFFFFFFFFFFFFLL;
constexpr static cl_ulong objectMagic = MemObj::objectMagic | 0x02;
bool forceDisallowCPUCopy = false;