Copy host ptr on cpu if possible in clCreateBuffer

use cpu copy with locked pointer if possible
because this is faster than copy on gpu
limit to buffers of size at most 64kb

Related-To: NEO-7332

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek 2022-09-21 12:39:39 +00:00 committed by Compute-Runtime-Automation
parent 7ded401615
commit d8b7d56160
7 changed files with 166 additions and 14 deletions

View File

@ -53,7 +53,7 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
}
TakeOwnershipWrapper<CommandQueue> queueOwnership(*this);
auto commandStreamReceieverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
auto commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
auto blockQueue = false;
auto taskLevel = 0u;
@ -80,7 +80,7 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
eventBuilder);
}
commandStreamReceieverOwnership.unlock();
commandStreamReceiverOwnership.unlock();
queueOwnership.unlock();
// read/write buffers are always blocking

View File

@ -7,6 +7,7 @@
#include "opencl/source/mem_obj/buffer.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/execution_environment/root_device_environment.h"
@ -410,19 +411,33 @@ Buffer *Buffer::create(Context *context,
auto &device = pBuffer->getContext()->getDevice(0u)->getDevice();
auto &hwInfo = device.getHardwareInfo();
auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily);
auto blitMemoryToAllocationResult = BlitOperationResult::Unsupported;
if (hwInfoConfig->isBlitterFullySupported(hwInfo) && isLocalMemory) {
blitMemoryToAllocationResult = BlitHelperFunctions::blitMemoryToAllocation(device, allocationInfo.memory, pBuffer->getOffset(), hostPtr, {size, 1, 1});
bool copyOnCpuAllowed = false == ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) &&
size <= Buffer::maxBufferSizeForCopyOnCpu &&
!isCompressionEnabled &&
hwInfoConfig->getLocalMemoryAccessMode(hwInfo) != LocalMemoryAccessMode::CpuAccessDisallowed;
if (DebugManager.flags.CopyHostPtrOnCpu.get() != -1) {
copyOnCpuAllowed = DebugManager.flags.CopyHostPtrOnCpu.get() == 1;
}
if (auto lockedPointer = copyOnCpuAllowed ? device.getMemoryManager()->lockResource(allocationInfo.memory) : nullptr) {
memcpy_s(ptrOffset(lockedPointer, pBuffer->getOffset()), size, hostPtr, size);
allocationInfo.memory->setAubWritable(true, GraphicsAllocation::defaultBank);
allocationInfo.memory->setTbxWritable(true, GraphicsAllocation::defaultBank);
copyExecuted = true;
} else {
auto blitMemoryToAllocationResult = BlitOperationResult::Unsupported;
if (blitMemoryToAllocationResult != BlitOperationResult::Success) {
auto cmdQ = context->getSpecialQueue(rootDeviceIndex);
if (CL_SUCCESS != cmdQ->enqueueWriteBuffer(pBuffer, CL_TRUE, 0, size, hostPtr, allocationInfo.mapAllocation, 0, nullptr, nullptr)) {
errcodeRet = CL_OUT_OF_RESOURCES;
if (hwInfoConfig->isBlitterFullySupported(hwInfo) && isLocalMemory) {
blitMemoryToAllocationResult = BlitHelperFunctions::blitMemoryToAllocation(device, allocationInfo.memory, pBuffer->getOffset(), hostPtr, {size, 1, 1});
}
if (blitMemoryToAllocationResult != BlitOperationResult::Success) {
auto cmdQ = context->getSpecialQueue(rootDeviceIndex);
if (CL_SUCCESS != cmdQ->enqueueWriteBuffer(pBuffer, CL_TRUE, 0, size, hostPtr, allocationInfo.mapAllocation, 0, nullptr, nullptr)) {
errcodeRet = CL_OUT_OF_RESOURCES;
}
}
copyExecuted = true;
}
copyExecuted = true;
} else {
memcpy_s(allocationInfo.memory->getUnderlyingBuffer(), size, hostPtr, size);
copyExecuted = true;

View File

@ -58,6 +58,7 @@ extern ValidateInputAndCreateBufferFunc validateInputAndCreateBuffer;
class Buffer : public MemObj {
public:
constexpr static size_t maxBufferSizeForReadWriteOnCpu = 10 * MB;
constexpr static size_t maxBufferSizeForCopyOnCpu = 64 * KB;
constexpr static cl_ulong maskMagic = 0xFFFFFFFFFFFFFFFFLL;
constexpr static cl_ulong objectMagic = MemObj::objectMagic | 0x02;
bool forceDisallowCPUCopy = false;

View File

@ -102,7 +102,9 @@ struct BcsBufferTests : public ::testing::Test {
cl_int retVal = CL_SUCCESS;
};
HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferWithInitializationDataAndBcsCsrWhenCreatingThenUseBlitOperation) {
HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferWithInitializationDataAndBcsCsrAndCpuCopyDisabledWhenCreatingThenUseBlitOperation) {
DebugManagerStateRestore restorer;
DebugManager.flags.CopyHostPtrOnCpu.set(0);
auto bcsCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(bcsMockContext->bcsCsr.get());
static_cast<MockMemoryManager *>(device->getExecutionEnvironment()->memoryManager.get())->enable64kbpages[0] = true;
@ -113,7 +115,9 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferWithInitializationDataAndBcsCsrWhe
EXPECT_EQ(1u, bcsCsr->blitBufferCalled);
}
HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferWithNotDefaultRootDeviceIndexAndBcsCsrWhenCreatingThenUseBlitOperation) {
HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferWithNotDefaultRootDeviceIndexAndBcsCsrAndCpuCopyDisabledWhenCreatingThenUseBlitOperation) {
DebugManagerStateRestore restorer;
DebugManager.flags.CopyHostPtrOnCpu.set(0);
auto rootDeviceIndex = 1u;
auto hwInfo = *defaultHwInfo;
hwInfo.capabilityTable.blitterOperationsSupported = true;

View File

@ -586,7 +586,9 @@ TEST(Buffer, givenZeroFlagsNoSharedContextAndCompressedBuffersDisabledWhenAlloca
EXPECT_EQ(AllocationType::BUFFER, type);
}
TEST(Buffer, givenClMemCopyHostPointerPassedToBufferCreateWhenAllocationIsNotInSystemMemoryPoolThenAllocationIsWrittenByEnqueueWriteBuffer) {
TEST(Buffer, givenClMemCopyHostPointerPassedToBufferCreateWhenAllocationIsNotInSystemMemoryPoolAndCopyOnCpuDisabledThenAllocationIsWrittenByEnqueueWriteBuffer) {
DebugManagerStateRestore restorer;
DebugManager.flags.CopyHostPtrOnCpu.set(0);
ExecutionEnvironment *executionEnvironment = MockClDevice::prepareExecutionEnvironment(defaultHwInfo.get(), 0u);
auto *memoryManager = new MockMemoryManagerFailFirstAllocation(*executionEnvironment);
@ -1804,6 +1806,134 @@ HWTEST_F(BufferHwFromDeviceTests, givenMultiGraphicsAllocationWhenCreateBufferHw
alignedFree(ptr);
}
TEST(BufferCreateTests, givenClMemCopyHostPointerPassedToBufferCreateWhenAllocationIsNotInSystemMemoryPoolAndCopyOnCpuEnabledThenAllocationIsWrittenUsingLockedPointerIfAllowed) {
DebugManagerStateRestore restorer;
DebugManager.flags.ForceLocalMemoryAccessMode.set(static_cast<int32_t>(LocalMemoryAccessMode::CpuAccessAllowed));
auto executionEnvironment = new MockExecutionEnvironment(defaultHwInfo.get());
auto memoryManager = new MockMemoryManager(true, *executionEnvironment);
executionEnvironment->memoryManager.reset(memoryManager);
MockClDevice device(new MockDevice(executionEnvironment, mockRootDeviceIndex));
ASSERT_TRUE(device.createEngines());
MockContext context(&device, true);
auto commandQueue = new MockCommandQueue(context);
context.setSpecialQueue(commandQueue, mockRootDeviceIndex);
constexpr size_t smallBufferSize = Buffer::maxBufferSizeForCopyOnCpu;
constexpr size_t bigBufferSize = smallBufferSize + 1;
char memory[smallBufferSize];
char bigMemory[bigBufferSize];
{
// cpu copy allowed
cl_int retVal;
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
auto writeBufferCounter = commandQueue->writeBufferCounter;
size_t lockResourceCalled = memoryManager->lockResourceCalled;
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(memory), memory, retVal));
ASSERT_NE(nullptr, buffer.get());
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter);
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled + 1);
}
{
// buffer size over threshold -> cpu copy disallowed
cl_int retVal;
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
auto writeBufferCounter = commandQueue->writeBufferCounter;
size_t lockResourceCalled = memoryManager->lockResourceCalled;
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(bigMemory), bigMemory, retVal));
ASSERT_NE(nullptr, buffer.get());
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter + 1);
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled);
}
{
// uses implicit scaling -> cpu copy disallowed
DebugManagerStateRestore subTestRestorer;
DebugManager.flags.EnableWalkerPartition.set(1);
cl_int retVal;
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
auto writeBufferCounter = commandQueue->writeBufferCounter;
size_t lockResourceCalled = memoryManager->lockResourceCalled;
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(bigMemory), bigMemory, retVal));
ASSERT_NE(nullptr, buffer.get());
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter + 1);
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled);
}
{
// debug flag disabled -> cpu copy disallowed
DebugManagerStateRestore subTestRestorer;
DebugManager.flags.CopyHostPtrOnCpu.set(0);
cl_int retVal;
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
auto writeBufferCounter = commandQueue->writeBufferCounter;
size_t lockResourceCalled = memoryManager->lockResourceCalled;
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(memory), memory, retVal));
ASSERT_NE(nullptr, buffer.get());
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter + 1);
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled);
}
{
// debug flag enabled -> cpu copy forced
DebugManagerStateRestore subTestRestorer;
DebugManager.flags.CopyHostPtrOnCpu.set(1);
cl_int retVal;
cl_mem_flags flags = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
auto writeBufferCounter = commandQueue->writeBufferCounter;
size_t lockResourceCalled = memoryManager->lockResourceCalled;
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(bigMemory), bigMemory, retVal));
ASSERT_NE(nullptr, buffer.get());
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter);
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled + 1);
}
{
// local memory cpu access disallowed -> cpu copy disallowed
DebugManagerStateRestore subTestRestorer;
DebugManager.flags.ForceLocalMemoryAccessMode.set(static_cast<int32_t>(LocalMemoryAccessMode::CpuAccessDisallowed));
cl_int retVal;
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
auto writeBufferCounter = commandQueue->writeBufferCounter;
size_t lockResourceCalled = memoryManager->lockResourceCalled;
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(memory), memory, retVal));
ASSERT_NE(nullptr, buffer.get());
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter + 1);
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled);
}
memoryManager->localMemorySupported[mockRootDeviceIndex] = false;
{
// buffer not in local memory -> locked pointer not used
cl_int retVal;
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
auto writeBufferCounter = commandQueue->writeBufferCounter;
size_t lockResourceCalled = memoryManager->lockResourceCalled;
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(memory), memory, retVal));
ASSERT_NE(nullptr, buffer.get());
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter);
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled);
}
{
// compressed buffer, not in local memory -> locked pointer not used
DebugManagerStateRestore subTestRestorer;
DebugManager.flags.RenderCompressedBuffersEnabled.set(1);
DebugManager.flags.OverrideBufferSuitableForRenderCompression.set(1);
cl_int retVal;
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
auto writeBufferCounter = commandQueue->writeBufferCounter;
size_t lockResourceCalled = memoryManager->lockResourceCalled;
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(memory), memory, retVal));
ASSERT_NE(nullptr, buffer.get());
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter + 1);
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled);
}
}
class BufferL3CacheTests : public ::testing::TestWithParam<uint64_t> {
public:
void SetUp() override {

View File

@ -220,6 +220,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideL1CachePolicyInSurfaceStateAndStateless,
DECLARE_DEBUG_VARIABLE(int32_t, PlaformSupportEvictIfNecessaryFlag, -1, "-1: default - platform specific, 0: disable, 1: enable")
DECLARE_DEBUG_VARIABLE(int32_t, ForceEvictOnlyIfNecessaryFlag, -1, "-1: default - driver selects when to use, 0: force never use this flag, 1: force always use this flag")
DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessMocsEncryptionBit, -1, "-1: default - 1: set encryption bit")
DECLARE_DEBUG_VARIABLE(int32_t, CopyHostPtrOnCpu, -1, "-1: default, 0: disable, 1:enable, In clCreateBuffer with CL_MEM_COPY_HOST_PTR, copy memory using locked ptr on cpu")
/*LOGGING FLAGS*/
DECLARE_DEBUG_VARIABLE(int32_t, PrintDriverDiagnostics, -1, "prints driver diagnostics messages to standard output, value corresponds to hint level")

View File

@ -463,3 +463,4 @@ ForceStatelessMocsEncryptionBit = -1
ExperimentalCopyThroughLock = -1
ExperimentalH2DCpuCopyThreshold = -1
ExperimentalD2HCpuCopyThreshold = -1
CopyHostPtrOnCpu = -1