Copy host ptr on cpu if possible in clCreateBuffer
use cpu copy with locked pointer if possible because this is faster than copy on gpu limit to buffers of size at most 64kb Related-To: NEO-7332 Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
parent
7ded401615
commit
d8b7d56160
|
@ -53,7 +53,7 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
|
|||
}
|
||||
|
||||
TakeOwnershipWrapper<CommandQueue> queueOwnership(*this);
|
||||
auto commandStreamReceieverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
|
||||
auto commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
|
||||
|
||||
auto blockQueue = false;
|
||||
auto taskLevel = 0u;
|
||||
|
@ -80,7 +80,7 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
|
|||
eventBuilder);
|
||||
}
|
||||
|
||||
commandStreamReceieverOwnership.unlock();
|
||||
commandStreamReceiverOwnership.unlock();
|
||||
queueOwnership.unlock();
|
||||
|
||||
// read/write buffers are always blocking
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
#include "opencl/source/mem_obj/buffer.h"
|
||||
|
||||
#include "shared/source/command_container/implicit_scaling.h"
|
||||
#include "shared/source/command_stream/command_stream_receiver.h"
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/execution_environment/root_device_environment.h"
|
||||
|
@ -410,19 +411,33 @@ Buffer *Buffer::create(Context *context,
|
|||
auto &device = pBuffer->getContext()->getDevice(0u)->getDevice();
|
||||
auto &hwInfo = device.getHardwareInfo();
|
||||
auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily);
|
||||
auto blitMemoryToAllocationResult = BlitOperationResult::Unsupported;
|
||||
|
||||
if (hwInfoConfig->isBlitterFullySupported(hwInfo) && isLocalMemory) {
|
||||
blitMemoryToAllocationResult = BlitHelperFunctions::blitMemoryToAllocation(device, allocationInfo.memory, pBuffer->getOffset(), hostPtr, {size, 1, 1});
|
||||
bool copyOnCpuAllowed = false == ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) &&
|
||||
size <= Buffer::maxBufferSizeForCopyOnCpu &&
|
||||
!isCompressionEnabled &&
|
||||
hwInfoConfig->getLocalMemoryAccessMode(hwInfo) != LocalMemoryAccessMode::CpuAccessDisallowed;
|
||||
if (DebugManager.flags.CopyHostPtrOnCpu.get() != -1) {
|
||||
copyOnCpuAllowed = DebugManager.flags.CopyHostPtrOnCpu.get() == 1;
|
||||
}
|
||||
if (auto lockedPointer = copyOnCpuAllowed ? device.getMemoryManager()->lockResource(allocationInfo.memory) : nullptr) {
|
||||
memcpy_s(ptrOffset(lockedPointer, pBuffer->getOffset()), size, hostPtr, size);
|
||||
allocationInfo.memory->setAubWritable(true, GraphicsAllocation::defaultBank);
|
||||
allocationInfo.memory->setTbxWritable(true, GraphicsAllocation::defaultBank);
|
||||
copyExecuted = true;
|
||||
} else {
|
||||
auto blitMemoryToAllocationResult = BlitOperationResult::Unsupported;
|
||||
|
||||
if (blitMemoryToAllocationResult != BlitOperationResult::Success) {
|
||||
auto cmdQ = context->getSpecialQueue(rootDeviceIndex);
|
||||
if (CL_SUCCESS != cmdQ->enqueueWriteBuffer(pBuffer, CL_TRUE, 0, size, hostPtr, allocationInfo.mapAllocation, 0, nullptr, nullptr)) {
|
||||
errcodeRet = CL_OUT_OF_RESOURCES;
|
||||
if (hwInfoConfig->isBlitterFullySupported(hwInfo) && isLocalMemory) {
|
||||
blitMemoryToAllocationResult = BlitHelperFunctions::blitMemoryToAllocation(device, allocationInfo.memory, pBuffer->getOffset(), hostPtr, {size, 1, 1});
|
||||
}
|
||||
|
||||
if (blitMemoryToAllocationResult != BlitOperationResult::Success) {
|
||||
auto cmdQ = context->getSpecialQueue(rootDeviceIndex);
|
||||
if (CL_SUCCESS != cmdQ->enqueueWriteBuffer(pBuffer, CL_TRUE, 0, size, hostPtr, allocationInfo.mapAllocation, 0, nullptr, nullptr)) {
|
||||
errcodeRet = CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
}
|
||||
copyExecuted = true;
|
||||
}
|
||||
copyExecuted = true;
|
||||
} else {
|
||||
memcpy_s(allocationInfo.memory->getUnderlyingBuffer(), size, hostPtr, size);
|
||||
copyExecuted = true;
|
||||
|
|
|
@ -58,6 +58,7 @@ extern ValidateInputAndCreateBufferFunc validateInputAndCreateBuffer;
|
|||
class Buffer : public MemObj {
|
||||
public:
|
||||
constexpr static size_t maxBufferSizeForReadWriteOnCpu = 10 * MB;
|
||||
constexpr static size_t maxBufferSizeForCopyOnCpu = 64 * KB;
|
||||
constexpr static cl_ulong maskMagic = 0xFFFFFFFFFFFFFFFFLL;
|
||||
constexpr static cl_ulong objectMagic = MemObj::objectMagic | 0x02;
|
||||
bool forceDisallowCPUCopy = false;
|
||||
|
|
|
@ -102,7 +102,9 @@ struct BcsBufferTests : public ::testing::Test {
|
|||
cl_int retVal = CL_SUCCESS;
|
||||
};
|
||||
|
||||
HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferWithInitializationDataAndBcsCsrWhenCreatingThenUseBlitOperation) {
|
||||
HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferWithInitializationDataAndBcsCsrAndCpuCopyDisabledWhenCreatingThenUseBlitOperation) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.CopyHostPtrOnCpu.set(0);
|
||||
auto bcsCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(bcsMockContext->bcsCsr.get());
|
||||
|
||||
static_cast<MockMemoryManager *>(device->getExecutionEnvironment()->memoryManager.get())->enable64kbpages[0] = true;
|
||||
|
@ -113,7 +115,9 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferWithInitializationDataAndBcsCsrWhe
|
|||
EXPECT_EQ(1u, bcsCsr->blitBufferCalled);
|
||||
}
|
||||
|
||||
HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferWithNotDefaultRootDeviceIndexAndBcsCsrWhenCreatingThenUseBlitOperation) {
|
||||
HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferWithNotDefaultRootDeviceIndexAndBcsCsrAndCpuCopyDisabledWhenCreatingThenUseBlitOperation) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.CopyHostPtrOnCpu.set(0);
|
||||
auto rootDeviceIndex = 1u;
|
||||
auto hwInfo = *defaultHwInfo;
|
||||
hwInfo.capabilityTable.blitterOperationsSupported = true;
|
||||
|
|
|
@ -586,7 +586,9 @@ TEST(Buffer, givenZeroFlagsNoSharedContextAndCompressedBuffersDisabledWhenAlloca
|
|||
EXPECT_EQ(AllocationType::BUFFER, type);
|
||||
}
|
||||
|
||||
TEST(Buffer, givenClMemCopyHostPointerPassedToBufferCreateWhenAllocationIsNotInSystemMemoryPoolThenAllocationIsWrittenByEnqueueWriteBuffer) {
|
||||
TEST(Buffer, givenClMemCopyHostPointerPassedToBufferCreateWhenAllocationIsNotInSystemMemoryPoolAndCopyOnCpuDisabledThenAllocationIsWrittenByEnqueueWriteBuffer) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.CopyHostPtrOnCpu.set(0);
|
||||
ExecutionEnvironment *executionEnvironment = MockClDevice::prepareExecutionEnvironment(defaultHwInfo.get(), 0u);
|
||||
|
||||
auto *memoryManager = new MockMemoryManagerFailFirstAllocation(*executionEnvironment);
|
||||
|
@ -1804,6 +1806,134 @@ HWTEST_F(BufferHwFromDeviceTests, givenMultiGraphicsAllocationWhenCreateBufferHw
|
|||
alignedFree(ptr);
|
||||
}
|
||||
|
||||
TEST(BufferCreateTests, givenClMemCopyHostPointerPassedToBufferCreateWhenAllocationIsNotInSystemMemoryPoolAndCopyOnCpuEnabledThenAllocationIsWrittenUsingLockedPointerIfAllowed) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.ForceLocalMemoryAccessMode.set(static_cast<int32_t>(LocalMemoryAccessMode::CpuAccessAllowed));
|
||||
|
||||
auto executionEnvironment = new MockExecutionEnvironment(defaultHwInfo.get());
|
||||
auto memoryManager = new MockMemoryManager(true, *executionEnvironment);
|
||||
executionEnvironment->memoryManager.reset(memoryManager);
|
||||
|
||||
MockClDevice device(new MockDevice(executionEnvironment, mockRootDeviceIndex));
|
||||
ASSERT_TRUE(device.createEngines());
|
||||
MockContext context(&device, true);
|
||||
auto commandQueue = new MockCommandQueue(context);
|
||||
context.setSpecialQueue(commandQueue, mockRootDeviceIndex);
|
||||
constexpr size_t smallBufferSize = Buffer::maxBufferSizeForCopyOnCpu;
|
||||
constexpr size_t bigBufferSize = smallBufferSize + 1;
|
||||
char memory[smallBufferSize];
|
||||
char bigMemory[bigBufferSize];
|
||||
|
||||
{
|
||||
// cpu copy allowed
|
||||
cl_int retVal;
|
||||
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
|
||||
auto writeBufferCounter = commandQueue->writeBufferCounter;
|
||||
size_t lockResourceCalled = memoryManager->lockResourceCalled;
|
||||
|
||||
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(memory), memory, retVal));
|
||||
ASSERT_NE(nullptr, buffer.get());
|
||||
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter);
|
||||
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled + 1);
|
||||
}
|
||||
{
|
||||
// buffer size over threshold -> cpu copy disallowed
|
||||
cl_int retVal;
|
||||
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
|
||||
auto writeBufferCounter = commandQueue->writeBufferCounter;
|
||||
size_t lockResourceCalled = memoryManager->lockResourceCalled;
|
||||
|
||||
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(bigMemory), bigMemory, retVal));
|
||||
ASSERT_NE(nullptr, buffer.get());
|
||||
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter + 1);
|
||||
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled);
|
||||
}
|
||||
{
|
||||
// uses implicit scaling -> cpu copy disallowed
|
||||
DebugManagerStateRestore subTestRestorer;
|
||||
DebugManager.flags.EnableWalkerPartition.set(1);
|
||||
cl_int retVal;
|
||||
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
|
||||
auto writeBufferCounter = commandQueue->writeBufferCounter;
|
||||
size_t lockResourceCalled = memoryManager->lockResourceCalled;
|
||||
|
||||
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(bigMemory), bigMemory, retVal));
|
||||
ASSERT_NE(nullptr, buffer.get());
|
||||
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter + 1);
|
||||
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled);
|
||||
}
|
||||
{
|
||||
// debug flag disabled -> cpu copy disallowed
|
||||
DebugManagerStateRestore subTestRestorer;
|
||||
DebugManager.flags.CopyHostPtrOnCpu.set(0);
|
||||
cl_int retVal;
|
||||
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
|
||||
auto writeBufferCounter = commandQueue->writeBufferCounter;
|
||||
size_t lockResourceCalled = memoryManager->lockResourceCalled;
|
||||
|
||||
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(memory), memory, retVal));
|
||||
ASSERT_NE(nullptr, buffer.get());
|
||||
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter + 1);
|
||||
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled);
|
||||
}
|
||||
{
|
||||
// debug flag enabled -> cpu copy forced
|
||||
DebugManagerStateRestore subTestRestorer;
|
||||
DebugManager.flags.CopyHostPtrOnCpu.set(1);
|
||||
cl_int retVal;
|
||||
cl_mem_flags flags = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
|
||||
auto writeBufferCounter = commandQueue->writeBufferCounter;
|
||||
size_t lockResourceCalled = memoryManager->lockResourceCalled;
|
||||
|
||||
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(bigMemory), bigMemory, retVal));
|
||||
ASSERT_NE(nullptr, buffer.get());
|
||||
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter);
|
||||
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled + 1);
|
||||
}
|
||||
{
|
||||
// local memory cpu access disallowed -> cpu copy disallowed
|
||||
DebugManagerStateRestore subTestRestorer;
|
||||
DebugManager.flags.ForceLocalMemoryAccessMode.set(static_cast<int32_t>(LocalMemoryAccessMode::CpuAccessDisallowed));
|
||||
cl_int retVal;
|
||||
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
|
||||
auto writeBufferCounter = commandQueue->writeBufferCounter;
|
||||
size_t lockResourceCalled = memoryManager->lockResourceCalled;
|
||||
|
||||
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(memory), memory, retVal));
|
||||
ASSERT_NE(nullptr, buffer.get());
|
||||
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter + 1);
|
||||
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled);
|
||||
}
|
||||
memoryManager->localMemorySupported[mockRootDeviceIndex] = false;
|
||||
{
|
||||
// buffer not in local memory -> locked pointer not used
|
||||
cl_int retVal;
|
||||
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
|
||||
auto writeBufferCounter = commandQueue->writeBufferCounter;
|
||||
size_t lockResourceCalled = memoryManager->lockResourceCalled;
|
||||
|
||||
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(memory), memory, retVal));
|
||||
ASSERT_NE(nullptr, buffer.get());
|
||||
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter);
|
||||
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled);
|
||||
}
|
||||
{
|
||||
// compressed buffer, not in local memory -> locked pointer not used
|
||||
DebugManagerStateRestore subTestRestorer;
|
||||
DebugManager.flags.RenderCompressedBuffersEnabled.set(1);
|
||||
DebugManager.flags.OverrideBufferSuitableForRenderCompression.set(1);
|
||||
cl_int retVal;
|
||||
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
|
||||
auto writeBufferCounter = commandQueue->writeBufferCounter;
|
||||
size_t lockResourceCalled = memoryManager->lockResourceCalled;
|
||||
|
||||
std::unique_ptr<Buffer> buffer(Buffer::create(&context, flags, sizeof(memory), memory, retVal));
|
||||
ASSERT_NE(nullptr, buffer.get());
|
||||
EXPECT_EQ(commandQueue->writeBufferCounter, writeBufferCounter + 1);
|
||||
EXPECT_EQ(memoryManager->lockResourceCalled, lockResourceCalled);
|
||||
}
|
||||
}
|
||||
|
||||
class BufferL3CacheTests : public ::testing::TestWithParam<uint64_t> {
|
||||
public:
|
||||
void SetUp() override {
|
||||
|
|
|
@ -220,6 +220,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideL1CachePolicyInSurfaceStateAndStateless,
|
|||
DECLARE_DEBUG_VARIABLE(int32_t, PlaformSupportEvictIfNecessaryFlag, -1, "-1: default - platform specific, 0: disable, 1: enable")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceEvictOnlyIfNecessaryFlag, -1, "-1: default - driver selects when to use, 0: force never use this flag, 1: force always use this flag")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessMocsEncryptionBit, -1, "-1: default - 1: set encryption bit")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, CopyHostPtrOnCpu, -1, "-1: default, 0: disable, 1:enable, In clCreateBuffer with CL_MEM_COPY_HOST_PTR, copy memory using locked ptr on cpu")
|
||||
|
||||
/*LOGGING FLAGS*/
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, PrintDriverDiagnostics, -1, "prints driver diagnostics messages to standard output, value corresponds to hint level")
|
||||
|
|
|
@ -463,3 +463,4 @@ ForceStatelessMocsEncryptionBit = -1
|
|||
ExperimentalCopyThroughLock = -1
|
||||
ExperimentalH2DCpuCopyThreshold = -1
|
||||
ExperimentalD2HCpuCopyThreshold = -1
|
||||
CopyHostPtrOnCpu = -1
|
||||
|
|
Loading…
Reference in New Issue