From e151bc6e2dcbb487009f7d6d6787d5eefe74892f Mon Sep 17 00:00:00 2001 From: Dominik Dabek Date: Tue, 11 Oct 2022 13:16:40 +0000 Subject: [PATCH] [OCL] Flag for allocating small buffers from pool Improves performance in workloads that create small opencl buffers. To enable, set env var ExperimentalSmallBufferPoolAllocator=1 Known issues (will be addressed in further commits): - cannot create subBuffer from such buffer - pool buffer allocation should be reused Related-To: NEO-7332 Signed-off-by: Dominik Dabek --- opencl/source/context/context.cpp | 65 ++++ opencl/source/context/context.h | 47 +++ opencl/source/mem_obj/buffer.cpp | 144 +++++-- opencl/source/mem_obj/mem_obj.cpp | 6 +- opencl/test/unit_test/mem_obj/CMakeLists.txt | 3 +- .../mem_obj/buffer_pool_alloc_tests.cpp | 358 ++++++++++++++++++ .../test/unit_test/mem_obj/buffer_tests.cpp | 2 +- opencl/test/unit_test/mocks/mock_buffer.h | 1 + opencl/test/unit_test/mocks/mock_context.h | 8 + .../debug_settings/debug_variables_base.inl | 1 + .../test/common/mocks/mock_memory_manager.h | 8 +- shared/test/common/test_files/igdrcl.config | 3 +- 12 files changed, 598 insertions(+), 48 deletions(-) create mode 100644 opencl/test/unit_test/mem_obj/buffer_pool_alloc_tests.cpp diff --git a/opencl/source/context/context.cpp b/opencl/source/context/context.cpp index ecc4a4e1b0..0ed299d24c 100644 --- a/opencl/source/context/context.cpp +++ b/opencl/source/context/context.cpp @@ -45,6 +45,10 @@ Context::Context( Context::~Context() { gtpinNotifyContextDestroy((cl_context)this); + if (smallBufferPoolAllocator.isAggregatedSmallBuffersEnabled()) { + smallBufferPoolAllocator.releaseSmallBufferPool(); + } + delete[] properties; for (auto rootDeviceIndex = 0u; rootDeviceIndex < specialQueues.size(); rootDeviceIndex++) { @@ -467,4 +471,65 @@ Platform *Context::getPlatformFromProperties(const cl_context_properties *proper bool Context::isSingleDeviceContext() { return devices[0]->getNumGenericSubDevices() == 0 && getNumDevices() == 1; } + +void Context::BufferPoolAllocator::initAggregatedSmallBuffers(Context *context) { + static constexpr cl_mem_flags flags{}; + [[maybe_unused]] cl_int errcodeRet{}; + this->mainStorage = Buffer::create(context, + flags, + BufferPoolAllocator::aggregatedSmallBuffersPoolSize, + nullptr, + errcodeRet); + if (this->mainStorage) { + this->chunkAllocator.reset(new HeapAllocator(BufferPoolAllocator::startingOffset, + BufferPoolAllocator::aggregatedSmallBuffersPoolSize, + BufferPoolAllocator::chunkAlignment)); + context->decRefInternal(); + } +} + +Buffer *Context::BufferPoolAllocator::allocateBufferFromPool(const MemoryProperties &memoryProperties, + cl_mem_flags flags, + cl_mem_flags_intel flagsIntel, + size_t size, + void *hostPtr, + cl_int &errcodeRet) { + errcodeRet = CL_MEM_OBJECT_ALLOCATION_FAILURE; + if (this->isAggregatedSmallBuffersEnabled() && + this->isSizeWithinThreshold(size) && + this->mainStorage) { + auto lock = std::unique_lock(this->mutex); + cl_buffer_region bufferRegion{}; + bufferRegion.origin = static_cast(this->chunkAllocator->allocate(size)); + if (bufferRegion.origin == 0) { + return nullptr; + } + bufferRegion.origin -= BufferPoolAllocator::startingOffset; + bufferRegion.size = size; + auto bufferFromPool = this->mainStorage->createSubBuffer(flags, flagsIntel, &bufferRegion, errcodeRet); + bufferFromPool->createFunction = this->mainStorage->createFunction; + return bufferFromPool; + } + return nullptr; +} + +bool Context::BufferPoolAllocator::isPoolBuffer(const MemObj *buffer) const { + return this->mainStorage == buffer; +} + +void Context::BufferPoolAllocator::tryFreeFromPoolBuffer(MemObj *possiblePoolBuffer, size_t offset, size_t size) { + if (this->isPoolBuffer(possiblePoolBuffer)) { + auto lock = std::unique_lock(this->mutex); + DEBUG_BREAK_IF(!this->mainStorage); + auto internalBufferAddress = offset + BufferPoolAllocator::startingOffset; + this->chunkAllocator->free(internalBufferAddress, size); + } +} + +void Context::BufferPoolAllocator::releaseSmallBufferPool() { + DEBUG_BREAK_IF(!this->mainStorage); + delete this->mainStorage; + this->mainStorage = nullptr; +} + } // namespace NEO diff --git a/opencl/source/context/context.h b/opencl/source/context/context.h index 84625e3608..942b5d3401 100644 --- a/opencl/source/context/context.h +++ b/opencl/source/context/context.h @@ -10,6 +10,7 @@ #include "shared/source/helpers/common_types.h" #include "shared/source/helpers/string.h" #include "shared/source/unified_memory/unified_memory.h" +#include "shared/source/utilities/heap_allocator.h" #include "opencl/source/cl_device/cl_device_vector.h" #include "opencl/source/context/context_type.h" @@ -17,6 +18,7 @@ #include "opencl/source/gtpin/gtpin_notify.h" #include "opencl/source/helpers/base_object.h" #include "opencl/source/helpers/destructor_callbacks.h" +#include "opencl/source/mem_obj/buffer.h" #include "opencl/source/mem_obj/map_operations_handler.h" #include @@ -40,6 +42,42 @@ struct OpenCLObjectMapper<_cl_context> { class Context : public BaseObject<_cl_context> { public: + class BufferPoolAllocator { + public: + static constexpr auto aggregatedSmallBuffersPoolSize = 64 * KB; + static constexpr auto smallBufferThreshold = 4 * KB; + static constexpr auto chunkAlignment = 256u; + static constexpr auto startingOffset = chunkAlignment; + + static_assert(aggregatedSmallBuffersPoolSize > smallBufferThreshold, "Largest allowed buffer needs to fit in pool"); + Buffer *allocateBufferFromPool(const MemoryProperties &memoryProperties, + cl_mem_flags flags, + cl_mem_flags_intel flagsIntel, + size_t size, + void *hostPtr, + cl_int &errcodeRet); + void tryFreeFromPoolBuffer(MemObj *possiblePoolBuffer, size_t offset, size_t size); + void releaseSmallBufferPool(); + + inline bool isAggregatedSmallBuffersEnabled() const { + constexpr bool enable = false; + if (DebugManager.flags.ExperimentalSmallBufferPoolAllocator.get() != -1) { + return !!DebugManager.flags.ExperimentalSmallBufferPoolAllocator.get(); + } + return enable; + } + void initAggregatedSmallBuffers(Context *context); + + bool isPoolBuffer(const MemObj *buffer) const; + + protected: + inline bool isSizeWithinThreshold(size_t size) const { + return BufferPoolAllocator::smallBufferThreshold >= size; + } + Buffer *mainStorage{nullptr}; + std::unique_ptr chunkAllocator; + std::mutex mutex; + }; static const cl_ulong objectMagic = 0xA4234321DC002130LL; bool createImpl(const cl_context_properties *properties, @@ -58,6 +96,11 @@ class Context : public BaseObject<_cl_context> { if (!pContext->createImpl(properties, devices, funcNotify, data, errcodeRet)) { delete pContext; pContext = nullptr; + } else { + auto &bufferPoolAllocator = pContext->getBufferPoolAllocator(); + if (bufferPoolAllocator.isAggregatedSmallBuffersEnabled()) { + bufferPoolAllocator.initAggregatedSmallBuffers(pContext); + } } gtpinNotifyContextCreate(pContext); return pContext; @@ -176,6 +219,9 @@ class Context : public BaseObject<_cl_context> { const std::map &getDeviceBitfields() const { return deviceBitfields; }; static Platform *getPlatformFromProperties(const cl_context_properties *properties, cl_int &errcode); + BufferPoolAllocator &getBufferPoolAllocator() { + return this->smallBufferPoolAllocator; + } protected: struct BuiltInKernel { @@ -211,6 +257,7 @@ class Context : public BaseObject<_cl_context> { MapOperationsStorage mapOperationsStorage = {}; StackVec specialQueues; DriverDiagnostics *driverDiagnostics = nullptr; + BufferPoolAllocator smallBufferPoolAllocator; uint32_t maxRootDeviceIndex = std::numeric_limits::max(); cl_bool preferD3dSharedResources = 0u; diff --git a/opencl/source/mem_obj/buffer.cpp b/opencl/source/mem_obj/buffer.cpp index 2d4d3e54fc..4157d6afae 100644 --- a/opencl/source/mem_obj/buffer.cpp +++ b/opencl/source/mem_obj/buffer.cpp @@ -175,6 +175,58 @@ Buffer *Buffer::create(Context *context, flags, 0, size, hostPtr, errcodeRet); } +bool inline copyHostPointer(Buffer *buffer, + size_t size, + void *hostPtr, + GraphicsAllocation *memory, + GraphicsAllocation *mapAllocation, + uint32_t rootDeviceIndex, + bool isCompressionEnabled, + bool implicitScalingEnabled, + cl_int &errcodeRet) { + const bool isLocalMemory = !MemoryPoolHelper::isSystemMemoryPool(memory->getMemoryPool()); + const bool gpuCopyRequired = isCompressionEnabled || isLocalMemory; + if (gpuCopyRequired) { + auto context = buffer->getContext(); + auto &device = context->getDevice(0u)->getDevice(); + auto &hwInfo = device.getHardwareInfo(); + auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily); + bool copyOnCpuAllowed = implicitScalingEnabled == false && + size <= Buffer::maxBufferSizeForCopyOnCpu && + isCompressionEnabled == false && + hwInfoConfig->getLocalMemoryAccessMode(hwInfo) != LocalMemoryAccessMode::CpuAccessDisallowed && + memory->storageInfo.isLockable; + if (DebugManager.flags.CopyHostPtrOnCpu.get() != -1) { + copyOnCpuAllowed = DebugManager.flags.CopyHostPtrOnCpu.get() == 1; + } + if (auto lockedPointer = copyOnCpuAllowed ? device.getMemoryManager()->lockResource(memory) : nullptr) { + memcpy_s(ptrOffset(lockedPointer, buffer->getOffset()), size, hostPtr, size); + memory->setAubWritable(true, GraphicsAllocation::defaultBank); + memory->setTbxWritable(true, GraphicsAllocation::defaultBank); + return true; + } else { + auto blitMemoryToAllocationResult = BlitOperationResult::Unsupported; + + if (hwInfoConfig->isBlitterFullySupported(hwInfo) && isLocalMemory) { + blitMemoryToAllocationResult = BlitHelperFunctions::blitMemoryToAllocation(device, memory, buffer->getOffset(), hostPtr, {size, 1, 1}); + } + + if (blitMemoryToAllocationResult != BlitOperationResult::Success) { + auto cmdQ = context->getSpecialQueue(rootDeviceIndex); + if (CL_SUCCESS != cmdQ->enqueueWriteBuffer(buffer, CL_TRUE, buffer->getOffset(), size, hostPtr, mapAllocation, 0, nullptr, nullptr)) { + errcodeRet = CL_OUT_OF_RESOURCES; + return false; + } + } + return true; + } + } else { + memcpy_s(ptrOffset(memory->getUnderlyingBuffer(), buffer->getOffset()), size, hostPtr, size); + return true; + } + return false; +} + Buffer *Buffer::create(Context *context, const MemoryProperties &memoryProperties, cl_mem_flags flags, @@ -184,6 +236,47 @@ Buffer *Buffer::create(Context *context, cl_int &errcodeRet) { errcodeRet = CL_SUCCESS; + Context::BufferPoolAllocator &bufferPoolAllocator = context->getBufferPoolAllocator(); + const bool implicitScalingEnabled = ImplicitScalingHelper::isImplicitScalingEnabled(context->getDevice(0u)->getDeviceBitfield(), true); + const bool useHostPtr = memoryProperties.flags.useHostPtr; + const bool copyHostPtr = memoryProperties.flags.copyHostPtr; + if (implicitScalingEnabled == false && + useHostPtr == false && + memoryProperties.flags.forceHostMemory == false) { + cl_int poolAllocRet = CL_SUCCESS; + auto bufferFromPool = bufferPoolAllocator.allocateBufferFromPool(memoryProperties, + flags, + flagsIntel, + size, + hostPtr, + poolAllocRet); + if (CL_SUCCESS == poolAllocRet) { + const bool needsCopy = copyHostPtr; + if (needsCopy) { + for (auto &rootDeviceIndex : context->getRootDeviceIndices()) { + auto graphicsAllocation = bufferFromPool->getGraphicsAllocation(rootDeviceIndex); + auto mapAllocation = bufferFromPool->getMapAllocation(rootDeviceIndex); + bool isCompressionEnabled = graphicsAllocation->isCompressionEnabled(); + if (copyHostPointer(bufferFromPool, + size, + hostPtr, + graphicsAllocation, + mapAllocation, + rootDeviceIndex, + isCompressionEnabled, + implicitScalingEnabled, + poolAllocRet)) { + break; + } + } + } + if (!needsCopy || poolAllocRet == CL_SUCCESS) { + return bufferFromPool; + } else { + clReleaseMemObject(bufferFromPool); + } + } + } MemoryManager *memoryManager = context->getMemoryManager(); UNRECOVERABLE_IF(!memoryManager); @@ -194,9 +287,6 @@ Buffer *Buffer::create(Context *context, AllocationInfoType allocationInfos; allocationInfos.resize(maxRootDeviceIndex + 1ull); - const bool useHostPtr = memoryProperties.flags.useHostPtr; - const bool copyHostPtr = memoryProperties.flags.copyHostPtr; - void *allocationCpuPtr = nullptr; bool forceCopyHostPtr = false; @@ -404,45 +494,15 @@ Buffer *Buffer::create(Context *context, pBuffer->setHostPtrMinSize(size); if (allocationInfo.copyMemoryFromHostPtr && !copyExecuted) { - auto isLocalMemory = !MemoryPoolHelper::isSystemMemoryPool(allocationInfo.memory->getMemoryPool()); - bool gpuCopyRequired = isCompressionEnabled || isLocalMemory; - - if (gpuCopyRequired) { - auto &device = pBuffer->getContext()->getDevice(0u)->getDevice(); - auto &hwInfo = device.getHardwareInfo(); - auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily); - bool copyOnCpuAllowed = false == ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) && - size <= Buffer::maxBufferSizeForCopyOnCpu && - !isCompressionEnabled && - hwInfoConfig->getLocalMemoryAccessMode(hwInfo) != LocalMemoryAccessMode::CpuAccessDisallowed && - allocationInfo.memory->storageInfo.isLockable; - if (DebugManager.flags.CopyHostPtrOnCpu.get() != -1) { - copyOnCpuAllowed = DebugManager.flags.CopyHostPtrOnCpu.get() == 1; - } - if (auto lockedPointer = copyOnCpuAllowed ? device.getMemoryManager()->lockResource(allocationInfo.memory) : nullptr) { - memcpy_s(ptrOffset(lockedPointer, pBuffer->getOffset()), size, hostPtr, size); - allocationInfo.memory->setAubWritable(true, GraphicsAllocation::defaultBank); - allocationInfo.memory->setTbxWritable(true, GraphicsAllocation::defaultBank); - copyExecuted = true; - } else { - auto blitMemoryToAllocationResult = BlitOperationResult::Unsupported; - - if (hwInfoConfig->isBlitterFullySupported(hwInfo) && isLocalMemory) { - blitMemoryToAllocationResult = BlitHelperFunctions::blitMemoryToAllocation(device, allocationInfo.memory, pBuffer->getOffset(), hostPtr, {size, 1, 1}); - } - - if (blitMemoryToAllocationResult != BlitOperationResult::Success) { - auto cmdQ = context->getSpecialQueue(rootDeviceIndex); - if (CL_SUCCESS != cmdQ->enqueueWriteBuffer(pBuffer, CL_TRUE, 0, size, hostPtr, allocationInfo.mapAllocation, 0, nullptr, nullptr)) { - errcodeRet = CL_OUT_OF_RESOURCES; - } - } - copyExecuted = true; - } - } else { - memcpy_s(allocationInfo.memory->getUnderlyingBuffer(), size, hostPtr, size); - copyExecuted = true; - } + copyExecuted = copyHostPointer(pBuffer, + size, + hostPtr, + allocationInfo.memory, + allocationInfo.mapAllocation, + rootDeviceIndex, + isCompressionEnabled, + implicitScalingEnabled, + errcodeRet); } } diff --git a/opencl/source/mem_obj/mem_obj.cpp b/opencl/source/mem_obj/mem_obj.cpp index bae4619ccb..e31c75d2a8 100644 --- a/opencl/source/mem_obj/mem_obj.cpp +++ b/opencl/source/mem_obj/mem_obj.cpp @@ -104,6 +104,7 @@ MemObj::~MemObj() { } if (associatedMemObject) { associatedMemObject->decRefInternal(); + context->getBufferPoolAllocator().tryFreeFromPoolBuffer(associatedMemObject, this->offset, this->size); } if (!associatedMemObject) { releaseAllocatedMapPtr(); @@ -112,7 +113,10 @@ MemObj::~MemObj() { destructorCallbacks.invoke(this); - context->decRefInternal(); + const bool needDecrementContextRefCount = !context->getBufferPoolAllocator().isPoolBuffer(this); + if (needDecrementContextRefCount) { + context->decRefInternal(); + } } cl_int MemObj::getMemObjectInfo(cl_mem_info paramName, diff --git a/opencl/test/unit_test/mem_obj/CMakeLists.txt b/opencl/test/unit_test/mem_obj/CMakeLists.txt index 086e9ae607..64b8b83a5e 100644 --- a/opencl/test/unit_test/mem_obj/CMakeLists.txt +++ b/opencl/test/unit_test/mem_obj/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2018-2021 Intel Corporation +# Copyright (C) 2018-2022 Intel Corporation # # SPDX-License-Identifier: MIT # @@ -7,6 +7,7 @@ set(IGDRCL_SRCS_tests_mem_obj ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt ${CMAKE_CURRENT_SOURCE_DIR}/buffer_pin_tests.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/buffer_pool_alloc_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/buffer_set_arg_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/buffer_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/buffer_bcs_tests.cpp diff --git a/opencl/test/unit_test/mem_obj/buffer_pool_alloc_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_pool_alloc_tests.cpp new file mode 100644 index 0000000000..da5f13489c --- /dev/null +++ b/opencl/test/unit_test/mem_obj/buffer_pool_alloc_tests.cpp @@ -0,0 +1,358 @@ +/* + * Copyright (C) 2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/helpers/hw_helper.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/mocks/mock_memory_manager.h" +#include "shared/test/common/test_macros/test.h" + +#include "opencl/test/unit_test/mocks/mock_buffer.h" +#include "opencl/test/unit_test/mocks/mock_cl_device.h" +#include "opencl/test/unit_test/mocks/mock_command_queue.h" + +using namespace NEO; +namespace Ult { +using PoolAllocator = Context::BufferPoolAllocator; +using MockBufferPoolAllocator = MockContext::MockBufferPoolAllocator; + +template +class aggregatedSmallBuffersTestTemplate : public ::testing::Test { + void SetUp() override { + this->SetUpImpl(); + } + + void SetUpImpl() { + DebugManager.flags.ExperimentalSmallBufferPoolAllocator.set(poolBufferFlag); + this->deviceFactory = std::make_unique(1, 0); + this->device = deviceFactory->rootDevices[0]; + this->mockMemoryManager = static_cast(device->getMemoryManager()); + this->mockMemoryManager->localMemorySupported[mockRootDeviceIndex] = true; + this->setAllocationToFail(failMainStorageAllocation); + cl_device_id devices[] = {device}; + this->context.reset(Context::create(nullptr, ClDeviceVector(devices, 1), nullptr, nullptr, retVal)); + ASSERT_EQ(retVal, CL_SUCCESS); + this->setAllocationToFail(false); + this->poolAllocator = static_cast(&context->smallBufferPoolAllocator); + } + + void TearDown() override { + if (this->context->getBufferPoolAllocator().isAggregatedSmallBuffersEnabled()) { + this->context->getBufferPoolAllocator().releaseSmallBufferPool(); + } + } + + void setAllocationToFail(bool shouldFail) { + this->mockMemoryManager->failInDevicePoolWithError = shouldFail; + } + + public: + std::unique_ptr deviceFactory; + MockClDevice *device; + std::unique_ptr context; + MockBufferPoolAllocator *poolAllocator; + MockMemoryManager *mockMemoryManager; + + cl_mem_flags flags{}; + size_t size = PoolAllocator::smallBufferThreshold; + void *hostPtr = nullptr; + cl_int retVal = CL_SUCCESS; + + DebugManagerStateRestore restore; +}; + +using aggregatedSmallBuffersDefaultTest = aggregatedSmallBuffersTestTemplate<-1>; + +TEST_F(aggregatedSmallBuffersDefaultTest, givenAggregatedSmallBuffersDefaultWhenCheckIfEnabledThenReturnFalse) { + EXPECT_FALSE(poolAllocator->isAggregatedSmallBuffersEnabled()); +} + +using aggregatedSmallBuffersDisabledTest = aggregatedSmallBuffersTestTemplate<0>; + +TEST_F(aggregatedSmallBuffersDisabledTest, givenAggregatedSmallBuffersDisabledWhenBufferCreateCalledThenDoNotUsePool) { + ASSERT_FALSE(poolAllocator->isAggregatedSmallBuffersEnabled()); + ASSERT_EQ(poolAllocator->mainStorage, nullptr); + std::unique_ptr buffer(Buffer::create(context.get(), flags, size, hostPtr, retVal)); + EXPECT_NE(buffer, nullptr); + EXPECT_EQ(retVal, CL_SUCCESS); + + EXPECT_EQ(poolAllocator->mainStorage, nullptr); +} + +using aggregatedSmallBuffersEnabledTest = aggregatedSmallBuffersTestTemplate<1>; + +TEST_F(aggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndSizeLargerThanThresholdWhenBufferCreateCalledThenDoNotUsePool) { + ASSERT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled()); + ASSERT_NE(poolAllocator->mainStorage, nullptr); + size = PoolAllocator::smallBufferThreshold + 1; + std::unique_ptr buffer(Buffer::create(context.get(), flags, size, hostPtr, retVal)); + EXPECT_NE(buffer, nullptr); + EXPECT_EQ(retVal, CL_SUCCESS); + + EXPECT_NE(poolAllocator->mainStorage, nullptr); +} + +TEST_F(aggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndSizeEqualToThresholdWhenBufferCreateCalledThenUsePool) { + ASSERT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled()); + ASSERT_NE(poolAllocator->mainStorage, nullptr); + std::unique_ptr buffer(Buffer::create(context.get(), flags, size, hostPtr, retVal)); + + EXPECT_NE(buffer, nullptr); + EXPECT_EQ(retVal, CL_SUCCESS); + + EXPECT_NE(poolAllocator->mainStorage, nullptr); + auto mockBuffer = static_cast(buffer.get()); + EXPECT_GE(mockBuffer->getSize(), size); + EXPECT_GE(mockBuffer->getOffset(), 0u); + EXPECT_LE(mockBuffer->getOffset(), PoolAllocator::aggregatedSmallBuffersPoolSize - size); + EXPECT_TRUE(mockBuffer->isSubBuffer()); + EXPECT_EQ(poolAllocator->mainStorage, mockBuffer->associatedMemObject); + + retVal = clReleaseMemObject(buffer.release()); + EXPECT_EQ(retVal, CL_SUCCESS); +} + +TEST_F(aggregatedSmallBuffersEnabledTest, givenCopyHostPointerWhenCreatingBufferButCopyFailedThenDoNotUsePool) { + class MockCommandQueueFailFirstEnqueueWrite : public MockCommandQueue { + public: + cl_int enqueueWriteBuffer(Buffer *buffer, cl_bool blockingWrite, size_t offset, size_t size, const void *ptr, + GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, + cl_event *event) override { + if (writeBufferCounter == 0) { + ++writeBufferCounter; + return CL_OUT_OF_RESOURCES; + } + return MockCommandQueue::enqueueWriteBuffer(buffer, blockingWrite, offset, size, ptr, mapAllocation, numEventsInWaitList, eventWaitList, event); + } + }; + DebugManager.flags.CopyHostPtrOnCpu.set(0); + + auto commandQueue = new MockCommandQueueFailFirstEnqueueWrite(); + context->getSpecialQueue(mockRootDeviceIndex)->decRefInternal(); + context->setSpecialQueue(commandQueue, mockRootDeviceIndex); + + flags = CL_MEM_COPY_HOST_PTR; + unsigned char dataToCopy[PoolAllocator::smallBufferThreshold]; + hostPtr = dataToCopy; + + ASSERT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled()); + ASSERT_NE(poolAllocator->mainStorage, nullptr); + std::unique_ptr buffer(Buffer::create(context.get(), flags, size, hostPtr, retVal)); + if (commandQueue->writeBufferCounter == 0) { + GTEST_SKIP(); + } + EXPECT_EQ(retVal, CL_SUCCESS); + ASSERT_NE(buffer, nullptr); + + auto mockBuffer = static_cast(buffer.get()); + EXPECT_FALSE(mockBuffer->isSubBuffer()); + retVal = clReleaseMemObject(buffer.release()); + EXPECT_EQ(retVal, CL_SUCCESS); +} + +TEST_F(aggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndSizeEqualToThresholdWhenBufferCreateCalledMultipleTimesThenUsePool) { + ASSERT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled()); + ASSERT_NE(poolAllocator->mainStorage, nullptr); + + constexpr auto buffersToCreate = PoolAllocator::aggregatedSmallBuffersPoolSize / PoolAllocator::smallBufferThreshold; + std::vector> buffers(buffersToCreate); + for (auto i = 0u; i < buffersToCreate; i++) { + buffers[i].reset(Buffer::create(context.get(), flags, size, hostPtr, retVal)); + EXPECT_EQ(retVal, CL_SUCCESS); + } + EXPECT_NE(poolAllocator->mainStorage, nullptr); + std::unique_ptr bufferAfterPoolIsFull(Buffer::create(context.get(), flags, size, hostPtr, retVal)); + EXPECT_EQ(retVal, CL_SUCCESS); + EXPECT_NE(bufferAfterPoolIsFull, nullptr); + EXPECT_FALSE(bufferAfterPoolIsFull->isSubBuffer()); + + using Bounds = struct { + size_t left; + size_t right; + }; + + std::vector subBuffersBounds(buffersToCreate); + + for (auto i = 0u; i < buffersToCreate; i++) { + // subbuffers are within pool buffer + EXPECT_NE(buffers[i], nullptr); + EXPECT_TRUE(buffers[i]->isSubBuffer()); + auto mockBuffer = static_cast(buffers[i].get()); + EXPECT_EQ(poolAllocator->mainStorage, mockBuffer->associatedMemObject); + EXPECT_GE(mockBuffer->getSize(), size); + EXPECT_GE(mockBuffer->getOffset(), 0u); + EXPECT_LE(mockBuffer->getOffset(), PoolAllocator::aggregatedSmallBuffersPoolSize - size); + + subBuffersBounds[i] = Bounds{mockBuffer->getOffset(), mockBuffer->getOffset() + mockBuffer->getSize()}; + } + + for (auto i = 0u; i < buffersToCreate; i++) { + for (auto j = i + 1; j < buffersToCreate; j++) { + // subbuffers do not overlap + EXPECT_TRUE(subBuffersBounds[i].right <= subBuffersBounds[j].left || + subBuffersBounds[j].right <= subBuffersBounds[i].left); + } + } + + // freeing subbuffer frees space in pool + ASSERT_LT(poolAllocator->chunkAllocator->getLeftSize(), size); + clReleaseMemObject(buffers[0].release()); + EXPECT_GE(poolAllocator->chunkAllocator->getLeftSize(), size); + std::unique_ptr bufferAfterPoolHasSpaceAgain(Buffer::create(context.get(), flags, size, hostPtr, retVal)); + EXPECT_EQ(retVal, CL_SUCCESS); + ASSERT_NE(bufferAfterPoolHasSpaceAgain, nullptr); + EXPECT_TRUE(bufferAfterPoolHasSpaceAgain->isSubBuffer()); + + // subbuffer after free does not overlap + subBuffersBounds[0] = Bounds{bufferAfterPoolHasSpaceAgain->getOffset(), bufferAfterPoolHasSpaceAgain->getOffset() + bufferAfterPoolHasSpaceAgain->getSize()}; + for (auto i = 0u; i < buffersToCreate; i++) { + for (auto j = i + 1; j < buffersToCreate; j++) { + EXPECT_TRUE(subBuffersBounds[i].right <= subBuffersBounds[j].left || + subBuffersBounds[j].right <= subBuffersBounds[i].left); + } + } +} + +using aggregatedSmallBuffersEnabledTestDoNotRunSetup = aggregatedSmallBuffersTestTemplate<1, true>; + +TEST_F(aggregatedSmallBuffersEnabledTestDoNotRunSetup, givenAggregatedSmallBuffersEnabledAndSizeEqualToThresholdWhenBufferCreateCalledButPoolCreateFailedThenDoNotUsePool) { + ASSERT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled()); + ASSERT_EQ(poolAllocator->mainStorage, nullptr); + std::unique_ptr buffer(Buffer::create(context.get(), flags, size, hostPtr, retVal)); + + EXPECT_EQ(retVal, CL_SUCCESS); + EXPECT_NE(buffer.get(), nullptr); + EXPECT_EQ(poolAllocator->mainStorage, nullptr); +} + +template +class aggregatedSmallBuffersApiTestTemplate : public ::testing::Test { + void SetUp() override { + DebugManager.flags.ExperimentalSmallBufferPoolAllocator.set(poolBufferFlag); + this->deviceFactory = std::make_unique(1, 0); + auto device = deviceFactory->rootDevices[0]; + cl_device_id devices[] = {device}; + clContext = clCreateContext(nullptr, 1, devices, nullptr, nullptr, &retVal); + ASSERT_EQ(retVal, CL_SUCCESS); + context = castToObject(clContext); + } + + public: + std::unique_ptr deviceFactory; + + cl_mem_flags flags = CL_MEM_READ_WRITE; + size_t size = PoolAllocator::smallBufferThreshold; + cl_int retVal = CL_SUCCESS; + void *hostPtr{nullptr}; + cl_context clContext{nullptr}; + Context *context{nullptr}; + + DebugManagerStateRestore restore; +}; + +using aggregatedSmallBuffersDefaultApiTest = aggregatedSmallBuffersApiTestTemplate<-1>; +TEST_F(aggregatedSmallBuffersDefaultApiTest, givenNoBufferCreatedWhenReleasingContextThenDoNotLeakMemory) { + EXPECT_EQ(clReleaseContext(context), CL_SUCCESS); +} + +using aggregatedSmallBuffersEnabledApiTest = aggregatedSmallBuffersApiTestTemplate<1>; +TEST_F(aggregatedSmallBuffersEnabledApiTest, givenNoBufferCreatedWhenReleasingContextThenDoNotLeakMemory) { + EXPECT_EQ(clReleaseContext(context), CL_SUCCESS); +} + +TEST_F(aggregatedSmallBuffersEnabledApiTest, givenNotSmallBufferWhenCreatingBufferThenDoNotUsePool) { + size = PoolAllocator::smallBufferThreshold + 1; + cl_mem buffer = clCreateBuffer(clContext, flags, size, hostPtr, &retVal); + EXPECT_EQ(retVal, CL_SUCCESS); + ASSERT_NE(buffer, nullptr); + + MockBuffer *asBuffer = static_cast(buffer); + EXPECT_FALSE(asBuffer->isSubBuffer()); + + retVal = clReleaseMemObject(buffer); + EXPECT_EQ(retVal, CL_SUCCESS); + + EXPECT_EQ(clReleaseContext(context), CL_SUCCESS); +} + +TEST_F(aggregatedSmallBuffersEnabledApiTest, givenSmallBufferWhenCreatingBufferThenUsePool) { + auto contextRefCountBefore = context->getRefInternalCount(); + cl_mem smallBuffer = clCreateBuffer(clContext, flags, size, hostPtr, &retVal); + EXPECT_EQ(retVal, CL_SUCCESS); + ASSERT_NE(smallBuffer, nullptr); + + MockBuffer *asBuffer = static_cast(smallBuffer); + EXPECT_TRUE(asBuffer->isSubBuffer()); + Buffer *parentBuffer = static_cast(asBuffer->associatedMemObject); + EXPECT_EQ(2, parentBuffer->getRefInternalCount()); + MockBufferPoolAllocator *mockBufferPoolAllocator = static_cast(&context->getBufferPoolAllocator()); + EXPECT_EQ(parentBuffer, mockBufferPoolAllocator->mainStorage); + + retVal = clReleaseMemObject(smallBuffer); + EXPECT_EQ(retVal, CL_SUCCESS); + + EXPECT_EQ(context->getRefInternalCount(), contextRefCountBefore); + + EXPECT_EQ(clReleaseContext(context), CL_SUCCESS); +} + +TEST_F(aggregatedSmallBuffersEnabledApiTest, givenSubBufferNotFromPoolAndAggregatedSmallBuffersEnabledWhenReleaseMemObjectCalledThenItSucceeds) { + DebugManagerStateRestore restore; + DebugManager.flags.ExperimentalSmallBufferPoolAllocator.set(0); + size_t size = PoolAllocator::smallBufferThreshold + 1; + + cl_mem largeBuffer = clCreateBuffer(clContext, flags, size, hostPtr, &retVal); + ASSERT_EQ(retVal, CL_SUCCESS); + ASSERT_NE(largeBuffer, nullptr); + + cl_buffer_region region{}; + region.size = 1; + cl_mem subBuffer = clCreateSubBuffer(largeBuffer, flags, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &retVal); + ASSERT_EQ(retVal, CL_SUCCESS); + ASSERT_NE(subBuffer, nullptr); + + DebugManager.flags.ExperimentalSmallBufferPoolAllocator.set(1); + retVal = clReleaseMemObject(subBuffer); + EXPECT_EQ(retVal, CL_SUCCESS); + + retVal = clReleaseMemObject(largeBuffer); + EXPECT_EQ(retVal, CL_SUCCESS); + + EXPECT_EQ(clReleaseContext(context), CL_SUCCESS); +} + +TEST_F(aggregatedSmallBuffersEnabledApiTest, givenCopyHostPointerWhenCreatingBufferThenUsePoolAndCopyHostPointer) { + DebugManagerStateRestore restore; + DebugManager.flags.ExperimentalSmallBufferPoolAllocator.set(1); + flags |= CL_MEM_COPY_HOST_PTR; + unsigned char dataToCopy[PoolAllocator::smallBufferThreshold]; + dataToCopy[0] = 123; + hostPtr = dataToCopy; + auto contextRefCountBefore = context->getRefInternalCount(); + cl_mem smallBuffer = clCreateBuffer(clContext, flags, size, hostPtr, &retVal); + EXPECT_EQ(context->getRefInternalCount(), contextRefCountBefore + 1); + EXPECT_EQ(retVal, CL_SUCCESS); + ASSERT_NE(smallBuffer, nullptr); + + MockBuffer *asBuffer = static_cast(smallBuffer); + EXPECT_TRUE(asBuffer->isSubBuffer()); + Buffer *parentBuffer = static_cast(asBuffer->associatedMemObject); + EXPECT_EQ(2, parentBuffer->getRefInternalCount()); + MockBufferPoolAllocator *mockBufferPoolAllocator = static_cast(&context->getBufferPoolAllocator()); + EXPECT_EQ(parentBuffer, mockBufferPoolAllocator->mainStorage); + + // check that data has been copied + auto address = asBuffer->getCpuAddress(); + EXPECT_EQ(0, memcmp(hostPtr, address, size)); + + retVal = clReleaseMemObject(smallBuffer); + EXPECT_EQ(retVal, CL_SUCCESS); + + EXPECT_EQ(context->getRefInternalCount(), contextRefCountBefore); + + EXPECT_EQ(clReleaseContext(context), CL_SUCCESS); +} +} // namespace Ult \ No newline at end of file diff --git a/opencl/test/unit_test/mem_obj/buffer_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_tests.cpp index 0573e31d30..029df0b0e3 100644 --- a/opencl/test/unit_test/mem_obj/buffer_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_tests.cpp @@ -1040,7 +1040,7 @@ TEST_P(ValidHostPtr, WhenBufferIsCreatedThenItIsNotResident) { EXPECT_FALSE(buffer->getGraphicsAllocation(pDevice->getRootDeviceIndex())->isResident(pDevice->getDefaultEngine().osContext->getContextId())); } -TEST_P(ValidHostPtr, WhenBufferIsCreatedThenAddressMatechesOnlyForHostPtr) { +TEST_P(ValidHostPtr, WhenBufferIsCreatedThenAddressMatchesOnlyForHostPtr) { buffer = createBuffer(); ASSERT_NE(nullptr, buffer); diff --git a/opencl/test/unit_test/mocks/mock_buffer.h b/opencl/test/unit_test/mocks/mock_buffer.h index e42aaeb07d..e1d7ebf718 100644 --- a/opencl/test/unit_test/mocks/mock_buffer.h +++ b/opencl/test/unit_test/mocks/mock_buffer.h @@ -47,6 +47,7 @@ class MockBuffer : public MockBufferStorage, public Buffer { using Buffer::magic; using Buffer::offset; using Buffer::size; + using MemObj::associatedMemObject; using MemObj::context; using MemObj::isZeroCopy; using MemObj::memObjectType; diff --git a/opencl/test/unit_test/mocks/mock_context.h b/opencl/test/unit_test/mocks/mock_context.h index 50e8b3ce0c..3613682509 100644 --- a/opencl/test/unit_test/mocks/mock_context.h +++ b/opencl/test/unit_test/mocks/mock_context.h @@ -33,6 +33,7 @@ class MockContext : public Context { using Context::rootDeviceIndices; using Context::setupContextType; using Context::sharingFunctions; + using Context::smallBufferPoolAllocator; using Context::specialQueues; using Context::svmAllocsManager; @@ -52,6 +53,13 @@ class MockContext : public Context { std::unique_ptr &getAsyncEventsHandlerUniquePtr(); void initializeWithDevices(const ClDeviceVector &devices, bool noSpecialQueue); + class MockBufferPoolAllocator : public BufferPoolAllocator { + public: + using BufferPoolAllocator::chunkAllocator; + using BufferPoolAllocator::isAggregatedSmallBuffersEnabled; + using BufferPoolAllocator::mainStorage; + }; + private: ClDevice *pDevice = nullptr; }; diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 726ca60a49..8f48547ead 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -436,6 +436,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableDeviceAllocationCache, -1, "Ex DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalH2DCpuCopyThreshold, -1, "Override default treshold (in bytes) for H2D CPU copy.") DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalD2HCpuCopyThreshold, -1, "Override default treshold (in bytes) for D2H CPU copy.") DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalCopyThroughLock, -1, "Experimentally copy memory through locked ptr. -1: default 0: disable 1: enable ") +DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSmallBufferPoolAllocator, -1, "Experimentally enable pool allocator for clCreateBuffer under 4KB.") DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableSourceLevelDebugger, false, "Experimentally enable source level debugger.") DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableL0DebuggerForOpenCL, false, "Experimentally enable debugging OCL with L0 Debug API.") DECLARE_DEBUG_VARIABLE(bool, ExperimentalEnableTileAttach, true, "Experimentally enable attaching to tiles (subdevices).") diff --git a/shared/test/common/mocks/mock_memory_manager.h b/shared/test/common/mocks/mock_memory_manager.h index 6e5f6c9ce3..358f5356d7 100644 --- a/shared/test/common/mocks/mock_memory_manager.h +++ b/shared/test/common/mocks/mock_memory_manager.h @@ -409,14 +409,18 @@ class MockMemoryManagerWithDebuggableOsContext : public MockMemoryManager { class MockMemoryManagerWithCapacity : public MockMemoryManager { public: MockMemoryManagerWithCapacity(NEO::ExecutionEnvironment &executionEnvironment) : MockMemoryManager(executionEnvironment) {} - GraphicsAllocation *allocateGraphicsMemoryWithProperties(const AllocationProperties &properties) override { + GraphicsAllocation *allocateGraphicsMemoryWithProperties(const AllocationProperties &properties, const void *ptr) override { if (this->capacity >= properties.size) { this->capacity -= properties.size; - return MockMemoryManager::allocateGraphicsMemoryWithProperties(properties); + return MockMemoryManager::allocateGraphicsMemoryWithProperties(properties, ptr); } return nullptr; } + GraphicsAllocation *allocateGraphicsMemoryWithProperties(const AllocationProperties &properties) override { + return this->allocateGraphicsMemoryWithProperties(properties, nullptr); + } + void freeGraphicsMemoryImpl(GraphicsAllocation *gfxAllocation) override { this->capacity += gfxAllocation->getUnderlyingBufferSize(); MockMemoryManager::freeGraphicsMemoryImpl(gfxAllocation); diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 4f74cf2572..3d7cfbc2e1 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -471,4 +471,5 @@ ExperimentalH2DCpuCopyThreshold = -1 ExperimentalD2HCpuCopyThreshold = -1 CopyHostPtrOnCpu = -1 PrintCompletionFenceUsage = 0 -SetAmountOfReusableAllocations = -1 \ No newline at end of file +SetAmountOfReusableAllocations = -1 +ExperimentalSmallBufferPoolAllocator = -1 \ No newline at end of file