fix(ocl): track buffer pool count per device

Track amount of created buffer pools per device. Do not allocate extra
pools if limit is reached. New contexts will have pooling disabled if
limit is reached on device.

Related-To: NEO-13461

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek
2024-12-16 10:19:12 +00:00
committed by Compute-Runtime-Automation
parent b6fc2b5861
commit e61d04a881
12 changed files with 140 additions and 23 deletions

View File

@ -35,6 +35,7 @@ ClDevice::ClDevice(Device &device, ClDevice &rootClDevice, Platform *platform) :
driverInfo.reset(DriverInfo::create(&device.getHardwareInfo(), osInterface));
initGTPinHelper();
initializeCaps();
initializeMaxPoolCount();
OpenClCFeaturesContainer emptyOpenClCFeatures;
compilerExtensions = convertEnabledExtensionsToCompilerInternalOptions(deviceInfo.deviceExtensions, emptyOpenClCFeatures);

View File

@ -147,6 +147,7 @@ class ClDevice : public BaseObject<_cl_device_id> {
void initializeOpenclCAllVersions();
void initializeILsWithVersion();
void initializeOsSpecificCaps();
void initializeMaxPoolCount();
void initGTPinHelper();
void setupFp64Flags();
const std::string getClDeviceName() const;

View File

@ -18,6 +18,7 @@
#include "shared/source/os_interface/driver_info.h"
#include "opencl/source/cl_device/cl_device.h"
#include "opencl/source/context/context.h"
#include "opencl/source/gtpin/gtpin_gfx_core_helper.h"
#include "opencl/source/helpers/cl_gfx_core_helper.h"
#include "opencl/source/sharings/sharing_factory.h"
@ -472,6 +473,14 @@ void ClDevice::initializeILsWithVersion() {
}
}
void ClDevice::initializeMaxPoolCount() {
auto &device = getDevice();
const auto bitfield = device.getDeviceBitfield();
const auto deviceMemory = device.getGlobalMemorySize(static_cast<uint32_t>(bitfield.to_ulong()));
const auto maxPoolCount = Context::BufferPoolAllocator::calculateMaxPoolCount(deviceMemory, 2);
device.updateMaxPoolCount(maxPoolCount);
}
const std::string ClDevice::getClDeviceName() const {
return this->getDevice().getDeviceInfo().name;
}

View File

@ -62,6 +62,8 @@ Context::~Context() {
}
if (smallBufferPoolAllocator.isAggregatedSmallBuffersEnabled(this)) {
auto &device = this->getDevice(0)->getDevice();
device.recordPoolsFreed(smallBufferPoolAllocator.getPoolsCount());
smallBufferPoolAllocator.releasePools();
}
@ -628,11 +630,10 @@ Buffer *Context::BufferPool::allocate(const MemoryProperties &memoryProperties,
void Context::BufferPoolAllocator::initAggregatedSmallBuffers(Context *context) {
this->context = context;
const auto &device = context->getDevice(0)->getDevice();
const auto bitfield = device.getDeviceBitfield();
const auto deviceMemory = device.getGlobalMemorySize(static_cast<uint32_t>(bitfield.to_ulong()));
this->maxPoolCount = this->calculateMaxPoolCount(deviceMemory, 2);
this->addNewBufferPool(Context::BufferPool{this->context});
auto &device = context->getDevice(0)->getDevice();
if (device.requestPoolCreate(1u)) {
this->addNewBufferPool(Context::BufferPool{this->context});
}
}
Buffer *Context::BufferPoolAllocator::allocateBufferFromPool(const MemoryProperties &memoryProperties,
@ -671,7 +672,8 @@ Buffer *Context::BufferPoolAllocator::allocateBufferFromPool(const MemoryPropert
return bufferFromPool;
}
if (this->bufferPools.size() < this->maxPoolCount) {
auto &device = context->getDevice(0)->getDevice();
if (device.requestPoolCreate(1u)) {
this->addNewBufferPool(BufferPool{this->context});
return this->allocateFromPools(memoryProperties, flags, flagsIntel, requestedSize, hostPtr, errcodeRet);
}

View File

@ -79,6 +79,10 @@ class Context : public BaseObject<_cl_context> {
void *hostPtr,
cl_int &errcodeRet);
bool flagsAllowBufferFromPool(const cl_mem_flags &flags, const cl_mem_flags_intel &flagsIntel) const;
static inline uint32_t calculateMaxPoolCount(uint64_t totalMemory, size_t percentOfMemory) {
const auto maxPoolCount = static_cast<uint32_t>(totalMemory * (percentOfMemory / 100.0) / BufferPoolAllocator::aggregatedSmallBuffersPoolSize);
return maxPoolCount ? maxPoolCount : 1u;
}
protected:
Buffer *allocateFromPools(const MemoryProperties &memoryProperties,
@ -87,13 +91,7 @@ class Context : public BaseObject<_cl_context> {
size_t requestedSize,
void *hostPtr,
cl_int &errcodeRet);
static inline size_t calculateMaxPoolCount(uint64_t totalMemory, size_t percentOfMemory) {
const auto maxPoolCount = static_cast<size_t>(totalMemory * (percentOfMemory / 100.0) / BufferPoolAllocator::aggregatedSmallBuffersPoolSize);
return maxPoolCount ? maxPoolCount : 1u;
}
Context *context{nullptr};
size_t maxPoolCount{1u};
};
static const cl_ulong objectMagic = 0xA4234321DC002130LL;

View File

@ -40,6 +40,7 @@ class AggregatedSmallBuffersTestTemplate : public ::testing::Test {
std::unique_ptr<UltClDeviceFactory> deviceFactory;
MockClDevice *device;
MockDevice *mockNeoDevice;
std::unique_ptr<MockContext> context;
MockBufferPoolAllocator *poolAllocator;
MockMemoryManager *mockMemoryManager;
@ -59,6 +60,11 @@ class AggregatedSmallBuffersTestTemplate : public ::testing::Test {
debugManager.flags.RenderCompressedBuffersEnabled.set(1);
this->deviceFactory = std::make_unique<UltClDeviceFactory>(2, 0);
this->device = deviceFactory->rootDevices[rootDeviceIndex];
this->mockNeoDevice = static_cast<MockDevice *>(&this->device->getDevice());
const auto bitfield = mockNeoDevice->getDeviceBitfield();
const auto deviceMemory = mockNeoDevice->getGlobalMemorySize(static_cast<uint32_t>(bitfield.to_ulong()));
const auto expectedMaxPoolCount = Context::BufferPoolAllocator::calculateMaxPoolCount(deviceMemory, 2);
EXPECT_EQ(expectedMaxPoolCount, mockNeoDevice->maxBufferPoolCount);
this->mockMemoryManager = static_cast<MockMemoryManager *>(device->getMemoryManager());
this->mockMemoryManager->localMemorySupported[rootDeviceIndex] = true;
this->setAllocationToFail(failMainStorageAllocation);
@ -69,7 +75,7 @@ class AggregatedSmallBuffersTestTemplate : public ::testing::Test {
EXPECT_EQ(retVal, CL_SUCCESS);
this->setAllocationToFail(false);
this->poolAllocator = static_cast<MockBufferPoolAllocator *>(&context->smallBufferPoolAllocator);
this->poolAllocator->maxPoolCount = 1u;
this->mockNeoDevice->updateMaxPoolCount(1u);
}
};
@ -301,7 +307,7 @@ TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndB
}
TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndBufferPoolIsExhaustedAndAllocationsAreNotInUseAndNoBuffersFreedThenNewPoolIsCreated) {
this->poolAllocator->maxPoolCount = 2u;
mockNeoDevice->updateMaxPoolCount(2u);
EXPECT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled(context.get()));
EXPECT_EQ(1u, poolAllocator->bufferPools.size());
EXPECT_NE(nullptr, poolAllocator->bufferPools[0].mainStorage.get());
@ -326,7 +332,7 @@ TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndB
}
TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndBufferPoolIsExhaustedAndAllocationsAreInUseThenNewPoolIsCreated) {
this->poolAllocator->maxPoolCount = 2u;
mockNeoDevice->updateMaxPoolCount(2u);
EXPECT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled(context.get()));
EXPECT_EQ(1u, poolAllocator->bufferPools.size());
EXPECT_NE(nullptr, poolAllocator->bufferPools[0].mainStorage.get());
@ -351,19 +357,19 @@ TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndB
}
TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndBufferPoolIsExhaustedAndAllocationsAreInUseAndPoolLimitIsReachedThenNewPoolIsNotCreated) {
this->poolAllocator->maxPoolCount = 2u;
mockNeoDevice->updateMaxPoolCount(2u);
EXPECT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled(context.get()));
EXPECT_EQ(1u, poolAllocator->bufferPools.size());
EXPECT_NE(nullptr, poolAllocator->bufferPools[0].mainStorage.get());
const std::vector<std::unique_ptr<Buffer>>::size_type buffersToCreate = (PoolAllocator::aggregatedSmallBuffersPoolSize / PoolAllocator::smallBufferThreshold) * poolAllocator->maxPoolCount;
const std::vector<std::unique_ptr<Buffer>>::size_type buffersToCreate = (PoolAllocator::aggregatedSmallBuffersPoolSize / PoolAllocator::smallBufferThreshold) * mockNeoDevice->maxBufferPoolCount;
std::vector<std::unique_ptr<Buffer>> buffers(buffersToCreate);
for (auto i = 0u; i < buffersToCreate; ++i) {
buffers[i].reset(Buffer::create(context.get(), flags, size, hostPtr, retVal));
EXPECT_EQ(retVal, CL_SUCCESS);
}
EXPECT_EQ(poolAllocator->maxPoolCount, poolAllocator->bufferPools.size());
for (auto i = 0u; i < poolAllocator->maxPoolCount; ++i) {
EXPECT_EQ(mockNeoDevice->maxBufferPoolCount, poolAllocator->bufferPools.size());
for (auto i = 0u; i < mockNeoDevice->maxBufferPoolCount; ++i) {
EXPECT_EQ(PoolAllocator::aggregatedSmallBuffersPoolSize, poolAllocator->bufferPools[i].chunkAllocator->getUsedSize());
}
EXPECT_EQ(1u, mockMemoryManager->allocInUseCalled);
@ -373,7 +379,7 @@ TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndB
std::unique_ptr<Buffer> bufferAfterExhaustMustFail(Buffer::create(context.get(), flags, size, hostPtr, retVal));
EXPECT_EQ(nullptr, bufferAfterExhaustMustFail.get());
EXPECT_NE(retVal, CL_SUCCESS);
EXPECT_EQ(poolAllocator->maxPoolCount, poolAllocator->bufferPools.size());
EXPECT_EQ(mockNeoDevice->maxBufferPoolCount, poolAllocator->bufferPools.size());
EXPECT_EQ(3u, mockMemoryManager->allocInUseCalled);
}
@ -460,6 +466,53 @@ TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndS
}
}
TEST_F(AggregatedSmallBuffersEnabledTest, givenAggregatedSmallBuffersEnabledAndMultipleContextsThenPoolLimitIsTrackedAcrossContexts) {
mockNeoDevice->updateMaxPoolCount(2u);
EXPECT_TRUE(poolAllocator->isAggregatedSmallBuffersEnabled(context.get()));
EXPECT_EQ(1u, poolAllocator->bufferPools.size());
EXPECT_NE(nullptr, poolAllocator->bufferPools[0].mainStorage.get());
EXPECT_EQ(1u, mockNeoDevice->bufferPoolCount.load());
std::unique_ptr<MockContext> secondContext;
cl_device_id devices[] = {device};
secondContext.reset(Context::create<MockContext>(nullptr, ClDeviceVector(devices, 1), nullptr, nullptr, retVal));
EXPECT_EQ(retVal, CL_SUCCESS);
this->setAllocationToFail(false);
EXPECT_EQ(2u, mockNeoDevice->bufferPoolCount.load());
constexpr auto buffersToCreate = PoolAllocator::aggregatedSmallBuffersPoolSize / PoolAllocator::smallBufferThreshold;
std::vector<std::unique_ptr<Buffer>> buffers(buffersToCreate);
for (auto i = 0u; i < buffersToCreate; i++) {
buffers[i].reset(Buffer::create(context.get(), flags, size, hostPtr, retVal));
EXPECT_EQ(retVal, CL_SUCCESS);
}
std::unique_ptr<Buffer> bufferAfterExhaustMustSucceed(Buffer::create(context.get(), flags, size, hostPtr, retVal));
EXPECT_EQ(retVal, CL_SUCCESS);
EXPECT_EQ(1u, poolAllocator->bufferPools.size());
EXPECT_EQ(size * buffersToCreate, poolAllocator->bufferPools[0].chunkAllocator->getUsedSize());
EXPECT_FALSE(bufferAfterExhaustMustSucceed->isSubBuffer());
mockNeoDevice->callBaseGetGlobalMemorySize = false;
mockNeoDevice->getGlobalMemorySizeReturn = static_cast<uint64_t>(2 * 2 * MemoryConstants::megaByte / 0.02);
const auto bitfield = mockNeoDevice->getDeviceBitfield();
const auto deviceMemory = mockNeoDevice->getGlobalMemorySize(static_cast<uint32_t>(bitfield.to_ulong()));
EXPECT_EQ(2u, MockBufferPoolAllocator::calculateMaxPoolCount(deviceMemory, 2));
std::unique_ptr<MockContext> thirdContext;
thirdContext.reset(Context::create<MockContext>(nullptr, ClDeviceVector(devices, 1), nullptr, nullptr, retVal));
EXPECT_EQ(retVal, CL_SUCCESS);
MockBufferPoolAllocator *thirdPoolAllocator = static_cast<MockBufferPoolAllocator *>(&thirdContext->smallBufferPoolAllocator);
EXPECT_EQ(0u, thirdPoolAllocator->bufferPools.size());
EXPECT_EQ(2u, mockNeoDevice->bufferPoolCount.load());
secondContext.reset(nullptr);
EXPECT_EQ(1u, mockNeoDevice->bufferPoolCount.load());
buffers.clear();
bufferAfterExhaustMustSucceed.reset(nullptr);
context.reset(nullptr);
EXPECT_EQ(0u, mockNeoDevice->bufferPoolCount.load());
}
TEST_F(AggregatedSmallBuffersKernelTest, givenBufferFromPoolWhenOffsetSubbufferIsPassedToSetKernelArgThenCorrectGpuVAIsPatched) {
std::unique_ptr<Buffer> unusedBuffer(Buffer::create(context.get(), flags, size, hostPtr, retVal));
std::unique_ptr<Buffer> buffer(Buffer::create(context.get(), flags, size, hostPtr, retVal));

View File

@ -59,7 +59,6 @@ class MockContext : public Context {
using BufferPoolAllocator::bufferPools;
using BufferPoolAllocator::calculateMaxPoolCount;
using BufferPoolAllocator::isAggregatedSmallBuffersEnabled;
using BufferPoolAllocator::maxPoolCount;
};
private:

View File

@ -181,7 +181,7 @@ class Device : public ReferenceTrackedObject<Device> {
void initializeRayTracing(uint32_t maxBvhLevels);
void allocateRTDispatchGlobals(uint32_t maxBvhLevels);
uint64_t getGlobalMemorySize(uint32_t deviceBitfield) const;
MOCKABLE_VIRTUAL uint64_t getGlobalMemorySize(uint32_t deviceBitfield) const;
const std::vector<SubDevice *> &getSubDevices() const { return subdevices; }
bool getUuid(std::array<uint8_t, ProductHelper::uuidSize> &uuid);
void generateUuid(std::array<uint8_t, ProductHelper::uuidSize> &uuid);
@ -237,6 +237,23 @@ class Device : public ReferenceTrackedObject<Device> {
return microsecondResolution;
}
void updateMaxPoolCount(uint32_t maxPoolCount) {
maxBufferPoolCount = maxPoolCount;
}
bool requestPoolCreate(uint32_t count) {
if (maxBufferPoolCount >= count + bufferPoolCount.fetch_add(count)) {
return true;
} else {
bufferPoolCount -= count;
return false;
}
}
void recordPoolsFreed(uint32_t size) {
bufferPoolCount -= size;
}
protected:
Device() = delete;
Device(ExecutionEnvironment *executionEnvironment, const uint32_t rootDeviceIndex);
@ -314,8 +331,10 @@ class Device : public ReferenceTrackedObject<Device> {
std::unique_ptr<UsmMemAllocPoolsManager> deviceUsmMemAllocPoolsManager;
size_t allocationsSavedForReuseSize = 0u;
uint32_t microsecondResolution = 1000u;
std::atomic_uint32_t bufferPoolCount = 0u;
uint32_t maxBufferPoolCount = 0u;
mutable std::mutex allocationsReuseMtx;
uint32_t microsecondResolution = 1000u;
struct {
bool isValid = false;

View File

@ -84,6 +84,7 @@ class AbstractBuffersAllocator : public SmallBuffersParams<BuffersPoolType> {
void releasePools() { this->bufferPools.clear(); }
bool isPoolBuffer(const BufferParentType *buffer) const;
void tryFreeFromPoolBuffer(BufferParentType *possiblePoolBuffer, size_t offset, size_t size);
uint32_t getPoolsCount() { return static_cast<uint32_t>(this->bufferPools.size()); }
protected:
inline bool isSizeWithinThreshold(size_t size) const { return smallBufferThreshold >= size; }

View File

@ -55,6 +55,7 @@ class MockDevice : public RootDevice {
using Device::addEngineToEngineGroup;
using Device::allEngines;
using Device::allocateDebugSurface;
using Device::bufferPoolCount;
using Device::commandStreamReceivers;
using Device::createDeviceInternals;
using Device::createEngine;
@ -65,6 +66,7 @@ class MockDevice : public RootDevice {
using Device::generateUuidFromPciBusInfo;
using Device::getGlobalMemorySize;
using Device::initializeCaps;
using Device::maxBufferPoolCount;
using Device::microsecondResolution;
using Device::preemptionMode;
using Device::regularEngineGroups;
@ -169,6 +171,14 @@ class MockDevice : public RootDevice {
stopDirectSubmissionCalled = true;
Device::stopDirectSubmissionAndWaitForCompletion();
}
uint64_t getGlobalMemorySize(uint32_t deviceBitfield) const override {
if (callBaseGetGlobalMemorySize) {
return Device::getGlobalMemorySize(deviceBitfield);
}
return getGlobalMemorySizeReturn;
}
static ExecutionEnvironment *prepareExecutionEnvironment(const HardwareInfo *pHwInfo);
static decltype(&createCommandStream) createCommandStreamReceiverFunc;
@ -180,6 +190,8 @@ class MockDevice : public RootDevice {
bool stopDirectSubmissionCalled = false;
ReleaseHelper *mockReleaseHelper = nullptr;
AILConfiguration *mockAilConfigurationHelper = nullptr;
uint64_t getGlobalMemorySizeReturn = 0u;
bool callBaseGetGlobalMemorySize = true;
};
template <>

View File

@ -349,6 +349,24 @@ TEST_F(DeviceTest, GivenDeviceWhenGenerateUuidFromPciBusInfoThenValidValuesAreSe
EXPECT_EQ(memcmp(&uuid, &expectedUuid, ProductHelper::uuidSize), 0);
}
TEST_F(DeviceTest, givenDeviceWhenUsingBufferPoolsTrackingThenCountIsUpdated) {
pDevice->updateMaxPoolCount(3u);
EXPECT_EQ(3u, pDevice->maxBufferPoolCount);
EXPECT_EQ(0u, pDevice->bufferPoolCount.load());
EXPECT_FALSE(pDevice->requestPoolCreate(4u));
EXPECT_EQ(0u, pDevice->bufferPoolCount.load());
EXPECT_TRUE(pDevice->requestPoolCreate(3u));
EXPECT_EQ(3u, pDevice->bufferPoolCount.load());
EXPECT_FALSE(pDevice->requestPoolCreate(1u));
EXPECT_EQ(3u, pDevice->bufferPoolCount.load());
pDevice->recordPoolsFreed(2u);
EXPECT_EQ(1u, pDevice->bufferPoolCount.load());
}
using DeviceGetCapsTest = Test<DeviceFixture>;
TEST_F(DeviceGetCapsTest, givenMockCompilerInterfaceWhenInitializeCapsIsCalledThenMaxParameterSizeIsSetCorrectly) {

View File

@ -108,6 +108,7 @@ TEST_F(AbstractSmallBuffersTest, givenBuffersAllocatorWhenPoolWithoutMainStorage
buffersAllocator.addNewBufferPool(std::move(pool));
EXPECT_EQ(buffersAllocator.bufferPools.size(), 0u);
EXPECT_EQ(buffersAllocator.getPoolsCount(), 0u);
}
TEST_F(AbstractSmallBuffersTest, givenBuffersAllocatorWhenNullptrTriedToBeFreedThenItIsNotConsideredValidBuffer) {
@ -164,8 +165,11 @@ TEST_F(AbstractSmallBuffersTest, givenBuffersAllocatorWhenChunkOfMainStorageTrie
auto poolStorage2 = pool2.mainStorage.get();
auto buffersAllocator = DummyBuffersAllocator{};
EXPECT_EQ(0u, buffersAllocator.getPoolsCount());
buffersAllocator.addNewBufferPool(std::move(pool1));
EXPECT_EQ(1u, buffersAllocator.getPoolsCount());
buffersAllocator.addNewBufferPool(std::move(pool2));
EXPECT_EQ(2u, buffersAllocator.getPoolsCount());
auto &chunksToFree1 = buffersAllocator.bufferPools[0].chunksToFree;
auto &chunksToFree2 = buffersAllocator.bufferPools[1].chunksToFree;