feature: add pooling of USM global/constant surface

Related-To: NEO-12287
Signed-off-by: Fabian Zwoliński <fabian.zwolinski@intel.com>
This commit is contained in:
Fabian Zwoliński
2025-09-19 14:53:48 +00:00
committed by Compute-Runtime-Automation
parent 0b6b0e3954
commit a1c5fa1a13
15 changed files with 635 additions and 42 deletions

View File

@@ -77,6 +77,13 @@ Device::~Device() {
if (deviceUsmMemAllocPoolsManager) {
deviceUsmMemAllocPoolsManager->cleanup();
}
if (usmConstantSurfaceAllocPool) {
usmConstantSurfaceAllocPool->cleanup();
}
if (usmGlobalSurfaceAllocPool) {
usmGlobalSurfaceAllocPool->cleanup();
}
secondaryCsrs.clear();
executionEnvironment->memoryManager->releaseSecondaryOsContexts(this->getRootDeviceIndex());
commandStreamReceivers.clear();
@@ -222,6 +229,10 @@ bool Device::initializeCommonResources() {
deviceBitfields.emplace(getRootDeviceIndex(), getDeviceBitfield());
deviceUsmMemAllocPoolsManager.reset(new UsmMemAllocPoolsManager(getMemoryManager(), rootDeviceIndices, deviceBitfields, this, InternalMemoryType::deviceUnifiedMemory));
}
this->resetUsmConstantSurfaceAllocPool(new UsmMemAllocPool);
this->resetUsmGlobalSurfaceAllocPool(new UsmMemAllocPool);
return true;
}
@@ -267,6 +278,14 @@ void Device::cleanupUsmAllocationPool() {
}
}
void Device::resetUsmConstantSurfaceAllocPool(UsmMemAllocPool *usmMemAllocPool) {
this->usmConstantSurfaceAllocPool.reset(usmMemAllocPool);
}
void Device::resetUsmGlobalSurfaceAllocPool(UsmMemAllocPool *usmMemAllocPool) {
this->usmGlobalSurfaceAllocPool.reset(usmMemAllocPool);
}
bool Device::initDeviceFully() {
if (!getRootDeviceEnvironment().isExposeSingleDeviceMode()) {

View File

@@ -213,6 +213,12 @@ class Device : public ReferenceTrackedObject<Device>, NEO::NonCopyableAndNonMova
UsmMemAllocPool *getUsmMemAllocPool() {
return usmMemAllocPool.get();
}
UsmMemAllocPool *getUsmConstantSurfaceAllocPool() {
return usmConstantSurfaceAllocPool.get();
}
UsmMemAllocPool *getUsmGlobalSurfaceAllocPool() {
return usmGlobalSurfaceAllocPool.get();
}
MOCKABLE_VIRTUAL void stopDirectSubmissionAndWaitForCompletion();
MOCKABLE_VIRTUAL void pollForCompletion();
bool isAnyDirectSubmissionEnabled() const;
@@ -262,6 +268,9 @@ class Device : public ReferenceTrackedObject<Device>, NEO::NonCopyableAndNonMova
void resetUsmAllocationPool(UsmMemAllocPool *usmMemAllocPool);
void cleanupUsmAllocationPool();
void resetUsmConstantSurfaceAllocPool(UsmMemAllocPool *usmMemAllocPool);
void resetUsmGlobalSurfaceAllocPool(UsmMemAllocPool *usmMemAllocPool);
std::unordered_map<uint32_t, bool> crossAccessEnabledDevices;
bool canAccessPeer(QueryPeerAccessFunc queryPeerAccess, Device *peerDevice, bool &canAccess);
static void initializePeerAccessForDevices(QueryPeerAccessFunc queryPeerAccess, const std::vector<NEO::Device *> &devices);
@@ -353,6 +362,8 @@ class Device : public ReferenceTrackedObject<Device>, NEO::NonCopyableAndNonMova
TimestampPoolAllocator deviceTimestampPoolAllocator;
std::unique_ptr<UsmMemAllocPoolsManager> deviceUsmMemAllocPoolsManager;
std::unique_ptr<UsmMemAllocPool> usmMemAllocPool;
std::unique_ptr<UsmMemAllocPool> usmConstantSurfaceAllocPool;
std::unique_ptr<UsmMemAllocPool> usmGlobalSurfaceAllocPool;
std::atomic_uint32_t bufferPoolCount = 0u;
uint32_t maxBufferPoolCount = 0u;

View File

@@ -27,7 +27,7 @@ class UsmMemAllocPool {
UsmMemAllocPool() = default;
virtual ~UsmMemAllocPool() = default;
bool initialize(SVMAllocsManager *svmMemoryManager, const UnifiedMemoryProperties &memoryProperties, size_t poolSize, size_t minServicedSize, size_t maxServicedSize);
MOCKABLE_VIRTUAL bool initialize(SVMAllocsManager *svmMemoryManager, const UnifiedMemoryProperties &memoryProperties, size_t poolSize, size_t minServicedSize, size_t maxServicedSize);
bool initialize(SVMAllocsManager *svmMemoryManager, void *ptr, SvmAllocationData *svmData, size_t minServicedSize, size_t maxServicedSize);
bool isInitialized() const;
size_t getPoolSize() const;
@@ -37,14 +37,15 @@ class UsmMemAllocPool {
static double getPercentOfFreeMemoryForRecycling(InternalMemoryType memoryType);
bool sizeIsAllowed(size_t size);
bool canBePooled(size_t size, const UnifiedMemoryProperties &memoryProperties);
void *createUnifiedMemoryAllocation(size_t size, const UnifiedMemoryProperties &memoryProperties);
MOCKABLE_VIRTUAL void *createUnifiedMemoryAllocation(size_t size, const UnifiedMemoryProperties &memoryProperties);
bool isInPool(const void *ptr) const;
bool isEmpty();
bool freeSVMAlloc(const void *ptr, bool blocking);
MOCKABLE_VIRTUAL bool freeSVMAlloc(const void *ptr, bool blocking);
size_t getPooledAllocationSize(const void *ptr);
void *getPooledAllocationBasePtr(const void *ptr);
size_t getOffsetInPool(const void *ptr) const;
uint64_t getPoolAddress() const;
std::mutex &getMutex() noexcept { return mtx; }
static constexpr auto chunkAlignment = 512u;
static constexpr auto poolAlignment = MemoryConstants::pageSize2M;

View File

@@ -7,6 +7,7 @@
#include "program_initialization.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/compiler_interface/linker.h"
#include "shared/source/device/device.h"
#include "shared/source/helpers/blit_commands_helper.h"
@@ -18,6 +19,8 @@
#include "shared/source/memory_manager/unified_memory_pooling.h"
#include "shared/source/program/program_info.h"
#include <mutex>
namespace NEO {
SharedPoolAllocation *allocateGlobalsSurface(NEO::SVMAllocsManager *const svmAllocManager, NEO::Device &device, size_t totalSize, size_t zeroInitSize, bool constant,
@@ -26,6 +29,8 @@ SharedPoolAllocation *allocateGlobalsSurface(NEO::SVMAllocsManager *const svmAll
size_t allocatedSize{0u};
bool globalsAreExported = false;
GraphicsAllocation *gpuAllocation = nullptr;
bool isAllocatedFromPool = false;
std::mutex *usmAllocPoolMutex = nullptr;
const auto rootDeviceIndex = device.getRootDeviceIndex();
const auto deviceBitfield = device.getDeviceBitfield();
@@ -42,27 +47,69 @@ SharedPoolAllocation *allocateGlobalsSurface(NEO::SVMAllocsManager *const svmAll
unifiedMemoryProperties.device = &device;
unifiedMemoryProperties.requestedAllocationType = allocationType;
unifiedMemoryProperties.isInternalAllocation = true;
auto ptr = svmAllocManager->createUnifiedMemoryAllocation(totalSize, unifiedMemoryProperties);
DEBUG_BREAK_IF(ptr == nullptr);
if (ptr == nullptr) {
return nullptr;
UsmMemAllocPool *allocPool = nullptr;
if (allocationType == AllocationType::constantSurface) {
allocPool = device.getUsmConstantSurfaceAllocPool();
} else {
allocPool = device.getUsmGlobalSurfaceAllocPool();
}
if (allocPool && device.getProductHelper().is2MBLocalMemAlignmentEnabled()) {
if (!allocPool->isInitialized()) {
constexpr size_t alignment = MemoryConstants::pageSize2M;
constexpr size_t poolSize = MemoryConstants::pageSize2M;
constexpr size_t minServicedSize = 0u;
constexpr size_t maxServicedSize = 2 * MemoryConstants::megaByte;
NEO::SVMAllocsManager::UnifiedMemoryProperties poolMemoryProperties(InternalMemoryType::deviceUnifiedMemory, alignment, rootDeviceIndices, subDeviceBitfields);
poolMemoryProperties.device = &device;
poolMemoryProperties.requestedAllocationType = allocationType;
poolMemoryProperties.isInternalAllocation = true;
allocPool->initialize(svmAllocManager, poolMemoryProperties, poolSize, minServicedSize, maxServicedSize);
}
if (allocPool->isInitialized()) {
unifiedMemoryProperties.alignment = MemoryConstants::pageSize;
auto pooledPtr = allocPool->createUnifiedMemoryAllocation(totalSize, unifiedMemoryProperties);
if (pooledPtr) {
allocationOffset = allocPool->getOffsetInPool(pooledPtr);
allocatedSize = allocPool->getPooledAllocationSize(pooledPtr);
auto usmAlloc = svmAllocManager->getSVMAlloc(reinterpret_cast<void *>(allocPool->getPoolAddress()));
UNRECOVERABLE_IF(usmAlloc == nullptr);
gpuAllocation = usmAlloc->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
usmAllocPoolMutex = &allocPool->getMutex();
isAllocatedFromPool = true;
}
}
}
if (!gpuAllocation) {
auto ptr = svmAllocManager->createUnifiedMemoryAllocation(totalSize, unifiedMemoryProperties);
DEBUG_BREAK_IF(ptr == nullptr);
if (ptr == nullptr) {
return nullptr;
}
auto usmAlloc = svmAllocManager->getSVMAlloc(ptr);
UNRECOVERABLE_IF(usmAlloc == nullptr);
gpuAllocation = usmAlloc->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
allocationOffset = 0u;
allocatedSize = gpuAllocation->getUnderlyingBufferSize();
}
auto usmAlloc = svmAllocManager->getSVMAlloc(ptr);
UNRECOVERABLE_IF(usmAlloc == nullptr);
gpuAllocation = usmAlloc->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
} else {
gpuAllocation = device.getMemoryManager()->allocateGraphicsMemoryWithProperties({rootDeviceIndex,
true, // allocateMemory
totalSize, allocationType,
false, // isMultiStorageAllocation
deviceBitfield});
if (nullptr == gpuAllocation) {
return nullptr;
}
allocationOffset = 0u;
allocatedSize = gpuAllocation->getUnderlyingBufferSize();
}
if (!gpuAllocation) {
return nullptr;
}
allocatedSize = gpuAllocation->getUnderlyingBufferSize();
auto &rootDeviceEnvironment = device.getRootDeviceEnvironment();
auto &productHelper = device.getProductHelper();
@@ -72,6 +119,28 @@ SharedPoolAllocation *allocateGlobalsSurface(NEO::SVMAllocsManager *const svmAll
auto success = MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *gpuAllocation),
device, gpuAllocation, allocationOffset, initData, initSize);
UNRECOVERABLE_IF(!success);
if (auto csr = device.getDefaultEngine().commandStreamReceiver;
isAllocatedFromPool && csr->getType() != NEO::CommandStreamReceiverType::hardware) {
auto writeMemoryOperation = [&]() {
constexpr uint32_t allBanks = std::numeric_limits<uint32_t>::max();
if (gpuAllocation->isTbxWritable(allBanks)) {
// initialize full page tables for the first time
csr->writeMemory(*gpuAllocation, false, 0, 0);
}
gpuAllocation->setTbxWritable(true, allBanks);
[[maybe_unused]] const auto writeMemoryStatus = csr->writeMemory(*gpuAllocation, true, allocationOffset, allocatedSize);
DEBUG_BREAK_IF(!writeMemoryStatus);
gpuAllocation->setTbxWritable(false, allBanks);
};
if (usmAllocPoolMutex) {
std::lock_guard<std::mutex> lock(*usmAllocPoolMutex);
writeMemoryOperation();
} else {
writeMemoryOperation();
}
}
}
return new SharedPoolAllocation(gpuAllocation, allocationOffset, allocatedSize, nullptr);
}