Files
compute-runtime/shared/source/memory_manager/unified_memory_manager.cpp
Szymon Morek bb15deca7e fix: do not wait on tag when releasing in-place
Related-To: GSD-11990

If user releases USM without defer policy, do not check whether
allocation is in use when trying to reuse.

Also, when context is being destroyed, use blocking policy to
make sure that tag is updated and driver will notice that
resource is no longer being used by GPU when cleaning up.

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
2025-11-25 11:12:43 +01:00

1266 lines
60 KiB
C++

/*
* Copyright (C) 2019-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/device/sub_device.h"
#include "shared/source/execution_environment/execution_environment.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/basic_math.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/helpers/memory_properties_helpers.h"
#include "shared/source/memory_manager/allocation_properties.h"
#include "shared/source/memory_manager/compression_selector.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/memory_manager/unified_memory_properties.h"
#include "shared/source/memory_manager/unified_memory_reuse_cleaner.h"
#include "shared/source/os_interface/device_factory.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/source/os_interface/product_helper.h"
#include "shared/source/page_fault_manager/cpu_page_fault_manager.h"
#include "shared/source/release_helper/release_helper.h"
namespace NEO {
uint32_t UnifiedMemoryProperties::getRootDeviceIndex() const {
if (device) {
return device->getRootDeviceIndex();
}
UNRECOVERABLE_IF(rootDeviceIndices.begin() == nullptr);
return *rootDeviceIndices.begin();
}
void SVMAllocsManager::MapBasedAllocationTracker::insert(const SvmAllocationData &allocationsPair) {
allocations.insert(std::make_pair(reinterpret_cast<void *>(allocationsPair.gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress()), allocationsPair));
}
void SVMAllocsManager::MapBasedAllocationTracker::remove(const SvmAllocationData &allocationsPair) {
SvmAllocationContainer::iterator iter;
iter = allocations.find(reinterpret_cast<void *>(allocationsPair.gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress()));
allocations.erase(iter);
}
void SVMAllocsManager::MapBasedAllocationTracker::freeAllocations(NEO::MemoryManager &memoryManager) {
std::unique_lock<NEO::SpinLock> lock(mutex);
for (auto &allocation : allocations) {
for (auto &gpuAllocation : allocation.second.gpuAllocations.getGraphicsAllocations()) {
memoryManager.freeGraphicsMemory(gpuAllocation);
}
}
}
SVMAllocsManager::SvmAllocationCache::SvmAllocationCache() {
this->enablePerformanceLogging = NEO::debugManager.flags.LogUsmReuse.get();
}
bool SVMAllocsManager::SvmAllocationCache::insert(size_t size, void *ptr, SvmAllocationData *svmData, CompletionCheckPolicy completionPolicy) {
if (false == sizeAllowed(size) ||
svmData->isInternalAllocation ||
svmData->isImportedAllocation) {
return false;
}
std::unique_lock<std::mutex> lock(this->mtx);
if (svmData->device ? svmData->device->shouldLimitAllocationsReuse() : memoryManager->shouldLimitAllocationsReuse()) {
return false;
}
if (svmData->isSavedForReuse) {
return true;
}
bool isSuccess = true;
if (auto device = svmData->device) {
auto lock = device->usmReuseInfo.obtainAllocationsReuseLock();
if (size + device->usmReuseInfo.getAllocationsSavedForReuseSize() > device->usmReuseInfo.getMaxAllocationsSavedForReuseSize()) {
isSuccess = false;
} else {
device->usmReuseInfo.recordAllocationSaveForReuse(size);
}
} else {
auto lock = memoryManager->usmReuseInfo.obtainAllocationsReuseLock();
if (size + memoryManager->usmReuseInfo.getAllocationsSavedForReuseSize() > memoryManager->usmReuseInfo.getMaxAllocationsSavedForReuseSize()) {
isSuccess = false;
} else {
memoryManager->usmReuseInfo.recordAllocationSaveForReuse(size);
}
}
if (isSuccess) {
if (completionPolicy == CompletionCheckPolicy::waitOnFree) {
svmAllocsManager->waitForEnginesCompletion(svmData);
}
if (requireUpdatingAllocsForIndirectAccess) {
svmAllocsManager->removeFromAllocsForIndirectAccess(*svmData);
}
svmData->isSavedForReuse = true;
allocations.emplace(std::lower_bound(allocations.begin(), allocations.end(), size), size, ptr, svmData, completionPolicy == CompletionCheckPolicy::deferred);
empty = false;
if (auto usmReuseCleaner = this->memoryManager->peekExecutionEnvironment().unifiedMemoryReuseCleaner.get()) {
lock.unlock();
usmReuseCleaner->startThread();
usmReuseCleaner->notifySvmAllocationsCacheUpdate();
}
}
if (enablePerformanceLogging) {
logCacheOperation({.allocationSize = size,
.timePoint = std::chrono::high_resolution_clock::now(),
.allocationType = svmData->memoryType,
.operationType = CacheOperationType::insert,
.isSuccess = isSuccess});
}
return isSuccess;
}
bool SVMAllocsManager::SvmAllocationCache::allocUtilizationAllows(size_t requestedSize, size_t reuseCandidateSize) {
if (reuseCandidateSize >= SvmAllocationCache::minimalSizeToCheckUtilization) {
const auto allocUtilization = static_cast<double>(requestedSize) / reuseCandidateSize;
return allocUtilization >= SvmAllocationCache::minimalAllocUtilization;
}
return true;
}
bool SVMAllocsManager::SvmAllocationCache::alignmentAllows(void *ptr, size_t alignment) {
return 0u == alignment || isAligned(castToUint64(ptr), alignment);
}
bool SVMAllocsManager::SvmAllocationCache::isInUse(SvmCacheAllocationInfo &cacheAllocInfo) {
if (!cacheAllocInfo.isInUseCheckRequired) {
return false;
}
if (cacheAllocInfo.svmData->cpuAllocation && memoryManager->allocInUse(*cacheAllocInfo.svmData->cpuAllocation)) {
return true;
}
for (auto &gpuAllocation : cacheAllocInfo.svmData->gpuAllocations.getGraphicsAllocations()) {
if (gpuAllocation && memoryManager->allocInUse(*gpuAllocation)) {
return true;
}
}
return false;
}
void *SVMAllocsManager::SvmAllocationCache::get(size_t size, const UnifiedMemoryProperties &unifiedMemoryProperties) {
if (false == sizeAllowed(size)) {
return nullptr;
}
std::lock_guard<std::mutex> lock(this->mtx);
for (auto allocationIter = std::lower_bound(allocations.begin(), allocations.end(), size);
allocationIter != allocations.end();
++allocationIter) {
if (false == allocUtilizationAllows(size, allocationIter->allocationSize)) {
break;
}
void *allocationPtr = allocationIter->allocation;
DEBUG_BREAK_IF(nullptr == allocationIter->svmData);
if (allocationIter->svmData->device == unifiedMemoryProperties.device &&
allocationIter->svmData->allocationFlagsProperty.allFlags == unifiedMemoryProperties.allocationFlags.allFlags &&
allocationIter->svmData->allocationFlagsProperty.allAllocFlags == unifiedMemoryProperties.allocationFlags.allAllocFlags &&
alignmentAllows(allocationIter->allocation, unifiedMemoryProperties.alignment) &&
false == isInUse(*allocationIter)) {
if (allocationIter->svmData->device) {
auto lock = allocationIter->svmData->device->usmReuseInfo.obtainAllocationsReuseLock();
allocationIter->svmData->device->usmReuseInfo.recordAllocationGetFromReuse(allocationIter->allocationSize);
} else {
auto lock = memoryManager->usmReuseInfo.obtainAllocationsReuseLock();
memoryManager->usmReuseInfo.recordAllocationGetFromReuse(allocationIter->allocationSize);
}
if (enablePerformanceLogging) {
logCacheOperation({.allocationSize = allocationIter->allocationSize,
.timePoint = std::chrono::high_resolution_clock::now(),
.allocationType = allocationIter->svmData->memoryType,
.operationType = CacheOperationType::get,
.isSuccess = true});
}
allocationIter->svmData->size = size;
allocationIter->svmData->isSavedForReuse = false;
allocationIter->svmData->gpuAllocations.getDefaultGraphicsAllocation()->setAubWritable(true, std::numeric_limits<uint32_t>::max());
allocationIter->svmData->gpuAllocations.getDefaultGraphicsAllocation()->setTbxWritable(true, std::numeric_limits<uint32_t>::max());
if (requireUpdatingAllocsForIndirectAccess) {
allocationIter->svmData->setAllocId(++svmAllocsManager->allocationsCounter);
svmAllocsManager->reinsertToAllocsForIndirectAccess(*allocationIter->svmData);
}
allocations.erase(allocationIter);
empty = allocations.empty();
return allocationPtr;
}
}
if (enablePerformanceLogging) {
logCacheOperation({.allocationSize = size,
.timePoint = std::chrono::high_resolution_clock::now(),
.allocationType = unifiedMemoryProperties.memoryType,
.operationType = CacheOperationType::get,
.isSuccess = false});
}
return nullptr;
}
void SVMAllocsManager::SvmAllocationCache::trim() {
std::lock_guard<std::mutex> lock(this->mtx);
for (auto &cachedAllocationInfo : this->allocations) {
DEBUG_BREAK_IF(nullptr == cachedAllocationInfo.svmData);
if (cachedAllocationInfo.svmData->device) {
auto lock = cachedAllocationInfo.svmData->device->usmReuseInfo.obtainAllocationsReuseLock();
cachedAllocationInfo.svmData->device->usmReuseInfo.recordAllocationGetFromReuse(cachedAllocationInfo.allocationSize);
} else {
auto lock = memoryManager->usmReuseInfo.obtainAllocationsReuseLock();
memoryManager->usmReuseInfo.recordAllocationGetFromReuse(cachedAllocationInfo.allocationSize);
}
if (enablePerformanceLogging) {
logCacheOperation({.allocationSize = cachedAllocationInfo.allocationSize,
.timePoint = std::chrono::high_resolution_clock::now(),
.allocationType = cachedAllocationInfo.svmData->memoryType,
.operationType = CacheOperationType::trim,
.isSuccess = true});
}
svmAllocsManager->freeSVMAllocImpl(cachedAllocationInfo.allocation, FreePolicyType::blocking, cachedAllocationInfo.svmData);
}
this->allocations.clear();
empty = true;
}
void SVMAllocsManager::SvmAllocationCache::cleanup() {
DEBUG_BREAK_IF(nullptr == this->memoryManager);
if (auto usmReuseCleaner = this->memoryManager->peekExecutionEnvironment().unifiedMemoryReuseCleaner.get()) {
usmReuseCleaner->unregisterSvmAllocationCache(this);
}
this->trim();
}
void SVMAllocsManager::SvmAllocationCache::logCacheOperation(const SvmAllocationCachePerfInfo &cachePerfEvent) const {
std::string allocationTypeString, operationTypeString, isSuccessString;
switch (cachePerfEvent.allocationType) {
case InternalMemoryType::deviceUnifiedMemory:
allocationTypeString = "device";
break;
case InternalMemoryType::hostUnifiedMemory:
allocationTypeString = "host";
break;
default:
allocationTypeString = "unknown";
break;
}
switch (cachePerfEvent.operationType) {
case CacheOperationType::get:
operationTypeString = "get";
break;
case CacheOperationType::insert:
operationTypeString = "insert";
break;
case CacheOperationType::trim:
operationTypeString = "trim";
break;
case CacheOperationType::trimOld:
operationTypeString = "trim_old";
break;
default:
operationTypeString = "unknown";
break;
}
isSuccessString = cachePerfEvent.isSuccess ? "TRUE" : "FALSE";
NEO::usmReusePerfLoggerInstance().log(true, ",",
cachePerfEvent.timePoint.time_since_epoch().count(), ",",
allocationTypeString, ",",
operationTypeString, ",",
cachePerfEvent.allocationSize, ",",
isSuccessString);
}
void SVMAllocsManager::SvmAllocationCache::trimOldAllocs(std::chrono::high_resolution_clock::time_point trimTimePoint, bool trimAll) {
std::lock_guard<std::mutex> lock(this->mtx);
auto allocCleanCandidateIndex = allocations.size();
while (0u != allocCleanCandidateIndex) {
auto &allocCleanCandidate = allocations[--allocCleanCandidateIndex];
if (allocCleanCandidate.saveTime > trimTimePoint) {
continue;
}
DEBUG_BREAK_IF(nullptr == allocCleanCandidate.svmData);
if (allocCleanCandidate.svmData->device) {
auto lock = allocCleanCandidate.svmData->device->usmReuseInfo.obtainAllocationsReuseLock();
allocCleanCandidate.svmData->device->usmReuseInfo.recordAllocationGetFromReuse(allocCleanCandidate.allocationSize);
} else {
auto lock = memoryManager->usmReuseInfo.obtainAllocationsReuseLock();
memoryManager->usmReuseInfo.recordAllocationGetFromReuse(allocCleanCandidate.allocationSize);
}
if (enablePerformanceLogging) {
logCacheOperation({.allocationSize = allocCleanCandidate.allocationSize,
.timePoint = std::chrono::high_resolution_clock::now(),
.allocationType = allocCleanCandidate.svmData->memoryType,
.operationType = CacheOperationType::trimOld,
.isSuccess = true});
}
svmAllocsManager->freeSVMAllocImpl(allocCleanCandidate.allocation, FreePolicyType::defer, allocCleanCandidate.svmData);
if (trimAll) {
allocCleanCandidate.markForDelete();
} else {
allocations.erase(allocations.begin() + allocCleanCandidateIndex);
break;
}
}
if (trimAll) {
std::erase_if(allocations, SvmCacheAllocationInfo::isMarkedForDelete);
}
empty = allocations.empty();
}
SvmAllocationData *SVMAllocsManager::MapBasedAllocationTracker::get(const void *ptr) {
if (allocations.size() == 0) {
return nullptr;
}
if (!ptr) {
return nullptr;
}
SvmAllocationContainer::iterator iter;
const SvmAllocationContainer::iterator end = allocations.end();
SvmAllocationData *svmAllocData;
// try faster find lookup if pointer is aligned to page
if (isAligned<MemoryConstants::pageSize>(ptr)) {
iter = allocations.find(ptr);
if (iter != end) {
return &iter->second;
}
}
// do additional check with lower bound as we may deal with pointer offset
iter = allocations.lower_bound(ptr);
if (((iter != end) && (iter->first != ptr)) ||
(iter == end)) {
if (iter == allocations.begin()) {
iter = end;
} else {
iter--;
}
}
if (iter != end) {
svmAllocData = &iter->second;
char *charPtr = reinterpret_cast<char *>(svmAllocData->gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress());
if (ptr < (charPtr + svmAllocData->size)) {
return svmAllocData;
}
}
return nullptr;
}
void SVMAllocsManager::MapOperationsTracker::insert(SvmMapOperation mapOperation) {
operations.insert(std::make_pair(mapOperation.regionSvmPtr, mapOperation));
}
void SVMAllocsManager::MapOperationsTracker::remove(const void *regionPtr) {
SvmMapOperationsContainer::iterator iter;
iter = operations.find(regionPtr);
operations.erase(iter);
}
SvmMapOperation *SVMAllocsManager::MapOperationsTracker::get(const void *regionPtr) {
SvmMapOperationsContainer::iterator iter;
iter = operations.find(regionPtr);
if (iter == operations.end()) {
return nullptr;
}
return &iter->second;
}
void SVMAllocsManager::addInternalAllocationsToResidencyContainer(uint32_t rootDeviceIndex,
ResidencyContainer &residencyContainer,
uint32_t requestedTypesMask) {
ContainerReadLockType lock(mtx);
for (auto &allocation : this->svmAllocs.allocations) {
if (rootDeviceIndex >= allocation.second->gpuAllocations.getGraphicsAllocations().size()) {
continue;
}
if (!(static_cast<uint32_t>(allocation.second->memoryType) & requestedTypesMask) ||
(nullptr == allocation.second->gpuAllocations.getGraphicsAllocation(rootDeviceIndex))) {
continue;
}
auto alloc = allocation.second->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
residencyContainer.push_back(alloc);
}
}
void SVMAllocsManager::makeInternalAllocationsResident(CommandStreamReceiver &commandStreamReceiver, uint32_t requestedTypesMask) {
ContainerReadLockType lock(mtx);
for (auto &allocation : this->svmAllocs.allocations) {
if (static_cast<uint32_t>(allocation.second->memoryType) & requestedTypesMask) {
auto gpuAllocation = allocation.second->gpuAllocations.getGraphicsAllocation(commandStreamReceiver.getRootDeviceIndex());
if (gpuAllocation == nullptr) {
continue;
}
commandStreamReceiver.makeResident(*gpuAllocation);
}
}
}
SVMAllocsManager::SVMAllocsManager(MemoryManager *memoryManager)
: memoryManager(memoryManager) {
}
SVMAllocsManager::~SVMAllocsManager() = default;
void *SVMAllocsManager::createSVMAlloc(size_t size, const SvmAllocationProperties svmProperties,
const RootDeviceIndicesContainer &rootDeviceIndices,
const std::map<uint32_t, DeviceBitfield> &subdeviceBitfields) {
if (size == 0) {
return nullptr;
}
if (rootDeviceIndices.size() > 1) {
return createZeroCopySvmAllocation(size, svmProperties, rootDeviceIndices, subdeviceBitfields);
}
if (!memoryManager->isLocalMemorySupported(*rootDeviceIndices.begin())) {
return createZeroCopySvmAllocation(size, svmProperties, rootDeviceIndices, subdeviceBitfields);
} else {
UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::notSpecified, 1, rootDeviceIndices, subdeviceBitfields);
return createUnifiedAllocationWithDeviceStorage(size, svmProperties, unifiedMemoryProperties);
}
}
void *SVMAllocsManager::createHostUnifiedMemoryAllocation(size_t size,
const UnifiedMemoryProperties &memoryProperties) {
bool isDiscrete = false;
if (size >= MemoryConstants::pageSize2M && !debugManager.flags.NEO_CAL_ENABLED.get()) {
for (const auto rootDeviceIndex : memoryProperties.rootDeviceIndices) {
isDiscrete |= !this->memoryManager->peekExecutionEnvironment().rootDeviceEnvironments[rootDeviceIndex]->getHardwareInfo()->capabilityTable.isIntegratedDevice;
if (isDiscrete) {
break;
}
}
}
const size_t pageSizeForAlignment = isDiscrete ? MemoryConstants::pageSize2M : MemoryConstants::pageSize;
const size_t alignedSize = alignUp<size_t>(size, pageSizeForAlignment);
bool compressionEnabled = false;
AllocationType allocationType = getGraphicsAllocationTypeAndCompressionPreference(memoryProperties, compressionEnabled);
RootDeviceIndicesContainer rootDeviceIndicesVector(memoryProperties.rootDeviceIndices);
uint32_t rootDeviceIndex = rootDeviceIndicesVector.at(0);
auto &deviceBitfield = memoryProperties.subdeviceBitfields.at(rootDeviceIndex);
AllocationProperties unifiedMemoryProperties{rootDeviceIndex,
true,
alignedSize,
allocationType,
false,
(deviceBitfield.count() > 1),
deviceBitfield};
unifiedMemoryProperties.alignment = alignUpNonZero<size_t>(memoryProperties.alignment, pageSizeForAlignment);
unifiedMemoryProperties.flags.uncacheable = memoryProperties.allocationFlags.flags.locallyUncachedResource;
unifiedMemoryProperties.flags.preferCompressed = compressionEnabled;
unifiedMemoryProperties.flags.shareable = memoryProperties.allocationFlags.flags.shareable;
unifiedMemoryProperties.flags.isUSMHostAllocation = true;
unifiedMemoryProperties.flags.isHostInaccessibleAllocation = false;
unifiedMemoryProperties.cacheRegion = MemoryPropertiesHelper::getCacheRegion(memoryProperties.allocationFlags);
if (this->usmHostAllocationsCache) {
void *allocationFromCache = this->usmHostAllocationsCache->get(size, memoryProperties);
if (allocationFromCache) {
return allocationFromCache;
}
}
auto maxRootDeviceIndex = *std::max_element(rootDeviceIndicesVector.begin(), rootDeviceIndicesVector.end(), std::less<uint32_t const>());
SvmAllocationData allocData(maxRootDeviceIndex);
void *externalHostPointer = reinterpret_cast<void *>(memoryProperties.allocationFlags.hostptr);
void *usmPtr = memoryManager->createMultiGraphicsAllocationInSystemMemoryPool(rootDeviceIndicesVector, unifiedMemoryProperties, allocData.gpuAllocations, externalHostPointer);
if (!usmPtr) {
if (this->usmHostAllocationsCache) {
this->trimUSMHostAllocCache();
usmPtr = memoryManager->createMultiGraphicsAllocationInSystemMemoryPool(rootDeviceIndicesVector, unifiedMemoryProperties, allocData.gpuAllocations, externalHostPointer);
}
if (!usmPtr) {
return nullptr;
}
}
allocData.cpuAllocation = nullptr;
allocData.size = size;
allocData.memoryType = memoryProperties.memoryType;
allocData.allocationFlagsProperty = memoryProperties.allocationFlags;
allocData.device = nullptr;
allocData.pageSizeForAlignment = pageSizeForAlignment;
allocData.setAllocId(++this->allocationsCounter);
insertSVMAlloc(usmPtr, allocData);
return usmPtr;
}
void *SVMAllocsManager::createUnifiedMemoryAllocation(size_t size,
const UnifiedMemoryProperties &memoryProperties) {
auto rootDeviceIndex = memoryProperties.getRootDeviceIndex();
DeviceBitfield deviceBitfield = memoryProperties.subdeviceBitfields.at(rootDeviceIndex);
auto externalPtr = reinterpret_cast<void *>(memoryProperties.allocationFlags.hostptr);
bool useExternalHostPtrForCpu = externalPtr != nullptr;
const size_t pageSizeForAlignment = useExternalHostPtrForCpu ? MemoryConstants::pageSize : MemoryConstants::pageSize64k;
const size_t alignedSize = alignUp<size_t>(size, pageSizeForAlignment);
bool compressionEnabled = false;
AllocationType allocationType = getGraphicsAllocationTypeAndCompressionPreference(memoryProperties, compressionEnabled);
if (memoryProperties.device) {
auto *releaseHelper = memoryProperties.device->getReleaseHelper();
const bool peerAccess = memoryProperties.device->hasAnyPeerAccess().value_or(false);
if (peerAccess && !releaseHelper->isUsmCompressionSupportedOnPeerAccess()) {
compressionEnabled = false;
}
}
bool multiStorageAllocation = (deviceBitfield.count() > 1);
AllocationProperties unifiedMemoryProperties{rootDeviceIndex,
!useExternalHostPtrForCpu, // allocateMemory
alignedSize,
allocationType,
false,
multiStorageAllocation,
deviceBitfield};
unifiedMemoryProperties.alignment = alignUpNonZero<size_t>(memoryProperties.alignment, pageSizeForAlignment);
unifiedMemoryProperties.flags.isHostInaccessibleAllocation = false;
unifiedMemoryProperties.flags.shareable = memoryProperties.allocationFlags.flags.shareable;
unifiedMemoryProperties.flags.shareableWithoutNTHandle = memoryProperties.allocationFlags.flags.shareableWithoutNTHandle;
unifiedMemoryProperties.cacheRegion = MemoryPropertiesHelper::getCacheRegion(memoryProperties.allocationFlags);
unifiedMemoryProperties.flags.uncacheable = memoryProperties.allocationFlags.flags.locallyUncachedResource;
unifiedMemoryProperties.flags.preferCompressed = compressionEnabled || memoryProperties.allocationFlags.flags.compressedHint;
unifiedMemoryProperties.flags.preferCompressed &= memoryManager->isCompressionSupportedForShareable(memoryProperties.allocationFlags.flags.shareable);
unifiedMemoryProperties.flags.resource48Bit = memoryProperties.allocationFlags.flags.resource48Bit;
if (memoryProperties.memoryType == InternalMemoryType::deviceUnifiedMemory) {
unifiedMemoryProperties.flags.isHostInaccessibleAllocation = true;
if (this->usmDeviceAllocationsCache &&
false == memoryProperties.isInternalAllocation) {
void *allocationFromCache = this->usmDeviceAllocationsCache->get(size, memoryProperties);
if (allocationFromCache) {
return allocationFromCache;
}
}
} else if (memoryProperties.memoryType == InternalMemoryType::hostUnifiedMemory) {
unifiedMemoryProperties.flags.isUSMHostAllocation = true;
} else {
unifiedMemoryProperties.flags.isUSMHostAllocation = useExternalHostPtrForCpu;
}
GraphicsAllocation *unifiedMemoryAllocation = memoryManager->allocateGraphicsMemoryWithProperties(unifiedMemoryProperties, externalPtr);
if (!unifiedMemoryAllocation) {
if (memoryProperties.memoryType == InternalMemoryType::deviceUnifiedMemory &&
this->usmDeviceAllocationsCache) {
this->trimUSMDeviceAllocCache();
unifiedMemoryAllocation = memoryManager->allocateGraphicsMemoryWithProperties(unifiedMemoryProperties, externalPtr);
}
if (!unifiedMemoryAllocation) {
return nullptr;
}
}
setUnifiedAllocationProperties(unifiedMemoryAllocation, {});
SvmAllocationData allocData(rootDeviceIndex);
allocData.gpuAllocations.addAllocation(unifiedMemoryAllocation);
allocData.cpuAllocation = nullptr;
allocData.size = size;
allocData.pageSizeForAlignment = pageSizeForAlignment;
allocData.memoryType = memoryProperties.memoryType;
allocData.allocationFlagsProperty = memoryProperties.allocationFlags;
allocData.device = memoryProperties.device;
allocData.setAllocId(++this->allocationsCounter);
allocData.isInternalAllocation = memoryProperties.isInternalAllocation;
auto retPtr = reinterpret_cast<void *>(unifiedMemoryAllocation->getGpuAddress());
insertSVMAlloc(retPtr, allocData);
UNRECOVERABLE_IF(useExternalHostPtrForCpu && (externalPtr != retPtr));
return retPtr;
}
void *SVMAllocsManager::createSharedUnifiedMemoryAllocation(size_t size,
const UnifiedMemoryProperties &memoryProperties,
void *cmdQ) {
if (memoryProperties.rootDeviceIndices.size() > 1 && memoryProperties.device == nullptr) {
return createHostUnifiedMemoryAllocation(size, memoryProperties);
}
auto rootDeviceIndex = memoryProperties.getRootDeviceIndex();
auto supportDualStorageSharedMemory = memoryManager->isLocalMemorySupported(rootDeviceIndex);
if (debugManager.flags.AllocateSharedAllocationsWithCpuAndGpuStorage.get() != -1) {
supportDualStorageSharedMemory = !!debugManager.flags.AllocateSharedAllocationsWithCpuAndGpuStorage.get();
}
if (supportDualStorageSharedMemory) {
bool useKmdMigration = memoryManager->isKmdMigrationAvailable(rootDeviceIndex);
void *unifiedMemoryPointer = nullptr;
if (useKmdMigration) {
unifiedMemoryPointer = createUnifiedKmdMigratedAllocation(size, {}, memoryProperties);
if (!unifiedMemoryPointer) {
return nullptr;
}
} else {
unifiedMemoryPointer = createUnifiedAllocationWithDeviceStorage(size, {}, memoryProperties);
if (!unifiedMemoryPointer) {
return nullptr;
}
UNRECOVERABLE_IF(cmdQ == nullptr);
auto pageFaultManager = this->memoryManager->getPageFaultManager();
pageFaultManager->insertAllocation(unifiedMemoryPointer, size, this, cmdQ, memoryProperties.allocationFlags);
}
auto unifiedMemoryAllocation = this->getSVMAlloc(unifiedMemoryPointer);
unifiedMemoryAllocation->memoryType = memoryProperties.memoryType;
unifiedMemoryAllocation->allocationFlagsProperty = memoryProperties.allocationFlags;
return unifiedMemoryPointer;
}
return createUnifiedMemoryAllocation(size, memoryProperties);
}
void *SVMAllocsManager::createUnifiedKmdMigratedAllocation(size_t size, const SvmAllocationProperties &svmProperties, const UnifiedMemoryProperties &unifiedMemoryProperties) {
auto rootDeviceIndex = unifiedMemoryProperties.getRootDeviceIndex();
auto &deviceBitfield = unifiedMemoryProperties.subdeviceBitfields.at(rootDeviceIndex);
constexpr size_t pageSizeForAlignment = MemoryConstants::pageSize2M;
const size_t alignedSize = alignUp<size_t>(size, pageSizeForAlignment);
AllocationProperties gpuProperties{rootDeviceIndex,
true,
alignedSize,
AllocationType::unifiedSharedMemory,
false,
false,
deviceBitfield};
gpuProperties.alignment = alignUpNonZero<size_t>(unifiedMemoryProperties.alignment, pageSizeForAlignment);
gpuProperties.flags.resource48Bit = unifiedMemoryProperties.allocationFlags.flags.resource48Bit;
auto cacheRegion = MemoryPropertiesHelper::getCacheRegion(unifiedMemoryProperties.allocationFlags);
MemoryPropertiesHelper::fillCachePolicyInProperties(gpuProperties, false, svmProperties.readOnly, false, cacheRegion);
auto initialPlacement = MemoryPropertiesHelper::getUSMInitialPlacement(unifiedMemoryProperties.allocationFlags);
MemoryPropertiesHelper::setUSMInitialPlacement(gpuProperties, initialPlacement);
GraphicsAllocation *allocationGpu = memoryManager->allocateGraphicsMemoryWithProperties(gpuProperties);
if (!allocationGpu) {
return nullptr;
}
setUnifiedAllocationProperties(allocationGpu, svmProperties);
SvmAllocationData allocData(rootDeviceIndex);
allocData.gpuAllocations.addAllocation(allocationGpu);
allocData.cpuAllocation = nullptr;
allocData.device = unifiedMemoryProperties.device;
allocData.size = size;
allocData.pageSizeForAlignment = pageSizeForAlignment;
allocData.setAllocId(++this->allocationsCounter);
auto retPtr = allocationGpu->getUnderlyingBuffer();
insertSVMAlloc(retPtr, allocData);
return retPtr;
}
void SVMAllocsManager::setUnifiedAllocationProperties(GraphicsAllocation *allocation, const SvmAllocationProperties &svmProperties) {
allocation->setMemObjectsAllocationWithWritableFlags(!svmProperties.readOnly && !svmProperties.hostPtrReadOnly);
allocation->setCoherent(svmProperties.coherent);
}
void SVMAllocsManager::reinsertToAllocsForIndirectAccess(SvmAllocationData &svmData) {
std::unique_lock<std::mutex> lockForIndirect(mtxForIndirectAccess);
ContainerReadWriteLockType lock(mtx);
for (auto alloc : svmData.gpuAllocations.getGraphicsAllocations()) {
OPTIONAL_UNRECOVERABLE_IF(nullptr == alloc);
internalAllocationsMap.insert({svmData.getAllocId(), alloc});
}
}
void SVMAllocsManager::removeFromAllocsForIndirectAccess(SvmAllocationData &svmData) {
std::unique_lock<std::mutex> lockForIndirect(mtxForIndirectAccess);
ContainerReadWriteLockType lock(mtx);
internalAllocationsMap.erase(svmData.getAllocId());
}
void SVMAllocsManager::insertSVMAlloc(const SvmAllocationData &svmAllocData) {
insertSVMAlloc(reinterpret_cast<void *>(svmAllocData.gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress()), svmAllocData);
}
void SVMAllocsManager::removeSVMAlloc(const SvmAllocationData &svmAllocData) {
ContainerReadWriteLockType lock(mtx);
internalAllocationsMap.erase(svmAllocData.getAllocId());
svmAllocs.remove(reinterpret_cast<void *>(svmAllocData.gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress()));
}
bool SVMAllocsManager::freeSVMAlloc(void *ptr, bool blocking) {
if (svmDeferFreeAllocs.allocations.size() > 0) {
this->freeSVMAllocDeferImpl();
}
SvmAllocationData *svmData = getSVMAlloc(ptr);
if (svmData) {
auto completionCheckPolicy = blocking ? SvmAllocationCache::CompletionCheckPolicy::waitOnFree : SvmAllocationCache::CompletionCheckPolicy::notRequired;
if (InternalMemoryType::deviceUnifiedMemory == svmData->memoryType &&
this->usmDeviceAllocationsCache) {
if (this->usmDeviceAllocationsCache->insert(svmData->gpuAllocations.getDefaultGraphicsAllocation()->getUnderlyingBufferSize(), ptr, svmData, completionCheckPolicy)) {
return true;
}
}
if (InternalMemoryType::hostUnifiedMemory == svmData->memoryType &&
this->usmHostAllocationsCache) {
if (this->usmHostAllocationsCache->insert(svmData->gpuAllocations.getDefaultGraphicsAllocation()->getUnderlyingBufferSize(), ptr, svmData, completionCheckPolicy)) {
return true;
}
}
if (blocking) {
this->freeSVMAllocImpl(ptr, FreePolicyType::blocking, svmData);
} else {
this->freeSVMAllocImpl(ptr, FreePolicyType::none, svmData);
}
return true;
}
return false;
}
bool SVMAllocsManager::freeSVMAllocDefer(void *ptr) {
if (svmDeferFreeAllocs.allocations.size() > 0) {
this->freeSVMAllocDeferImpl();
}
SvmAllocationData *svmData = getSVMAlloc(ptr);
if (svmData) {
constexpr auto completionCheckPolicy = SvmAllocationCache::CompletionCheckPolicy::deferred;
if (InternalMemoryType::deviceUnifiedMemory == svmData->memoryType &&
this->usmDeviceAllocationsCache) {
if (this->usmDeviceAllocationsCache->insert(svmData->gpuAllocations.getDefaultGraphicsAllocation()->getUnderlyingBufferSize(), ptr, svmData, completionCheckPolicy)) {
return true;
}
}
if (InternalMemoryType::hostUnifiedMemory == svmData->memoryType &&
this->usmHostAllocationsCache) {
if (this->usmHostAllocationsCache->insert(svmData->gpuAllocations.getDefaultGraphicsAllocation()->getUnderlyingBufferSize(), ptr, svmData, completionCheckPolicy)) {
return true;
}
}
this->freeSVMAllocImpl(ptr, FreePolicyType::defer, svmData);
return true;
}
return false;
}
void SVMAllocsManager::waitForEnginesCompletion(SvmAllocationData *allocationData) {
if (allocationData->cpuAllocation) {
this->memoryManager->waitForEnginesCompletion(*allocationData->cpuAllocation);
}
for (auto &gpuAllocation : allocationData->gpuAllocations.getGraphicsAllocations()) {
if (gpuAllocation) {
this->memoryManager->waitForEnginesCompletion(*gpuAllocation);
}
}
}
void SVMAllocsManager::freeSVMAllocImpl(void *ptr, FreePolicyType policy, SvmAllocationData *svmData) {
auto allowNonBlockingFree = policy == FreePolicyType::none;
this->prepareIndirectAllocationForDestruction(svmData, allowNonBlockingFree);
if (policy == FreePolicyType::blocking) {
this->waitForEnginesCompletion(svmData);
} else if (policy == FreePolicyType::defer) {
if (svmData->cpuAllocation) {
if (this->memoryManager->allocInUse(*svmData->cpuAllocation)) {
std::lock_guard<ContainerMutexType> lock(mtx);
if (svmDeferFreeAllocs.get(ptr) == nullptr) {
this->svmDeferFreeAllocs.insert(*svmData);
}
return;
}
}
for (auto &gpuAllocation : svmData->gpuAllocations.getGraphicsAllocations()) {
if (gpuAllocation) {
if (this->memoryManager->allocInUse(*gpuAllocation)) {
std::lock_guard<ContainerMutexType> lock(mtx);
if (svmDeferFreeAllocs.get(ptr) == nullptr) {
this->svmDeferFreeAllocs.insert(*svmData);
}
return;
}
}
}
}
auto pageFaultManager = this->memoryManager->getPageFaultManager();
if (svmData->cpuAllocation && pageFaultManager) {
pageFaultManager->removeAllocation(svmData->cpuAllocation->getUnderlyingBuffer());
}
if (svmData->gpuAllocations.getAllocationType() == AllocationType::svmZeroCopy) {
freeZeroCopySvmAllocation(svmData);
} else {
freeSvmAllocationWithDeviceStorage(svmData);
}
}
void SVMAllocsManager::freeSVMAllocDeferImpl(FreePolicyType policy) {
std::vector<void *> freedPtr;
for (auto iter = svmDeferFreeAllocs.allocations.begin(); iter != svmDeferFreeAllocs.allocations.end(); ++iter) {
void *ptr = reinterpret_cast<void *>(iter->second.gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress());
this->freeSVMAllocImpl(ptr, policy, this->getSVMAlloc(ptr));
if (this->getSVMAlloc(ptr) == nullptr) {
freedPtr.push_back(ptr);
}
}
for (uint32_t i = 0; i < freedPtr.size(); ++i) {
svmDeferFreeAllocs.allocations.erase(freedPtr[i]);
}
}
void SVMAllocsManager::cleanupUSMAllocCaches() {
if (this->usmDeviceAllocationsCache) {
this->usmDeviceAllocationsCache->cleanup();
this->usmDeviceAllocationsCache.reset(nullptr);
}
if (this->usmHostAllocationsCache) {
this->usmHostAllocationsCache->cleanup();
this->usmHostAllocationsCache.reset(nullptr);
}
}
void SVMAllocsManager::trimUSMDeviceAllocCache() {
this->usmDeviceAllocationsCache->trim();
}
void SVMAllocsManager::trimUSMHostAllocCache() {
this->usmHostAllocationsCache->trim();
}
void *SVMAllocsManager::createZeroCopySvmAllocation(size_t size, const SvmAllocationProperties &svmProperties,
const RootDeviceIndicesContainer &rootDeviceIndices,
const std::map<uint32_t, DeviceBitfield> &subdeviceBitfields) {
auto rootDeviceIndex = *rootDeviceIndices.begin();
auto &deviceBitfield = subdeviceBitfields.at(rootDeviceIndex);
AllocationProperties properties{rootDeviceIndex,
true, // allocateMemory
size,
AllocationType::svmZeroCopy,
false, // isMultiStorageAllocation
deviceBitfield};
MemoryPropertiesHelper::fillCachePolicyInProperties(properties, false, svmProperties.readOnly, false, properties.cacheRegion);
RootDeviceIndicesContainer rootDeviceIndicesVector(rootDeviceIndices);
auto maxRootDeviceIndex = *std::max_element(rootDeviceIndices.begin(), rootDeviceIndices.end(), std::less<uint32_t const>());
SvmAllocationData allocData(maxRootDeviceIndex);
void *usmPtr = memoryManager->createMultiGraphicsAllocationInSystemMemoryPool(rootDeviceIndicesVector, properties, allocData.gpuAllocations);
if (!usmPtr) {
return nullptr;
}
for (const auto &rootDeviceIndex : rootDeviceIndices) {
auto allocation = allocData.gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
allocation->setMemObjectsAllocationWithWritableFlags(!svmProperties.readOnly && !svmProperties.hostPtrReadOnly);
allocation->setCoherent(svmProperties.coherent);
}
allocData.size = size;
allocData.setAllocId(++this->allocationsCounter);
insertSVMAlloc(usmPtr, allocData);
return usmPtr;
}
void *SVMAllocsManager::createUnifiedAllocationWithDeviceStorage(size_t size, const SvmAllocationProperties &svmProperties, const UnifiedMemoryProperties &unifiedMemoryProperties) {
memoryManager->initPageFaultManager();
auto rootDeviceIndex = unifiedMemoryProperties.getRootDeviceIndex();
auto externalPtr = reinterpret_cast<void *>(unifiedMemoryProperties.allocationFlags.hostptr);
bool useExternalHostPtrForCpu = externalPtr != nullptr;
const size_t svmCpuAlignment = memoryManager->peekExecutionEnvironment().rootDeviceEnvironments[rootDeviceIndex]->getProductHelper().getSvmCpuAlignment();
auto minCpuAlignment = (debugManager.flags.AlignLocalMemoryVaTo2MB.get() == 1) ? MemoryConstants::pageSize2M : MemoryConstants::pageSize64k;
const size_t effectiveSvmCpuAlignment = std::max(minCpuAlignment, svmCpuAlignment);
const size_t alignment = alignUpNonZero<size_t>(unifiedMemoryProperties.alignment, effectiveSvmCpuAlignment);
const size_t alignedCpuSize = alignUp<size_t>(size, alignment);
DeviceBitfield subDevices = unifiedMemoryProperties.subdeviceBitfields.at(rootDeviceIndex);
AllocationProperties cpuProperties{rootDeviceIndex,
!useExternalHostPtrForCpu, // allocateMemory
alignedCpuSize, AllocationType::svmCpu,
false, // isMultiStorageAllocation
subDevices};
cpuProperties.alignment = alignment;
cpuProperties.flags.isUSMHostAllocation = useExternalHostPtrForCpu;
cpuProperties.forceKMDAllocation = true;
cpuProperties.makeGPUVaDifferentThanCPUPtr = true;
auto cacheRegion = MemoryPropertiesHelper::getCacheRegion(unifiedMemoryProperties.allocationFlags);
MemoryPropertiesHelper::fillCachePolicyInProperties(cpuProperties, false, svmProperties.readOnly, false, cacheRegion);
GraphicsAllocation *allocationCpu = memoryManager->allocateGraphicsMemoryWithProperties(cpuProperties, externalPtr);
if (!allocationCpu) {
return nullptr;
}
setUnifiedAllocationProperties(allocationCpu, svmProperties);
void *svmPtr = allocationCpu->getUnderlyingBuffer();
UNRECOVERABLE_IF(useExternalHostPtrForCpu && (externalPtr != svmPtr));
const size_t alignedGpuSize = alignUp<size_t>(size, MemoryConstants::pageSize64k);
AllocationProperties gpuProperties{rootDeviceIndex,
false,
alignedGpuSize,
AllocationType::svmGpu,
false,
subDevices.count() > 1,
subDevices};
gpuProperties.alignment = alignment;
auto compressionSupported = false;
if (unifiedMemoryProperties.device) {
auto &gfxCoreHelper = unifiedMemoryProperties.device->getGfxCoreHelper();
auto &hwInfo = unifiedMemoryProperties.device->getHardwareInfo();
compressionSupported = gfxCoreHelper.usmCompressionSupported(hwInfo);
compressionSupported &= memoryManager->isCompressionSupportedForShareable(unifiedMemoryProperties.allocationFlags.flags.shareable);
}
gpuProperties.flags.preferCompressed = compressionSupported;
MemoryPropertiesHelper::fillCachePolicyInProperties(gpuProperties, false, svmProperties.readOnly, false, cacheRegion);
GraphicsAllocation *allocationGpu = memoryManager->allocateGraphicsMemoryWithProperties(gpuProperties, svmPtr);
if (!allocationGpu) {
memoryManager->freeGraphicsMemory(allocationCpu);
return nullptr;
}
setUnifiedAllocationProperties(allocationGpu, svmProperties);
SvmAllocationData allocData(rootDeviceIndex);
allocData.gpuAllocations.addAllocation(allocationGpu);
allocData.cpuAllocation = allocationCpu;
allocData.device = unifiedMemoryProperties.device;
allocData.pageSizeForAlignment = effectiveSvmCpuAlignment;
allocData.size = size;
allocData.setAllocId(++this->allocationsCounter);
insertSVMAlloc(svmPtr, allocData);
return svmPtr;
}
void SVMAllocsManager::freeSVMData(SvmAllocationData *svmData) {
std::unique_lock<std::mutex> lockForIndirect(mtxForIndirectAccess);
ContainerReadWriteLockType lock(mtx);
internalAllocationsMap.erase(svmData->getAllocId());
svmAllocs.remove(reinterpret_cast<void *>(svmData->gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress()));
}
void SVMAllocsManager::freeZeroCopySvmAllocation(SvmAllocationData *svmData) {
auto gpuAllocations = svmData->gpuAllocations;
freeSVMData(svmData);
for (const auto &graphicsAllocation : gpuAllocations.getGraphicsAllocations()) {
memoryManager->freeGraphicsMemory(graphicsAllocation);
}
}
void SVMAllocsManager::initUsmDeviceAllocationsCache(Device &device) {
this->usmDeviceAllocationsCache.reset(new SvmAllocationCache);
this->usmDeviceAllocationsCache->allocations.reserve(128u);
this->usmDeviceAllocationsCache->svmAllocsManager = this;
this->usmDeviceAllocationsCache->memoryManager = memoryManager;
if (auto usmReuseCleaner = device.getExecutionEnvironment()->unifiedMemoryReuseCleaner.get()) {
usmReuseCleaner->registerSvmAllocationCache(this->usmDeviceAllocationsCache.get());
}
}
void SVMAllocsManager::initUsmHostAllocationsCache() {
this->usmHostAllocationsCache.reset(new SvmAllocationCache);
this->usmHostAllocationsCache->allocations.reserve(128u);
this->usmHostAllocationsCache->svmAllocsManager = this;
this->usmHostAllocationsCache->memoryManager = memoryManager;
if (auto usmReuseCleaner = this->memoryManager->peekExecutionEnvironment().unifiedMemoryReuseCleaner.get()) {
usmReuseCleaner->registerSvmAllocationCache(this->usmHostAllocationsCache.get());
}
}
void SVMAllocsManager::initUsmAllocationsCaches(Device &device) {
const bool debuggerEnabled = nullptr != device.getDebugger();
const bool isHwModeSelected = NEO::DeviceFactory::isHwModeSelected();
bool usmDeviceAllocationsCacheEnabled = NEO::ApiSpecificConfig::isDeviceAllocationCacheEnabled() &&
device.getProductHelper().isDeviceUsmAllocationReuseSupported() &&
!debuggerEnabled &&
isHwModeSelected;
if (debugManager.flags.ExperimentalEnableDeviceAllocationCache.get() != -1) {
usmDeviceAllocationsCacheEnabled = !!debugManager.flags.ExperimentalEnableDeviceAllocationCache.get();
}
if (usmDeviceAllocationsCacheEnabled && device.usmReuseInfo.getMaxAllocationsSavedForReuseSize() > 0u) {
device.getExecutionEnvironment()->initializeUnifiedMemoryReuseCleaner(device.isAnyDirectSubmissionLightEnabled());
this->initUsmDeviceAllocationsCache(device);
if (debugManager.flags.SetCommandStreamReceiver.get() > 0) {
this->usmDeviceAllocationsCache->requireUpdatingAllocsForIndirectAccess = true;
}
}
bool usmHostAllocationsCacheEnabled = NEO::ApiSpecificConfig::isHostAllocationCacheEnabled() &&
device.getProductHelper().isHostUsmAllocationReuseSupported() &&
!debuggerEnabled &&
isHwModeSelected;
if (debugManager.flags.ExperimentalEnableHostAllocationCache.get() != -1) {
usmHostAllocationsCacheEnabled = !!debugManager.flags.ExperimentalEnableHostAllocationCache.get();
}
if (usmHostAllocationsCacheEnabled && this->memoryManager->usmReuseInfo.getMaxAllocationsSavedForReuseSize() > 0u) {
device.getExecutionEnvironment()->initializeUnifiedMemoryReuseCleaner(device.isAnyDirectSubmissionLightEnabled());
this->initUsmHostAllocationsCache();
if (debugManager.flags.SetCommandStreamReceiver.get() > 0) {
this->usmHostAllocationsCache->requireUpdatingAllocsForIndirectAccess = true;
}
}
}
void SVMAllocsManager::freeSvmAllocationWithDeviceStorage(SvmAllocationData *svmData) {
auto graphicsAllocations = svmData->gpuAllocations.getGraphicsAllocations();
GraphicsAllocation *cpuAllocation = svmData->cpuAllocation;
bool isImportedAllocation = svmData->isImportedAllocation;
freeSVMData(svmData);
for (auto gpuAllocation : graphicsAllocations) {
memoryManager->freeGraphicsMemory(gpuAllocation, isImportedAllocation);
}
memoryManager->freeGraphicsMemory(cpuAllocation, isImportedAllocation);
}
bool SVMAllocsManager::hasHostAllocations() {
ContainerReadLockType lock(mtx);
for (auto &allocation : this->svmAllocs.allocations) {
if (allocation.second->memoryType == InternalMemoryType::hostUnifiedMemory) {
return true;
}
}
return false;
}
void SVMAllocsManager::makeIndirectAllocationsResident(CommandStreamReceiver &commandStreamReceiver, TaskCountType taskCount) {
ContainerReadWriteLockType lock(mtx);
bool parseAllAllocations = false;
auto entry = indirectAllocationsResidency.find(&commandStreamReceiver);
TaskCountType previousCounter = 0;
if (entry == indirectAllocationsResidency.end()) {
parseAllAllocations = true;
InternalAllocationsTracker tracker = {};
tracker.latestResidentObjectId = this->allocationsCounter;
tracker.latestSentTaskCount = taskCount;
this->indirectAllocationsResidency.insert(std::make_pair(&commandStreamReceiver, tracker));
} else {
if (this->allocationsCounter > entry->second.latestResidentObjectId) {
parseAllAllocations = true;
previousCounter = entry->second.latestResidentObjectId;
entry->second.latestResidentObjectId = this->allocationsCounter;
}
entry->second.latestSentTaskCount = taskCount;
}
if (parseAllAllocations) {
auto currentCounter = this->allocationsCounter.load();
for (auto allocationId = static_cast<uint32_t>(previousCounter + 1); allocationId <= currentCounter; allocationId++) {
makeResidentForAllocationsWithId(allocationId, commandStreamReceiver);
}
}
}
void SVMAllocsManager::prepareIndirectAllocationForDestruction(SvmAllocationData *allocationData, bool isNonBlockingFree) {
ContainerReadWriteLockType lock(mtx);
if (this->indirectAllocationsResidency.size() > 0u) {
for (auto &internalAllocationsHandling : this->indirectAllocationsResidency) {
auto commandStreamReceiver = internalAllocationsHandling.first;
auto gpuAllocation = allocationData->gpuAllocations.getGraphicsAllocation(commandStreamReceiver->getRootDeviceIndex());
if (gpuAllocation == nullptr) {
continue;
}
// If this is non blocking free, we will wait for latest known usage of this allocation.
// However, if this is blocking free, we must select "safest" task count to wait for.
TaskCountType desiredTaskCount = std::max(internalAllocationsHandling.second.latestSentTaskCount, gpuAllocation->getTaskCount(commandStreamReceiver->getOsContext().getContextId()));
if (isNonBlockingFree) {
desiredTaskCount = gpuAllocation->getTaskCount(commandStreamReceiver->getOsContext().getContextId());
}
if (gpuAllocation->isAlwaysResident(commandStreamReceiver->getOsContext().getContextId())) {
gpuAllocation->updateResidencyTaskCount(GraphicsAllocation::objectNotResident, commandStreamReceiver->getOsContext().getContextId());
gpuAllocation->updateResidencyTaskCount(desiredTaskCount, commandStreamReceiver->getOsContext().getContextId());
gpuAllocation->updateTaskCount(desiredTaskCount, commandStreamReceiver->getOsContext().getContextId());
}
}
}
}
SvmMapOperation *SVMAllocsManager::getSvmMapOperation(const void *ptr) {
ContainerReadLockType lock(mtx);
return svmMapOperations.get(ptr);
}
void SVMAllocsManager::insertSvmMapOperation(void *regionSvmPtr, size_t regionSize, void *baseSvmPtr, size_t offset, bool readOnlyMap) {
SvmMapOperation svmMapOperation;
svmMapOperation.regionSvmPtr = regionSvmPtr;
svmMapOperation.baseSvmPtr = baseSvmPtr;
svmMapOperation.offset = offset;
svmMapOperation.regionSize = regionSize;
svmMapOperation.readOnlyMap = readOnlyMap;
ContainerReadWriteLockType lock(mtx);
svmMapOperations.insert(svmMapOperation);
}
void SVMAllocsManager::removeSvmMapOperation(const void *regionSvmPtr) {
ContainerReadWriteLockType lock(mtx);
svmMapOperations.remove(regionSvmPtr);
}
AllocationType SVMAllocsManager::getGraphicsAllocationTypeAndCompressionPreference(const UnifiedMemoryProperties &unifiedMemoryProperties, bool &compressionEnabled) const {
compressionEnabled = false;
AllocationType allocationType = AllocationType::bufferHostMemory;
if (unifiedMemoryProperties.memoryType == InternalMemoryType::deviceUnifiedMemory) {
if (unifiedMemoryProperties.allocationFlags.allocFlags.allocWriteCombined) {
allocationType = AllocationType::writeCombined;
} else {
UNRECOVERABLE_IF(nullptr == unifiedMemoryProperties.device);
auto &gfxCoreHelper = unifiedMemoryProperties.device->getGfxCoreHelper();
auto &hwInfo = unifiedMemoryProperties.device->getHardwareInfo();
if (CompressionSelector::allowStatelessCompression() || gfxCoreHelper.usmCompressionSupported(hwInfo)) {
compressionEnabled = true;
}
if (unifiedMemoryProperties.requestedAllocationType != AllocationType::unknown) {
allocationType = unifiedMemoryProperties.requestedAllocationType;
} else {
allocationType = AllocationType::buffer;
}
}
}
return allocationType;
}
static uint32_t getSubDeviceId(Device &device) {
if (!device.isSubDevice()) {
uint32_t deviceBitField = static_cast<uint32_t>(device.getDeviceBitfield().to_ulong());
if (device.getDeviceBitfield().count() > 1) {
deviceBitField &= ~deviceBitField + 1;
}
return Math::log2(deviceBitField);
}
return static_cast<NEO::SubDevice *>(&device)->getSubDeviceIndex();
};
static NEO::SubDeviceIdsVec getSubDeviceIds(CommandStreamReceiver &csr) {
SubDeviceIdsVec subDeviceIds;
for (auto subDeviceId = 0u; subDeviceId < csr.getOsContext().getDeviceBitfield().size(); subDeviceId++) {
if (csr.getOsContext().getDeviceBitfield().test(subDeviceId)) {
subDeviceIds.push_back(subDeviceId);
}
}
return subDeviceIds;
};
void SVMAllocsManager::sharedSystemMemAdvise(Device &device, MemAdvise memAdviseOp, const void *ptr, const size_t size) {
// All vm_ids on a single device for shared system USM allocation
auto subDeviceIds = NEO::SubDevice::getSubDeviceIdsFromDevice(device);
memoryManager->setSharedSystemMemAdvise(ptr, size, memAdviseOp, subDeviceIds, device.getRootDeviceIndex());
}
void SVMAllocsManager::prefetchMemory(Device &device, CommandStreamReceiver &commandStreamReceiver, const void *ptr, const size_t size) {
auto svmData = getSVMAlloc(ptr);
if (!svmData) {
if (device.areSharedSystemAllocationsAllowed()) {
// Single vm_id for shared system USM allocation
auto subDeviceIds = SubDeviceIdsVec{getSubDeviceId(device)};
memoryManager->prefetchSharedSystemAlloc(ptr, size, subDeviceIds, device.getRootDeviceIndex());
}
return;
}
if ((memoryManager->isKmdMigrationAvailable(device.getRootDeviceIndex()) &&
(svmData->memoryType == InternalMemoryType::sharedUnifiedMemory))) {
auto gfxAllocation = svmData->gpuAllocations.getGraphicsAllocation(device.getRootDeviceIndex());
auto subDeviceIds = commandStreamReceiver.getActivePartitions() > 1 ? getSubDeviceIds(commandStreamReceiver) : SubDeviceIdsVec{getSubDeviceId(device)};
memoryManager->setMemPrefetch(gfxAllocation, subDeviceIds, device.getRootDeviceIndex());
}
}
void SVMAllocsManager::prefetchSVMAllocs(Device &device, CommandStreamReceiver &commandStreamReceiver) {
ContainerReadLockType lock(mtx);
auto subDeviceIds = commandStreamReceiver.getActivePartitions() > 1 ? getSubDeviceIds(commandStreamReceiver) : SubDeviceIdsVec{getSubDeviceId(device)};
if (memoryManager->isKmdMigrationAvailable(device.getRootDeviceIndex())) {
for (auto &allocation : this->svmAllocs.allocations) {
NEO::SvmAllocationData svmData = *allocation.second;
if (svmData.memoryType == InternalMemoryType::sharedUnifiedMemory) {
auto gfxAllocation = svmData.gpuAllocations.getGraphicsAllocation(device.getRootDeviceIndex());
memoryManager->setMemPrefetch(gfxAllocation, subDeviceIds, device.getRootDeviceIndex());
}
}
}
}
void SVMAllocsManager::sharedSystemAtomicAccess(Device &device, AtomicAccessMode mode, const void *ptr, const size_t size) {
// All vm_ids on a single device for shared system USM allocation
auto subDeviceIds = NEO::SubDevice::getSubDeviceIdsFromDevice(device);
memoryManager->setSharedSystemAtomicAccess(ptr, size, mode, subDeviceIds, device.getRootDeviceIndex());
}
AtomicAccessMode SVMAllocsManager::getSharedSystemAtomicAccess(Device &device, const void *ptr, const size_t size) {
// All vm_ids on a single device for shared system USM allocation
auto subDeviceIds = NEO::SubDevice::getSubDeviceIdsFromDevice(device);
return memoryManager->getSharedSystemAtomicAccess(ptr, size, subDeviceIds, device.getRootDeviceIndex());
}
std::unique_lock<std::mutex> SVMAllocsManager::obtainOwnership() {
return std::unique_lock<std::mutex>(mtxForIndirectAccess);
}
SVMAllocsManager::ContainerReadLockTypeRAIIHelper SVMAllocsManager::obtainReadContainerLock() {
return ContainerReadLockTypeRAIIHelper(*this);
}
void SVMAllocsManager::insertSVMAlloc(void *svmPtr, const SvmAllocationData &allocData) {
ContainerReadWriteLockType lock(mtx);
this->svmAllocs.insert(svmPtr, allocData);
UNRECOVERABLE_IF(internalAllocationsMap.count(allocData.getAllocId()) > 0);
for (auto alloc : allocData.gpuAllocations.getGraphicsAllocations()) {
if (alloc != nullptr) {
internalAllocationsMap.insert({allocData.getAllocId(), alloc});
}
}
}
/**
* @brief This method calls makeResident for allocation with specific allocId.
* Since single allocation id might be shared for different allocations in multi gpu scenario,
* this method iterates over all of these allocations and selects correct one based on device index
*
* @param[in] allocationId id of the allocation which should be resident
* @param[in] csr command stream receiver which will make allocation resident
*/
void SVMAllocsManager::makeResidentForAllocationsWithId(uint32_t allocationId, CommandStreamReceiver &csr) {
for (auto [iter, rangeEnd] = internalAllocationsMap.equal_range(allocationId); iter != rangeEnd; ++iter) {
auto gpuAllocation = iter->second;
if (gpuAllocation->getRootDeviceIndex() != csr.getRootDeviceIndex()) {
continue;
}
csr.makeResident(*gpuAllocation);
gpuAllocation->updateResidencyTaskCount(GraphicsAllocation::objectAlwaysResident, csr.getOsContext().getContextId());
gpuAllocation->setEvictable(false);
}
}
bool SVMAllocsManager::submitIndirectAllocationsAsPack(CommandStreamReceiver &csr) {
auto submitAsPack = memoryManager->allowIndirectAllocationsAsPack(csr.getRootDeviceIndex());
if (debugManager.flags.MakeIndirectAllocationsResidentAsPack.get() != -1) {
submitAsPack = !!NEO::debugManager.flags.MakeIndirectAllocationsResidentAsPack.get();
}
if (submitAsPack) {
makeIndirectAllocationsResident(csr, csr.peekTaskCount() + 1u);
}
return submitAsPack;
}
} // namespace NEO