From ac8c00048e42d9c9f9b120afd6ebd6dc64c3d273 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Thu, 23 Nov 2023 09:01:31 +0000 Subject: [PATCH] performance: optimize svm allocation tracking Signed-off-by: Lukasz Jobczyk --- .../fixtures/memory_ipc_fixture.cpp | 27 +++++--- opencl/source/kernel/kernel.cpp | 2 +- .../memory_manager/unified_memory_manager.cpp | 65 +++++++++++++++--- .../memory_manager/unified_memory_manager.h | 18 ++++- .../unified_memory_manager_cache_tests.cpp | 68 ++++++++++++++++++- 5 files changed, 158 insertions(+), 22 deletions(-) diff --git a/level_zero/core/test/unit_tests/fixtures/memory_ipc_fixture.cpp b/level_zero/core/test/unit_tests/fixtures/memory_ipc_fixture.cpp index 6d6fe813f2..a6effe429b 100644 --- a/level_zero/core/test/unit_tests/fixtures/memory_ipc_fixture.cpp +++ b/level_zero/core/test/unit_tests/fixtures/memory_ipc_fixture.cpp @@ -313,7 +313,8 @@ NEO::GraphicsAllocation *MemoryManagerOpenIpcMock::allocateGraphicsMemoryWithPro } NEO::GraphicsAllocation *MemoryManagerOpenIpcMock::allocateGraphicsMemoryWithProperties(const AllocationProperties &properties, const void *externalPtr) { - auto ptr = reinterpret_cast(sharedHandleAddress++); + auto ptr = reinterpret_cast(sharedHandleAddress); + sharedHandleAddress += properties.size; auto gmmHelper = getGmmHelper(0); auto canonizedGpuAddress = gmmHelper->canonize(castToUint64(ptr)); auto alloc = new IpcImplicitScalingMockGraphicsAllocation(properties.rootDeviceIndex, @@ -332,7 +333,8 @@ NEO::GraphicsAllocation *MemoryManagerOpenIpcMock::createGraphicsAllocationFromS if (failOnCreateGraphicsAllocationFromSharedHandle) { return nullptr; } - auto ptr = reinterpret_cast(sharedHandleAddress++); + auto ptr = reinterpret_cast(sharedHandleAddress); + sharedHandleAddress += properties.size; auto gmmHelper = getGmmHelper(0); auto canonizedGpuAddress = gmmHelper->canonize(castToUint64(ptr)); auto alloc = new IpcImplicitScalingMockGraphicsAllocation(properties.rootDeviceIndex, @@ -350,7 +352,8 @@ NEO::GraphicsAllocation *MemoryManagerOpenIpcMock::createGraphicsAllocationFromM if (failOnCreateGraphicsAllocationFromSharedHandle) { return nullptr; } - auto ptr = reinterpret_cast(sharedHandleAddress++); + auto ptr = reinterpret_cast(sharedHandleAddress); + sharedHandleAddress += properties.size; auto gmmHelper = getGmmHelper(0); auto canonizedGpuAddress = gmmHelper->canonize(castToUint64(ptr)); auto alloc = new IpcImplicitScalingMockGraphicsAllocation(properties.rootDeviceIndex, @@ -365,7 +368,8 @@ NEO::GraphicsAllocation *MemoryManagerOpenIpcMock::createGraphicsAllocationFromM return alloc; } NEO::GraphicsAllocation *MemoryManagerOpenIpcMock::createGraphicsAllocationFromNTHandle(void *handle, uint32_t rootDeviceIndex, AllocationType allocType) { - auto ptr = reinterpret_cast(sharedHandleAddress++); + auto ptr = reinterpret_cast(sharedHandleAddress); + sharedHandleAddress += 0x1000; auto gmmHelper = getGmmHelper(0); auto canonizedGpuAddress = gmmHelper->canonize(castToUint64(ptr)); auto alloc = new IpcImplicitScalingMockGraphicsAllocation(0u, @@ -424,7 +428,8 @@ void MemoryOpenIpcHandleTest::TearDown() { } NEO::GraphicsAllocation *MemoryManagerIpcImplicitScalingMock::allocateGraphicsMemoryInPreferredPool(const AllocationProperties &properties, const void *hostPtr) { - auto ptr = reinterpret_cast(sharedHandleAddress++); + auto ptr = reinterpret_cast(sharedHandleAddress); + sharedHandleAddress += properties.size; auto gmmHelper = getGmmHelper(0); auto canonizedGpuAddress = gmmHelper->canonize(castToUint64(ptr)); auto alloc = new IpcImplicitScalingMockGraphicsAllocation(0u, @@ -440,7 +445,8 @@ NEO::GraphicsAllocation *MemoryManagerIpcImplicitScalingMock::allocateGraphicsMe } NEO::GraphicsAllocation *MemoryManagerIpcImplicitScalingMock::allocateGraphicsMemoryWithProperties(const AllocationProperties &properties) { - auto ptr = reinterpret_cast(sharedHandleAddress++); + auto ptr = reinterpret_cast(sharedHandleAddress); + sharedHandleAddress += properties.size; auto gmmHelper = getGmmHelper(0); auto canonizedGpuAddress = gmmHelper->canonize(castToUint64(ptr)); auto alloc = new IpcImplicitScalingMockGraphicsAllocation(0u, @@ -459,7 +465,8 @@ NEO::GraphicsAllocation *MemoryManagerIpcImplicitScalingMock::createGraphicsAllo if (failOnCreateGraphicsAllocationFromSharedHandle) { return nullptr; } - auto ptr = reinterpret_cast(sharedHandleAddress++); + auto ptr = reinterpret_cast(sharedHandleAddress); + sharedHandleAddress += properties.size; auto gmmHelper = getGmmHelper(0); auto canonizedGpuAddress = gmmHelper->canonize(castToUint64(ptr)); auto alloc = new IpcImplicitScalingMockGraphicsAllocation(0u, @@ -479,7 +486,8 @@ NEO::GraphicsAllocation *MemoryManagerIpcImplicitScalingMock::createGraphicsAllo return nullptr; } - auto ptr = reinterpret_cast(sharedHandleAddress++); + auto ptr = reinterpret_cast(sharedHandleAddress); + sharedHandleAddress += 0x1000; auto gmmHelper = getGmmHelper(0); auto canonizedGpuAddress = gmmHelper->canonize(castToUint64(ptr)); auto alloc = new IpcImplicitScalingMockGraphicsAllocation(0u, @@ -497,7 +505,8 @@ NEO::GraphicsAllocation *MemoryManagerIpcImplicitScalingMock::createGraphicsAllo if (failOnCreateGraphicsAllocationFromSharedHandle) { return nullptr; } - auto ptr = reinterpret_cast(sharedHandleAddress++); + auto ptr = reinterpret_cast(sharedHandleAddress); + sharedHandleAddress += properties.size; auto gmmHelper = getGmmHelper(0); auto canonizedGpuAddress = gmmHelper->canonize(castToUint64(ptr)); auto alloc = new IpcImplicitScalingMockGraphicsAllocation(0u, diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 742e9401f3..4ee834cf3e 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -1965,7 +1965,7 @@ std::unique_ptr Kernel::fillWithKernelObjsForAuxTra } if (getContext().getSVMAllocsManager()) { for (auto &allocation : getContext().getSVMAllocsManager()->getSVMAllocs()->allocations) { - auto gfxAllocation = allocation.second.gpuAllocations.getDefaultGraphicsAllocation(); + auto gfxAllocation = allocation.second->gpuAllocations.getDefaultGraphicsAllocation(); if (gfxAllocation->isCompressionEnabled()) { kernelObjsForAuxTranslation->insert({KernelObjForAuxTranslation::Type::GFX_ALLOC, gfxAllocation}); auto &context = this->program->getContext(); diff --git a/shared/source/memory_manager/unified_memory_manager.cpp b/shared/source/memory_manager/unified_memory_manager.cpp index 9da7a53484..2e1e1cc428 100644 --- a/shared/source/memory_manager/unified_memory_manager.cpp +++ b/shared/source/memory_manager/unified_memory_manager.cpp @@ -42,6 +42,25 @@ void SVMAllocsManager::MapBasedAllocationTracker::remove(const SvmAllocationData allocations.erase(iter); } +void SVMAllocsManager::SortedVectorBasedAllocationTracker::insert(const SvmAllocationData &allocationsPair) { + allocations.push_back(std::make_pair(reinterpret_cast(allocationsPair.gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress()), std::make_unique(allocationsPair))); + for (size_t i = allocations.size() - 1; i > 0; --i) { + if (allocations[i].first < allocations[i - 1].first) { + std::iter_swap(allocations.begin() + i, allocations.begin() + i - 1); + } else { + break; + } + } +} + +void SVMAllocsManager::SortedVectorBasedAllocationTracker::remove(const SvmAllocationData &allocationsPair) { + auto gpuAddress = reinterpret_cast(allocationsPair.gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress()); + auto removeIt = std::remove_if(allocations.begin(), allocations.end(), [&gpuAddress](const auto &other) { + return gpuAddress == other.first; + }); + allocations.erase(removeIt); +} + void SVMAllocsManager::SvmAllocationCache::insert(size_t size, void *ptr) { std::lock_guard lock(this->mtx); allocations.emplace(std::lower_bound(allocations.begin(), allocations.end(), size), size, ptr); @@ -113,6 +132,34 @@ SvmAllocationData *SVMAllocsManager::MapBasedAllocationTracker::get(const void * return nullptr; } +SvmAllocationData *SVMAllocsManager::SortedVectorBasedAllocationTracker::get(const void *ptr) { + if (allocations.size() == 0) { + return nullptr; + } + if (!ptr) { + return nullptr; + } + + int begin = 0; + int end = static_cast(allocations.size() - 1); + while (end >= begin) { + int currentPos = (begin + end) / 2; + const auto &allocation = allocations[currentPos]; + if (allocation.first == ptr || (allocation.first < ptr && + (reinterpret_cast(ptr) < (reinterpret_cast(allocation.first) + allocation.second->size)))) { + return allocation.second.get(); + } else if (ptr < allocation.first) { + end = currentPos - 1; + continue; + } else { + begin = currentPos + 1; + continue; + } + } + + return nullptr; +} + void SVMAllocsManager::MapOperationsTracker::insert(SvmMapOperation mapOperation) { operations.insert(std::make_pair(mapOperation.regionSvmPtr, mapOperation)); } @@ -137,16 +184,16 @@ void SVMAllocsManager::addInternalAllocationsToResidencyContainer(uint32_t rootD uint32_t requestedTypesMask) { std::shared_lock lock(mtx); for (auto &allocation : this->svmAllocs.allocations) { - if (rootDeviceIndex >= allocation.second.gpuAllocations.getGraphicsAllocations().size()) { + if (rootDeviceIndex >= allocation.second->gpuAllocations.getGraphicsAllocations().size()) { continue; } - if (!(allocation.second.memoryType & requestedTypesMask) || - (nullptr == allocation.second.gpuAllocations.getGraphicsAllocation(rootDeviceIndex))) { + if (!(allocation.second->memoryType & requestedTypesMask) || + (nullptr == allocation.second->gpuAllocations.getGraphicsAllocation(rootDeviceIndex))) { continue; } - auto alloc = allocation.second.gpuAllocations.getGraphicsAllocation(rootDeviceIndex); + auto alloc = allocation.second->gpuAllocations.getGraphicsAllocation(rootDeviceIndex); residencyContainer.push_back(alloc); } } @@ -154,8 +201,8 @@ void SVMAllocsManager::addInternalAllocationsToResidencyContainer(uint32_t rootD void SVMAllocsManager::makeInternalAllocationsResident(CommandStreamReceiver &commandStreamReceiver, uint32_t requestedTypesMask) { std::shared_lock lock(mtx); for (auto &allocation : this->svmAllocs.allocations) { - if (allocation.second.memoryType & requestedTypesMask) { - auto gpuAllocation = allocation.second.gpuAllocations.getGraphicsAllocation(commandStreamReceiver.getRootDeviceIndex()); + if (allocation.second->memoryType & requestedTypesMask) { + auto gpuAllocation = allocation.second->gpuAllocations.getGraphicsAllocation(commandStreamReceiver.getRootDeviceIndex()); if (gpuAllocation == nullptr) { continue; } @@ -665,7 +712,7 @@ void SVMAllocsManager::freeSvmAllocationWithDeviceStorage(SvmAllocationData *svm bool SVMAllocsManager::hasHostAllocations() { std::shared_lock lock(mtx); for (auto &allocation : this->svmAllocs.allocations) { - if (allocation.second.memoryType == InternalMemoryType::HOST_UNIFIED_MEMORY) { + if (allocation.second->memoryType == InternalMemoryType::HOST_UNIFIED_MEMORY) { return true; } } @@ -695,7 +742,7 @@ void SVMAllocsManager::makeIndirectAllocationsResident(CommandStreamReceiver &co } if (parseAllAllocations) { for (auto &allocation : this->svmAllocs.allocations) { - auto gpuAllocation = allocation.second.gpuAllocations.getGraphicsAllocation(commandStreamReceiver.getRootDeviceIndex()); + auto gpuAllocation = allocation.second->gpuAllocations.getGraphicsAllocation(commandStreamReceiver.getRootDeviceIndex()); if (gpuAllocation == nullptr) { continue; } @@ -812,7 +859,7 @@ void SVMAllocsManager::prefetchMemory(Device &device, CommandStreamReceiver &com void SVMAllocsManager::prefetchSVMAllocs(Device &device, CommandStreamReceiver &commandStreamReceiver) { std::shared_lock lock(mtx); for (auto &allocation : this->svmAllocs.allocations) { - NEO::SvmAllocationData allocData = allocation.second; + NEO::SvmAllocationData allocData = *allocation.second; this->prefetchMemory(device, commandStreamReceiver, allocData); } } diff --git a/shared/source/memory_manager/unified_memory_manager.h b/shared/source/memory_manager/unified_memory_manager.h index 4029efc25b..ae45b6913e 100644 --- a/shared/source/memory_manager/unified_memory_manager.h +++ b/shared/source/memory_manager/unified_memory_manager.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,19 @@ struct SvmMapOperation { class SVMAllocsManager { public: + class SortedVectorBasedAllocationTracker { + friend class SVMAllocsManager; + + public: + using SvmAllocationContainer = std::vector>>; + void insert(const SvmAllocationData &); + void remove(const SvmAllocationData &); + SvmAllocationData *get(const void *); + size_t getNumAllocs() const { return allocations.size(); }; + + SvmAllocationContainer allocations; + }; + class MapBasedAllocationTracker { friend class SVMAllocsManager; @@ -205,7 +219,7 @@ class SVMAllocsManager { void removeSVMAlloc(const SvmAllocationData &svmData); size_t getNumAllocs() const { return svmAllocs.getNumAllocs(); } MOCKABLE_VIRTUAL size_t getNumDeferFreeAllocs() const { return svmDeferFreeAllocs.getNumAllocs(); } - MapBasedAllocationTracker *getSVMAllocs() { return &svmAllocs; } + SortedVectorBasedAllocationTracker *getSVMAllocs() { return &svmAllocs; } MOCKABLE_VIRTUAL void insertSvmMapOperation(void *regionSvmPtr, size_t regionSize, void *baseSvmPtr, size_t offset, bool readOnlyMap); void removeSvmMapOperation(const void *regionSvmPtr); @@ -240,7 +254,7 @@ class SVMAllocsManager { void initUsmDeviceAllocationsCache(); void freeSVMData(SvmAllocationData *svmData); - MapBasedAllocationTracker svmAllocs; + SortedVectorBasedAllocationTracker svmAllocs; MapOperationsTracker svmMapOperations; MapBasedAllocationTracker svmDeferFreeAllocs; MemoryManager *memoryManager; diff --git a/shared/test/unit_test/memory_manager/unified_memory_manager_cache_tests.cpp b/shared/test/unit_test/memory_manager/unified_memory_manager_cache_tests.cpp index 6855676181..ce22e2fd10 100644 --- a/shared/test/unit_test/memory_manager/unified_memory_manager_cache_tests.cpp +++ b/shared/test/unit_test/memory_manager/unified_memory_manager_cache_tests.cpp @@ -17,6 +17,72 @@ using namespace NEO; +TEST(SortedVectorBasedAllocationTrackerTests, givenSortedVectorBasedAllocationTrackerWhenInsertRemoveAndGetThenStoreDataProperly) { + SvmAllocationData data(1u); + SVMAllocsManager::SortedVectorBasedAllocationTracker tracker; + + MockGraphicsAllocation graphicsAllocations[] = {{reinterpret_cast(0x1 * MemoryConstants::pageSize64k), MemoryConstants::pageSize64k}, + {reinterpret_cast(0x2 * MemoryConstants::pageSize64k), MemoryConstants::pageSize64k}, + {reinterpret_cast(0x3 * MemoryConstants::pageSize64k), MemoryConstants::pageSize64k}, + {reinterpret_cast(0x4 * MemoryConstants::pageSize64k), MemoryConstants::pageSize64k}, + {reinterpret_cast(0x5 * MemoryConstants::pageSize64k), MemoryConstants::pageSize64k}, + {reinterpret_cast(0x6 * MemoryConstants::pageSize64k), MemoryConstants::pageSize64k}, + {reinterpret_cast(0x7 * MemoryConstants::pageSize64k), MemoryConstants::pageSize64k}, + {reinterpret_cast(0x8 * MemoryConstants::pageSize64k), MemoryConstants::pageSize64k}, + {reinterpret_cast(0x9 * MemoryConstants::pageSize64k), MemoryConstants::pageSize64k}, + {reinterpret_cast(0xA * MemoryConstants::pageSize64k), MemoryConstants::pageSize64k}}; + const auto graphicsAllocationsSize = sizeof(graphicsAllocations) / sizeof(MockGraphicsAllocation); + for (uint32_t i = graphicsAllocationsSize - 1; i >= graphicsAllocationsSize / 2; --i) { + data.gpuAllocations.addAllocation(&graphicsAllocations[i]); + data.device = reinterpret_cast(graphicsAllocations[i].getGpuAddress()); + tracker.insert(data); + } + for (uint32_t i = 0; i < graphicsAllocationsSize / 2; ++i) { + data.gpuAllocations.addAllocation(&graphicsAllocations[i]); + data.device = reinterpret_cast(graphicsAllocations[i].getGpuAddress()); + tracker.insert(data); + } + + EXPECT_EQ(tracker.getNumAllocs(), graphicsAllocationsSize); + for (uint64_t i = 0; i < graphicsAllocationsSize; ++i) { + EXPECT_EQ((i + 1) * MemoryConstants::pageSize64k, reinterpret_cast(tracker.allocations[static_cast(i)].first)); + EXPECT_EQ((i + 1) * MemoryConstants::pageSize64k, reinterpret_cast(tracker.allocations[static_cast(i)].second->device)); + } + + auto addr1 = reinterpret_cast(graphicsAllocations[7].getGpuAddress()); + auto data1 = tracker.get(addr1); + EXPECT_EQ(data1->device, addr1); + + MockGraphicsAllocation graphicsAlloc{reinterpret_cast(0x0), MemoryConstants::pageSize64k}; + data.gpuAllocations.addAllocation(&graphicsAlloc); + data.device = reinterpret_cast(graphicsAlloc.getGpuAddress()); + tracker.insert(data); + + EXPECT_EQ(tracker.getNumAllocs(), graphicsAllocationsSize + 1); + for (uint64_t i = 0; i < graphicsAllocationsSize + 1; ++i) { + EXPECT_EQ(i * MemoryConstants::pageSize64k, reinterpret_cast(tracker.allocations[static_cast(i)].first)); + EXPECT_EQ(i * MemoryConstants::pageSize64k, reinterpret_cast(tracker.allocations[static_cast(i)].second->device)); + } + EXPECT_EQ(data1->device, addr1); + + auto addr2 = reinterpret_cast(graphicsAllocations[1].getGpuAddress()); + auto data2 = tracker.get(addr2); + EXPECT_EQ(data1->device, addr1); + EXPECT_EQ(data2->device, addr2); + tracker.remove(*data2); + EXPECT_EQ(tracker.getNumAllocs(), graphicsAllocationsSize); + for (uint64_t i = 0; i < graphicsAllocationsSize; ++i) { + if (i < 2) { + EXPECT_EQ(i * MemoryConstants::pageSize64k, reinterpret_cast(tracker.allocations[static_cast(i)].first)); + EXPECT_EQ(i * MemoryConstants::pageSize64k, reinterpret_cast(tracker.allocations[static_cast(i)].second->device)); + } else { + EXPECT_EQ((i + 1) * MemoryConstants::pageSize64k, reinterpret_cast(tracker.allocations[static_cast(i)].first)); + EXPECT_EQ((i + 1) * MemoryConstants::pageSize64k, reinterpret_cast(tracker.allocations[static_cast(i)].second->device)); + } + } + EXPECT_EQ(data1->device, addr1); +} + TEST(SvmDeviceAllocationCacheTest, givenAllocationCacheDefaultWhenCheckingIfEnabledThenItIsDisabled) { std::unique_ptr deviceFactory(new UltDeviceFactory(1, 1)); RootDeviceIndicesContainer rootDeviceIndices = {mockRootDeviceIndex}; @@ -217,7 +283,7 @@ TEST(SvmDeviceAllocationCacheTest, givenAllocationsWithDifferentFlagsWhenAllocat auto svmManager = std::make_unique(rootDevice->getMemoryManager(), false); ASSERT_TRUE(svmManager->usmDeviceAllocationsCacheEnabled); - constexpr auto allocationSizeBasis = MemoryConstants::pageSize64k; + constexpr auto allocationSizeBasis = MemoryConstants::kiloByte; size_t defaultAllocSize = allocationSizeBasis << 0; std::map subDeviceBitfields = {{0u, {01}}, {1u, {10}}}; SvmDeviceAllocationCacheTestDataType