From 0bff9def3be208aa4182a04907025202346d4608 Mon Sep 17 00:00:00 2001 From: "Chodor, Jaroslaw" Date: Wed, 26 Feb 2025 14:32:25 +0000 Subject: [PATCH] feature: New allocators for opaque arrays Adds fast allocators for opaque objects of uniform size. Related-To: NEO-13406 Signed-off-by: Chodor, Jaroslaw --- shared/source/helpers/ptr_math.h | 11 +- shared/source/utilities/CMakeLists.txt | 3 +- shared/source/utilities/bitcontainers.h | 232 ++++++++++++++++ shared/source/utilities/stackvec.h | 19 +- .../test/unit_test/helpers/ptr_math_tests.cpp | 19 +- .../unit_test/utilities/containers_tests.cpp | 249 +++++++++++++++++- 6 files changed, 524 insertions(+), 9 deletions(-) create mode 100644 shared/source/utilities/bitcontainers.h diff --git a/shared/source/helpers/ptr_math.h b/shared/source/helpers/ptr_math.h index 2f88e6b1ac..31c2e0c20e 100644 --- a/shared/source/helpers/ptr_math.h +++ b/shared/source/helpers/ptr_math.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -44,6 +44,15 @@ inline uint64_t ptrDiff(uint64_t ptrAfter, T ptrBefore) { return ptrAfter - ptrBefore; } +template +constexpr bool byteRangeContains(T rangeBase, size_t rangeSizeInBytes, P ptr) noexcept { + const auto begin = reinterpret_cast(rangeBase); + const auto end = begin + rangeSizeInBytes; + const auto p = reinterpret_cast(ptr); + + return (p >= begin) && (p < end); +} + template inline void *addrToPtr(IntegerAddressType addr) { uintptr_t correctBitnessAddress = static_cast(addr); diff --git a/shared/source/utilities/CMakeLists.txt b/shared/source/utilities/CMakeLists.txt index 067bf2daf0..5d2f3f6d8c 100644 --- a/shared/source/utilities/CMakeLists.txt +++ b/shared/source/utilities/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2019-2024 Intel Corporation +# Copyright (C) 2019-2025 Intel Corporation # # SPDX-License-Identifier: MIT # @@ -8,6 +8,7 @@ set(NEO_CORE_UTILITIES ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt ${CMAKE_CURRENT_SOURCE_DIR}/api_intercept.h ${CMAKE_CURRENT_SOURCE_DIR}/arrayref.h + ${CMAKE_CURRENT_SOURCE_DIR}/bitcontainers.h ${CMAKE_CURRENT_SOURCE_DIR}/cpuintrinsics.h ${CMAKE_CURRENT_SOURCE_DIR}/const_stringref.h ${CMAKE_CURRENT_SOURCE_DIR}/cpu_info.h diff --git a/shared/source/utilities/bitcontainers.h b/shared/source/utilities/bitcontainers.h new file mode 100644 index 0000000000..57ae790973 --- /dev/null +++ b/shared/source/utilities/bitcontainers.h @@ -0,0 +1,232 @@ +/* + * Copyright (C) 2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +#include "shared/source/helpers/aligned_memory.h" +#include "shared/source/helpers/debug_helpers.h" +#include "shared/source/helpers/ptr_math.h" + +#include +#include +#include +#include + +// Variable-length bitarray with support for ffz (find first zero) +class BitArray final { + static inline constexpr size_t chunkSize = 64; + using Chunk = std::bitset; + static_assert(sizeof(Chunk) == sizeof(uint64_t)); + + std::vector data; + size_t arrayLength; + + public: + static inline constexpr int64_t npos = -1; + + BitArray(size_t length) : arrayLength(length) { + data.resize(alignUp(length, chunkSize) / chunkSize); + } + + Chunk::reference operator[](size_t pos) { + DEBUG_BREAK_IF(pos >= arrayLength); + return data[pos / chunkSize][pos % chunkSize]; + } + + int64_t ffz() const { + auto chunkIt = std::find_if(cbegin(data), cend(data), [](auto &chunk) { return false == chunk.all(); }); + if (cend(data) == chunkIt) { + return npos; + } + + auto offset = std::countr_one(chunkIt->to_ullong()); + auto block = chunkIt - cbegin(data); + auto pos = block * chunkSize + offset; + if (pos >= arrayLength) { + return npos; + } + return static_cast(pos); + } + + size_t length() const { + return arrayLength; + } +}; + +// Variable-length allocator of bits (positions) +class BitAllocator final { + BitArray bitArray; + + public: + static inline constexpr int64_t npos = BitArray::npos; + + BitAllocator(size_t capacity) : bitArray(capacity) { + } + + int64_t allocate() { + auto pos = bitArray.ffz(); + if (BitArray::npos == pos) { + return npos; + } + bitArray[static_cast(pos)] = true; + return pos; + } + + void free(int64_t pos) { + if (pos < 0) { + DEBUG_BREAK_IF(true); + return; + } + DEBUG_BREAK_IF(false == bitArray[static_cast(pos)]); + bitArray[static_cast(pos)] = false; + } + + size_t sizeInBits() const { + return bitArray.length(); + } +}; + +// Array of opaque elements +template +class OpaqueArray { + using ElementCount = size_t; + + const UnderlyingMemoryHandleT userHandle; + void *const array = nullptr; + const size_t arraySizeInBytes = 0; + const size_t elementStrideInBytes = 0; + + protected: + size_t idx(void *ptr) const { + DEBUG_BREAK_IF(false == contains(ptr)); + auto byteOffset = ptrDiff(ptr, array); + return byteOffset / elementStrideInBytes; + } + + public: + template + OpaqueArray(T &&handle, void *array, size_t elementStrideInBytes, size_t numElementsInArray) + : userHandle(std::forward(handle)), array(reinterpret_cast(array)), + arraySizeInBytes(numElementsInArray * elementStrideInBytes), elementStrideInBytes(elementStrideInBytes) { + } + + const UnderlyingMemoryHandleT &handle() const { + return userHandle; + } + + void *element(size_t pos) { + return ptrOffset(array, pos * elementStrideInBytes); + } + + void *base() const { + return array; + } + + bool contains(void *ptr) const { + if (byteRangeContains(array, arraySizeInBytes, ptr)) { + DEBUG_BREAK_IF(ptrDiff(ptr, array) + elementStrideInBytes > arraySizeInBytes); + return true; + } + return false; + } +}; + +// Fast fixed-size allocator of opaque elements (of uniform size) +template +class OpaqueArrayElementAllocator final : public OpaqueArray { + BitAllocator bitAllocator; + + public: + template + OpaqueArrayElementAllocator(T &&handle, void *array, size_t elementStrideInBytes, size_t numElementsInArray) + : OpaqueArray(std::forward(handle), array, elementStrideInBytes, numElementsInArray), + bitAllocator(numElementsInArray) { + } + + void *allocate() { + auto pos = bitAllocator.allocate(); + if (BitAllocator::npos == pos) { + return nullptr; + } + return this->element(static_cast(pos)); + } + + bool free(void *el) { + if (this->contains(el) == false) { + return false; + } + + auto pos = this->idx(el); + bitAllocator.free(pos); + return true; + } +}; + +template +struct UnderlyingAllocator { + using AllocationT = std::pair; + + std::function allocate; + std::function free; +}; + +// Fast dynamic-size allocator of opaque elements (of uniform size) +template +class OpaqueElementAllocator final { + public: + using UnderlyingAllocatorT = UnderlyingAllocator; + using AllocationT = UnderlyingAllocatorT::AllocationT; + + private: + const size_t chunkSize; + const size_t alignedElementSize; + + UnderlyingAllocatorT underlyingAllocator; + using ChunkT = OpaqueArrayElementAllocator; + std::vector chunks; + + public: + OpaqueElementAllocator(size_t chunkSize, size_t alignedElementSize, + UnderlyingAllocatorT underlyingAllocator) : chunkSize(chunkSize), alignedElementSize(alignedElementSize), + underlyingAllocator(std::move(underlyingAllocator)) { + UNRECOVERABLE_IF(chunkSize < alignedElementSize); + DEBUG_BREAK_IF((chunkSize % alignedElementSize) != 0); + } + + ~OpaqueElementAllocator() { + for (auto &chunk : chunks) { + underlyingAllocator.free({std::move(chunk.handle()), chunk.base()}); + } + } + + AllocationT allocate() { + for (auto &chunk : chunks) { + auto *va = chunk.allocate(); + if (va) { + return {chunk.handle(), va}; + } + } + + auto alloc = underlyingAllocator.allocate(chunkSize, alignedElementSize); + if (nullptr == alloc.second) { + return {}; + } + chunks.emplace_back(std::move(alloc.first), alloc.second, alignedElementSize, chunkSize / alignedElementSize); + + return {chunks.rbegin()->handle(), chunks.rbegin()->allocate()}; + } + + bool free(void *ptr) { + return std::ranges::any_of(chunks, + [=](auto &chunk) { return chunk.free(ptr); }); + } + + bool contains(void *ptr) const { + return std::ranges::any_of(chunks, + [=](const auto &chunk) { return chunk.contains(ptr); }); + } +}; diff --git a/shared/source/utilities/stackvec.h b/shared/source/utilities/stackvec.h index c3584af81a..55e394e696 100644 --- a/shared/source/utilities/stackvec.h +++ b/shared/source/utilities/stackvec.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -151,6 +151,14 @@ class StackVec { // NOLINT(clang-analyzer-optin.performance.Padding) return *this; } + template + constexpr iterator insert(const_iterator pos, T &&value) { + auto offset = pos - begin(); + push_back(std::forward(value)); + std::rotate(begin() + offset, end() - 1, end()); + return begin() + offset; + } + template void swap(RhsT &rhs) { if (this->usesDynamicMem() && rhs.usesDynamicMem()) { @@ -332,13 +340,20 @@ class StackVec { // NOLINT(clang-analyzer-optin.performance.Padding) return reinterpret_cast(this->onStackMem) != reinterpret_cast(onStackMemRawBytes) && this->dynamicMem; } - auto data() { + DataType *data() { if (usesDynamicMem()) { return dynamicMem->data(); } return reinterpret_cast(onStackMemRawBytes); } + const DataType *data() const { + if (usesDynamicMem()) { + return dynamicMem->data(); + } + return reinterpret_cast(onStackMemRawBytes); + } + private: template friend class StackVec; diff --git a/shared/test/unit_test/helpers/ptr_math_tests.cpp b/shared/test/unit_test/helpers/ptr_math_tests.cpp index e6e07a6890..82a5299f5a 100644 --- a/shared/test/unit_test/helpers/ptr_math_tests.cpp +++ b/shared/test/unit_test/helpers/ptr_math_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -92,3 +92,20 @@ TEST(ptrDiff, WhenGettingPtrDiffThen64BitIsPreserved) { auto ptrBefore3 = ptrDiff(ptrAfter, 0x1234ull); EXPECT_EQ(0x800000000ull, ptrBefore3); } + +TEST(ByteRangeContains, WhenInRangeThenReturnTrue) { + char stack[3] = {}; + EXPECT_TRUE(byteRangeContains(stack, 3, &stack[0])); + EXPECT_TRUE(byteRangeContains(stack, 3, &stack[1])); + EXPECT_TRUE(byteRangeContains(stack, 3, &stack[2])); +} + +TEST(ByteRangeContains, WhenNotInRangeThenReturnFalse) { + char stack[3] = {}; + std::vector heap(3, 0); + EXPECT_FALSE(byteRangeContains(stack + 1, 1, &stack[2])); + EXPECT_FALSE(byteRangeContains(stack + 1, 1, &stack[0])); + EXPECT_FALSE(byteRangeContains(stack, 3, &heap[0])); + EXPECT_FALSE(byteRangeContains(stack, 3, &heap[1])); + EXPECT_FALSE(byteRangeContains(stack, 3, &heap[2])); +} diff --git a/shared/test/unit_test/utilities/containers_tests.cpp b/shared/test/unit_test/utilities/containers_tests.cpp index 6cb65e4ce2..f51d962c15 100644 --- a/shared/test/unit_test/utilities/containers_tests.cpp +++ b/shared/test/unit_test/utilities/containers_tests.cpp @@ -6,6 +6,7 @@ */ #include "shared/source/utilities/arrayref.h" +#include "shared/source/utilities/bitcontainers.h" #include "shared/source/utilities/idlist.h" #include "shared/source/utilities/iflist.h" #include "shared/source/utilities/lookup_array.h" @@ -15,9 +16,11 @@ #include "gtest/gtest.h" +#include #include #include #include +#include #include #include @@ -1654,13 +1657,13 @@ TEST(StackVec, WhenCallingDataThenVectorDataIsReturned) { char dataB[] = {5, 4, 3, 2, 1}; StackVec stackVecA{dataA, dataA + sizeof(dataA)}; - StackVec stackVecB{dataB, dataB + sizeof(dataB)}; + const StackVec stackVecB{dataB, dataB + sizeof(dataB)}; EXPECT_TRUE(stackVecA.usesDynamicMem()); EXPECT_FALSE(stackVecB.usesDynamicMem()); - auto stackVecAData = reinterpret_cast(stackVecA.data()); - auto stackVecBData = reinterpret_cast(stackVecB.data()); + auto stackVecAData = stackVecA.data(); + auto stackVecBData = stackVecB.data(); for (size_t i = 0; i < 5; i++) { EXPECT_EQ(dataA[i], stackVecAData[i]); EXPECT_EQ(dataB[i], stackVecBData[i]); @@ -1728,6 +1731,44 @@ TEST(StackVec, whenPushingUniqueToRootDeviceIndicesContainerThenOnlyUniqueValues } } +TEST(StackVec, WhenInsertingAtGivenPositionThenOrderIsMaintained) { + { + StackVec tested; + tested.insert(tested.end(), 12); + + StackVec expected{12}; + EXPECT_EQ(expected, tested); + } + { + StackVec tested{3, 5, 7, 11}; + tested.insert(tested.end(), 12); + tested.insert(tested.begin(), 2); + tested.insert(tested.begin() + 2, 4); + + StackVec expected{2, 3, 4, 5, 7, 11, 12}; + EXPECT_EQ(expected, tested); + } + { + StackVec testedB{3}; + testedB.insert(testedB.begin(), 2); + StackVec testedE{2}; + testedE.insert(testedE.end(), 3); + + StackVec expected{2, 3}; + EXPECT_EQ(expected, testedB); + EXPECT_EQ(expected, testedE); + } + { + StackVec tested{3, 5, 7, 11}; + tested.insert(tested.end(), 12); + tested.insert(tested.begin(), 2); + tested.insert(tested.begin() + 2, 4); + + StackVec expected{2, 3, 4, 5, 7, 11, 12}; + EXPECT_EQ(expected, tested); + } +} + int sum(ArrayRef a) { int sum = 0; for (auto v : a) { @@ -1968,4 +2009,204 @@ TEST(LookupArrayLookUpGreaterEqual, WhenLookingForElementThenReturnFirstThatIsEq EXPECT_EQ(90, res); EXPECT_THROW(res = arr.lookUpGreaterEqual(10), std::exception); -} \ No newline at end of file +} + +TEST(BitArray, WhenFfzIsCalledThenReturnsFirstZeroBitOrMinusAllBitAreSet) { + { + BitArray ba{3}; + ASSERT_EQ(3U, ba.length()); + auto pos = ba.ffz(); + EXPECT_EQ(0, pos); + pos = ba.ffz(); + EXPECT_EQ(0, pos); + ba[0] = true; + pos = ba.ffz(); + EXPECT_EQ(1, pos); + pos = ba.ffz(); + EXPECT_EQ(1, pos); + ba[1] = true; + pos = ba.ffz(); + EXPECT_EQ(2, pos); + ba[2] = true; + pos = ba.ffz(); + EXPECT_EQ(-1, pos); + } + { + BitArray ba{64}; + ASSERT_EQ(64U, ba.length()); + for (int i = 0; i < 64; ++i) { + ba[i] = true; + } + auto pos = ba.ffz(); + EXPECT_EQ(-1, pos); + } + { + BitArray ba{4097}; + ASSERT_EQ(4097U, ba.length()); + for (int i = 0; i < 4095; ++i) { + ba[i] = true; + } + auto pos = ba.ffz(); + EXPECT_EQ(4095, pos); + ba[4095] = true; + pos = ba.ffz(); + EXPECT_EQ(4096, pos); + ba[4096] = true; + pos = ba.ffz(); + EXPECT_EQ(-1, pos); + + ba[796] = false; + pos = ba.ffz(); + EXPECT_EQ(796, pos); + } +} + +TEST(BitAllocator, WhenAllocateIsCalledThenAllocatesAndReturnsFirstFreeBitPositionOrMinusOneIfEmpty) { + { + BitAllocator ba{3}; + ASSERT_EQ(3U, ba.sizeInBits()); + auto pos = ba.allocate(); + EXPECT_EQ(0, pos); + pos = ba.allocate(); + EXPECT_EQ(1, pos); + pos = ba.allocate(); + EXPECT_EQ(2, pos); + pos = ba.allocate(); + EXPECT_EQ(-1, pos); + + ba.free(1); + pos = ba.allocate(); + EXPECT_EQ(1, pos); + pos = ba.allocate(); + EXPECT_EQ(-1, pos); + + ba.free(0); + pos = ba.allocate(); + EXPECT_EQ(0, pos); + pos = ba.allocate(); + EXPECT_EQ(-1, pos); + } + { + BitAllocator ba{64}; + ASSERT_EQ(64U, ba.sizeInBits()); + for (int i = 0; i < 64; ++i) { + EXPECT_EQ(i, ba.allocate()); + } + auto pos = ba.allocate(); + EXPECT_EQ(-1, pos); + } + { + BitAllocator ba{4097}; + ASSERT_EQ(4097U, ba.sizeInBits()); + for (int i = 0; i < 4095; ++i) { + EXPECT_EQ(i, ba.allocate()); + } + auto pos = ba.allocate(); + EXPECT_EQ(4095, pos); + pos = ba.allocate(); + EXPECT_EQ(4096, pos); + pos = ba.allocate(); + EXPECT_EQ(-1, pos); + + ba.free(796); + pos = ba.allocate(); + EXPECT_EQ(796, pos); + } +} + +TEST(OpaqueArray, WhenAllocateIsCalledThenAllocatesAndReturnsFirstFreeElementOrNullIfEmpty) { + struct GraphicsAllocationHandleExample { + } handle; + double underlyingMemory[5] = {1, 2, 3, 4, 5}; + const size_t elementStride = 2 * sizeof(double); + { + OpaqueArrayElementAllocator oa{&handle, underlyingMemory, elementStride, sizeof(underlyingMemory) / elementStride}; + EXPECT_EQ(underlyingMemory, oa.base()); + EXPECT_EQ(&handle, oa.handle()); + + EXPECT_TRUE(oa.contains(underlyingMemory)); + EXPECT_TRUE(oa.contains(underlyingMemory + 1)); + EXPECT_TRUE(oa.contains(underlyingMemory + 2)); + EXPECT_TRUE(oa.contains(underlyingMemory + 3)); + EXPECT_FALSE(oa.contains(underlyingMemory + 4)); + + auto el0 = oa.allocate(); + EXPECT_EQ(underlyingMemory, el0); + EXPECT_EQ(el0, oa.element(0)); + + auto el1 = oa.allocate(); + EXPECT_EQ(underlyingMemory + 2, el1); + EXPECT_EQ(el1, oa.element(1)); + + auto el2 = oa.allocate(); + EXPECT_EQ(nullptr, el2); + + oa.free(el0); + el2 = oa.allocate(); + EXPECT_EQ(underlyingMemory, el2); + } +} + +TEST(OpaqueElementAllocator, WhenAllocateIsCalledThenAllocatesAndReturnsFirstFreeElementAndGrowsIfNeeded) { + struct GraphicsAllocationHandleExample {}; + using AllocatorT = OpaqueElementAllocator; + using AllocationT = AllocatorT::AllocationT; + + std::set graphicsAllocations; + { + const size_t chunkSize = 256; + const size_t elementSize = 64; + AllocatorT allocator(chunkSize, elementSize, AllocatorT::UnderlyingAllocatorT{.allocate = [&](size_t s, size_t a) -> AllocationT { + auto newGa = new GraphicsAllocationHandleExample; + graphicsAllocations.insert(newGa); + return {newGa, alignedMalloc(s, a)}; + }, + .free = [&](AllocationT alloc) { + UNRECOVERABLE_IF(1 != graphicsAllocations.count(alloc.first)); + graphicsAllocations.erase(alloc.first); + delete alloc.first; + alignedFree(alloc.second); }}); + + auto elsPerChunk = chunkSize / elementSize; + std::set allocated; + int toAllocate = static_cast(elsPerChunk * 3 + 1); + for (int i = 0; i < toAllocate; ++i) { + AllocationT el = allocator.allocate(); + EXPECT_NE(nullptr, el.first); + EXPECT_NE(nullptr, el.second); + EXPECT_EQ(1U, graphicsAllocations.count(el.first)); + EXPECT_EQ(0U, allocated.count(el.second)); + allocated.insert(el.second); + + EXPECT_TRUE(isAligned(reinterpret_cast(el.second), elementSize)); + EXPECT_EQ(alignUp(i + 1, elsPerChunk) / elsPerChunk, graphicsAllocations.size()); + } + + EXPECT_EQ(alignUp(toAllocate, elsPerChunk) / elsPerChunk, graphicsAllocations.size()); + for (auto ptr : allocated) { + EXPECT_TRUE(allocator.contains(ptr)); + } + int stackMem = 0; + EXPECT_FALSE(allocator.contains(&stackMem)); + EXPECT_FALSE(allocator.free(&stackMem)); + + EXPECT_EQ(alignUp(toAllocate, elsPerChunk) / elsPerChunk, graphicsAllocations.size()); + for (auto ptr : allocated) { + EXPECT_TRUE(allocator.free(ptr)); + } + + allocated.clear(); + for (int i = 0; i < toAllocate; ++i) { + AllocationT el = allocator.allocate(); + EXPECT_NE(nullptr, el.first); + EXPECT_NE(nullptr, el.second); + EXPECT_EQ(1U, graphicsAllocations.count(el.first)); + EXPECT_EQ(0U, allocated.count(el.second)); + allocated.insert(el.second); + + EXPECT_TRUE(isAligned(reinterpret_cast(el.second), elementSize)); + } + EXPECT_EQ(alignUp(toAllocate, elsPerChunk) / elsPerChunk, graphicsAllocations.size()) << " alloc reuse failed"; + } + EXPECT_EQ(0U, graphicsAllocations.size()); +}