Refactor TagAllocator

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
2026-01-05 09:09:04 +08:00 · 2021-03-24 18:21:13 +00:00
parent cb4db7767e
commit 5a50ad098c
49 changed files with 868 additions and 430 deletions
--- a/shared/source/command_stream/command_stream_receiver.cpp
+++ b/shared/source/command_stream/command_stream_receiver.cpp
@@ -606,15 +606,15 @@ bool CommandStreamReceiver::createAllocationForHostSurface(HostPtrSurface &surfa
    return true;
 }

-TagAllocator<HwTimeStamps> *CommandStreamReceiver::getEventTsAllocator() {
+TagAllocatorBase *CommandStreamReceiver::getEventTsAllocator() {
    if (profilingTimeStampAllocator.get() == nullptr) {
-        profilingTimeStampAllocator = std::make_unique<TagAllocator<HwTimeStamps>>(
-            rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize, sizeof(HwTimeStamps), false, osContext->getDeviceBitfield());
+        profilingTimeStampAllocator = std::make_unique<TagAllocator<HwTimeStamps>>(rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize,
+                                                                                   sizeof(HwTimeStamps), false, osContext->getDeviceBitfield());
    }
    return profilingTimeStampAllocator.get();
 }

-TagAllocator<HwPerfCounter> *CommandStreamReceiver::getEventPerfCountAllocator(const uint32_t tagSize) {
+TagAllocatorBase *CommandStreamReceiver::getEventPerfCountAllocator(const uint32_t tagSize) {
    if (perfCounterAllocator.get() == nullptr) {
        perfCounterAllocator = std::make_unique<TagAllocator<HwPerfCounter>>(
            rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize, tagSize, false, osContext->getDeviceBitfield());
@@ -622,15 +622,15 @@ TagAllocator<HwPerfCounter> *CommandStreamReceiver::getEventPerfCountAllocator(c
    return perfCounterAllocator.get();
 }

-TagAllocator<TimestampPacketStorage> *CommandStreamReceiver::getTimestampPacketAllocator() {
+TagAllocatorBase *CommandStreamReceiver::getTimestampPacketAllocator() {
    if (timestampPacketAllocator.get() == nullptr) {
        // dont release nodes in aub/tbx mode, to avoid removing semaphores optimization or reusing returned tags
        bool doNotReleaseNodes = (getType() > CommandStreamReceiverType::CSR_HW) ||
                                 DebugManager.flags.DisableTimestampPacketOptimizations.get();

-        timestampPacketAllocator = std::make_unique<TagAllocator<TimestampPacketStorage>>(
+        timestampPacketAllocator = std::make_unique<TagAllocator<NEO::TimestampPackets<uint32_t>>>(
            rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize * 4,
-            sizeof(TimestampPacketStorage), doNotReleaseNodes, osContext->getDeviceBitfield());
+            sizeof(NEO::TimestampPackets<uint32_t>), doNotReleaseNodes, osContext->getDeviceBitfield());
    }
    return timestampPacketAllocator.get();
 }
--- a/shared/source/command_stream/command_stream_receiver.h
+++ b/shared/source/command_stream/command_stream_receiver.h
@@ -43,8 +43,9 @@ class MultiGraphicsAllocation;
 class OsContext;
 class OSInterface;
 class ScratchSpaceController;
-struct HwPerfCounter;
-struct HwTimeStamps;
+class HwPerfCounter;
+class HwTimeStamps;
+class TagAllocatorBase;

 template <typename TSize>
 class TimestampPackets;
@@ -192,9 +193,9 @@ class CommandStreamReceiver {
    virtual void setupContext(OsContext &osContext) { this->osContext = &osContext; }
    OsContext &getOsContext() const { return *osContext; }

-    TagAllocator<HwTimeStamps> *getEventTsAllocator();
-    TagAllocator<HwPerfCounter> *getEventPerfCountAllocator(const uint32_t tagSize);
-    TagAllocator<TimestampPacketStorage> *getTimestampPacketAllocator();
+    TagAllocatorBase *getEventTsAllocator();
+    TagAllocatorBase *getEventPerfCountAllocator(const uint32_t tagSize);
+    TagAllocatorBase *getTimestampPacketAllocator();

    virtual bool expectMemory(const void *gfxAddress, const void *srcAddress, size_t length, uint32_t compareOperation);

@@ -261,9 +262,9 @@ class CommandStreamReceiver {
    std::unique_ptr<InternalAllocationStorage> internalAllocationStorage;
    std::unique_ptr<KmdNotifyHelper> kmdNotifyHelper;
    std::unique_ptr<ScratchSpaceController> scratchSpaceController;
-    std::unique_ptr<TagAllocator<HwTimeStamps>> profilingTimeStampAllocator;
-    std::unique_ptr<TagAllocator<HwPerfCounter>> perfCounterAllocator;
-    std::unique_ptr<TagAllocator<TimestampPacketStorage>> timestampPacketAllocator;
+    std::unique_ptr<TagAllocatorBase> profilingTimeStampAllocator;
+    std::unique_ptr<TagAllocatorBase> perfCounterAllocator;
+    std::unique_ptr<TagAllocatorBase> timestampPacketAllocator;
    std::unique_ptr<Thread> userPauseConfirmation;

    ResidencyContainer residencyAllocations;
--- a/shared/source/helpers/blit_commands_helper.h
+++ b/shared/source/helpers/blit_commands_helper.h
@@ -26,16 +26,17 @@ class LinearStream;
 struct RootDeviceEnvironment;

 template <typename TagType>
-struct TagNode;
+class TagNode;

 template <typename TSize>
 class TimestampPackets;

+class TagNodeBase;
+
 struct BlitProperties;
 struct HardwareInfo;
 struct TimestampPacketDependencies;
 using BlitPropertiesContainer = StackVec<BlitProperties, 16>;
-using TimestampPacketStorage = TimestampPackets<uint32_t>;

 struct BlitProperties {
    static BlitProperties constructPropertiesForReadWriteBuffer(BlitterConstants::BlitDirection blitDirection,
@@ -60,7 +61,7 @@ struct BlitProperties {
                                                   TimestampPacketContainer &kernelTimestamps, const CsrDependencies &depsFromEvents,
                                                   CommandStreamReceiver &gpguCsr, CommandStreamReceiver &bcsCsr);

-    TagNode<TimestampPacketStorage> *outputTimestampPacket = nullptr;
+    TagNodeBase *outputTimestampPacket = nullptr;
    BlitterConstants::BlitDirection blitDirection;
    CsrDependencies csrDependencies;
    AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None;
--- a/shared/source/helpers/common_types.h
+++ b/shared/source/helpers/common_types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2020 Intel Corporation
+ * Copyright (C) 2019-2021 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -23,4 +23,13 @@ enum class DebugPauseState : uint32_t {
    hasUserEndConfirmation,
    terminate
 };
+
+class TagTypeBase {
+};
+
+enum class TagNodeType {
+    TimestampPacket,
+    HwTimeStamps,
+    HwPerfCounter
+};
 } // namespace NEO
--- a/shared/source/helpers/timestamp_packet.cpp
+++ b/shared/source/helpers/timestamp_packet.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2020 Intel Corporation
+ * Copyright (C) 2018-2021 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -13,7 +13,7 @@

 using namespace NEO;

-void TimestampPacketContainer::add(Node *timestampPacketNode) {
+void TimestampPacketContainer::add(TagNodeBase *timestampPacketNode) {
    timestampPacketNodes.push_back(timestampPacketNode);
 }

@@ -28,7 +28,7 @@ void TimestampPacketContainer::swapNodes(TimestampPacketContainer &timestampPack
 }

 void TimestampPacketContainer::resolveDependencies(bool clearAllDependencies) {
-    std::vector<Node *> pendingNodes;
+    std::vector<TagNodeBase *> pendingNodes;

    for (auto node : timestampPacketNodes) {
        if (node->canBeReleased() || clearAllDependencies) {
--- a/shared/source/helpers/timestamp_packet.h
+++ b/shared/source/helpers/timestamp_packet.h
@@ -31,7 +31,7 @@ constexpr uint32_t preferredPacketCount = 16u;

 #pragma pack(1)
 template <typename TSize>
-class TimestampPackets {
+class TimestampPackets : public TagTypeBase {
  public:
    struct Packet {
        TSize contextStart = 1u;
@@ -40,10 +40,14 @@ class TimestampPackets {
        TSize globalEnd = 1u;
    };

-    static GraphicsAllocation::AllocationType getAllocationType() {
+    static constexpr GraphicsAllocation::AllocationType getAllocationType() {
        return GraphicsAllocation::AllocationType::TIMESTAMP_PACKET_TAG_BUFFER;
    }

+    static constexpr TagNodeType getTagNodeType() { return TagNodeType::TimestampPacket; }
+
+    size_t getSinglePacketSize() const { return sizeof(Packet); }
+
    bool isCompleted() const {
        if (DebugManager.flags.DisableAtomicForPostSyncs.get()) {
            return false;
@@ -96,29 +100,25 @@ class TimestampPackets {
 };
 #pragma pack()

-using TimestampPacketStorage = TimestampPackets<uint32_t>;
-
-static_assert(((4 * TimestampPacketSizeControl::preferredPacketCount + 2) * sizeof(uint32_t)) == sizeof(TimestampPacketStorage),
+static_assert(((4 * TimestampPacketSizeControl::preferredPacketCount + 2) * sizeof(uint32_t)) == sizeof(TimestampPackets<uint32_t>),
              "This structure is consumed by GPU and has to follow specific restrictions for padding and size");

 class TimestampPacketContainer : public NonCopyableClass {
  public:
-    using Node = TagNode<TimestampPacketStorage>;
-
    TimestampPacketContainer() = default;
    TimestampPacketContainer(TimestampPacketContainer &&) = default;
    TimestampPacketContainer &operator=(TimestampPacketContainer &&) = default;
    MOCKABLE_VIRTUAL ~TimestampPacketContainer();

-    const std::vector<Node *> &peekNodes() const { return timestampPacketNodes; }
-    void add(Node *timestampPacketNode);
+    const std::vector<TagNodeBase *> &peekNodes() const { return timestampPacketNodes; }
+    void add(TagNodeBase *timestampPacketNode);
    void swapNodes(TimestampPacketContainer &timestampPacketContainer);
    void assignAndIncrementNodesRefCounts(const TimestampPacketContainer &inputTimestampPacketContainer);
    void resolveDependencies(bool clearAllDependencies);
    void makeResident(CommandStreamReceiver &commandStreamReceiver);

  protected:
-    std::vector<Node *> timestampPacketNodes;
+    std::vector<TagNodeBase *> timestampPacketNodes;
 };

 struct TimestampPacketDependencies : public NonCopyableClass {
@@ -130,27 +130,27 @@ struct TimestampPacketDependencies : public NonCopyableClass {
 };

 struct TimestampPacketHelper {
-    static uint64_t getContextEndGpuAddress(const TagNode<TimestampPacketStorage> &timestampPacketNode) {
-        return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getContextEndOffset();
+    static uint64_t getContextEndGpuAddress(const TagNodeBase &timestampPacketNode) {
+        return timestampPacketNode.getGpuAddress() + timestampPacketNode.getContextEndOffset();
    }
-    static uint64_t getContextStartGpuAddress(const TagNode<TimestampPacketStorage> &timestampPacketNode) {
-        return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getContextStartOffset();
+    static uint64_t getContextStartGpuAddress(const TagNodeBase &timestampPacketNode) {
+        return timestampPacketNode.getGpuAddress() + timestampPacketNode.getContextStartOffset();
    }
-    static uint64_t getGlobalEndGpuAddress(const TagNode<TimestampPacketStorage> &timestampPacketNode) {
-        return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getGlobalEndOffset();
+    static uint64_t getGlobalEndGpuAddress(const TagNodeBase &timestampPacketNode) {
+        return timestampPacketNode.getGpuAddress() + timestampPacketNode.getGlobalEndOffset();
    }
-    static uint64_t getGlobalStartGpuAddress(const TagNode<TimestampPacketStorage> &timestampPacketNode) {
-        return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getGlobalStartOffset();
+    static uint64_t getGlobalStartGpuAddress(const TagNodeBase &timestampPacketNode) {
+        return timestampPacketNode.getGpuAddress() + timestampPacketNode.getGlobalStartOffset();
    }

-    static uint64_t getGpuDependenciesCountGpuAddress(const TagNode<TimestampPacketStorage> &timestampPacketNode) {
-        return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getImplicitGpuDependenciesCountOffset();
+    static uint64_t getGpuDependenciesCountGpuAddress(const TagNodeBase &timestampPacketNode) {
+        return timestampPacketNode.getGpuAddress() + timestampPacketNode.getImplicitGpuDependenciesCountOffset();
    }

    static void overrideSupportedDevicesCount(uint32_t &numSupportedDevices);

    template <typename GfxFamily>
-    static void programSemaphoreWithImplicitDependency(LinearStream &cmdStream, TagNode<TimestampPacketStorage> &timestampPacketNode, uint32_t numSupportedDevices) {
+    static void programSemaphoreWithImplicitDependency(LinearStream &cmdStream, TagNodeBase &timestampPacketNode, uint32_t numSupportedDevices) {
        using MI_ATOMIC = typename GfxFamily::MI_ATOMIC;
        using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
        using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
@@ -158,8 +158,8 @@ struct TimestampPacketHelper {
        auto compareAddress = getContextEndGpuAddress(timestampPacketNode);
        auto dependenciesCountAddress = getGpuDependenciesCountGpuAddress(timestampPacketNode);

-        for (uint32_t packetId = 0; packetId < timestampPacketNode.tagForCpuAccess->getPacketsUsed(); packetId++) {
-            uint64_t compareOffset = packetId * sizeof(TimestampPacketStorage::Packet);
+        for (uint32_t packetId = 0; packetId < timestampPacketNode.getPacketsUsed(); packetId++) {
+            uint64_t compareOffset = packetId * timestampPacketNode.getSinglePacketSize();
            EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(cmdStream, compareAddress + compareOffset, 1, COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
        }

@@ -231,8 +231,8 @@ struct TimestampPacketHelper {
    }

    template <typename GfxFamily>
-    static size_t getRequiredCmdStreamSizeForNodeDependency(TagNode<TimestampPacketStorage> &timestampPacketNode) {
-        size_t totalMiSemaphoreWaitSize = timestampPacketNode.tagForCpuAccess->getPacketsUsed() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
+    static size_t getRequiredCmdStreamSizeForNodeDependency(TagNodeBase &timestampPacketNode) {
+        size_t totalMiSemaphoreWaitSize = timestampPacketNode.getPacketsUsed() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);

        return totalMiSemaphoreWaitSize + sizeof(typename GfxFamily::MI_ATOMIC);
    }
--- a/shared/source/utilities/CMakeLists.txt
+++ b/shared/source/utilities/CMakeLists.txt
@@ -34,7 +34,9 @@ set(NEO_CORE_UTILITIES
    ${CMAKE_CURRENT_SOURCE_DIR}/software_tags_manager.h
    ${CMAKE_CURRENT_SOURCE_DIR}/spinlock.h
    ${CMAKE_CURRENT_SOURCE_DIR}/stackvec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator.inl
    ${CMAKE_CURRENT_SOURCE_DIR}/time_measure_wrapper.h
    ${CMAKE_CURRENT_SOURCE_DIR}/timer_util.h
 )
--- a/shared/source/utilities/tag_allocator.cpp
+++ b/shared/source/utilities/tag_allocator.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/utilities/tag_allocator.h"
+
+namespace NEO {
+
+TagAllocatorBase::TagAllocatorBase(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount, size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes, DeviceBitfield deviceBitfield)
+    : deviceBitfield(deviceBitfield), rootDeviceIndex(rootDeviceIndex), memoryManager(memMngr), tagCount(tagCount), tagSize(tagSize), doNotReleaseNodes(doNotReleaseNodes) {
+
+    this->tagSize = alignUp(tagSize, tagAlignment);
+}
+
+void TagAllocatorBase::cleanUpResources() {
+    for (auto gfxAllocation : gfxAllocations) {
+        memoryManager->freeGraphicsMemory(gfxAllocation);
+    }
+    gfxAllocations.clear();
+}
+
+void TagNodeBase::returnTag() {
+    allocator->returnTag(this);
+}
+
+bool TagNodeBase::canBeReleased() const {
+    return (!doNotReleaseNodes) &&
+           (isCompleted()) &&
+           (getImplicitGpuDependenciesCount() == getImplicitCpuDependenciesCount());
+}
+
+} // namespace NEO
--- a/shared/source/utilities/tag_allocator.h
+++ b/shared/source/utilities/tag_allocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -14,6 +14,7 @@
 #include <atomic>
 #include <cstdint>
 #include <mutex>
+#include <type_traits>
 #include <vector>

 namespace NEO {
@@ -23,28 +24,31 @@ template <typename TagType>
 class TagAllocator;

 template <typename TagType>
-struct TagNode : public IDNode<TagNode<TagType>>, NonCopyableOrMovableClass {
+class TagNode;
+
+class TagAllocatorBase;
+
+class TagNodeBase : public NonCopyableOrMovableClass {
  public:
-    TagType *tagForCpuAccess;
+    virtual ~TagNodeBase() = default;

    GraphicsAllocation *getBaseGraphicsAllocation() const { return gfxAllocation; }
+
    uint64_t getGpuAddress() const { return gpuAddress; }

    void incRefCount() { refCount++; }

-    MOCKABLE_VIRTUAL void returnTag() {
-        allocator->returnTag(this);
-    }
+    uint32_t refCountFetchSub(uint32_t value) { return refCount.fetch_sub(value); }

-    bool canBeReleased() const {
-        return (!doNotReleaseNodes) &&
-               (tagForCpuAccess->isCompleted()) &&
-               (tagForCpuAccess->getImplicitGpuDependenciesCount() == getImplicitCpuDependenciesCount());
-    }
+    MOCKABLE_VIRTUAL void returnTag();

-    void setDoNotReleaseNodes(bool doNotRelease) {
-        doNotReleaseNodes = doNotRelease;
-    }
+    virtual void initialize() = 0;
+
+    bool canBeReleased() const;
+
+    virtual void *getCpuBase() const = 0;
+
+    void setDoNotReleaseNodes(bool doNotRelease) { doNotReleaseNodes = doNotRelease; }

    void setProfilingCapable(bool capable) { profilingCapable = capable; }

@@ -52,18 +56,42 @@ struct TagNode : public IDNode<TagNode<TagType>>, NonCopyableOrMovableClass {

    void incImplicitCpuDependenciesCount() { implicitCpuDependenciesCount++; }

-    void initialize() {
-        tagForCpuAccess->initialize();
-        implicitCpuDependenciesCount.store(0);
-        setProfilingCapable(true);
-    }
-
    uint32_t getImplicitCpuDependenciesCount() const { return implicitCpuDependenciesCount.load(); }

-    const TagAllocator<TagType> *getAllocator() const { return allocator; }
+    const TagAllocatorBase *getAllocator() const { return allocator; }
+
+    // TagType specific calls
+    virtual bool isCompleted() const = 0;
+    virtual void assignDataToAllTimestamps(uint32_t packetIndex, void *source) = 0;
+
+    virtual size_t getGlobalStartOffset() const = 0;
+    virtual size_t getContextStartOffset() const = 0;
+    virtual size_t getContextEndOffset() const = 0;
+    virtual size_t getGlobalEndOffset() const = 0;
+    virtual size_t getImplicitGpuDependenciesCountOffset() const = 0;
+
+    virtual uint64_t getContextStartValue(uint32_t packetIndex) const = 0;
+    virtual uint64_t getGlobalStartValue(uint32_t packetIndex) const = 0;
+    virtual uint64_t getContextEndValue(uint32_t packetIndex) const = 0;
+    virtual uint64_t getGlobalEndValue(uint32_t packetIndex) const = 0;
+
+    virtual uint64_t &getGlobalEndRef() const = 0;
+    virtual uint64_t &getContextCompleteRef() const = 0;
+
+    virtual void setPacketsUsed(uint32_t used) = 0;
+    virtual uint32_t getPacketsUsed() const = 0;
+
+    virtual size_t getSinglePacketSize() const = 0;
+
+    virtual uint32_t getImplicitGpuDependenciesCount() const = 0;
+
+    virtual MetricsLibraryApi::QueryHandle_1_0 &getQueryHandleRef() const = 0;

  protected:
-    TagAllocator<TagType> *allocator = nullptr;
+    TagNodeBase() = default;
+
+    TagAllocatorBase *allocator = nullptr;
+
    GraphicsAllocation *gfxAllocation = nullptr;
    uint64_t gpuAddress = 0;
    std::atomic<uint32_t> refCount{0};
@@ -71,71 +99,78 @@ struct TagNode : public IDNode<TagNode<TagType>>, NonCopyableOrMovableClass {
    bool doNotReleaseNodes = false;
    bool profilingCapable = true;

-    template <typename TagType2>
+    template <typename TagType>
    friend class TagAllocator;
 };

 template <typename TagType>
-class TagAllocator {
+class TagNode : public TagNodeBase, public IDNode<TagNode<TagType>> {
+    static_assert(!std::is_polymorphic<TagType>::value,
+                  "This structure is consumed by GPU and has to follow specific restrictions for padding and size");
+
  public:
-    using NodeType = TagNode<TagType>;
+    TagType *tagForCpuAccess;

-    TagAllocator(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount,
-                 size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes,
-                 DeviceBitfield deviceBitfield) : deviceBitfield(deviceBitfield),
-                                                  rootDeviceIndex(rootDeviceIndex),
-                                                  memoryManager(memMngr),
-                                                  tagCount(tagCount),
-                                                  doNotReleaseNodes(doNotReleaseNodes) {
-
-        this->tagSize = alignUp(tagSize, tagAlignment);
-        populateFreeTags();
+    void initialize() override {
+        tagForCpuAccess->initialize();
+        implicitCpuDependenciesCount.store(0);
+        setProfilingCapable(true);
    }

-    MOCKABLE_VIRTUAL ~TagAllocator() {
-        cleanUpResources();
-    }
+    void *getCpuBase() const override { return tagForCpuAccess; }

-    void cleanUpResources() {
-        for (auto gfxAllocation : gfxAllocations) {
-            memoryManager->freeGraphicsMemory(gfxAllocation);
-        }
-        gfxAllocations.clear();
-    }
+    void assignDataToAllTimestamps(uint32_t packetIndex, void *source) override;

-    NodeType *getTag() {
-        if (freeTags.peekIsEmpty()) {
-            releaseDeferredTags();
-        }
-        NodeType *node = freeTags.removeFrontOne().release();
-        if (!node) {
-            std::unique_lock<std::mutex> lock(allocatorMutex);
-            populateFreeTags();
-            node = freeTags.removeFrontOne().release();
-        }
-        usedTags.pushFrontOne(*node);
-        node->incRefCount();
-        node->initialize();
-        return node;
-    }
+    bool isCompleted() const override;

-    MOCKABLE_VIRTUAL void returnTag(NodeType *node) {
-        if (node->refCount.fetch_sub(1) == 1) {
-            if (node->canBeReleased()) {
-                returnTagToFreePool(node);
-            } else {
-                returnTagToDeferredPool(node);
-            }
-        }
-    }
+    size_t getGlobalStartOffset() const override;
+    size_t getContextStartOffset() const override;
+    size_t getContextEndOffset() const override;
+    size_t getGlobalEndOffset() const override;
+    size_t getImplicitGpuDependenciesCountOffset() const override;
+
+    uint64_t getContextStartValue(uint32_t packetIndex) const override;
+    uint64_t getGlobalStartValue(uint32_t packetIndex) const override;
+    uint64_t getContextEndValue(uint32_t packetIndex) const override;
+    uint64_t getGlobalEndValue(uint32_t packetIndex) const override;
+
+    uint64_t &getGlobalEndRef() const override;
+    uint64_t &getContextCompleteRef() const override;
+
+    void setPacketsUsed(uint32_t used) override;
+    uint32_t getPacketsUsed() const override;
+
+    size_t getSinglePacketSize() const override;
+
+    uint32_t getImplicitGpuDependenciesCount() const override;
+
+    MetricsLibraryApi::QueryHandle_1_0 &getQueryHandleRef() const override;
+};
+
+class TagAllocatorBase {
+  public:
+    virtual ~TagAllocatorBase() { cleanUpResources(); };
+
+    virtual void returnTag(TagNodeBase *node) = 0;
+
+    virtual TagNodeBase *getTag() = 0;

  protected:
-    IDList<NodeType> freeTags;
-    IDList<NodeType> usedTags;
-    IDList<NodeType> deferredTags;
-    std::vector<GraphicsAllocation *> gfxAllocations;
-    std::vector<std::unique_ptr<NodeType[]>> tagPoolMemory;
+    TagAllocatorBase() = delete;

+    TagAllocatorBase(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount,
+                     size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes,
+                     DeviceBitfield deviceBitfield);
+
+    virtual void returnTagToFreePool(TagNodeBase *node) = 0;
+
+    virtual void returnTagToDeferredPool(TagNodeBase *node) = 0;
+
+    virtual void releaseDeferredTags() = 0;
+
+    void cleanUpResources();
+
+    std::vector<GraphicsAllocation *> gfxAllocations;
    const DeviceBitfield deviceBitfield;
    const uint32_t rootDeviceIndex;
    MemoryManager *memoryManager;
@@ -144,66 +179,38 @@ class TagAllocator {
    bool doNotReleaseNodes = false;

    std::mutex allocatorMutex;
+};

-    MOCKABLE_VIRTUAL void returnTagToFreePool(NodeType *node) {
-        NodeType *usedNode = usedTags.removeOne(*node).release();
-        DEBUG_BREAK_IF(usedNode == nullptr);
-        UNUSED_VARIABLE(usedNode);
-        freeTags.pushFrontOne(*node);
-    }
+template <typename TagType>
+class TagAllocator : public TagAllocatorBase {
+  public:
+    using NodeType = TagNode<TagType>;

-    void returnTagToDeferredPool(NodeType *node) {
-        NodeType *usedNode = usedTags.removeOne(*node).release();
-        DEBUG_BREAK_IF(!usedNode);
-        deferredTags.pushFrontOne(*usedNode);
-    }
+    TagAllocator(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount,
+                 size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes,
+                 DeviceBitfield deviceBitfield);

-    void populateFreeTags() {
-        size_t allocationSizeRequired = tagCount * tagSize;
+    TagNodeBase *getTag() override;

-        auto allocationType = TagType::getAllocationType();
-        AllocationProperties allocationProperties{rootDeviceIndex, allocationSizeRequired, allocationType, deviceBitfield};
-        GraphicsAllocation *graphicsAllocation = memoryManager->allocateGraphicsMemoryWithProperties(allocationProperties);
-        gfxAllocations.push_back(graphicsAllocation);
+    void returnTag(TagNodeBase *node) override;

-        auto nodesMemory = std::make_unique<NodeType[]>(tagCount);
+  protected:
+    TagAllocator() = delete;

-        for (size_t i = 0; i < tagCount; ++i) {
-            auto tagOffset = i * tagSize;
+    void returnTagToFreePool(TagNodeBase *node) override;

-            nodesMemory[i].allocator = this;
-            nodesMemory[i].gfxAllocation = graphicsAllocation;
-            nodesMemory[i].tagForCpuAccess = reinterpret_cast<TagType *>(ptrOffset(graphicsAllocation->getUnderlyingBuffer(), tagOffset));
-            nodesMemory[i].gpuAddress = graphicsAllocation->getGpuAddress() + tagOffset;
-            nodesMemory[i].setDoNotReleaseNodes(doNotReleaseNodes);
+    void returnTagToDeferredPool(TagNodeBase *node) override;

-            freeTags.pushTailOne(nodesMemory[i]);
-        }
+    void releaseDeferredTags() override;

-        tagPoolMemory.push_back(std::move(nodesMemory));
-    }
+    void populateFreeTags();

-    void releaseDeferredTags() {
-        IDList<NodeType, false> pendingFreeTags;
-        IDList<NodeType, false> pendingDeferredTags;
-        auto currentNode = deferredTags.detachNodes();
+    IDList<NodeType> freeTags;
+    IDList<NodeType> usedTags;
+    IDList<NodeType> deferredTags;

-        while (currentNode != nullptr) {
-            auto nextNode = currentNode->next;
-            if (currentNode->canBeReleased()) {
-                pendingFreeTags.pushFrontOne(*currentNode);
-            } else {
-                pendingDeferredTags.pushFrontOne(*currentNode);
-            }
-            currentNode = nextNode;
-        }
-
-        if (!pendingFreeTags.peekIsEmpty()) {
-            freeTags.splice(*pendingFreeTags.detachNodes());
-        }
-        if (!pendingDeferredTags.peekIsEmpty()) {
-            deferredTags.splice(*pendingDeferredTags.detachNodes());
-        }
-    }
+    std::vector<std::unique_ptr<NodeType[]>> tagPoolMemory;
 };
 } // namespace NEO
+
+#include "shared/source/utilities/tag_allocator.inl"
--- a/shared/source/utilities/tag_allocator.inl
+++ b/shared/source/utilities/tag_allocator.inl
@@ -0,0 +1,282 @@
+/*
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/utilities/tag_allocator.h"
+
+namespace NEO {
+template <typename TagType>
+TagAllocator<TagType>::TagAllocator(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount, size_t tagAlignment,
+                                    size_t tagSize, bool doNotReleaseNodes, DeviceBitfield deviceBitfield)
+    : TagAllocatorBase(rootDeviceIndex, memMngr, tagCount, tagAlignment, tagSize, doNotReleaseNodes, deviceBitfield) {
+
+    populateFreeTags();
+}
+
+template <typename TagType>
+TagNodeBase *TagAllocator<TagType>::getTag() {
+    if (freeTags.peekIsEmpty()) {
+        releaseDeferredTags();
+    }
+    auto node = freeTags.removeFrontOne().release();
+    if (!node) {
+        std::unique_lock<std::mutex> lock(allocatorMutex);
+        populateFreeTags();
+        node = freeTags.removeFrontOne().release();
+    }
+    usedTags.pushFrontOne(*node);
+    node->incRefCount();
+    node->initialize();
+    return node;
+}
+
+template <typename TagType>
+void TagAllocator<TagType>::returnTagToFreePool(TagNodeBase *node) {
+    auto nodeT = static_cast<NodeType *>(node);
+    auto usedNode = usedTags.removeOne(*nodeT).release();
+    DEBUG_BREAK_IF(usedNode == nullptr);
+    UNUSED_VARIABLE(usedNode);
+    freeTags.pushFrontOne(*nodeT);
+}
+
+template <typename TagType>
+void TagAllocator<TagType>::returnTagToDeferredPool(TagNodeBase *node) {
+    auto nodeT = static_cast<NodeType *>(node);
+    auto usedNode = usedTags.removeOne(*nodeT).release();
+    DEBUG_BREAK_IF(!usedNode);
+    deferredTags.pushFrontOne(*usedNode);
+}
+
+template <typename TagType>
+void TagAllocator<TagType>::releaseDeferredTags() {
+    IDList<NodeType, false> pendingFreeTags;
+    IDList<NodeType, false> pendingDeferredTags;
+    auto currentNode = deferredTags.detachNodes();
+
+    while (currentNode != nullptr) {
+        auto nextNode = currentNode->next;
+        if (currentNode->canBeReleased()) {
+            pendingFreeTags.pushFrontOne(*currentNode);
+        } else {
+            pendingDeferredTags.pushFrontOne(*currentNode);
+        }
+        currentNode = nextNode;
+    }
+
+    if (!pendingFreeTags.peekIsEmpty()) {
+        freeTags.splice(*pendingFreeTags.detachNodes());
+    }
+    if (!pendingDeferredTags.peekIsEmpty()) {
+        deferredTags.splice(*pendingDeferredTags.detachNodes());
+    }
+}
+
+template <typename TagType>
+void TagAllocator<TagType>::populateFreeTags() {
+    size_t allocationSizeRequired = tagCount * tagSize;
+
+    AllocationProperties allocationProperties{rootDeviceIndex, allocationSizeRequired, TagType::getAllocationType(), deviceBitfield};
+    GraphicsAllocation *graphicsAllocation = memoryManager->allocateGraphicsMemoryWithProperties(allocationProperties);
+    gfxAllocations.push_back(graphicsAllocation);
+
+    auto nodesMemory = std::make_unique<NodeType[]>(tagCount);
+
+    for (size_t i = 0; i < tagCount; ++i) {
+        auto tagOffset = i * tagSize;
+
+        nodesMemory[i].allocator = this;
+        nodesMemory[i].gfxAllocation = graphicsAllocation;
+        nodesMemory[i].tagForCpuAccess = reinterpret_cast<TagType *>(ptrOffset(graphicsAllocation->getUnderlyingBuffer(), tagOffset));
+        nodesMemory[i].gpuAddress = graphicsAllocation->getGpuAddress() + tagOffset;
+        nodesMemory[i].setDoNotReleaseNodes(doNotReleaseNodes);
+
+        freeTags.pushTailOne(nodesMemory[i]);
+    }
+
+    tagPoolMemory.push_back(std::move(nodesMemory));
+}
+
+template <typename TagType>
+void TagAllocator<TagType>::returnTag(TagNodeBase *node) {
+    if (node->refCountFetchSub(1) == 1) {
+        if (node->canBeReleased()) {
+            returnTagToFreePool(node);
+        } else {
+            returnTagToDeferredPool(node);
+        }
+    }
+}
+
+template <typename TagType>
+size_t TagNode<TagType>::getGlobalStartOffset() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getGlobalStartOffset();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+size_t TagNode<TagType>::getContextStartOffset() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getContextStartOffset();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+size_t TagNode<TagType>::getContextEndOffset() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getContextEndOffset();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+size_t TagNode<TagType>::getGlobalEndOffset() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getGlobalEndOffset();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+size_t TagNode<TagType>::getImplicitGpuDependenciesCountOffset() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getImplicitGpuDependenciesCountOffset();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint64_t TagNode<TagType>::getContextStartValue(uint32_t packetIndex) const {
+    if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) {
+        return tagForCpuAccess->getContextStartValue(packetIndex);
+    } else {
+        UNUSED_VARIABLE(packetIndex);
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint64_t TagNode<TagType>::getGlobalStartValue(uint32_t packetIndex) const {
+    if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) {
+        return tagForCpuAccess->getGlobalStartValue(packetIndex);
+    } else {
+        UNUSED_VARIABLE(packetIndex);
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint64_t TagNode<TagType>::getContextEndValue(uint32_t packetIndex) const {
+    if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) {
+        return tagForCpuAccess->getContextEndValue(packetIndex);
+    } else {
+        UNUSED_VARIABLE(packetIndex);
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint64_t TagNode<TagType>::getGlobalEndValue(uint32_t packetIndex) const {
+    if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) {
+        return tagForCpuAccess->getGlobalEndValue(packetIndex);
+    } else {
+        UNUSED_VARIABLE(packetIndex);
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint64_t &TagNode<TagType>::getContextCompleteRef() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::HwTimeStamps) {
+        return tagForCpuAccess->ContextCompleteTS;
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint64_t &TagNode<TagType>::getGlobalEndRef() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::HwTimeStamps) {
+        return tagForCpuAccess->GlobalEndTS;
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+void TagNode<TagType>::setPacketsUsed(uint32_t used) {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->setPacketsUsed(used);
+    } else {
+        UNUSED_VARIABLE(used);
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint32_t TagNode<TagType>::getPacketsUsed() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getPacketsUsed();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint32_t TagNode<TagType>::getImplicitGpuDependenciesCount() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getImplicitGpuDependenciesCount();
+    } else {
+        return 0;
+    }
+}
+
+template <typename TagType>
+size_t TagNode<TagType>::getSinglePacketSize() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getSinglePacketSize();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+void TagNode<TagType>::assignDataToAllTimestamps(uint32_t packetIndex, void *source) {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->assignDataToAllTimestamps(packetIndex, source);
+    } else {
+        UNUSED_VARIABLE(packetIndex);
+        UNUSED_VARIABLE(source);
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+bool TagNode<TagType>::isCompleted() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->isCompleted();
+    } else {
+        return true;
+    }
+}
+
+template <typename TagType>
+MetricsLibraryApi::QueryHandle_1_0 &TagNode<TagType>::getQueryHandleRef() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::HwPerfCounter) {
+        return tagForCpuAccess->query.handle;
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+} // namespace NEO