diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl
index bba5997e6b..017d6bebc1 100644
--- a/level_zero/core/source/cmdlist/cmdlist_hw.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl
@@ -230,7 +230,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendEventReset(ze_event_hand
     uint64_t baseAddr = event->getGpuAddress();
     size_t eventOffset = 0;
     if (event->isTimestampEvent) {
-        eventOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
+        eventOffset = offsetof(NEO::TimestampPackets<uint32_t>::Packet, contextEnd);
         event->resetPackets();
     }
     commandContainer.addToResidencyContainer(&event->getAllocation());
@@ -1487,7 +1487,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
     uint64_t baseAddr = event->getGpuAddress();
     size_t eventSignalOffset = 0;
     if (event->isTimestampEvent) {
-        eventSignalOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
+        eventSignalOffset = offsetof(NEO::TimestampPackets<uint32_t>::Packet, contextEnd);
     }
 
     if (isCopyOnly()) {
@@ -1536,7 +1536,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
 
         gpuAddr = event->getGpuAddress();
         if (event->isTimestampEvent) {
-            gpuAddr += offsetof(TimestampPacketStorage::Packet, contextEnd);
+            gpuAddr += offsetof(NEO::TimestampPackets<uint32_t>::Packet, contextEnd);
         }
         NEO::EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(),
                                                                    gpuAddr,
@@ -1577,8 +1577,8 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWriteKernelTimestamp(ze_event_h
     auto event = Event::fromHandle(hEvent);
 
     auto baseAddr = event->getGpuAddress();
-    auto contextOffset = beforeWalker ? offsetof(TimestampPacketStorage::Packet, contextStart) : offsetof(TimestampPacketStorage::Packet, contextEnd);
-    auto globalOffset = beforeWalker ? offsetof(TimestampPacketStorage::Packet, globalStart) : offsetof(TimestampPacketStorage::Packet, globalEnd);
+    auto contextOffset = beforeWalker ? offsetof(NEO::TimestampPackets<uint32_t>::Packet, contextStart) : offsetof(NEO::TimestampPackets<uint32_t>::Packet, contextEnd);
+    auto globalOffset = beforeWalker ? offsetof(NEO::TimestampPackets<uint32_t>::Packet, globalStart) : offsetof(NEO::TimestampPackets<uint32_t>::Packet, globalEnd);
 
     if (maskLsb) {
         NEO::EncodeMathMMIO<GfxFamily>::encodeBitwiseAndVal(commandContainer, REG_GLOBAL_TIMESTAMP_LDW, mask, ptrOffset(baseAddr, globalOffset));
diff --git a/level_zero/core/source/event/event.cpp b/level_zero/core/source/event/event.cpp
index 6f3271e547..3ca3ffe375 100644
--- a/level_zero/core/source/event/event.cpp
+++ b/level_zero/core/source/event/event.cpp
@@ -117,7 +117,7 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *
 
     if (eventPool->isEventPoolUsedForTimestamp) {
         event->isTimestampEvent = true;
-        event->timestampsData = std::make_unique<TimestampPacketStorage>();
+        event->timestampsData = std::make_unique<NEO::TimestampPackets<uint32_t>>();
     }
 
     auto alloc = eventPool->getAllocation().getGraphicsAllocation(device->getNEODevice()->getRootDeviceIndex());
@@ -140,7 +140,7 @@ NEO::GraphicsAllocation &Event::getAllocation() {
 }
 
 uint64_t Event::getTimestampPacketAddress() {
-    return gpuAddress + packetsInUse * sizeof(TimestampPacketStorage::Packet);
+    return gpuAddress + packetsInUse * sizeof(NEO::TimestampPackets<uint32_t>::Packet);
 }
 
 ze_result_t EventImp::calculateProfilingData() {
@@ -172,7 +172,7 @@ void EventImp::assignTimestampData(void *address) {
 
     for (uint32_t i = 0; i < packetsToCopy; i++) {
         timestampsData->assignDataToAllTimestamps(i, address);
-        address = ptrOffset(address, sizeof(struct TimestampPacketStorage::Packet));
+        address = ptrOffset(address, sizeof(struct NEO::TimestampPackets<uint32_t>::Packet));
     }
 }
 
@@ -190,7 +190,7 @@ ze_result_t EventImp::queryStatus() {
     this->csr->downloadAllocations();
     if (isTimestampEvent) {
         auto baseAddr = reinterpret_cast<uint64_t>(hostAddress);
-        auto timeStampAddress = baseAddr + offsetof(TimestampPacketStorage::Packet, contextEnd);
+        auto timeStampAddress = baseAddr + offsetof(NEO::TimestampPackets<uint32_t>::Packet, contextEnd);
         hostAddr = reinterpret_cast<uint64_t *>(timeStampAddress);
     }
     memcpy_s(static_cast<void *>(&queryVal), sizeof(uint32_t), static_cast<void *>(hostAddr), sizeof(uint32_t));
@@ -212,11 +212,11 @@ ze_result_t EventImp::hostEventSetValueTimestamps(uint32_t eventVal) {
     };
 
     for (uint32_t i = 0; i < NEO::TimestampPacketSizeControl::preferredPacketCount; i++) {
-        eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, contextStart));
-        eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, globalStart));
-        eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, contextEnd));
-        eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, globalEnd));
-        baseAddr += sizeof(struct TimestampPacketStorage::Packet);
+        eventTsSetFunc(baseAddr + offsetof(NEO::TimestampPackets<uint32_t>::Packet, contextStart));
+        eventTsSetFunc(baseAddr + offsetof(NEO::TimestampPackets<uint32_t>::Packet, globalStart));
+        eventTsSetFunc(baseAddr + offsetof(NEO::TimestampPackets<uint32_t>::Packet, contextEnd));
+        eventTsSetFunc(baseAddr + offsetof(NEO::TimestampPackets<uint32_t>::Packet, globalEnd));
+        baseAddr += sizeof(struct NEO::TimestampPackets<uint32_t>::Packet);
     }
     assignTimestampData(hostAddress);
 
diff --git a/level_zero/core/source/event/event.h b/level_zero/core/source/event/event.h
index 0732176d45..7039a32595 100644
--- a/level_zero/core/source/event/event.h
+++ b/level_zero/core/source/event/event.h
@@ -22,7 +22,6 @@ namespace L0 {
 typedef uint64_t FlushStamp;
 struct EventPool;
 struct MetricStreamer;
-using TimestampPacketStorage = NEO::TimestampPackets<uint32_t>;
 
 struct Event : _ze_event_handle_t {
     virtual ~Event() = default;
@@ -61,7 +60,7 @@ struct Event : _ze_event_handle_t {
     ze_event_scope_flags_t waitScope = 0u;
     bool isTimestampEvent = false;
 
-    std::unique_ptr<TimestampPacketStorage> timestampsData = nullptr;
+    std::unique_ptr<NEO::TimestampPackets<uint32_t>> timestampsData = nullptr;
     uint64_t globalStartTS;
     uint64_t globalEndTS;
     uint64_t contextStartTS;
@@ -158,7 +157,7 @@ struct EventPoolImp : public EventPool {
     size_t numEvents;
 
   protected:
-    const uint32_t eventSize = static_cast<uint32_t>(alignUp(NEO::TimestampPacketSizeControl::preferredPacketCount * sizeof(struct TimestampPacketStorage::Packet),
+    const uint32_t eventSize = static_cast<uint32_t>(alignUp(NEO::TimestampPacketSizeControl::preferredPacketCount * sizeof(struct NEO::TimestampPackets<uint32_t>::Packet),
                                                              MemoryConstants::cacheLineSize));
     const uint32_t eventAlignment = MemoryConstants::cacheLineSize;
 };
diff --git a/level_zero/core/test/unit_tests/gen12lp/test_events_gen12lp.cpp b/level_zero/core/test/unit_tests/gen12lp/test_events_gen12lp.cpp
index b681e4079e..49a21f4b39 100644
--- a/level_zero/core/test/unit_tests/gen12lp/test_events_gen12lp.cpp
+++ b/level_zero/core/test/unit_tests/gen12lp/test_events_gen12lp.cpp
@@ -42,7 +42,7 @@ struct TimestampEvent : public Test<DeviceFixture> {
 };
 
 GEN12LPTEST_F(TimestampEvent, givenEventTimestampsWhenQueryKernelTimestampThenCorrectDataAreSet) {
-    TimestampPacketStorage::Packet data = {};
+    TimestampPackets<uint32_t>::Packet data = {};
     data.contextStart = 1u;
     data.contextEnd = 2u;
     data.globalStart = 3u;
@@ -61,7 +61,7 @@ GEN12LPTEST_F(TimestampEvent, givenEventTimestampsWhenQueryKernelTimestampThenCo
 }
 
 GEN12LPTEST_F(TimestampEvent, givenEventMoreThanOneTimestampsPacketWhenQueryKernelTimestampThenCorrectCalculationAreMade) {
-    TimestampPacketStorage::Packet data[3] = {};
+    TimestampPackets<uint32_t>::Packet data[3] = {};
     data[0].contextStart = 3u;
     data[0].contextEnd = 4u;
     data[0].globalStart = 5u;
diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp
index 97998190b7..847a65abdd 100644
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp
@@ -1041,8 +1041,8 @@ HWTEST2_F(CommandListCreate, givenCopyCommandListWhenProfilingBeforeCommandForCo
     auto event = std::unique_ptr<L0::Event>(L0::Event::create(eventPool.get(), &eventDesc, device));
 
     auto baseAddr = event->getGpuAddress();
-    auto contextOffset = offsetof(TimestampPacketStorage::Packet, contextStart);
-    auto globalOffset = offsetof(TimestampPacketStorage::Packet, globalStart);
+    auto contextOffset = offsetof(TimestampPackets<uint32_t>::Packet, contextStart);
+    auto globalOffset = offsetof(TimestampPackets<uint32_t>::Packet, globalStart);
     EXPECT_EQ(event->getTimestampPacketAddress(), baseAddr);
 
     commandList->appendEventForProfilingCopyCommand(event->toHandle(), true);
@@ -1078,8 +1078,8 @@ HWTEST2_F(CommandListCreate, givenCopyCommandListWhenProfilingAfterCommandForCop
 
     commandList->appendEventForProfilingCopyCommand(event->toHandle(), false);
 
-    auto contextOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
-    auto globalOffset = offsetof(TimestampPacketStorage::Packet, globalEnd);
+    auto contextOffset = offsetof(TimestampPackets<uint32_t>::Packet, contextEnd);
+    auto globalOffset = offsetof(TimestampPackets<uint32_t>::Packet, globalEnd);
     auto baseAddr = event->getGpuAddress();
     GenCmdList cmdList;
     ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp
index 6b73c47bef..cf9c994775 100644
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp
@@ -138,7 +138,7 @@ HWTEST2_F(CommandListAppendEventReset, givenTimestampEventUsedInResetThenPipeCon
 
     commandList->appendEventReset(event->toHandle());
     ASSERT_EQ(0u, event->getPacketsInUse());
-    auto contextOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
+    auto contextOffset = offsetof(TimestampPackets<uint32_t>::Packet, contextEnd);
     auto baseAddr = event->getGpuAddress();
     auto gpuAddress = ptrOffset(baseAddr, contextOffset);
 
diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp
index d6f99e6a4e..9aebbca51b 100644
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp
@@ -201,7 +201,7 @@ HWTEST2_F(CommandListAppendSignalEvent, givenTimestampEventUsedInSignalThenPipeC
     auto event = std::unique_ptr<L0::Event>(L0::Event::create(eventPool.get(), &eventDesc, device));
 
     commandList->appendSignalEvent(event->toHandle());
-    auto contextOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
+    auto contextOffset = offsetof(TimestampPackets<uint32_t>::Packet, contextEnd);
     auto baseAddr = event->getGpuAddress();
     auto gpuAddress = ptrOffset(baseAddr, contextOffset);
 
diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp
index a0f64cf247..b32673e920 100644
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp
@@ -205,10 +205,10 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListWhenTimestampPassedToMemoryCopyR
     GenCmdList cmdList;
 
     auto baseAddr = event->getGpuAddress();
-    auto contextStartOffset = offsetof(TimestampPacketStorage::Packet, contextStart);
-    auto globalStartOffset = offsetof(TimestampPacketStorage::Packet, globalStart);
-    auto contextEndOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
-    auto globalEndOffset = offsetof(TimestampPacketStorage::Packet, globalEnd);
+    auto contextStartOffset = offsetof(TimestampPackets<uint32_t>::Packet, contextStart);
+    auto globalStartOffset = offsetof(TimestampPackets<uint32_t>::Packet, globalStart);
+    auto contextEndOffset = offsetof(TimestampPackets<uint32_t>::Packet, contextEnd);
+    auto globalEndOffset = offsetof(TimestampPackets<uint32_t>::Packet, globalEnd);
 
     ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
         cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), commandList->commandContainer.getCommandStream()->getUsed()));
diff --git a/level_zero/core/test/unit_tests/sources/event/test_event.cpp b/level_zero/core/test/unit_tests/sources/event/test_event.cpp
index a11e8bfa45..6a5aa9bda2 100644
--- a/level_zero/core/test/unit_tests/sources/event/test_event.cpp
+++ b/level_zero/core/test/unit_tests/sources/event/test_event.cpp
@@ -46,7 +46,7 @@ TEST_F(EventPoolCreate, givenTimestampEventsThenEventSizeSufficientForAllKernelT
 
     std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), 0, nullptr, &eventPoolDesc));
     ASSERT_NE(nullptr, eventPool);
-    uint32_t packetsSize = NEO::TimestampPacketSizeControl::preferredPacketCount * sizeof(struct TimestampPacketStorage::Packet);
+    uint32_t packetsSize = NEO::TimestampPacketSizeControl::preferredPacketCount * sizeof(struct TimestampPackets<uint32_t>::Packet);
     uint32_t kernelTimestampsSize = static_cast<uint32_t>(alignUp(packetsSize, MemoryConstants::cacheLineSize));
     EXPECT_EQ(kernelTimestampsSize, eventPool->getEventSize());
 }
@@ -364,12 +364,12 @@ TEST_F(TimestampEventCreate, givenEventTimestampWhenPacketCountIsIncreasedThenCo
     event->increasePacketsInUse();
     EXPECT_EQ(1u, event->getPacketsInUse());
 
-    gpuAddr += sizeof(TimestampPacketStorage::Packet);
+    gpuAddr += sizeof(TimestampPackets<uint32_t>::Packet);
     EXPECT_EQ(gpuAddr, event->getTimestampPacketAddress());
 }
 
 HWCMDTEST_F(IGFX_GEN9_CORE, TimestampEventCreate, givenEventTimestampsWhenQueryKernelTimestampThenCorrectDataAreSet) {
-    TimestampPacketStorage::Packet data = {};
+    TimestampPackets<uint32_t>::Packet data = {};
     data.contextStart = 1u;
     data.contextEnd = 2u;
     data.globalStart = 3u;
diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h
index 9cc010dc07..703c7637f2 100644
--- a/opencl/source/command_queue/command_queue_hw.h
+++ b/opencl/source/command_queue/command_queue_hw.h
@@ -472,14 +472,14 @@ class CommandQueueHw : public CommandQueue {
                                                    size_t hostSlicePitch);
     void processDeviceEnqueue(DeviceQueueHw<GfxFamily> *devQueueHw,
                               const MultiDispatchInfo &multiDispatchInfo,
-                              TagNode<HwTimeStamps> *hwTimeStamps,
+                              TagNodeBase *hwTimeStamps,
                               bool &blocking);
 
     template <uint32_t commandType>
     void processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo,
                                    std::unique_ptr<PrintfHandler> &printfHandler,
                                    Event *event,
-                                   TagNode<NEO::HwTimeStamps> *&hwTimeStamps,
+                                   TagNodeBase *&hwTimeStamps,
                                    bool blockQueue,
                                    DeviceQueueHw<GfxFamily> *devQueueHw,
                                    CsrDependencies &csrDeps,
diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h
index 8054ce059b..bbd1d1d81a 100644
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -174,7 +174,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
     DeviceQueueHw<GfxFamily> *devQueueHw = castToObject<DeviceQueueHw<GfxFamily>>(devQueue);
     auto clearAllDependencies = queueDependenciesClearRequired();
 
-    TagNode<HwTimeStamps> *hwTimeStamps = nullptr;
+    TagNodeBase *hwTimeStamps = nullptr;
 
     auto commandStreamRecieverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
 
@@ -385,13 +385,13 @@ template <uint32_t commandType>
 void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo,
                                                           std::unique_ptr<PrintfHandler> &printfHandler,
                                                           Event *event,
-                                                          TagNode<HwTimeStamps> *&hwTimeStamps,
+                                                          TagNodeBase *&hwTimeStamps,
                                                           bool blockQueue,
                                                           DeviceQueueHw<GfxFamily> *devQueueHw,
                                                           CsrDependencies &csrDeps,
                                                           KernelOperation *blockedCommandsData,
                                                           TimestampPacketDependencies &timestampPacketDependencies) {
-    TagNode<HwPerfCounter> *hwPerfCounter = nullptr;
+    TagNodeBase *hwPerfCounter = nullptr;
     FileLoggerInstance().dumpKernelArgs(&multiDispatchInfo);
 
     printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device));
@@ -565,7 +565,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForCacheFlush(Surface **surfaces,
 template <typename GfxFamily>
 void CommandQueueHw<GfxFamily>::processDeviceEnqueue(DeviceQueueHw<GfxFamily> *devQueueHw,
                                                      const MultiDispatchInfo &multiDispatchInfo,
-                                                     TagNode<HwTimeStamps> *hwTimeStamps,
+                                                     TagNodeBase *hwTimeStamps,
                                                      bool &blocking) {
     auto parentKernel = multiDispatchInfo.peekParentKernel();
     size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
diff --git a/opencl/source/command_queue/gpgpu_walker.h b/opencl/source/command_queue/gpgpu_walker.h
index fecce241fe..b433472cbe 100644
--- a/opencl/source/command_queue/gpgpu_walker.h
+++ b/opencl/source/command_queue/gpgpu_walker.h
@@ -118,29 +118,29 @@ class GpgpuWalkerHelper {
         uint32_t requiredWorkgroupOrder);
 
     static void dispatchProfilingCommandsStart(
-        TagNode<HwTimeStamps> &hwTimeStamps,
+        TagNodeBase &hwTimeStamps,
         LinearStream *commandStream,
         const HardwareInfo &hwInfo);
 
     static void dispatchProfilingCommandsEnd(
-        TagNode<HwTimeStamps> &hwTimeStamps,
+        TagNodeBase &hwTimeStamps,
         LinearStream *commandStream,
         const HardwareInfo &hwInfo);
 
     static void dispatchPerfCountersCommandsStart(
         CommandQueue &commandQueue,
-        TagNode<HwPerfCounter> &hwPerfCounter,
+        TagNodeBase &hwPerfCounter,
         LinearStream *commandStream);
 
     static void dispatchPerfCountersCommandsEnd(
         CommandQueue &commandQueue,
-        TagNode<HwPerfCounter> &hwPerfCounter,
+        TagNodeBase &hwPerfCounter,
         LinearStream *commandStream);
 
     static void setupTimestampPacket(
         LinearStream *cmdStream,
         WALKER_TYPE<GfxFamily> *walkerCmd,
-        TagNode<TimestampPacketStorage> *timestampPacketNode,
+        TagNodeBase *timestampPacketNode,
         const RootDeviceEnvironment &rootDeviceEnvironment);
 
     static void dispatchScheduler(
diff --git a/opencl/source/command_queue/gpgpu_walker_base.inl b/opencl/source/command_queue/gpgpu_walker_base.inl
index 769448fcf9..88e258d930 100644
--- a/opencl/source/command_queue/gpgpu_walker_base.inl
+++ b/opencl/source/command_queue/gpgpu_walker_base.inl
@@ -133,7 +133,7 @@ void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
 template <typename GfxFamily>
 void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
     CommandQueue &commandQueue,
-    TagNode<HwPerfCounter> &hwPerfCounter,
+    TagNodeBase &hwPerfCounter,
     LinearStream *commandStream) {
 
     const auto pPerformanceCounters = commandQueue.getPerfCounters();
@@ -149,7 +149,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
 template <typename GfxFamily>
 void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(
     CommandQueue &commandQueue,
-    TagNode<HwPerfCounter> &hwPerfCounter,
+    TagNodeBase &hwPerfCounter,
     LinearStream *commandStream) {
 
     const auto pPerformanceCounters = commandQueue.getPerfCounters();
diff --git a/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl b/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl
index 0522bf19cc..8a4e6702a6 100644
--- a/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl
+++ b/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl
@@ -167,7 +167,7 @@ template <typename GfxFamily>
 void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
     LinearStream *cmdStream,
     WALKER_TYPE<GfxFamily> *walkerCmd,
-    TagNode<TimestampPacketStorage> *timestampPacketNode,
+    TagNodeBase *timestampPacketNode,
     const RootDeviceEnvironment &rootDeviceEnvironment) {
 
     uint64_t address = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNode);
@@ -210,7 +210,7 @@ void GpgpuWalkerHelper<GfxFamily>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxF
 
 template <typename GfxFamily>
 void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
-    TagNode<HwTimeStamps> &hwTimeStamps,
+    TagNodeBase &hwTimeStamps,
     LinearStream *commandStream,
     const HardwareInfo &hwInfo) {
     using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
@@ -242,7 +242,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
 
 template <typename GfxFamily>
 void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
-    TagNode<HwTimeStamps> &hwTimeStamps,
+    TagNodeBase &hwTimeStamps,
     LinearStream *commandStream,
     const HardwareInfo &hwInfo) {
     using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
diff --git a/opencl/source/command_queue/hardware_interface.h b/opencl/source/command_queue/hardware_interface.h
index 05c0dd1519..1e989932cc 100644
--- a/opencl/source/command_queue/hardware_interface.h
+++ b/opencl/source/command_queue/hardware_interface.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2020 Intel Corporation
+ * Copyright (C) 2018-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -19,13 +19,13 @@ class DispatchInfo;
 class IndirectHeap;
 class Kernel;
 class LinearStream;
-struct HwPerfCounter;
-struct HwTimeStamps;
+class HwPerfCounter;
+class HwTimeStamps;
 struct KernelOperation;
 struct MultiDispatchInfo;
 
 template <class T>
-struct TagNode;
+class TagNode;
 
 template <typename GfxFamily>
 using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;
@@ -40,8 +40,8 @@ class HardwareInterface {
         const MultiDispatchInfo &multiDispatchInfo,
         const CsrDependencies &csrDependencies,
         KernelOperation *blockedCommandsData,
-        TagNode<HwTimeStamps> *hwTimeStamps,
-        TagNode<HwPerfCounter> *hwPerfCounter,
+        TagNodeBase *hwTimeStamps,
+        TagNodeBase *hwPerfCounter,
         TimestampPacketDependencies *timestampPacketDependencies,
         TimestampPacketContainer *currentTimestampPacketNodes,
         uint32_t commandType);
@@ -62,14 +62,14 @@ class HardwareInterface {
         const bool &enable);
 
     static void dispatchProfilingPerfStartCommands(
-        TagNode<HwTimeStamps> *hwTimeStamps,
-        TagNode<HwPerfCounter> *hwPerfCounter,
+        TagNodeBase *hwTimeStamps,
+        TagNodeBase *hwPerfCounter,
         LinearStream *commandStream,
         CommandQueue &commandQueue);
 
     static void dispatchProfilingPerfEndCommands(
-        TagNode<HwTimeStamps> *hwTimeStamps,
-        TagNode<HwPerfCounter> *hwPerfCounter,
+        TagNodeBase *hwTimeStamps,
+        TagNodeBase *hwPerfCounter,
         LinearStream *commandStream,
         CommandQueue &commandQueue);
 
diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl
index 400bc83c97..469b978452 100644
--- a/opencl/source/command_queue/hardware_interface_base.inl
+++ b/opencl/source/command_queue/hardware_interface_base.inl
@@ -30,8 +30,8 @@ inline WALKER_TYPE<GfxFamily> *HardwareInterface<GfxFamily>::allocateWalkerSpace
 
 template <typename GfxFamily>
 inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
-    TagNode<HwTimeStamps> *hwTimeStamps,
-    TagNode<HwPerfCounter> *hwPerfCounter,
+    TagNodeBase *hwTimeStamps,
+    TagNodeBase *hwPerfCounter,
     LinearStream *commandStream,
     CommandQueue &commandQueue) {
 
@@ -46,8 +46,8 @@ inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
 
 template <typename GfxFamily>
 inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
-    TagNode<HwTimeStamps> *hwTimeStamps,
-    TagNode<HwPerfCounter> *hwPerfCounter,
+    TagNodeBase *hwTimeStamps,
+    TagNodeBase *hwPerfCounter,
     LinearStream *commandStream,
     CommandQueue &commandQueue) {
 
@@ -66,8 +66,8 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
     const MultiDispatchInfo &multiDispatchInfo,
     const CsrDependencies &csrDependencies,
     KernelOperation *blockedCommandsData,
-    TagNode<HwTimeStamps> *hwTimeStamps,
-    TagNode<HwPerfCounter> *hwPerfCounter,
+    TagNodeBase *hwTimeStamps,
+    TagNodeBase *hwPerfCounter,
     TimestampPacketDependencies *timestampPacketDependencies,
     TimestampPacketContainer *currentTimestampPacketNodes,
     uint32_t commandType) {
diff --git a/opencl/source/device_queue/device_queue.cpp b/opencl/source/device_queue/device_queue.cpp
index 8c2254c935..58c790e6cf 100644
--- a/opencl/source/device_queue/device_queue.cpp
+++ b/opencl/source/device_queue/device_queue.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -169,7 +169,7 @@ void DeviceQueue::initDeviceQueue() {
 }
 
 void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel,
-                                              uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNode<HwTimeStamps> *hwTimeStamp, bool isCcsUsed) {
+                                              uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNodeBase *hwTimeStamp, bool isCcsUsed) {
     setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentCount, isCcsUsed);
     addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, tagAddress, taskCount);
 }
@@ -178,7 +178,7 @@ void DeviceQueue::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHea
     return;
 }
 
-void DeviceQueue::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) {
+void DeviceQueue::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) {
     return;
 }
 
diff --git a/opencl/source/device_queue/device_queue.h b/opencl/source/device_queue/device_queue.h
index ebd2fce080..ed48f03628 100644
--- a/opencl/source/device_queue/device_queue.h
+++ b/opencl/source/device_queue/device_queue.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,8 @@ class Kernel;
 class Event;
 struct MultiDispatchInfo;
 class SchedulerKernel;
-struct HwTimeStamps;
-template <class T>
-struct TagNode;
+class HwTimeStamps;
+class TagNodeBase;
 
 template <>
 struct OpenCLObjectMapper<_device_queue> {
@@ -72,10 +71,10 @@ class DeviceQueue : public BaseObject<_device_queue> {
                                size_t paramValueSize, void *paramValue,
                                size_t *paramValueSizeRet);
 
-    void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNode<HwTimeStamps> *hwTimeStamp, bool isCcsUsed);
+    void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNodeBase *hwTimeStamp, bool isCcsUsed);
 
     virtual void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed);
-    virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount);
+    virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount);
 
     MOCKABLE_VIRTUAL bool isEMCriticalSectionFree() {
         auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());
diff --git a/opencl/source/device_queue/device_queue_hw.h b/opencl/source/device_queue/device_queue_hw.h
index b60edf0d14..72469e3b93 100644
--- a/opencl/source/device_queue/device_queue_hw.h
+++ b/opencl/source/device_queue/device_queue_hw.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@ class DeviceQueueHw : public DeviceQueue {
 
     void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed) override;
 
-    void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) override;
+    void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) override;
     void resetDeviceQueue() override;
     void dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh, bool isCcsUsed) override;
 
diff --git a/opencl/source/device_queue/device_queue_hw_base.inl b/opencl/source/device_queue/device_queue_hw_base.inl
index 2534019ac3..8c0d9ac567 100644
--- a/opencl/source/device_queue/device_queue_hw_base.inl
+++ b/opencl/source/device_queue/device_queue_hw_base.inl
@@ -103,7 +103,7 @@ void DeviceQueueHw<GfxFamily>::initPipeControl(PIPE_CONTROL *pc) {
 }
 
 template <typename GfxFamily>
-void DeviceQueueHw<GfxFamily>::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) {
+void DeviceQueueHw<GfxFamily>::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) {
     // CleanUp Section
     auto offset = slbCS.getUsed();
     auto alignmentSize = alignUp(offset, MemoryConstants::pageSize) - offset;
diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp
index 585bba9e1d..60a3b1c00c 100644
--- a/opencl/source/event/event.cpp
+++ b/opencl/source/event/event.cpp
@@ -129,8 +129,8 @@ Event::~Event() {
             timeStampNode->returnTag();
         }
         if (perfCounterNode != nullptr) {
-            cmdQueue->getPerfCounters()->deleteQuery(perfCounterNode->tagForCpuAccess->query.handle);
-            perfCounterNode->tagForCpuAccess->query.handle = {};
+            cmdQueue->getPerfCounters()->deleteQuery(perfCounterNode->getQueryHandleRef());
+            perfCounterNode->getQueryHandleRef() = {};
             perfCounterNode->returnTag();
         }
         cmdQueue->decRefInternal();
@@ -258,12 +258,12 @@ bool Event::calcProfilingData() {
                 for (auto i = 0u; i < timestamps.size(); i++) {
                     std::cout << "Timestamp " << i << ", "
                               << "profiling capable: " << timestamps[i]->isProfilingCapable() << ", ";
-                    for (auto j = 0u; j < timestamps[i]->tagForCpuAccess->getPacketsUsed(); j++) {
+                    for (auto j = 0u; j < timestamps[i]->getPacketsUsed(); j++) {
                         std::cout << "packet " << j << ": "
-                                  << "global start: " << timestamps[i]->tagForCpuAccess->getGlobalStartValue(j) << ", "
-                                  << "global end: " << timestamps[i]->tagForCpuAccess->getGlobalEndValue(j) << ", "
-                                  << "context start: " << timestamps[i]->tagForCpuAccess->getContextStartValue(j) << ", "
-                                  << "context end: " << timestamps[i]->tagForCpuAccess->getContextEndValue(j) << std::endl;
+                                  << "global start: " << timestamps[i]->getGlobalStartValue(j) << ", "
+                                  << "global end: " << timestamps[i]->getGlobalEndValue(j) << ", "
+                                  << "context start: " << timestamps[i]->getContextStartValue(j) << ", "
+                                  << "context end: " << timestamps[i]->getContextEndValue(j) << std::endl;
                     }
                 }
             }
@@ -277,16 +277,16 @@ bool Event::calcProfilingData() {
         } else if (timeStampNode) {
             if (HwHelper::get(this->cmdQueue->getDevice().getHardwareInfo().platform.eRenderCoreFamily).useOnlyGlobalTimestamps()) {
                 calculateProfilingDataInternal(
-                    timeStampNode->tagForCpuAccess->GlobalStartTS,
-                    timeStampNode->tagForCpuAccess->GlobalEndTS,
-                    &timeStampNode->tagForCpuAccess->GlobalEndTS,
-                    timeStampNode->tagForCpuAccess->GlobalStartTS);
+                    timeStampNode->getGlobalStartValue(0),
+                    timeStampNode->getGlobalEndValue(0),
+                    &timeStampNode->getGlobalEndRef(),
+                    timeStampNode->getGlobalStartValue(0));
             } else {
                 calculateProfilingDataInternal(
-                    timeStampNode->tagForCpuAccess->ContextStartTS,
-                    timeStampNode->tagForCpuAccess->ContextEndTS,
-                    &timeStampNode->tagForCpuAccess->ContextCompleteTS,
-                    timeStampNode->tagForCpuAccess->GlobalStartTS);
+                    timeStampNode->getContextStartValue(0),
+                    timeStampNode->getContextEndValue(0),
+                    &timeStampNode->getContextCompleteRef(),
+                    timeStampNode->getGlobalStartValue(0));
             }
         }
     }
@@ -346,19 +346,19 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
 void Event::getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS) {
     const auto timestamps = timestampContainer->peekNodes();
 
-    globalStartTS = timestamps[0]->tagForCpuAccess->getGlobalStartValue(0);
-    globalEndTS = timestamps[0]->tagForCpuAccess->getGlobalEndValue(0);
+    globalStartTS = timestamps[0]->getGlobalStartValue(0);
+    globalEndTS = timestamps[0]->getGlobalEndValue(0);
 
     for (const auto &timestamp : timestamps) {
         if (!timestamp->isProfilingCapable()) {
             continue;
         }
-        for (auto i = 0u; i < timestamp->tagForCpuAccess->getPacketsUsed(); ++i) {
-            if (globalStartTS > timestamp->tagForCpuAccess->getGlobalStartValue(i)) {
-                globalStartTS = timestamp->tagForCpuAccess->getGlobalStartValue(i);
+        for (auto i = 0u; i < timestamp->getPacketsUsed(); ++i) {
+            if (globalStartTS > timestamp->getGlobalStartValue(i)) {
+                globalStartTS = timestamp->getGlobalStartValue(i);
             }
-            if (globalEndTS < timestamp->tagForCpuAccess->getGlobalEndValue(i)) {
-                globalEndTS = timestamp->tagForCpuAccess->getGlobalEndValue(i);
+            if (globalEndTS < timestamp->getGlobalEndValue(i)) {
+                globalEndTS = timestamp->getGlobalEndValue(i);
             }
         }
     }
@@ -734,14 +734,14 @@ void Event::setEndTimeStamp() {
     }
 }
 
-TagNode<HwTimeStamps> *Event::getHwTimeStampNode() {
+TagNodeBase *Event::getHwTimeStampNode() {
     if (!cmdQueue->getTimestampPacketContainer() && !timeStampNode) {
         timeStampNode = cmdQueue->getGpgpuCommandStreamReceiver().getEventTsAllocator()->getTag();
     }
     return timeStampNode;
 }
 
-TagNode<HwPerfCounter> *Event::getHwPerfCounterNode() {
+TagNodeBase *Event::getHwPerfCounterNode() {
 
     if (!perfCounterNode && cmdQueue->getPerfCounters()) {
         const uint32_t gpuReportSize = HwPerfCounter::getSize(*(cmdQueue->getPerfCounters()));
diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h
index 7d59d92b7c..917c6c3d56 100644
--- a/opencl/source/event/event.h
+++ b/opencl/source/event/event.h
@@ -24,7 +24,7 @@
 
 namespace NEO {
 template <typename TagType>
-struct TagNode;
+class TagNode;
 class CommandQueue;
 class Context;
 class Device;
@@ -106,7 +106,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
 
     void setProfilingEnabled(bool profilingEnabled) { this->profilingEnabled = profilingEnabled; }
 
-    TagNode<HwTimeStamps> *getHwTimeStampNode();
+    TagNodeBase *getHwTimeStampNode();
 
     void addTimestampPacketNodes(const TimestampPacketContainer &inputTimestampPacketContainer);
     TimestampPacketContainer *getTimestampPacketNodes() const;
@@ -119,7 +119,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
         this->perfCountersEnabled = perfCountersEnabled;
     }
 
-    TagNode<HwPerfCounter> *getHwPerfCounterNode();
+    TagNodeBase *getHwPerfCounterNode();
 
     std::unique_ptr<FlushStampTracker> flushStamp;
     std::atomic<uint32_t> taskLevel;
@@ -372,8 +372,8 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
     uint64_t completeTimeStamp;
     uint32_t bcsTaskCount = 0;
     bool perfCountersEnabled;
-    TagNode<HwTimeStamps> *timeStampNode = nullptr;
-    TagNode<HwPerfCounter> *perfCounterNode = nullptr;
+    TagNodeBase *timeStampNode = nullptr;
+    TagNodeBase *perfCounterNode = nullptr;
     std::unique_ptr<TimestampPacketContainer> timestampPacketContainer;
     //number of events this event depends on
     std::atomic<int> parentCount;
diff --git a/opencl/source/event/hw_timestamps.h b/opencl/source/event/hw_timestamps.h
index 1c3c553dc1..2a8fef6bcd 100644
--- a/opencl/source/event/hw_timestamps.h
+++ b/opencl/source/event/hw_timestamps.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -12,7 +12,8 @@
 
 namespace NEO {
 
-struct HwTimeStamps {
+class HwTimeStamps : public TagTypeBase {
+  public:
     void initialize() {
         GlobalStartTS = 0;
         ContextStartTS = 0;
@@ -21,12 +22,18 @@ struct HwTimeStamps {
         GlobalCompleteTS = 0;
         ContextCompleteTS = 0;
     }
-    bool isCompleted() const { return true; }
-    uint32_t getImplicitGpuDependenciesCount() const { return 0; }
 
-    static GraphicsAllocation::AllocationType getAllocationType() {
+    static constexpr GraphicsAllocation::AllocationType getAllocationType() {
         return GraphicsAllocation::AllocationType::PROFILING_TAG_BUFFER;
     }
+
+    static constexpr TagNodeType getTagNodeType() { return TagNodeType::HwTimeStamps; }
+
+    uint64_t getContextStartValue(uint32_t) const { return ContextStartTS; }
+    uint64_t getGlobalStartValue(uint32_t) const { return GlobalStartTS; }
+    uint64_t getContextEndValue(uint32_t) const { return ContextEndTS; }
+    uint64_t getGlobalEndValue(uint32_t) const { return GlobalEndTS; }
+
     uint64_t GlobalStartTS;
     uint64_t ContextStartTS;
     uint64_t GlobalEndTS;
@@ -34,4 +41,7 @@ struct HwTimeStamps {
     uint64_t GlobalCompleteTS;
     uint64_t ContextCompleteTS;
 };
+
+static_assert((6 * sizeof(uint64_t)) == sizeof(HwTimeStamps),
+              "This structure is consumed by GPU and has to follow specific restrictions for padding and size");
 } // namespace NEO
diff --git a/opencl/source/event/perf_counter.h b/opencl/source/event/perf_counter.h
index e48eea4350..8ba4415fcc 100644
--- a/opencl/source/event/perf_counter.h
+++ b/opencl/source/event/perf_counter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -14,24 +14,24 @@
 
 namespace NEO {
 
-struct HwPerfCounter {
+class HwPerfCounter : public TagTypeBase {
+  public:
     void initialize() {
         query = {};
         report[0] = 0;
     }
 
-    static GraphicsAllocation::AllocationType getAllocationType() {
+    static constexpr GraphicsAllocation::AllocationType getAllocationType() {
         return GraphicsAllocation::AllocationType::PROFILING_TAG_BUFFER;
     }
 
+    static constexpr TagNodeType getTagNodeType() { return TagNodeType::HwPerfCounter; }
+
     template <typename Type>
     static uint32_t getSize(Type &performanceCounters) {
         return sizeof(query) + performanceCounters.getGpuReportSize();
     }
 
-    bool isCompleted() const { return true; }
-    uint32_t getImplicitGpuDependenciesCount() const { return 0; }
-
     // Gpu report size is not known during compile time.
     // Such information will be provided by metrics library dll.
     // Bellow variable will be allocated dynamically based on information
@@ -43,4 +43,5 @@ struct HwPerfCounter {
 
     uint8_t report[1] = {};
 };
+
 } // namespace NEO
diff --git a/opencl/source/helpers/task_information.h b/opencl/source/helpers/task_information.h
index 4c8c53714f..56a57e0647 100644
--- a/opencl/source/helpers/task_information.h
+++ b/opencl/source/helpers/task_information.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,10 +27,10 @@ class Kernel;
 class MemObj;
 class Surface;
 class PrintfHandler;
-struct HwTimeStamps;
+class HwTimeStamps;
 class TimestampPacketContainer;
 template <class T>
-struct TagNode;
+class TagNode;
 
 enum MapOperationType {
     MAP,
@@ -99,7 +99,7 @@ class Command : public IFNode<Command> {
     void setEventsRequest(EventsRequest &eventsRequest);
     void makeTimestampPacketsResident(CommandStreamReceiver &commandStreamReceiver);
 
-    TagNode<HwTimeStamps> *timestamp = nullptr;
+    TagNodeBase *timestamp = nullptr;
     CompletionStamp completionStamp = {};
 
   protected:
diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp
index 189c469acd..4667f7ae14 100644
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@@ -1227,7 +1227,7 @@ bool Kernel::hasTunningFinished(KernelSubmissionData &submissionData) {
 
 bool Kernel::hasRunFinished(TimestampPacketContainer *timestampContainer) {
     for (const auto &node : timestampContainer->peekNodes()) {
-        if (!node->tagForCpuAccess->isCompleted()) {
+        if (!node->isCompleted()) {
             return false;
         }
     }
diff --git a/opencl/source/os_interface/performance_counters.cpp b/opencl/source/os_interface/performance_counters.cpp
index 81bf73e2ee..b097afbd5d 100644
--- a/opencl/source/os_interface/performance_counters.cpp
+++ b/opencl/source/os_interface/performance_counters.cpp
@@ -140,7 +140,7 @@ void PerformanceCounters::closeMetricsLibrary() {
 //////////////////////////////////////////////////////
 // PerformanceCounters::getQueryHandle
 //////////////////////////////////////////////////////
-void PerformanceCounters::getQueryHandle(QueryHandle_1_0 &handle) {
+void PerformanceCounters::getQueryHandleRef(QueryHandle_1_0 &handle) {
     if (!handle.IsValid()) {
         metricsLibrary->hwCountersCreate(
             context,
@@ -201,7 +201,7 @@ uint32_t PerformanceCounters::getGpuCommandsSize(
     bufferData.CommandsType = ObjectType::QueryHwCounters;
     bufferData.Type = commandBufferType;
 
-    getQueryHandle(query);
+    getQueryHandleRef(query);
 
     bufferData.QueryHwCounters.Begin = begin;
     bufferData.QueryHwCounters.Handle = query;
@@ -216,7 +216,7 @@ uint32_t PerformanceCounters::getGpuCommandsSize(
 //////////////////////////////////////////////////////
 bool PerformanceCounters::getGpuCommands(
     const MetricsLibraryApi::GpuCommandBufferType commandBufferType,
-    TagNode<HwPerfCounter> &performanceCounters,
+    TagNodeBase &performanceCounters,
     const bool begin,
     const uint32_t bufferSize,
     void *pBuffer) {
@@ -231,15 +231,15 @@ bool PerformanceCounters::getGpuCommands(
 
     // Gpu memory allocation for query hw counters.
     const uint32_t allocationOffset = offsetof(HwPerfCounter, report);
-    bufferData.Allocation.CpuAddress = reinterpret_cast<uint8_t *>(performanceCounters.tagForCpuAccess) + allocationOffset;
+    bufferData.Allocation.CpuAddress = reinterpret_cast<uint8_t *>(performanceCounters.getCpuBase()) + allocationOffset;
     bufferData.Allocation.GpuAddress = performanceCounters.getGpuAddress() + allocationOffset;
 
     // Allocate query handle for cl_event if not exists.
-    getQueryHandle(performanceCounters.tagForCpuAccess->query.handle);
+    getQueryHandleRef(performanceCounters.getQueryHandleRef());
 
     // Query hw counters specific data.
     bufferData.QueryHwCounters.Begin = begin;
-    bufferData.QueryHwCounters.Handle = performanceCounters.tagForCpuAccess->query.handle;
+    bufferData.QueryHwCounters.Handle = performanceCounters.getQueryHandleRef();
 
     return metricsLibrary->commandBufferGet(bufferData);
 }
@@ -261,7 +261,7 @@ uint32_t PerformanceCounters::getGpuReportSize() {
 //////////////////////////////////////////////////////
 // PerformanceCounters::getApiReport
 //////////////////////////////////////////////////////
-bool PerformanceCounters::getApiReport(const TagNode<HwPerfCounter> *performanceCounters, const size_t inputParamSize, void *pInputParam, size_t *pOutputParamSize, bool isEventComplete) {
+bool PerformanceCounters::getApiReport(const TagNodeBase *performanceCounters, const size_t inputParamSize, void *pInputParam, size_t *pOutputParamSize, bool isEventComplete) {
     const uint32_t outputSize = metricsLibrary->hwCountersGetApiReportSize();
 
     if (pOutputParamSize) {
@@ -272,10 +272,6 @@ bool PerformanceCounters::getApiReport(const TagNode<HwPerfCounter> *performance
         return false;
     }
 
-    if (!performanceCounters->tagForCpuAccess) {
-        return false;
-    }
-
     if (pInputParam == nullptr && inputParamSize == 0 && pOutputParamSize) {
         return true;
     }
@@ -288,6 +284,6 @@ bool PerformanceCounters::getApiReport(const TagNode<HwPerfCounter> *performance
         return false;
     }
 
-    return metricsLibrary->hwCountersGetReport(performanceCounters->tagForCpuAccess->query.handle, 0, 1, outputSize, pInputParam);
+    return metricsLibrary->hwCountersGetReport(performanceCounters->getQueryHandleRef(), 0, 1, outputSize, pInputParam);
 }
 } // namespace NEO
diff --git a/opencl/source/os_interface/performance_counters.h b/opencl/source/os_interface/performance_counters.h
index aad35f3830..365be6294d 100644
--- a/opencl/source/os_interface/performance_counters.h
+++ b/opencl/source/os_interface/performance_counters.h
@@ -16,8 +16,7 @@ namespace NEO {
 //////////////////////////////////////////////////////
 // Forward declaration.
 //////////////////////////////////////////////////////
-template <typename Node>
-struct TagNode;
+class TagNodeBase;
 class CommandQueue;
 
 //////////////////////////////////////////////////////
@@ -50,14 +49,14 @@ class PerformanceCounters {
     //////////////////////////////////////////////////////
     static uint32_t getGpuCommandsSize(CommandQueue &commandQueue, const bool reservePerfCounters);
     uint32_t getGpuCommandsSize(const MetricsLibraryApi::GpuCommandBufferType commandBufferType, const bool begin);
-    bool getGpuCommands(const MetricsLibraryApi::GpuCommandBufferType commandBufferType, TagNode<HwPerfCounter> &performanceCounters, const bool begin, const uint32_t bufferSize, void *pBuffer);
+    bool getGpuCommands(const MetricsLibraryApi::GpuCommandBufferType commandBufferType, TagNodeBase &performanceCounters, const bool begin, const uint32_t bufferSize, void *pBuffer);
 
     /////////////////////////////////////////////////////
     // Gpu/Api reports.
     /////////////////////////////////////////////////////
     uint32_t getApiReportSize();
     uint32_t getGpuReportSize();
-    bool getApiReport(const TagNode<HwPerfCounter> *performanceCounters, const size_t inputParamSize, void *pClientData, size_t *pOutputSize, bool isEventComplete);
+    bool getApiReport(const TagNodeBase *performanceCounters, const size_t inputParamSize, void *pClientData, size_t *pOutputSize, bool isEventComplete);
 
     /////////////////////////////////////////////////////
     // Metrics Library interface.
@@ -71,7 +70,7 @@ class PerformanceCounters {
     // Metrics Library context/query handles.
     /////////////////////////////////////////////////////
     ContextHandle_1_0 getMetricsLibraryContext();
-    void getQueryHandle(QueryHandle_1_0 &handle);
+    void getQueryHandleRef(QueryHandle_1_0 &handle);
     void deleteQuery(QueryHandle_1_0 &handle);
 
   protected:
diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_tests.cpp
index 85b2081a3a..ef7a46d51c 100644
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_tests.cpp
@@ -613,17 +613,17 @@ HWTEST_F(CommandStreamReceiverTest, whenCsrIsCreatedThenUseTimestampPacketWriteI
 }
 
 TEST_F(CommandStreamReceiverTest, whenGettingEventTsAllocatorThenSameTagAllocatorIsReturned) {
-    TagAllocator<HwTimeStamps> *allocator = commandStreamReceiver->getEventTsAllocator();
+    TagAllocatorBase *allocator = commandStreamReceiver->getEventTsAllocator();
     EXPECT_NE(nullptr, allocator);
-    TagAllocator<HwTimeStamps> *allocator2 = commandStreamReceiver->getEventTsAllocator();
+    TagAllocatorBase *allocator2 = commandStreamReceiver->getEventTsAllocator();
     EXPECT_EQ(allocator2, allocator);
 }
 
 TEST_F(CommandStreamReceiverTest, whenGettingEventPerfCountAllocatorThenSameTagAllocatorIsReturned) {
     const uint32_t gpuReportSize = 100;
-    TagAllocator<HwPerfCounter> *allocator = commandStreamReceiver->getEventPerfCountAllocator(gpuReportSize);
+    TagAllocatorBase *allocator = commandStreamReceiver->getEventPerfCountAllocator(gpuReportSize);
     EXPECT_NE(nullptr, allocator);
-    TagAllocator<HwPerfCounter> *allocator2 = commandStreamReceiver->getEventPerfCountAllocator(gpuReportSize);
+    TagAllocatorBase *allocator2 = commandStreamReceiver->getEventPerfCountAllocator(gpuReportSize);
     EXPECT_EQ(allocator2, allocator);
 }
 
@@ -631,11 +631,11 @@ HWTEST_F(CommandStreamReceiverTest, givenTimestampPacketAllocatorWhenAskingForTa
     auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
     EXPECT_EQ(nullptr, csr.timestampPacketAllocator.get());
 
-    TagAllocator<TimestampPacketStorage> *allocator = csr.getTimestampPacketAllocator();
+    auto allocator = static_cast<TagAllocator<TimestampPackets<uint32_t>> *>(csr.getTimestampPacketAllocator());
     EXPECT_NE(nullptr, csr.timestampPacketAllocator.get());
     EXPECT_EQ(allocator, csr.timestampPacketAllocator.get());
 
-    TagAllocator<TimestampPacketStorage> *allocator2 = csr.getTimestampPacketAllocator();
+    auto allocator2 = static_cast<TagAllocator<TimestampPackets<uint32_t>> *>(csr.getTimestampPacketAllocator());
     EXPECT_EQ(allocator, allocator2);
 
     auto node1 = allocator->getTag();
diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_with_aub_dump_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_with_aub_dump_tests.cpp
index d7735235b2..90f6a11725 100644
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_with_aub_dump_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_with_aub_dump_tests.cpp
@@ -294,10 +294,10 @@ struct CommandStreamReceiverTagTests : public ::testing::Test {
         uint32_t zeros[4] = {};
 
         for (uint32_t i = 0; i < TimestampPacketSizeControl::preferredPacketCount; i++) {
-            tag->tagForCpuAccess->assignDataToAllTimestamps(i, zeros);
+            tag->assignDataToAllTimestamps(i, zeros);
         }
 
-        EXPECT_TRUE(tag->tagForCpuAccess->isCompleted());
+        EXPECT_TRUE(tag->isCompleted());
 
         bool canBeReleased = tag->canBeReleased();
         allocator->returnTag(tag);
diff --git a/opencl/test/unit_test/event/event_tests.cpp b/opencl/test/unit_test/event/event_tests.cpp
index a77e3b73fe..bdd8f44e77 100644
--- a/opencl/test/unit_test/event/event_tests.cpp
+++ b/opencl/test/unit_test/event/event_tests.cpp
@@ -864,10 +864,10 @@ HWTEST_F(InternalsEventWithPerfCountersTest, givenCpuProfilingPerfCountersPathWh
     pCmdQ->setPerfCountersEnabled();
     MockEvent<Event> *event = new MockEvent<Event>(pCmdQ, CL_COMMAND_MARKER, 0, 0);
     event->setCPUProfilingPath(true);
-    HwPerfCounter *perfCounter = event->getHwPerfCounterNode()->tagForCpuAccess;
+    HwPerfCounter *perfCounter = static_cast<TagNode<HwPerfCounter> *>(event->getHwPerfCounterNode())->tagForCpuAccess;
     ASSERT_NE(nullptr, perfCounter);
 
-    auto hwTimeStampNode = event->getHwTimeStampNode();
+    auto hwTimeStampNode = static_cast<TagNode<HwTimeStamps> *>(event->getHwTimeStampNode());
     if (pCmdQ->getTimestampPacketContainer()) {
         EXPECT_EQ(nullptr, hwTimeStampNode);
     } else {
@@ -1122,7 +1122,7 @@ HWTEST_F(EventTest, WhenGettingHwTimeStampsThenValidPointerIsReturned) {
     std::unique_ptr<Event> event(new Event(myCmdQ.get(), CL_COMMAND_COPY_BUFFER, 0, 0));
     ASSERT_NE(nullptr, event);
 
-    HwTimeStamps *timeStamps = event->getHwTimeStampNode()->tagForCpuAccess;
+    HwTimeStamps *timeStamps = static_cast<TagNode<HwTimeStamps> *>(event->getHwTimeStampNode())->tagForCpuAccess;
     ASSERT_NE(nullptr, timeStamps);
 
     //this should not cause any heap corruptions
@@ -1133,9 +1133,9 @@ HWTEST_F(EventTest, WhenGettingHwTimeStampsThenValidPointerIsReturned) {
     ASSERT_EQ(0ULL, timeStamps->GlobalCompleteTS);
     ASSERT_EQ(0ULL, timeStamps->ContextCompleteTS);
 
-    EXPECT_TRUE(timeStamps->isCompleted());
+    EXPECT_TRUE(event->getHwTimeStampNode()->isCompleted());
 
-    HwTimeStamps *timeStamps2 = event->getHwTimeStampNode()->tagForCpuAccess;
+    HwTimeStamps *timeStamps2 = static_cast<TagNode<HwTimeStamps> *>(event->getHwTimeStampNode())->tagForCpuAccess;
     ASSERT_EQ(timeStamps, timeStamps2);
 }
 
@@ -1165,7 +1165,7 @@ HWTEST_F(EventTest, WhenEventIsCreatedThenHwTimeStampsMemoryIsPlacedInGraphicsAl
     std::unique_ptr<Event> event(new Event(myCmdQ.get(), CL_COMMAND_COPY_BUFFER, 0, 0));
     ASSERT_NE(nullptr, event);
 
-    HwTimeStamps *timeStamps = event->getHwTimeStampNode()->tagForCpuAccess;
+    HwTimeStamps *timeStamps = static_cast<TagNode<HwTimeStamps> *>(event->getHwTimeStampNode())->tagForCpuAccess;
     ASSERT_NE(nullptr, timeStamps);
 
     GraphicsAllocation *allocation = event->getHwTimeStampNode()->getBaseGraphicsAllocation();
diff --git a/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp b/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp
index 62f6c5214d..e4121e62e7 100644
--- a/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp
+++ b/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp
@@ -58,9 +58,12 @@ class MockDeviceQueueHwWithCriticalSectionRelease : public DeviceQueueHw<GfxFami
         indirectStateSetup = true;
         return BaseClass::setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentIDCount, isCcsUsed);
     }
-    void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) override {
+    void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) override {
         cleanupSectionAdded = true;
-        timestampAddedInCleanupSection = hwTimeStamp ? hwTimeStamp->tagForCpuAccess : nullptr;
+
+        auto hwTimestampT = static_cast<TagNode<HwTimeStamps> *>(hwTimeStamp);
+
+        timestampAddedInCleanupSection = hwTimestampT ? hwTimestampT->tagForCpuAccess : nullptr;
         return BaseClass::addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, tagAddress, taskCount);
     }
     void dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh, bool isCcsUsed) override {
@@ -253,7 +256,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenBlockedParentK
     std::vector<Surface *> surfaces;
     auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, blockedCommandData, surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
 
-    auto timestamp = pCmdQ->getGpgpuCommandStreamReceiver().getEventTsAllocator()->getTag();
+    auto timestamp = static_cast<TagNode<HwTimeStamps> *>(pCmdQ->getGpgpuCommandStreamReceiver().getEventTsAllocator()->getTag());
     cmdComputeKernel->timestamp = timestamp;
     cmdComputeKernel->submit(0, false);
 
diff --git a/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp
index 72ce050702..da1dfde1da 100644
--- a/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp
+++ b/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp
@@ -36,31 +36,30 @@
 using namespace NEO;
 
 struct TimestampPacketSimpleTests : public ::testing::Test {
-    class MockTimestampPacketStorage : public TimestampPacketStorage {
+    class MockTimestampPacketStorage : public TimestampPackets<uint32_t> {
       public:
-        using TimestampPacketStorage::implicitGpuDependenciesCount;
-        using TimestampPacketStorage::packets;
+        using TimestampPackets<uint32_t>::implicitGpuDependenciesCount;
+        using TimestampPackets<uint32_t>::packets;
     };
 
-    template <typename TimestampPacketStorageT>
-    void setTagToReadyState(TagNode<TimestampPacketStorageT> *tagNode) {
-        auto packetsUsed = tagNode->tagForCpuAccess->getPacketsUsed();
+    void setTagToReadyState(TagNodeBase *tagNode) {
+        auto packetsUsed = tagNode->getPacketsUsed();
         tagNode->initialize();
 
         uint32_t zeros[4] = {};
 
         for (uint32_t i = 0; i < TimestampPacketSizeControl::preferredPacketCount; i++) {
-            tagNode->tagForCpuAccess->assignDataToAllTimestamps(i, zeros);
+            tagNode->assignDataToAllTimestamps(i, zeros);
         }
-        tagNode->tagForCpuAccess->setPacketsUsed(packetsUsed);
+        tagNode->setPacketsUsed(packetsUsed);
     }
 
     const size_t gws[3] = {1, 1, 1};
 };
 
 struct TimestampPacketTests : public TimestampPacketSimpleTests {
-    struct MockTagNode : public TagNode<TimestampPacketStorage> {
-        using TagNode<TimestampPacketStorage>::gpuAddress;
+    struct MockTagNode : public TagNode<TimestampPackets<uint32_t>> {
+        using TagNode<TimestampPackets<uint32_t>>::gpuAddress;
     };
 
     void SetUp() override {
@@ -83,19 +82,19 @@ struct TimestampPacketTests : public TimestampPacketSimpleTests {
     }
 
     template <typename MI_SEMAPHORE_WAIT>
-    void verifySemaphore(MI_SEMAPHORE_WAIT *semaphoreCmd, TagNode<TimestampPacketStorage> *timestampPacketNode, uint32_t packetId) {
+    void verifySemaphore(MI_SEMAPHORE_WAIT *semaphoreCmd, TagNodeBase *timestampPacketNode, uint32_t packetId) {
         EXPECT_NE(nullptr, semaphoreCmd);
         EXPECT_EQ(semaphoreCmd->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
         EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
 
-        uint64_t compareOffset = packetId * sizeof(TimestampPacketStorage::Packet);
+        uint64_t compareOffset = packetId * sizeof(TimestampPackets<uint32_t>::Packet);
         auto dataAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNode) + compareOffset;
 
         EXPECT_EQ(dataAddress, semaphoreCmd->getSemaphoreGraphicsAddress());
     };
 
     template <typename GfxFamily>
-    void verifyMiAtomic(typename GfxFamily::MI_ATOMIC *miAtomicCmd, TagNode<TimestampPacketStorage> *timestampPacketNode) {
+    void verifyMiAtomic(typename GfxFamily::MI_ATOMIC *miAtomicCmd, TagNodeBase *timestampPacketNode) {
         using MI_ATOMIC = typename GfxFamily::MI_ATOMIC;
         EXPECT_NE(nullptr, miAtomicCmd);
         auto writeAddress = TimestampPacketHelper::getGpuDependenciesCountGpuAddress(*timestampPacketNode);
@@ -124,7 +123,7 @@ HWTEST_F(TimestampPacketTests, givenTagNodeWhenSemaphoreAndAtomicAreProgrammedTh
     using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
     using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
 
-    TimestampPacketStorage tag;
+    TimestampPackets<uint32_t> tag;
     MockTagNode mockNode;
     mockNode.tagForCpuAccess = &tag;
     mockNode.gpuAddress = 0x1230000;
@@ -166,7 +165,7 @@ HWTEST_F(TimestampPacketTests, givenDebugModeWhereAtomicsAreNotEmittedWhenComman
 }
 
 HWTEST_F(TimestampPacketTests, givenMultipleDeviesWhenIncrementingCpuDependenciesThenIncrementMultipleTimes) {
-    TimestampPacketStorage tag;
+    TimestampPackets<uint32_t> tag;
     MockTagNode mockNode;
     mockNode.tagForCpuAccess = &tag;
     mockNode.gpuAddress = 0x1230000;
@@ -183,7 +182,7 @@ HWTEST_F(TimestampPacketTests, givenTagNodeWithPacketsUsed2WhenSemaphoreAndAtomi
     using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
     using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
 
-    TimestampPacketStorage tag;
+    TimestampPackets<uint32_t> tag;
     tag.setPacketsUsed(2);
     MockTagNode mockNode;
     mockNode.tagForCpuAccess = &tag;
@@ -202,7 +201,7 @@ HWTEST_F(TimestampPacketTests, givenTagNodeWithPacketsUsed2WhenSemaphoreAndAtomi
 }
 
 TEST_F(TimestampPacketTests, givenTagNodeWhatAskingForGpuAddressesThenReturnCorrectValue) {
-    TimestampPacketStorage tag;
+    TimestampPackets<uint32_t> tag;
     MockTagNode mockNode;
     mockNode.tagForCpuAccess = &tag;
     mockNode.gpuAddress = 0x1230000;
@@ -242,11 +241,11 @@ TEST_F(TimestampPacketSimpleTests, givenTimestampPacketContainerWhenMovedThenMov
     EXPECT_FALSE(std::is_copy_assignable<TimestampPacketContainer>::value);
     EXPECT_FALSE(std::is_copy_constructible<TimestampPacketContainer>::value);
 
-    struct MockTagNode : public TagNode<TimestampPacketStorage> {
+    struct MockTagNode : public TagNode<TimestampPackets<uint32_t>> {
         void returnTag() override {
             returnCalls++;
         }
-        using TagNode<TimestampPacketStorage>::refCount;
+        using TagNode<TimestampPackets<uint32_t>>::refCount;
         uint32_t returnCalls = 0;
     };
 
@@ -308,7 +307,9 @@ TEST_F(TimestampPacketSimpleTests, whenNewTagIsTakenThenReinitialize) {
     MockMemoryManager memoryManager(executionEnvironment);
     MockTagAllocator<MockTimestampPacketStorage> allocator(0, &memoryManager, 1);
 
-    auto firstNode = allocator.getTag();
+    using MockNode = TagNode<MockTimestampPacketStorage>;
+
+    auto firstNode = static_cast<MockNode *>(allocator.getTag());
     auto i = 0u;
     for (auto &packet : firstNode->tagForCpuAccess->packets) {
         packet.contextStart = i++;
@@ -369,7 +370,7 @@ HWTEST_F(TimestampPacketTests, givenDebugFlagSetWhenCreatingTimestampPacketAlloc
     auto tag = csr.getTimestampPacketAllocator()->getTag();
     setTagToReadyState(tag);
 
-    EXPECT_TRUE(tag->tagForCpuAccess->isCompleted());
+    EXPECT_TRUE(tag->isCompleted());
     EXPECT_FALSE(tag->canBeReleased());
 }
 
@@ -550,7 +551,7 @@ HWTEST_F(TimestampPacketTests, givenEventsRequestWithEventsWithoutTimestampsWhen
 }
 
 HWTEST_F(TimestampPacketTests, whenEstimatingSizeForNodeDependencyThenReturnCorrectValue) {
-    TimestampPacketStorage tag;
+    TimestampPackets<uint32_t> tag;
     MockTagNode mockNode;
     mockNode.tagForCpuAccess = &tag;
     mockNode.gpuAddress = 0x1230000;
@@ -1457,8 +1458,8 @@ HWTEST_F(TimestampPacketTests, givenAlreadyAssignedNodeWhenEnqueueingWithOmitTim
 }
 
 HWTEST_F(TimestampPacketTests, givenEventsWaitlistFromDifferentDevicesWhenEnqueueingThenMakeAllTimestampsResident) {
-    TagAllocator<TimestampPacketStorage> tagAllocator(device->getRootDeviceIndex(), executionEnvironment->memoryManager.get(), 1, 1,
-                                                      sizeof(TimestampPacketStorage), false, device->getDeviceBitfield());
+    TagAllocator<TimestampPackets<uint32_t>> tagAllocator(device->getRootDeviceIndex(), executionEnvironment->memoryManager.get(), 1, 1,
+                                                          sizeof(TimestampPackets<uint32_t>), false, device->getDeviceBitfield());
     auto device2 = std::make_unique<MockClDevice>(Device::create<MockDevice>(executionEnvironment, 1u));
 
     auto &ultCsr = device->getUltCommandStreamReceiver<FamilyType>();
@@ -1493,8 +1494,8 @@ HWTEST_F(TimestampPacketTests, givenEventsWaitlistFromDifferentDevicesWhenEnqueu
 }
 
 HWTEST_F(TimestampPacketTests, givenEventsWaitlistFromDifferentCSRsWhenEnqueueingThenMakeAllTimestampsResident) {
-    TagAllocator<TimestampPacketStorage> tagAllocator(device->getRootDeviceIndex(), executionEnvironment->memoryManager.get(), 1, 1,
-                                                      sizeof(TimestampPacketStorage), false, device->getDeviceBitfield());
+    TagAllocator<TimestampPackets<uint32_t>> tagAllocator(device->getRootDeviceIndex(), executionEnvironment->memoryManager.get(), 1, 1,
+                                                          sizeof(TimestampPackets<uint32_t>), false, device->getDeviceBitfield());
 
     auto &ultCsr = device->getUltCommandStreamReceiver<FamilyType>();
     ultCsr.timestampPacketWriteEnabled = true;
@@ -1600,7 +1601,7 @@ HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingWithoutK
     auto cmdQ = clUniquePtr(new MockCommandQueueHw<FamilyType>(context, device.get(), nullptr));
 
     MockKernelWithInternals mockKernel(*device, context);
-    cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); // obtain first TimestampPacketStorage
+    cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); // obtain first TimestampPackets<uint32_t>
 
     TimestampPacketContainer cmdQNodes;
     cmdQNodes.assignAndIncrementNodesRefCounts(*cmdQ->timestampPacketContainer);
@@ -1815,7 +1816,7 @@ HWTEST_F(TimestampPacketTests, whenEnqueueingBarrierThenRequestPipeControlOnCsrF
     MockCommandQueueHw<FamilyType> cmdQ(context, device.get(), nullptr);
 
     MockKernelWithInternals mockKernel(*device, context);
-    cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); // obtain first TimestampPacketStorage
+    cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); // obtain first TimestampPackets<uint32_t>
 
     TimestampPacketContainer cmdQNodes;
     cmdQNodes.assignAndIncrementNodesRefCounts(*cmdQ.timestampPacketContainer);
diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp
index d1fae7c68e..6138574d97 100644
--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@@ -2180,11 +2180,11 @@ HWTEST_F(KernelResidencyTest, givenEnableFullKernelTuningWhenPerformTunningThenK
     EXPECT_EQ(result->second.status, MockKernel::TunningStatus::SUBDEVICE_TUNNING_IN_PROGRESS);
     EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferedInCurrentEnqueue);
 
-    uint32_t data[4] = {static_cast<uint32_t>(container.getNode(0u)->tagForCpuAccess->getContextStartValue(0)),
-                        static_cast<uint32_t>(container.getNode(0u)->tagForCpuAccess->getGlobalStartValue(0)),
+    uint32_t data[4] = {static_cast<uint32_t>(container.getNode(0u)->getContextStartValue(0)),
+                        static_cast<uint32_t>(container.getNode(0u)->getGlobalStartValue(0)),
                         2, 2};
 
-    container.getNode(0u)->tagForCpuAccess->assignDataToAllTimestamps(0, data);
+    container.getNode(0u)->assignDataToAllTimestamps(0, data);
 
     mockKernel.mockKernel->performKernelTunning(commandStreamReceiver, lws, gws, offsets, &container);
 
@@ -2193,12 +2193,12 @@ HWTEST_F(KernelResidencyTest, givenEnableFullKernelTuningWhenPerformTunningThenK
     EXPECT_EQ(result->second.status, MockKernel::TunningStatus::SUBDEVICE_TUNNING_IN_PROGRESS);
     EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferedInCurrentEnqueue);
 
-    data[0] = static_cast<uint32_t>(subdeviceContainer.getNode(0u)->tagForCpuAccess->getContextStartValue(0));
-    data[1] = static_cast<uint32_t>(subdeviceContainer.getNode(0u)->tagForCpuAccess->getGlobalStartValue(0));
+    data[0] = static_cast<uint32_t>(subdeviceContainer.getNode(0u)->getContextStartValue(0));
+    data[1] = static_cast<uint32_t>(subdeviceContainer.getNode(0u)->getGlobalStartValue(0));
     data[2] = 2;
     data[3] = 2;
 
-    subdeviceContainer.getNode(0u)->tagForCpuAccess->assignDataToAllTimestamps(0, data);
+    subdeviceContainer.getNode(0u)->assignDataToAllTimestamps(0, data);
 
     mockKernel.mockKernel->performKernelTunning(commandStreamReceiver, lws, gws, offsets, &container);
 
@@ -2209,12 +2209,12 @@ HWTEST_F(KernelResidencyTest, givenEnableFullKernelTuningWhenPerformTunningThenK
     EXPECT_EQ(result->second.status, MockKernel::TunningStatus::SUBDEVICE_TUNNING_IN_PROGRESS);
     EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferedInCurrentEnqueue);
 
-    data[0] = static_cast<uint32_t>(subdeviceContainer.getNode(1u)->tagForCpuAccess->getContextStartValue(0));
-    data[1] = static_cast<uint32_t>(subdeviceContainer.getNode(1u)->tagForCpuAccess->getGlobalStartValue(0));
+    data[0] = static_cast<uint32_t>(subdeviceContainer.getNode(1u)->getContextStartValue(0));
+    data[1] = static_cast<uint32_t>(subdeviceContainer.getNode(1u)->getGlobalStartValue(0));
     data[2] = 2;
     data[3] = 2;
 
-    subdeviceContainer.getNode(1u)->tagForCpuAccess->assignDataToAllTimestamps(0, data);
+    subdeviceContainer.getNode(1u)->assignDataToAllTimestamps(0, data);
 
     mockKernel.mockKernel->performKernelTunning(commandStreamReceiver, lws, gws, offsets, &container);
 
diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp
index 1d2a6c908d..39e860a816 100644
--- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp
+++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp
@@ -856,10 +856,10 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenInputAndOutputTimestampPacketWhenBlitCal
     cl_int retVal = CL_SUCCESS;
 
     auto memoryManager = bcsCsr->getMemoryManager();
-    bcsCsr->timestampPacketAllocator = std::make_unique<TagAllocator<TimestampPacketStorage>>(device->getRootDeviceIndex(), memoryManager, 1,
-                                                                                              MemoryConstants::cacheLineSize,
-                                                                                              sizeof(TimestampPacketStorage),
-                                                                                              false, device->getDeviceBitfield());
+    bcsCsr->timestampPacketAllocator = std::make_unique<TagAllocator<TimestampPackets<uint32_t>>>(device->getRootDeviceIndex(), memoryManager, 1,
+                                                                                                  MemoryConstants::cacheLineSize,
+                                                                                                  sizeof(TimestampPackets<uint32_t>),
+                                                                                                  false, device->getDeviceBitfield());
 
     auto buffer = clUniquePtr<Buffer>(Buffer::create(bcsMockContext.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal));
     buffer->forceDisallowCPUCopy = true;
diff --git a/opencl/test/unit_test/mocks/mock_timestamp_container.h b/opencl/test/unit_test/mocks/mock_timestamp_container.h
index 8a16a9674f..3a6095c00d 100644
--- a/opencl/test/unit_test/mocks/mock_timestamp_container.h
+++ b/opencl/test/unit_test/mocks/mock_timestamp_container.h
@@ -11,7 +11,7 @@
 
 namespace NEO {
 
-template <typename TagType = TimestampPacketStorage>
+template <typename TagType = TimestampPackets<uint32_t>>
 class MockTagAllocator : public TagAllocator<TagType> {
   public:
     using BaseClass = TagAllocator<TagType>;
@@ -22,13 +22,13 @@ class MockTagAllocator : public TagAllocator<TagType> {
     MockTagAllocator(uint32_t rootDeviceIndex, MemoryManager *memoryManager, size_t tagCount = 10)
         : BaseClass(rootDeviceIndex, memoryManager, tagCount, MemoryConstants::cacheLineSize, sizeof(TagType), false, mockDeviceBitfield) {}
 
-    void returnTag(NodeType *node) override {
-        releaseReferenceNodes.push_back(node);
+    void returnTag(TagNodeBase *node) override {
+        releaseReferenceNodes.push_back(static_cast<NodeType *>(node));
         BaseClass::returnTag(node);
     }
 
-    void returnTagToFreePool(NodeType *node) override {
-        returnedToFreePoolNodes.push_back(node);
+    void returnTagToFreePool(TagNodeBase *node) override {
+        returnedToFreePoolNodes.push_back(static_cast<NodeType *>(node));
         BaseClass::returnTagToFreePool(node);
     }
 
@@ -40,13 +40,13 @@ class MockTimestampPacketContainer : public TimestampPacketContainer {
   public:
     using TimestampPacketContainer::timestampPacketNodes;
 
-    MockTimestampPacketContainer(TagAllocator<TimestampPacketStorage> &tagAllocator, size_t numberOfPreallocatedTags) {
+    MockTimestampPacketContainer(TagAllocatorBase &tagAllocator, size_t numberOfPreallocatedTags) {
         for (size_t i = 0; i < numberOfPreallocatedTags; i++) {
             add(tagAllocator.getTag());
         }
     }
 
-    TagNode<TimestampPacketStorage> *getNode(size_t position) {
+    TagNodeBase *getNode(size_t position) {
         return timestampPacketNodes.at(position);
     }
 };
diff --git a/opencl/test/unit_test/os_interface/performance_counters_tests.cpp b/opencl/test/unit_test/os_interface/performance_counters_tests.cpp
index a8d764bb39..8ddaecbdcd 100644
--- a/opencl/test/unit_test/os_interface/performance_counters_tests.cpp
+++ b/opencl/test/unit_test/os_interface/performance_counters_tests.cpp
@@ -97,7 +97,7 @@ TEST_P(PerformanceCountersProcessEventTest, givenNullptrInputParamWhenProcessEve
     TagNode<HwPerfCounter> query = {};
     query.tagForCpuAccess = &counters;
 
-    performanceCountersBase->getQueryHandle(counters.query.handle);
+    performanceCountersBase->getQueryHandleRef(counters.query.handle);
     auto retVal = performanceCountersBase->getApiReport(&query, inputParamSize, nullptr, &outputParamSize, eventComplete);
     performanceCountersBase->deleteQuery(counters.query.handle);
 
@@ -111,7 +111,7 @@ TEST_P(PerformanceCountersProcessEventTest, givenCorrectInputParamWhenProcessEve
     TagNode<HwPerfCounter> query = {};
     query.tagForCpuAccess = &counters;
 
-    performanceCountersBase->getQueryHandle(counters.query.handle);
+    performanceCountersBase->getQueryHandleRef(counters.query.handle);
     auto retVal = performanceCountersBase->getApiReport(&query, inputParamSize, inputParam.get(), &outputParamSize, eventComplete);
     performanceCountersBase->deleteQuery(counters.query.handle);
 
@@ -127,11 +127,12 @@ TEST_P(PerformanceCountersProcessEventTest, givenCorrectInputParamWhenProcessEve
 TEST_P(PerformanceCountersProcessEventTest, givenCorrectInputParamWhenProcessEventPerfCountersIsNotCalledThenReturnsFalse) {
     eventComplete = GetParam();
     EXPECT_EQ(0ull, outputParamSize);
+    HwPerfCounter tag = {};
     TagNode<HwPerfCounter> query = {};
-    query.tagForCpuAccess = nullptr;
+    query.tagForCpuAccess = &tag;
 
     auto retVal = performanceCountersBase->getApiReport(&query, inputParamSize, inputParam.get(), &outputParamSize, eventComplete);
-    EXPECT_FALSE(retVal);
+    EXPECT_EQ(eventComplete, retVal);
 }
 
 TEST_F(PerformanceCountersProcessEventTest, givenInvalidInputParamSizeWhenProcessEventPerfCountersIsCalledThenReturnsFalse) {
@@ -141,7 +142,7 @@ TEST_F(PerformanceCountersProcessEventTest, givenInvalidInputParamSizeWhenProces
     TagNode<HwPerfCounter> query = {};
     query.tagForCpuAccess = &counters;
 
-    performanceCountersBase->getQueryHandle(counters.query.handle);
+    performanceCountersBase->getQueryHandleRef(counters.query.handle);
     auto retVal = performanceCountersBase->getApiReport(&query, inputParamSize - 1, inputParam.get(), &outputParamSize, eventComplete);
     performanceCountersBase->deleteQuery(counters.query.handle);
 
@@ -156,7 +157,7 @@ TEST_F(PerformanceCountersProcessEventTest, givenNullptrOutputParamSizeWhenProce
     TagNode<HwPerfCounter> query = {};
     query.tagForCpuAccess = &counters;
 
-    performanceCountersBase->getQueryHandle(counters.query.handle);
+    performanceCountersBase->getQueryHandleRef(counters.query.handle);
     auto retVal = performanceCountersBase->getApiReport(&query, inputParamSize, inputParam.get(), nullptr, eventComplete);
     performanceCountersBase->deleteQuery(counters.query.handle);
 
@@ -171,7 +172,7 @@ TEST_F(PerformanceCountersProcessEventTest, givenNullptrInputZeroSizeWhenProcess
     TagNode<HwPerfCounter> query = {};
     query.tagForCpuAccess = &counters;
 
-    performanceCountersBase->getQueryHandle(counters.query.handle);
+    performanceCountersBase->getQueryHandleRef(counters.query.handle);
     auto retVal = performanceCountersBase->getApiReport(&query, 0, nullptr, &outputParamSize, eventComplete);
     performanceCountersBase->deleteQuery(counters.query.handle);
 
@@ -186,7 +187,7 @@ TEST_F(PerformanceCountersProcessEventTest, givenNullptrInputZeroSizeAndNullptrO
     TagNode<HwPerfCounter> query = {};
     query.tagForCpuAccess = &counters;
 
-    performanceCountersBase->getQueryHandle(counters.query.handle);
+    performanceCountersBase->getQueryHandleRef(counters.query.handle);
     auto retVal = performanceCountersBase->getApiReport(&query, 0, nullptr, nullptr, eventComplete);
     performanceCountersBase->deleteQuery(counters.query.handle);
 
@@ -487,7 +488,7 @@ TEST_F(PerformanceCountersMetricsLibraryTest, givenPerformanceCountersWhenMetric
     EXPECT_EQ(0u, performanceCountersBase->getReferenceNumber());
     EXPECT_TRUE(performanceCountersBase->enable(false));
 
-    performanceCountersBase->getQueryHandle(query);
+    performanceCountersBase->getQueryHandleRef(query);
     EXPECT_TRUE(query.IsValid());
 
     performanceCountersBase->deleteQuery(query);
@@ -587,13 +588,13 @@ TEST_F(PerformanceCountersMetricsLibraryTest, WhenGettingHwPerfCounterThenValidP
     std::unique_ptr<Event> event(new Event(queue.get(), CL_COMMAND_COPY_BUFFER, 0, 0));
     ASSERT_NE(nullptr, event);
 
-    HwPerfCounter *perfCounter = event->getHwPerfCounterNode()->tagForCpuAccess;
+    auto perfCounter = static_cast<TagNode<HwPerfCounter> *>(event->getHwPerfCounterNode());
     ASSERT_NE(nullptr, perfCounter);
 
-    ASSERT_EQ(0ULL, perfCounter->report[0]);
+    ASSERT_EQ(0ULL, perfCounter->tagForCpuAccess->report[0]);
     EXPECT_TRUE(perfCounter->isCompleted());
 
-    HwPerfCounter *perfCounter2 = event->getHwPerfCounterNode()->tagForCpuAccess;
+    auto perfCounter2 = event->getHwPerfCounterNode();
     ASSERT_EQ(perfCounter, perfCounter2);
 
     performanceCountersBase->shutdown();
@@ -633,7 +634,7 @@ TEST_F(PerformanceCountersMetricsLibraryTest, WhenCreatingEventThenHwPerfCounter
     std::unique_ptr<Event> event(new Event(queue.get(), CL_COMMAND_COPY_BUFFER, 0, 0));
     ASSERT_NE(nullptr, event);
 
-    HwPerfCounter *perfCounter = event->getHwPerfCounterNode()->tagForCpuAccess;
+    HwPerfCounter *perfCounter = static_cast<TagNode<HwPerfCounter> *>(event->getHwPerfCounterNode())->tagForCpuAccess;
     ASSERT_NE(nullptr, perfCounter);
 
     GraphicsAllocation *allocation = event->getHwPerfCounterNode()->getBaseGraphicsAllocation();
diff --git a/opencl/test/unit_test/profiling/profiling_tests.cpp b/opencl/test/unit_test/profiling/profiling_tests.cpp
index a7b09d1677..f0d26b08c4 100644
--- a/opencl/test/unit_test/profiling/profiling_tests.cpp
+++ b/opencl/test/unit_test/profiling/profiling_tests.cpp
@@ -1064,7 +1064,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersOnCCSTests, givenCommandQue
 struct MockTimestampContainer : public TimestampPacketContainer {
     ~MockTimestampContainer() override {
         for (const auto &node : timestampPacketNodes) {
-            delete node->tagForCpuAccess;
+            auto mockNode = static_cast<MockTagNode<TimestampPackets<uint32_t>> *>(node);
+            delete mockNode->tagForCpuAccess;
             delete node;
         }
         timestampPacketNodes.clear();
@@ -1079,8 +1080,8 @@ struct ProfilingTimestampPacketsTest : public ::testing::Test {
     }
 
     void addTimestampNode(uint32_t contextStart, uint32_t contextEnd, uint32_t globalStart, uint32_t globalEnd) {
-        auto node = new MockTagNode<TimestampPacketStorage>();
-        auto timestampPacketStorage = new TimestampPacketStorage();
+        auto node = new MockTagNode<TimestampPackets<uint32_t>>();
+        auto timestampPacketStorage = new TimestampPackets<uint32_t>();
         node->tagForCpuAccess = timestampPacketStorage;
 
         uint32_t values[4] = {contextStart, globalStart, contextEnd, globalEnd};
@@ -1090,8 +1091,8 @@ struct ProfilingTimestampPacketsTest : public ::testing::Test {
     }
 
     void addTimestampNodeMultiOsContext(uint32_t globalStart[16], uint32_t globalEnd[16], uint32_t contextStart[16], uint32_t contextEnd[16], uint32_t size) {
-        auto node = new MockTagNode<TimestampPacketStorage>();
-        auto timestampPacketStorage = new TimestampPacketStorage();
+        auto node = new MockTagNode<TimestampPackets<uint32_t>>();
+        auto timestampPacketStorage = new TimestampPackets<uint32_t>();
         timestampPacketStorage->setPacketsUsed(size);
 
         for (uint32_t i = 0u; i < timestampPacketStorage->getPacketsUsed(); ++i) {
diff --git a/opencl/test/unit_test/utilities/tag_allocator_tests.cpp b/opencl/test/unit_test/utilities/tag_allocator_tests.cpp
index 9c3c280f63..e79e94cd17 100644
--- a/opencl/test/unit_test/utilities/tag_allocator_tests.cpp
+++ b/opencl/test/unit_test/utilities/tag_allocator_tests.cpp
@@ -21,29 +21,69 @@
 using namespace NEO;
 
 struct TagAllocatorTest : public Test<MemoryAllocatorFixture> {
-    const DeviceBitfield deviceBitfield{0xf};
-    DebugManagerStateRestore restorer;
+    class MockTimestampPackets32 : public TimestampPackets<uint32_t> {
+      public:
+        void setTagToReadyState() {
+            auto packetsUsed = getPacketsUsed();
+            initialize();
+
+            uint32_t zeros[4] = {};
+
+            for (uint32_t i = 0; i < TimestampPacketSizeControl::preferredPacketCount; i++) {
+                assignDataToAllTimestamps(i, zeros);
+            }
+            setPacketsUsed(packetsUsed);
+
+            EXPECT_TRUE(isCompleted());
+        }
+
+        void setToNonReadyState() {
+            packets[0].contextEnd = 1;
+            EXPECT_FALSE(isCompleted());
+        }
+    };
 
     void SetUp() override {
         DebugManager.flags.CreateMultipleSubDevices.set(4);
         MemoryAllocatorFixture::SetUp();
     }
+
+    const DeviceBitfield deviceBitfield{0xf};
+    DebugManagerStateRestore restorer;
 };
 
 struct TimeStamps {
     void initialize() {
         start = 1;
         end = 2;
-        release = true;
     }
-    static GraphicsAllocation::AllocationType getAllocationType() {
+    static constexpr GraphicsAllocation::AllocationType getAllocationType() {
         return GraphicsAllocation::AllocationType::PROFILING_TAG_BUFFER;
     }
-    bool isCompleted() const { return release; }
-    uint32_t getImplicitGpuDependenciesCount() const { return 0; }
-    bool release;
+
+    static constexpr TagNodeType getTagNodeType() { return TagNodeType::HwTimeStamps; }
+
+    uint64_t getContextStartValue(uint32_t packetIndex) const {
+        return start;
+    }
+
+    uint64_t getGlobalStartValue(uint32_t packetIndex) const {
+        return start;
+    }
+
+    uint64_t getContextEndValue(uint32_t packetIndex) const {
+        return end;
+    }
+
+    uint64_t getGlobalEndValue(uint32_t packetIndex) const {
+        return end;
+    }
+
     uint64_t start;
     uint64_t end;
+
+    uint64_t ContextCompleteTS;
+    uint64_t GlobalEndTS;
 };
 
 template <typename TagType>
@@ -58,6 +98,7 @@ class MockTagAllocator : public TagAllocator<TagType> {
     using BaseClass::populateFreeTags;
     using BaseClass::releaseDeferredTags;
     using BaseClass::usedTags;
+    using BaseClass::TagAllocatorBase::cleanUpResources;
 
     MockTagAllocator(MemoryManager *memMngr, size_t tagCount, size_t tagAlignment, bool disableCompletionCheck, DeviceBitfield deviceBitfield)
         : BaseClass(0, memMngr, tagCount, tagAlignment, sizeof(TagType), disableCompletionCheck, deviceBitfield) {
@@ -115,7 +156,7 @@ TEST_F(TagAllocatorTest, WhenGettingAndReturningTagThenFreeAndUsedListsAreUpdate
     ASSERT_NE(nullptr, tagAllocator.getFreeTagsHead());
     EXPECT_EQ(nullptr, tagAllocator.getUsedTagsHead());
 
-    TagNode<TimeStamps> *tagNode = tagAllocator.getTag();
+    auto tagNode = static_cast<TagNode<TimeStamps> *>(tagAllocator.getTag());
 
     EXPECT_NE(nullptr, tagNode);
 
@@ -151,7 +192,7 @@ TEST_F(TagAllocatorTest, WhenTagIsAllocatedThenItIsAligned) {
 
     ASSERT_NE(nullptr, tagAllocator.getFreeTagsHead());
 
-    TagNode<TimeStamps> *tagNode = tagAllocator.getTag();
+    TagNode<TimeStamps> *tagNode = static_cast<TagNode<TimeStamps> *>(tagAllocator.getTag());
 
     ASSERT_NE(nullptr, tagNode);
     EXPECT_EQ(0u, (uintptr_t)tagNode->tagForCpuAccess % alignment);
@@ -170,13 +211,13 @@ TEST_F(TagAllocatorTest, givenTagAllocatorWhenAllNodesWereUsedThenCreateNewGraph
     TagNode<TimeStamps> *tagNodes[4];
 
     for (size_t i = 0; i < 4; i++) {
-        tagNodes[i] = tagAllocator.getTag();
+        tagNodes[i] = static_cast<TagNode<TimeStamps> *>(tagAllocator.getTag());
         EXPECT_NE(nullptr, tagNodes[i]);
     }
     EXPECT_EQ(1u, tagAllocator.getGraphicsAllocationsCount());
     EXPECT_EQ(1u, tagAllocator.getTagPoolCount());
 
-    TagNode<TimeStamps> *tagNode = tagAllocator.getTag();
+    TagNode<TimeStamps> *tagNode = static_cast<TagNode<TimeStamps> *>(tagAllocator.getTag());
     EXPECT_NE(nullptr, tagNode);
 
     EXPECT_EQ(2u, tagAllocator.getGraphicsAllocationsCount());
@@ -188,7 +229,7 @@ TEST_F(TagAllocatorTest, givenInputTagCountWhenCreatingAllocatorThenRequestedNum
       public:
         using MockMemoryManager::MockMemoryManager;
         GraphicsAllocation *allocateGraphicsMemoryWithAlignment(const AllocationData &allocationData) override {
-            return new MemoryAllocation(0, TimestampPacketStorage::getAllocationType(), nullptr, nullptr, 0, MemoryConstants::pageSize,
+            return new MemoryAllocation(0, TimestampPackets<uint32_t>::getAllocationType(), nullptr, nullptr, 0, MemoryConstants::pageSize,
                                         1, MemoryPool::System4KBPages, false, false, mockMaxOsContextCount);
         }
     };
@@ -196,7 +237,7 @@ TEST_F(TagAllocatorTest, givenInputTagCountWhenCreatingAllocatorThenRequestedNum
     auto mockMemoryManager = std::make_unique<MyMockMemoryManager>(true, true, *executionEnvironment);
 
     const size_t tagsCount = 3;
-    MockTagAllocator<TimestampPacketStorage> tagAllocator(mockMemoryManager.get(), tagsCount, 1, deviceBitfield);
+    MockTagAllocator<TimestampPackets<uint32_t>> tagAllocator(mockMemoryManager.get(), tagsCount, 1, deviceBitfield);
 
     size_t nodesFound = 0;
     auto head = tagAllocator.freeTags.peekHead();
@@ -219,13 +260,13 @@ TEST_F(TagAllocatorTest, GivenSpecificOrderWhenReturningTagsThenFreeListIsUpdate
     TagNode<TimeStamps> *tagNodes[4];
 
     for (int i = 0; i < 4; i++) {
-        tagNodes[i] = tagAllocator.getTag();
+        tagNodes[i] = static_cast<TagNode<TimeStamps> *>(tagAllocator.getTag());
         EXPECT_NE(nullptr, tagNodes[i]);
     }
     EXPECT_EQ(1u, tagAllocator.getGraphicsAllocationsCount());
     EXPECT_EQ(1u, tagAllocator.getTagPoolCount());
 
-    TagNode<TimeStamps> *tagNode2 = tagAllocator.getTag();
+    TagNode<TimeStamps> *tagNode2 = static_cast<TagNode<TimeStamps> *>(tagAllocator.getTag());
     EXPECT_NE(nullptr, tagNode2);
     EXPECT_EQ(2u, tagAllocator.getGraphicsAllocationsCount());
     EXPECT_EQ(2u, tagAllocator.getTagPoolCount());
@@ -263,10 +304,10 @@ TEST_F(TagAllocatorTest, WhenGettingTagsFromTwoPoolsThenTagsAreDifferent) {
 
     TagNode<TimeStamps> *tagNode1, *tagNode2;
 
-    tagNode1 = tagAllocator.getTag();
+    tagNode1 = static_cast<TagNode<TimeStamps> *>(tagAllocator.getTag());
     ASSERT_NE(nullptr, tagNode1);
 
-    tagNode2 = tagAllocator.getTag();
+    tagNode2 = static_cast<TagNode<TimeStamps> *>(tagAllocator.getTag());
     ASSERT_NE(nullptr, tagNode2);
 
     EXPECT_EQ(2u, tagAllocator.getGraphicsAllocationsCount());
@@ -286,11 +327,11 @@ TEST_F(TagAllocatorTest, WhenCleaningUpResourcesThenAllResourcesAreReleased) {
     TagNode<TimeStamps> *tagNode1, *tagNode2;
 
     // Allocate first Pool
-    tagNode1 = tagAllocator.getTag();
+    tagNode1 = static_cast<TagNode<TimeStamps> *>(tagAllocator.getTag());
     EXPECT_NE(nullptr, tagNode1);
 
     // Allocate second Pool
-    tagNode2 = tagAllocator.getTag();
+    tagNode2 = static_cast<TagNode<TimeStamps> *>(tagAllocator.getTag());
     ASSERT_NE(nullptr, tagNode2);
 
     // Two pools should have different gfxAllocations
@@ -312,7 +353,7 @@ TEST_F(TagAllocatorTest, whenNewTagIsTakenThenItIsInitialized) {
     tagAllocator.getFreeTagsHead()->tagForCpuAccess->end = 4;
     tagAllocator.getFreeTagsHead()->setProfilingCapable(false);
 
-    auto node = tagAllocator.getTag();
+    auto node = static_cast<TagNode<TimeStamps> *>(tagAllocator.getTag());
     EXPECT_EQ(1u, node->tagForCpuAccess->start);
     EXPECT_EQ(2u, node->tagForCpuAccess->end);
     EXPECT_TRUE(node->isProfilingCapable());
@@ -337,10 +378,10 @@ TEST_F(TagAllocatorTest, givenMultipleReferencesOnTagWhenReleasingThenReturnWhen
 }
 
 TEST_F(TagAllocatorTest, givenNotReadyTagWhenReturnedThenMoveToDeferredList) {
-    MockTagAllocator<TimeStamps> tagAllocator(memoryManager, 1, 1, deviceBitfield);
-    auto node = tagAllocator.getTag();
+    MockTagAllocator<MockTimestampPackets32> tagAllocator(memoryManager, 1, 1, deviceBitfield);
+    auto node = static_cast<TagNode<MockTimestampPackets32> *>(tagAllocator.getTag());
 
-    node->tagForCpuAccess->release = false;
+    node->tagForCpuAccess->setToNonReadyState();
     EXPECT_TRUE(tagAllocator.deferredTags.peekIsEmpty());
     tagAllocator.returnTag(node);
     EXPECT_FALSE(tagAllocator.deferredTags.peekIsEmpty());
@@ -379,10 +420,10 @@ TEST_F(TagAllocatorTest, givenTagAllocatorWhenDisabledCompletionCheckThenNodeInh
 }
 
 TEST_F(TagAllocatorTest, givenReadyTagWhenReturnedThenMoveToFreeList) {
-    MockTagAllocator<TimeStamps> tagAllocator(memoryManager, 1, 1, deviceBitfield);
-    auto node = tagAllocator.getTag();
+    MockTagAllocator<MockTimestampPackets32> tagAllocator(memoryManager, 1, 1, deviceBitfield);
+    auto node = static_cast<TagNode<MockTimestampPackets32> *>(tagAllocator.getTag());
 
-    node->tagForCpuAccess->release = true;
+    node->tagForCpuAccess->setTagToReadyState();
     EXPECT_TRUE(tagAllocator.deferredTags.peekIsEmpty());
     tagAllocator.returnTag(node);
     EXPECT_TRUE(tagAllocator.deferredTags.peekIsEmpty());
@@ -390,25 +431,25 @@ TEST_F(TagAllocatorTest, givenReadyTagWhenReturnedThenMoveToFreeList) {
 }
 
 TEST_F(TagAllocatorTest, givenEmptyFreeListWhenAskingForNewTagThenTryToReleaseDeferredListFirst) {
-    MockTagAllocator<TimeStamps> tagAllocator(memoryManager, 1, 1, deviceBitfield);
-    auto node = tagAllocator.getTag();
+    MockTagAllocator<MockTimestampPackets32> tagAllocator(memoryManager, 1, 1, deviceBitfield);
+    auto node = static_cast<TagNode<MockTimestampPackets32> *>(tagAllocator.getTag());
 
-    node->tagForCpuAccess->release = false;
+    node->tagForCpuAccess->setToNonReadyState();
     tagAllocator.returnTag(node);
-    node->tagForCpuAccess->release = false;
+    node->tagForCpuAccess->setToNonReadyState();
     EXPECT_TRUE(tagAllocator.freeTags.peekIsEmpty());
-    node = tagAllocator.getTag();
+    node = static_cast<TagNode<MockTimestampPackets32> *>(tagAllocator.getTag());
     EXPECT_NE(nullptr, node);
     EXPECT_TRUE(tagAllocator.freeTags.peekIsEmpty()); // empty again - new pool wasnt allocated
 }
 
 TEST_F(TagAllocatorTest, givenTagsOnDeferredListWhenReleasingItThenMoveReadyTagsToFreePool) {
-    MockTagAllocator<TimeStamps> tagAllocator(memoryManager, 2, 1, deviceBitfield); // pool with 2 tags
-    auto node1 = tagAllocator.getTag();
-    auto node2 = tagAllocator.getTag();
+    MockTagAllocator<MockTimestampPackets32> tagAllocator(memoryManager, 2, 1, deviceBitfield); // pool with 2 tags
+    auto node1 = static_cast<TagNode<MockTimestampPackets32> *>(tagAllocator.getTag());
+    auto node2 = static_cast<TagNode<MockTimestampPackets32> *>(tagAllocator.getTag());
 
-    node1->tagForCpuAccess->release = false;
-    node2->tagForCpuAccess->release = false;
+    node1->tagForCpuAccess->setToNonReadyState();
+    node2->tagForCpuAccess->setToNonReadyState();
     tagAllocator.returnTag(node1);
     tagAllocator.returnTag(node2);
 
@@ -416,19 +457,19 @@ TEST_F(TagAllocatorTest, givenTagsOnDeferredListWhenReleasingItThenMoveReadyTags
     EXPECT_FALSE(tagAllocator.deferredTags.peekIsEmpty());
     EXPECT_TRUE(tagAllocator.freeTags.peekIsEmpty());
 
-    node1->tagForCpuAccess->release = true;
+    node1->tagForCpuAccess->setTagToReadyState();
     tagAllocator.releaseDeferredTags();
     EXPECT_FALSE(tagAllocator.deferredTags.peekIsEmpty());
     EXPECT_FALSE(tagAllocator.freeTags.peekIsEmpty());
 
-    node2->tagForCpuAccess->release = true;
+    node2->tagForCpuAccess->setTagToReadyState();
     tagAllocator.releaseDeferredTags();
     EXPECT_TRUE(tagAllocator.deferredTags.peekIsEmpty());
     EXPECT_FALSE(tagAllocator.freeTags.peekIsEmpty());
 }
 
 TEST_F(TagAllocatorTest, givenTagAllocatorWhenGraphicsAllocationIsCreatedThenSetValidllocationType) {
-    TagAllocator<TimestampPacketStorage> timestampPacketAllocator(mockRootDeviceIndex, memoryManager, 1, 1, sizeof(TimestampPacketStorage), false, mockDeviceBitfield);
+    TagAllocator<TimestampPackets<uint32_t>> timestampPacketAllocator(mockRootDeviceIndex, memoryManager, 1, 1, sizeof(TimestampPackets<uint32_t>), false, mockDeviceBitfield);
     TagAllocator<HwTimeStamps> hwTimeStampsAllocator(mockRootDeviceIndex, memoryManager, 1, 1, sizeof(HwTimeStamps), false, mockDeviceBitfield);
     TagAllocator<HwPerfCounter> hwPerfCounterAllocator(mockRootDeviceIndex, memoryManager, 1, 1, sizeof(HwPerfCounter), false, mockDeviceBitfield);
 
@@ -440,3 +481,53 @@ TEST_F(TagAllocatorTest, givenTagAllocatorWhenGraphicsAllocationIsCreatedThenSet
     EXPECT_EQ(GraphicsAllocation::AllocationType::PROFILING_TAG_BUFFER, hwTimeStampsTag->getBaseGraphicsAllocation()->getAllocationType());
     EXPECT_EQ(GraphicsAllocation::AllocationType::PROFILING_TAG_BUFFER, hwPerfCounterTag->getBaseGraphicsAllocation()->getAllocationType());
 }
+
+TEST_F(TagAllocatorTest, givenNotSupportedTagTypeWhenCallingMethodThenAbortOrReturnInitialValue) {
+
+    {
+        TagNode<HwPerfCounter> perfCounterNode = {};
+
+        EXPECT_ANY_THROW(perfCounterNode.getGlobalStartOffset());
+        EXPECT_ANY_THROW(perfCounterNode.getContextStartOffset());
+        EXPECT_ANY_THROW(perfCounterNode.getContextEndOffset());
+        EXPECT_ANY_THROW(perfCounterNode.getGlobalEndOffset());
+        EXPECT_ANY_THROW(perfCounterNode.getImplicitGpuDependenciesCountOffset());
+        EXPECT_ANY_THROW(perfCounterNode.getContextStartValue(0));
+        EXPECT_ANY_THROW(perfCounterNode.getGlobalStartValue(0));
+        EXPECT_ANY_THROW(perfCounterNode.getContextEndValue(0));
+        EXPECT_ANY_THROW(perfCounterNode.getGlobalEndValue(0));
+        EXPECT_ANY_THROW(perfCounterNode.getContextCompleteRef());
+        EXPECT_ANY_THROW(perfCounterNode.getGlobalEndRef());
+        EXPECT_ANY_THROW(perfCounterNode.setPacketsUsed(0));
+        EXPECT_ANY_THROW(perfCounterNode.getPacketsUsed());
+        EXPECT_EQ(0u, perfCounterNode.getImplicitGpuDependenciesCount());
+        EXPECT_ANY_THROW(perfCounterNode.getSinglePacketSize());
+        EXPECT_ANY_THROW(perfCounterNode.assignDataToAllTimestamps(0, nullptr));
+        EXPECT_TRUE(perfCounterNode.isCompleted());
+    }
+
+    {
+        TagNode<HwTimeStamps> hwTimestampNode = {};
+
+        EXPECT_ANY_THROW(hwTimestampNode.getGlobalStartOffset());
+        EXPECT_ANY_THROW(hwTimestampNode.getContextStartOffset());
+        EXPECT_ANY_THROW(hwTimestampNode.getContextEndOffset());
+        EXPECT_ANY_THROW(hwTimestampNode.getGlobalEndOffset());
+        EXPECT_ANY_THROW(hwTimestampNode.getImplicitGpuDependenciesCountOffset());
+        EXPECT_ANY_THROW(hwTimestampNode.setPacketsUsed(0));
+        EXPECT_ANY_THROW(hwTimestampNode.getPacketsUsed());
+        EXPECT_EQ(0u, hwTimestampNode.getImplicitGpuDependenciesCount());
+        EXPECT_ANY_THROW(hwTimestampNode.getSinglePacketSize());
+        EXPECT_ANY_THROW(hwTimestampNode.assignDataToAllTimestamps(0, nullptr));
+        EXPECT_TRUE(hwTimestampNode.isCompleted());
+        EXPECT_ANY_THROW(hwTimestampNode.getQueryHandleRef());
+    }
+
+    {
+        TagNode<TimestampPackets<uint32_t>> timestampPacketsNode = {};
+
+        EXPECT_ANY_THROW(timestampPacketsNode.getContextCompleteRef());
+        EXPECT_ANY_THROW(timestampPacketsNode.getGlobalEndRef());
+        EXPECT_ANY_THROW(timestampPacketsNode.getQueryHandleRef());
+    }
+}
diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp
index 4d4d0da9f3..313eb49f23 100644
--- a/shared/source/command_stream/command_stream_receiver.cpp
+++ b/shared/source/command_stream/command_stream_receiver.cpp
@@ -606,15 +606,15 @@ bool CommandStreamReceiver::createAllocationForHostSurface(HostPtrSurface &surfa
     return true;
 }
 
-TagAllocator<HwTimeStamps> *CommandStreamReceiver::getEventTsAllocator() {
+TagAllocatorBase *CommandStreamReceiver::getEventTsAllocator() {
     if (profilingTimeStampAllocator.get() == nullptr) {
-        profilingTimeStampAllocator = std::make_unique<TagAllocator<HwTimeStamps>>(
-            rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize, sizeof(HwTimeStamps), false, osContext->getDeviceBitfield());
+        profilingTimeStampAllocator = std::make_unique<TagAllocator<HwTimeStamps>>(rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize,
+                                                                                   sizeof(HwTimeStamps), false, osContext->getDeviceBitfield());
     }
     return profilingTimeStampAllocator.get();
 }
 
-TagAllocator<HwPerfCounter> *CommandStreamReceiver::getEventPerfCountAllocator(const uint32_t tagSize) {
+TagAllocatorBase *CommandStreamReceiver::getEventPerfCountAllocator(const uint32_t tagSize) {
     if (perfCounterAllocator.get() == nullptr) {
         perfCounterAllocator = std::make_unique<TagAllocator<HwPerfCounter>>(
             rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize, tagSize, false, osContext->getDeviceBitfield());
@@ -622,15 +622,15 @@ TagAllocator<HwPerfCounter> *CommandStreamReceiver::getEventPerfCountAllocator(c
     return perfCounterAllocator.get();
 }
 
-TagAllocator<TimestampPacketStorage> *CommandStreamReceiver::getTimestampPacketAllocator() {
+TagAllocatorBase *CommandStreamReceiver::getTimestampPacketAllocator() {
     if (timestampPacketAllocator.get() == nullptr) {
         // dont release nodes in aub/tbx mode, to avoid removing semaphores optimization or reusing returned tags
         bool doNotReleaseNodes = (getType() > CommandStreamReceiverType::CSR_HW) ||
                                  DebugManager.flags.DisableTimestampPacketOptimizations.get();
 
-        timestampPacketAllocator = std::make_unique<TagAllocator<TimestampPacketStorage>>(
+        timestampPacketAllocator = std::make_unique<TagAllocator<NEO::TimestampPackets<uint32_t>>>(
             rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize * 4,
-            sizeof(TimestampPacketStorage), doNotReleaseNodes, osContext->getDeviceBitfield());
+            sizeof(NEO::TimestampPackets<uint32_t>), doNotReleaseNodes, osContext->getDeviceBitfield());
     }
     return timestampPacketAllocator.get();
 }
diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h
index b4c9c0ffa7..61156eb16d 100644
--- a/shared/source/command_stream/command_stream_receiver.h
+++ b/shared/source/command_stream/command_stream_receiver.h
@@ -43,8 +43,9 @@ class MultiGraphicsAllocation;
 class OsContext;
 class OSInterface;
 class ScratchSpaceController;
-struct HwPerfCounter;
-struct HwTimeStamps;
+class HwPerfCounter;
+class HwTimeStamps;
+class TagAllocatorBase;
 
 template <typename TSize>
 class TimestampPackets;
@@ -192,9 +193,9 @@ class CommandStreamReceiver {
     virtual void setupContext(OsContext &osContext) { this->osContext = &osContext; }
     OsContext &getOsContext() const { return *osContext; }
 
-    TagAllocator<HwTimeStamps> *getEventTsAllocator();
-    TagAllocator<HwPerfCounter> *getEventPerfCountAllocator(const uint32_t tagSize);
-    TagAllocator<TimestampPacketStorage> *getTimestampPacketAllocator();
+    TagAllocatorBase *getEventTsAllocator();
+    TagAllocatorBase *getEventPerfCountAllocator(const uint32_t tagSize);
+    TagAllocatorBase *getTimestampPacketAllocator();
 
     virtual bool expectMemory(const void *gfxAddress, const void *srcAddress, size_t length, uint32_t compareOperation);
 
@@ -261,9 +262,9 @@ class CommandStreamReceiver {
     std::unique_ptr<InternalAllocationStorage> internalAllocationStorage;
     std::unique_ptr<KmdNotifyHelper> kmdNotifyHelper;
     std::unique_ptr<ScratchSpaceController> scratchSpaceController;
-    std::unique_ptr<TagAllocator<HwTimeStamps>> profilingTimeStampAllocator;
-    std::unique_ptr<TagAllocator<HwPerfCounter>> perfCounterAllocator;
-    std::unique_ptr<TagAllocator<TimestampPacketStorage>> timestampPacketAllocator;
+    std::unique_ptr<TagAllocatorBase> profilingTimeStampAllocator;
+    std::unique_ptr<TagAllocatorBase> perfCounterAllocator;
+    std::unique_ptr<TagAllocatorBase> timestampPacketAllocator;
     std::unique_ptr<Thread> userPauseConfirmation;
 
     ResidencyContainer residencyAllocations;
diff --git a/shared/source/helpers/blit_commands_helper.h b/shared/source/helpers/blit_commands_helper.h
index 617e21ec6b..515cbb1968 100644
--- a/shared/source/helpers/blit_commands_helper.h
+++ b/shared/source/helpers/blit_commands_helper.h
@@ -26,16 +26,17 @@ class LinearStream;
 struct RootDeviceEnvironment;
 
 template <typename TagType>
-struct TagNode;
+class TagNode;
 
 template <typename TSize>
 class TimestampPackets;
 
+class TagNodeBase;
+
 struct BlitProperties;
 struct HardwareInfo;
 struct TimestampPacketDependencies;
 using BlitPropertiesContainer = StackVec<BlitProperties, 16>;
-using TimestampPacketStorage = TimestampPackets<uint32_t>;
 
 struct BlitProperties {
     static BlitProperties constructPropertiesForReadWriteBuffer(BlitterConstants::BlitDirection blitDirection,
@@ -60,7 +61,7 @@ struct BlitProperties {
                                                    TimestampPacketContainer &kernelTimestamps, const CsrDependencies &depsFromEvents,
                                                    CommandStreamReceiver &gpguCsr, CommandStreamReceiver &bcsCsr);
 
-    TagNode<TimestampPacketStorage> *outputTimestampPacket = nullptr;
+    TagNodeBase *outputTimestampPacket = nullptr;
     BlitterConstants::BlitDirection blitDirection;
     CsrDependencies csrDependencies;
     AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None;
diff --git a/shared/source/helpers/common_types.h b/shared/source/helpers/common_types.h
index da0922f7dd..eee4d5b664 100644
--- a/shared/source/helpers/common_types.h
+++ b/shared/source/helpers/common_types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2020 Intel Corporation
+ * Copyright (C) 2019-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,4 +23,13 @@ enum class DebugPauseState : uint32_t {
     hasUserEndConfirmation,
     terminate
 };
+
+class TagTypeBase {
+};
+
+enum class TagNodeType {
+    TimestampPacket,
+    HwTimeStamps,
+    HwPerfCounter
+};
 } // namespace NEO
diff --git a/shared/source/helpers/timestamp_packet.cpp b/shared/source/helpers/timestamp_packet.cpp
index d51ba29355..abd12b8977 100644
--- a/shared/source/helpers/timestamp_packet.cpp
+++ b/shared/source/helpers/timestamp_packet.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2020 Intel Corporation
+ * Copyright (C) 2018-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -13,7 +13,7 @@
 
 using namespace NEO;
 
-void TimestampPacketContainer::add(Node *timestampPacketNode) {
+void TimestampPacketContainer::add(TagNodeBase *timestampPacketNode) {
     timestampPacketNodes.push_back(timestampPacketNode);
 }
 
@@ -28,7 +28,7 @@ void TimestampPacketContainer::swapNodes(TimestampPacketContainer &timestampPack
 }
 
 void TimestampPacketContainer::resolveDependencies(bool clearAllDependencies) {
-    std::vector<Node *> pendingNodes;
+    std::vector<TagNodeBase *> pendingNodes;
 
     for (auto node : timestampPacketNodes) {
         if (node->canBeReleased() || clearAllDependencies) {
diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h
index 724af11825..52cf590c13 100644
--- a/shared/source/helpers/timestamp_packet.h
+++ b/shared/source/helpers/timestamp_packet.h
@@ -31,7 +31,7 @@ constexpr uint32_t preferredPacketCount = 16u;
 
 #pragma pack(1)
 template <typename TSize>
-class TimestampPackets {
+class TimestampPackets : public TagTypeBase {
   public:
     struct Packet {
         TSize contextStart = 1u;
@@ -40,10 +40,14 @@ class TimestampPackets {
         TSize globalEnd = 1u;
     };
 
-    static GraphicsAllocation::AllocationType getAllocationType() {
+    static constexpr GraphicsAllocation::AllocationType getAllocationType() {
         return GraphicsAllocation::AllocationType::TIMESTAMP_PACKET_TAG_BUFFER;
     }
 
+    static constexpr TagNodeType getTagNodeType() { return TagNodeType::TimestampPacket; }
+
+    size_t getSinglePacketSize() const { return sizeof(Packet); }
+
     bool isCompleted() const {
         if (DebugManager.flags.DisableAtomicForPostSyncs.get()) {
             return false;
@@ -96,29 +100,25 @@ class TimestampPackets {
 };
 #pragma pack()
 
-using TimestampPacketStorage = TimestampPackets<uint32_t>;
-
-static_assert(((4 * TimestampPacketSizeControl::preferredPacketCount + 2) * sizeof(uint32_t)) == sizeof(TimestampPacketStorage),
+static_assert(((4 * TimestampPacketSizeControl::preferredPacketCount + 2) * sizeof(uint32_t)) == sizeof(TimestampPackets<uint32_t>),
               "This structure is consumed by GPU and has to follow specific restrictions for padding and size");
 
 class TimestampPacketContainer : public NonCopyableClass {
   public:
-    using Node = TagNode<TimestampPacketStorage>;
-
     TimestampPacketContainer() = default;
     TimestampPacketContainer(TimestampPacketContainer &&) = default;
     TimestampPacketContainer &operator=(TimestampPacketContainer &&) = default;
     MOCKABLE_VIRTUAL ~TimestampPacketContainer();
 
-    const std::vector<Node *> &peekNodes() const { return timestampPacketNodes; }
-    void add(Node *timestampPacketNode);
+    const std::vector<TagNodeBase *> &peekNodes() const { return timestampPacketNodes; }
+    void add(TagNodeBase *timestampPacketNode);
     void swapNodes(TimestampPacketContainer &timestampPacketContainer);
     void assignAndIncrementNodesRefCounts(const TimestampPacketContainer &inputTimestampPacketContainer);
     void resolveDependencies(bool clearAllDependencies);
     void makeResident(CommandStreamReceiver &commandStreamReceiver);
 
   protected:
-    std::vector<Node *> timestampPacketNodes;
+    std::vector<TagNodeBase *> timestampPacketNodes;
 };
 
 struct TimestampPacketDependencies : public NonCopyableClass {
@@ -130,27 +130,27 @@ struct TimestampPacketDependencies : public NonCopyableClass {
 };
 
 struct TimestampPacketHelper {
-    static uint64_t getContextEndGpuAddress(const TagNode<TimestampPacketStorage> &timestampPacketNode) {
-        return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getContextEndOffset();
+    static uint64_t getContextEndGpuAddress(const TagNodeBase &timestampPacketNode) {
+        return timestampPacketNode.getGpuAddress() + timestampPacketNode.getContextEndOffset();
     }
-    static uint64_t getContextStartGpuAddress(const TagNode<TimestampPacketStorage> &timestampPacketNode) {
-        return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getContextStartOffset();
+    static uint64_t getContextStartGpuAddress(const TagNodeBase &timestampPacketNode) {
+        return timestampPacketNode.getGpuAddress() + timestampPacketNode.getContextStartOffset();
     }
-    static uint64_t getGlobalEndGpuAddress(const TagNode<TimestampPacketStorage> &timestampPacketNode) {
-        return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getGlobalEndOffset();
+    static uint64_t getGlobalEndGpuAddress(const TagNodeBase &timestampPacketNode) {
+        return timestampPacketNode.getGpuAddress() + timestampPacketNode.getGlobalEndOffset();
     }
-    static uint64_t getGlobalStartGpuAddress(const TagNode<TimestampPacketStorage> &timestampPacketNode) {
-        return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getGlobalStartOffset();
+    static uint64_t getGlobalStartGpuAddress(const TagNodeBase &timestampPacketNode) {
+        return timestampPacketNode.getGpuAddress() + timestampPacketNode.getGlobalStartOffset();
     }
 
-    static uint64_t getGpuDependenciesCountGpuAddress(const TagNode<TimestampPacketStorage> &timestampPacketNode) {
-        return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getImplicitGpuDependenciesCountOffset();
+    static uint64_t getGpuDependenciesCountGpuAddress(const TagNodeBase &timestampPacketNode) {
+        return timestampPacketNode.getGpuAddress() + timestampPacketNode.getImplicitGpuDependenciesCountOffset();
     }
 
     static void overrideSupportedDevicesCount(uint32_t &numSupportedDevices);
 
     template <typename GfxFamily>
-    static void programSemaphoreWithImplicitDependency(LinearStream &cmdStream, TagNode<TimestampPacketStorage> &timestampPacketNode, uint32_t numSupportedDevices) {
+    static void programSemaphoreWithImplicitDependency(LinearStream &cmdStream, TagNodeBase &timestampPacketNode, uint32_t numSupportedDevices) {
         using MI_ATOMIC = typename GfxFamily::MI_ATOMIC;
         using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
         using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
@@ -158,8 +158,8 @@ struct TimestampPacketHelper {
         auto compareAddress = getContextEndGpuAddress(timestampPacketNode);
         auto dependenciesCountAddress = getGpuDependenciesCountGpuAddress(timestampPacketNode);
 
-        for (uint32_t packetId = 0; packetId < timestampPacketNode.tagForCpuAccess->getPacketsUsed(); packetId++) {
-            uint64_t compareOffset = packetId * sizeof(TimestampPacketStorage::Packet);
+        for (uint32_t packetId = 0; packetId < timestampPacketNode.getPacketsUsed(); packetId++) {
+            uint64_t compareOffset = packetId * timestampPacketNode.getSinglePacketSize();
             EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(cmdStream, compareAddress + compareOffset, 1, COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
         }
 
@@ -231,8 +231,8 @@ struct TimestampPacketHelper {
     }
 
     template <typename GfxFamily>
-    static size_t getRequiredCmdStreamSizeForNodeDependency(TagNode<TimestampPacketStorage> &timestampPacketNode) {
-        size_t totalMiSemaphoreWaitSize = timestampPacketNode.tagForCpuAccess->getPacketsUsed() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
+    static size_t getRequiredCmdStreamSizeForNodeDependency(TagNodeBase &timestampPacketNode) {
+        size_t totalMiSemaphoreWaitSize = timestampPacketNode.getPacketsUsed() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
 
         return totalMiSemaphoreWaitSize + sizeof(typename GfxFamily::MI_ATOMIC);
     }
diff --git a/shared/source/utilities/CMakeLists.txt b/shared/source/utilities/CMakeLists.txt
index 148546fdd1..16a2b5a2d5 100644
--- a/shared/source/utilities/CMakeLists.txt
+++ b/shared/source/utilities/CMakeLists.txt
@@ -34,7 +34,9 @@ set(NEO_CORE_UTILITIES
     ${CMAKE_CURRENT_SOURCE_DIR}/software_tags_manager.h
     ${CMAKE_CURRENT_SOURCE_DIR}/spinlock.h
     ${CMAKE_CURRENT_SOURCE_DIR}/stackvec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator.inl
     ${CMAKE_CURRENT_SOURCE_DIR}/time_measure_wrapper.h
     ${CMAKE_CURRENT_SOURCE_DIR}/timer_util.h
 )
diff --git a/shared/source/utilities/tag_allocator.cpp b/shared/source/utilities/tag_allocator.cpp
new file mode 100644
index 0000000000..106470c3f7
--- /dev/null
+++ b/shared/source/utilities/tag_allocator.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/utilities/tag_allocator.h"
+
+namespace NEO {
+
+TagAllocatorBase::TagAllocatorBase(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount, size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes, DeviceBitfield deviceBitfield)
+    : deviceBitfield(deviceBitfield), rootDeviceIndex(rootDeviceIndex), memoryManager(memMngr), tagCount(tagCount), tagSize(tagSize), doNotReleaseNodes(doNotReleaseNodes) {
+
+    this->tagSize = alignUp(tagSize, tagAlignment);
+}
+
+void TagAllocatorBase::cleanUpResources() {
+    for (auto gfxAllocation : gfxAllocations) {
+        memoryManager->freeGraphicsMemory(gfxAllocation);
+    }
+    gfxAllocations.clear();
+}
+
+void TagNodeBase::returnTag() {
+    allocator->returnTag(this);
+}
+
+bool TagNodeBase::canBeReleased() const {
+    return (!doNotReleaseNodes) &&
+           (isCompleted()) &&
+           (getImplicitGpuDependenciesCount() == getImplicitCpuDependenciesCount());
+}
+
+} // namespace NEO
diff --git a/shared/source/utilities/tag_allocator.h b/shared/source/utilities/tag_allocator.h
index 8275ad7386..a9b804f339 100644
--- a/shared/source/utilities/tag_allocator.h
+++ b/shared/source/utilities/tag_allocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -14,6 +14,7 @@
 #include <atomic>
 #include <cstdint>
 #include <mutex>
+#include <type_traits>
 #include <vector>
 
 namespace NEO {
@@ -23,28 +24,31 @@ template <typename TagType>
 class TagAllocator;
 
 template <typename TagType>
-struct TagNode : public IDNode<TagNode<TagType>>, NonCopyableOrMovableClass {
+class TagNode;
+
+class TagAllocatorBase;
+
+class TagNodeBase : public NonCopyableOrMovableClass {
   public:
-    TagType *tagForCpuAccess;
+    virtual ~TagNodeBase() = default;
 
     GraphicsAllocation *getBaseGraphicsAllocation() const { return gfxAllocation; }
+
     uint64_t getGpuAddress() const { return gpuAddress; }
 
     void incRefCount() { refCount++; }
 
-    MOCKABLE_VIRTUAL void returnTag() {
-        allocator->returnTag(this);
-    }
+    uint32_t refCountFetchSub(uint32_t value) { return refCount.fetch_sub(value); }
 
-    bool canBeReleased() const {
-        return (!doNotReleaseNodes) &&
-               (tagForCpuAccess->isCompleted()) &&
-               (tagForCpuAccess->getImplicitGpuDependenciesCount() == getImplicitCpuDependenciesCount());
-    }
+    MOCKABLE_VIRTUAL void returnTag();
 
-    void setDoNotReleaseNodes(bool doNotRelease) {
-        doNotReleaseNodes = doNotRelease;
-    }
+    virtual void initialize() = 0;
+
+    bool canBeReleased() const;
+
+    virtual void *getCpuBase() const = 0;
+
+    void setDoNotReleaseNodes(bool doNotRelease) { doNotReleaseNodes = doNotRelease; }
 
     void setProfilingCapable(bool capable) { profilingCapable = capable; }
 
@@ -52,18 +56,42 @@ struct TagNode : public IDNode<TagNode<TagType>>, NonCopyableOrMovableClass {
 
     void incImplicitCpuDependenciesCount() { implicitCpuDependenciesCount++; }
 
-    void initialize() {
-        tagForCpuAccess->initialize();
-        implicitCpuDependenciesCount.store(0);
-        setProfilingCapable(true);
-    }
-
     uint32_t getImplicitCpuDependenciesCount() const { return implicitCpuDependenciesCount.load(); }
 
-    const TagAllocator<TagType> *getAllocator() const { return allocator; }
+    const TagAllocatorBase *getAllocator() const { return allocator; }
+
+    // TagType specific calls
+    virtual bool isCompleted() const = 0;
+    virtual void assignDataToAllTimestamps(uint32_t packetIndex, void *source) = 0;
+
+    virtual size_t getGlobalStartOffset() const = 0;
+    virtual size_t getContextStartOffset() const = 0;
+    virtual size_t getContextEndOffset() const = 0;
+    virtual size_t getGlobalEndOffset() const = 0;
+    virtual size_t getImplicitGpuDependenciesCountOffset() const = 0;
+
+    virtual uint64_t getContextStartValue(uint32_t packetIndex) const = 0;
+    virtual uint64_t getGlobalStartValue(uint32_t packetIndex) const = 0;
+    virtual uint64_t getContextEndValue(uint32_t packetIndex) const = 0;
+    virtual uint64_t getGlobalEndValue(uint32_t packetIndex) const = 0;
+
+    virtual uint64_t &getGlobalEndRef() const = 0;
+    virtual uint64_t &getContextCompleteRef() const = 0;
+
+    virtual void setPacketsUsed(uint32_t used) = 0;
+    virtual uint32_t getPacketsUsed() const = 0;
+
+    virtual size_t getSinglePacketSize() const = 0;
+
+    virtual uint32_t getImplicitGpuDependenciesCount() const = 0;
+
+    virtual MetricsLibraryApi::QueryHandle_1_0 &getQueryHandleRef() const = 0;
 
   protected:
-    TagAllocator<TagType> *allocator = nullptr;
+    TagNodeBase() = default;
+
+    TagAllocatorBase *allocator = nullptr;
+
     GraphicsAllocation *gfxAllocation = nullptr;
     uint64_t gpuAddress = 0;
     std::atomic<uint32_t> refCount{0};
@@ -71,71 +99,78 @@ struct TagNode : public IDNode<TagNode<TagType>>, NonCopyableOrMovableClass {
     bool doNotReleaseNodes = false;
     bool profilingCapable = true;
 
-    template <typename TagType2>
+    template <typename TagType>
     friend class TagAllocator;
 };
 
 template <typename TagType>
-class TagAllocator {
+class TagNode : public TagNodeBase, public IDNode<TagNode<TagType>> {
+    static_assert(!std::is_polymorphic<TagType>::value,
+                  "This structure is consumed by GPU and has to follow specific restrictions for padding and size");
+
   public:
-    using NodeType = TagNode<TagType>;
+    TagType *tagForCpuAccess;
 
-    TagAllocator(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount,
-                 size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes,
-                 DeviceBitfield deviceBitfield) : deviceBitfield(deviceBitfield),
-                                                  rootDeviceIndex(rootDeviceIndex),
-                                                  memoryManager(memMngr),
-                                                  tagCount(tagCount),
-                                                  doNotReleaseNodes(doNotReleaseNodes) {
-
-        this->tagSize = alignUp(tagSize, tagAlignment);
-        populateFreeTags();
+    void initialize() override {
+        tagForCpuAccess->initialize();
+        implicitCpuDependenciesCount.store(0);
+        setProfilingCapable(true);
     }
 
-    MOCKABLE_VIRTUAL ~TagAllocator() {
-        cleanUpResources();
-    }
+    void *getCpuBase() const override { return tagForCpuAccess; }
 
-    void cleanUpResources() {
-        for (auto gfxAllocation : gfxAllocations) {
-            memoryManager->freeGraphicsMemory(gfxAllocation);
-        }
-        gfxAllocations.clear();
-    }
+    void assignDataToAllTimestamps(uint32_t packetIndex, void *source) override;
 
-    NodeType *getTag() {
-        if (freeTags.peekIsEmpty()) {
-            releaseDeferredTags();
-        }
-        NodeType *node = freeTags.removeFrontOne().release();
-        if (!node) {
-            std::unique_lock<std::mutex> lock(allocatorMutex);
-            populateFreeTags();
-            node = freeTags.removeFrontOne().release();
-        }
-        usedTags.pushFrontOne(*node);
-        node->incRefCount();
-        node->initialize();
-        return node;
-    }
+    bool isCompleted() const override;
 
-    MOCKABLE_VIRTUAL void returnTag(NodeType *node) {
-        if (node->refCount.fetch_sub(1) == 1) {
-            if (node->canBeReleased()) {
-                returnTagToFreePool(node);
-            } else {
-                returnTagToDeferredPool(node);
-            }
-        }
-    }
+    size_t getGlobalStartOffset() const override;
+    size_t getContextStartOffset() const override;
+    size_t getContextEndOffset() const override;
+    size_t getGlobalEndOffset() const override;
+    size_t getImplicitGpuDependenciesCountOffset() const override;
+
+    uint64_t getContextStartValue(uint32_t packetIndex) const override;
+    uint64_t getGlobalStartValue(uint32_t packetIndex) const override;
+    uint64_t getContextEndValue(uint32_t packetIndex) const override;
+    uint64_t getGlobalEndValue(uint32_t packetIndex) const override;
+
+    uint64_t &getGlobalEndRef() const override;
+    uint64_t &getContextCompleteRef() const override;
+
+    void setPacketsUsed(uint32_t used) override;
+    uint32_t getPacketsUsed() const override;
+
+    size_t getSinglePacketSize() const override;
+
+    uint32_t getImplicitGpuDependenciesCount() const override;
+
+    MetricsLibraryApi::QueryHandle_1_0 &getQueryHandleRef() const override;
+};
+
+class TagAllocatorBase {
+  public:
+    virtual ~TagAllocatorBase() { cleanUpResources(); };
+
+    virtual void returnTag(TagNodeBase *node) = 0;
+
+    virtual TagNodeBase *getTag() = 0;
 
   protected:
-    IDList<NodeType> freeTags;
-    IDList<NodeType> usedTags;
-    IDList<NodeType> deferredTags;
-    std::vector<GraphicsAllocation *> gfxAllocations;
-    std::vector<std::unique_ptr<NodeType[]>> tagPoolMemory;
+    TagAllocatorBase() = delete;
 
+    TagAllocatorBase(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount,
+                     size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes,
+                     DeviceBitfield deviceBitfield);
+
+    virtual void returnTagToFreePool(TagNodeBase *node) = 0;
+
+    virtual void returnTagToDeferredPool(TagNodeBase *node) = 0;
+
+    virtual void releaseDeferredTags() = 0;
+
+    void cleanUpResources();
+
+    std::vector<GraphicsAllocation *> gfxAllocations;
     const DeviceBitfield deviceBitfield;
     const uint32_t rootDeviceIndex;
     MemoryManager *memoryManager;
@@ -144,66 +179,38 @@ class TagAllocator {
     bool doNotReleaseNodes = false;
 
     std::mutex allocatorMutex;
+};
 
-    MOCKABLE_VIRTUAL void returnTagToFreePool(NodeType *node) {
-        NodeType *usedNode = usedTags.removeOne(*node).release();
-        DEBUG_BREAK_IF(usedNode == nullptr);
-        UNUSED_VARIABLE(usedNode);
-        freeTags.pushFrontOne(*node);
-    }
+template <typename TagType>
+class TagAllocator : public TagAllocatorBase {
+  public:
+    using NodeType = TagNode<TagType>;
 
-    void returnTagToDeferredPool(NodeType *node) {
-        NodeType *usedNode = usedTags.removeOne(*node).release();
-        DEBUG_BREAK_IF(!usedNode);
-        deferredTags.pushFrontOne(*usedNode);
-    }
+    TagAllocator(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount,
+                 size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes,
+                 DeviceBitfield deviceBitfield);
 
-    void populateFreeTags() {
-        size_t allocationSizeRequired = tagCount * tagSize;
+    TagNodeBase *getTag() override;
 
-        auto allocationType = TagType::getAllocationType();
-        AllocationProperties allocationProperties{rootDeviceIndex, allocationSizeRequired, allocationType, deviceBitfield};
-        GraphicsAllocation *graphicsAllocation = memoryManager->allocateGraphicsMemoryWithProperties(allocationProperties);
-        gfxAllocations.push_back(graphicsAllocation);
+    void returnTag(TagNodeBase *node) override;
 
-        auto nodesMemory = std::make_unique<NodeType[]>(tagCount);
+  protected:
+    TagAllocator() = delete;
 
-        for (size_t i = 0; i < tagCount; ++i) {
-            auto tagOffset = i * tagSize;
+    void returnTagToFreePool(TagNodeBase *node) override;
 
-            nodesMemory[i].allocator = this;
-            nodesMemory[i].gfxAllocation = graphicsAllocation;
-            nodesMemory[i].tagForCpuAccess = reinterpret_cast<TagType *>(ptrOffset(graphicsAllocation->getUnderlyingBuffer(), tagOffset));
-            nodesMemory[i].gpuAddress = graphicsAllocation->getGpuAddress() + tagOffset;
-            nodesMemory[i].setDoNotReleaseNodes(doNotReleaseNodes);
+    void returnTagToDeferredPool(TagNodeBase *node) override;
 
-            freeTags.pushTailOne(nodesMemory[i]);
-        }
+    void releaseDeferredTags() override;
 
-        tagPoolMemory.push_back(std::move(nodesMemory));
-    }
+    void populateFreeTags();
 
-    void releaseDeferredTags() {
-        IDList<NodeType, false> pendingFreeTags;
-        IDList<NodeType, false> pendingDeferredTags;
-        auto currentNode = deferredTags.detachNodes();
+    IDList<NodeType> freeTags;
+    IDList<NodeType> usedTags;
+    IDList<NodeType> deferredTags;
 
-        while (currentNode != nullptr) {
-            auto nextNode = currentNode->next;
-            if (currentNode->canBeReleased()) {
-                pendingFreeTags.pushFrontOne(*currentNode);
-            } else {
-                pendingDeferredTags.pushFrontOne(*currentNode);
-            }
-            currentNode = nextNode;
-        }
-
-        if (!pendingFreeTags.peekIsEmpty()) {
-            freeTags.splice(*pendingFreeTags.detachNodes());
-        }
-        if (!pendingDeferredTags.peekIsEmpty()) {
-            deferredTags.splice(*pendingDeferredTags.detachNodes());
-        }
-    }
+    std::vector<std::unique_ptr<NodeType[]>> tagPoolMemory;
 };
 } // namespace NEO
+
+#include "shared/source/utilities/tag_allocator.inl"
diff --git a/shared/source/utilities/tag_allocator.inl b/shared/source/utilities/tag_allocator.inl
new file mode 100644
index 0000000000..d81371f0f8
--- /dev/null
+++ b/shared/source/utilities/tag_allocator.inl
@@ -0,0 +1,282 @@
+/*
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/utilities/tag_allocator.h"
+
+namespace NEO {
+template <typename TagType>
+TagAllocator<TagType>::TagAllocator(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount, size_t tagAlignment,
+                                    size_t tagSize, bool doNotReleaseNodes, DeviceBitfield deviceBitfield)
+    : TagAllocatorBase(rootDeviceIndex, memMngr, tagCount, tagAlignment, tagSize, doNotReleaseNodes, deviceBitfield) {
+
+    populateFreeTags();
+}
+
+template <typename TagType>
+TagNodeBase *TagAllocator<TagType>::getTag() {
+    if (freeTags.peekIsEmpty()) {
+        releaseDeferredTags();
+    }
+    auto node = freeTags.removeFrontOne().release();
+    if (!node) {
+        std::unique_lock<std::mutex> lock(allocatorMutex);
+        populateFreeTags();
+        node = freeTags.removeFrontOne().release();
+    }
+    usedTags.pushFrontOne(*node);
+    node->incRefCount();
+    node->initialize();
+    return node;
+}
+
+template <typename TagType>
+void TagAllocator<TagType>::returnTagToFreePool(TagNodeBase *node) {
+    auto nodeT = static_cast<NodeType *>(node);
+    auto usedNode = usedTags.removeOne(*nodeT).release();
+    DEBUG_BREAK_IF(usedNode == nullptr);
+    UNUSED_VARIABLE(usedNode);
+    freeTags.pushFrontOne(*nodeT);
+}
+
+template <typename TagType>
+void TagAllocator<TagType>::returnTagToDeferredPool(TagNodeBase *node) {
+    auto nodeT = static_cast<NodeType *>(node);
+    auto usedNode = usedTags.removeOne(*nodeT).release();
+    DEBUG_BREAK_IF(!usedNode);
+    deferredTags.pushFrontOne(*usedNode);
+}
+
+template <typename TagType>
+void TagAllocator<TagType>::releaseDeferredTags() {
+    IDList<NodeType, false> pendingFreeTags;
+    IDList<NodeType, false> pendingDeferredTags;
+    auto currentNode = deferredTags.detachNodes();
+
+    while (currentNode != nullptr) {
+        auto nextNode = currentNode->next;
+        if (currentNode->canBeReleased()) {
+            pendingFreeTags.pushFrontOne(*currentNode);
+        } else {
+            pendingDeferredTags.pushFrontOne(*currentNode);
+        }
+        currentNode = nextNode;
+    }
+
+    if (!pendingFreeTags.peekIsEmpty()) {
+        freeTags.splice(*pendingFreeTags.detachNodes());
+    }
+    if (!pendingDeferredTags.peekIsEmpty()) {
+        deferredTags.splice(*pendingDeferredTags.detachNodes());
+    }
+}
+
+template <typename TagType>
+void TagAllocator<TagType>::populateFreeTags() {
+    size_t allocationSizeRequired = tagCount * tagSize;
+
+    AllocationProperties allocationProperties{rootDeviceIndex, allocationSizeRequired, TagType::getAllocationType(), deviceBitfield};
+    GraphicsAllocation *graphicsAllocation = memoryManager->allocateGraphicsMemoryWithProperties(allocationProperties);
+    gfxAllocations.push_back(graphicsAllocation);
+
+    auto nodesMemory = std::make_unique<NodeType[]>(tagCount);
+
+    for (size_t i = 0; i < tagCount; ++i) {
+        auto tagOffset = i * tagSize;
+
+        nodesMemory[i].allocator = this;
+        nodesMemory[i].gfxAllocation = graphicsAllocation;
+        nodesMemory[i].tagForCpuAccess = reinterpret_cast<TagType *>(ptrOffset(graphicsAllocation->getUnderlyingBuffer(), tagOffset));
+        nodesMemory[i].gpuAddress = graphicsAllocation->getGpuAddress() + tagOffset;
+        nodesMemory[i].setDoNotReleaseNodes(doNotReleaseNodes);
+
+        freeTags.pushTailOne(nodesMemory[i]);
+    }
+
+    tagPoolMemory.push_back(std::move(nodesMemory));
+}
+
+template <typename TagType>
+void TagAllocator<TagType>::returnTag(TagNodeBase *node) {
+    if (node->refCountFetchSub(1) == 1) {
+        if (node->canBeReleased()) {
+            returnTagToFreePool(node);
+        } else {
+            returnTagToDeferredPool(node);
+        }
+    }
+}
+
+template <typename TagType>
+size_t TagNode<TagType>::getGlobalStartOffset() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getGlobalStartOffset();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+size_t TagNode<TagType>::getContextStartOffset() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getContextStartOffset();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+size_t TagNode<TagType>::getContextEndOffset() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getContextEndOffset();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+size_t TagNode<TagType>::getGlobalEndOffset() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getGlobalEndOffset();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+size_t TagNode<TagType>::getImplicitGpuDependenciesCountOffset() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getImplicitGpuDependenciesCountOffset();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint64_t TagNode<TagType>::getContextStartValue(uint32_t packetIndex) const {
+    if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) {
+        return tagForCpuAccess->getContextStartValue(packetIndex);
+    } else {
+        UNUSED_VARIABLE(packetIndex);
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint64_t TagNode<TagType>::getGlobalStartValue(uint32_t packetIndex) const {
+    if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) {
+        return tagForCpuAccess->getGlobalStartValue(packetIndex);
+    } else {
+        UNUSED_VARIABLE(packetIndex);
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint64_t TagNode<TagType>::getContextEndValue(uint32_t packetIndex) const {
+    if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) {
+        return tagForCpuAccess->getContextEndValue(packetIndex);
+    } else {
+        UNUSED_VARIABLE(packetIndex);
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint64_t TagNode<TagType>::getGlobalEndValue(uint32_t packetIndex) const {
+    if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) {
+        return tagForCpuAccess->getGlobalEndValue(packetIndex);
+    } else {
+        UNUSED_VARIABLE(packetIndex);
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint64_t &TagNode<TagType>::getContextCompleteRef() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::HwTimeStamps) {
+        return tagForCpuAccess->ContextCompleteTS;
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint64_t &TagNode<TagType>::getGlobalEndRef() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::HwTimeStamps) {
+        return tagForCpuAccess->GlobalEndTS;
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+void TagNode<TagType>::setPacketsUsed(uint32_t used) {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->setPacketsUsed(used);
+    } else {
+        UNUSED_VARIABLE(used);
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint32_t TagNode<TagType>::getPacketsUsed() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getPacketsUsed();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+uint32_t TagNode<TagType>::getImplicitGpuDependenciesCount() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getImplicitGpuDependenciesCount();
+    } else {
+        return 0;
+    }
+}
+
+template <typename TagType>
+size_t TagNode<TagType>::getSinglePacketSize() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->getSinglePacketSize();
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+void TagNode<TagType>::assignDataToAllTimestamps(uint32_t packetIndex, void *source) {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->assignDataToAllTimestamps(packetIndex, source);
+    } else {
+        UNUSED_VARIABLE(packetIndex);
+        UNUSED_VARIABLE(source);
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+template <typename TagType>
+bool TagNode<TagType>::isCompleted() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) {
+        return tagForCpuAccess->isCompleted();
+    } else {
+        return true;
+    }
+}
+
+template <typename TagType>
+MetricsLibraryApi::QueryHandle_1_0 &TagNode<TagType>::getQueryHandleRef() const {
+    if constexpr (TagType::getTagNodeType() == TagNodeType::HwPerfCounter) {
+        return tagForCpuAccess->query.handle;
+    } else {
+        UNRECOVERABLE_IF(true);
+    }
+}
+
+} // namespace NEO