diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index bba5997e6b..017d6bebc1 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -230,7 +230,7 @@ ze_result_t CommandListCoreFamily::appendEventReset(ze_event_hand uint64_t baseAddr = event->getGpuAddress(); size_t eventOffset = 0; if (event->isTimestampEvent) { - eventOffset = offsetof(TimestampPacketStorage::Packet, contextEnd); + eventOffset = offsetof(NEO::TimestampPackets::Packet, contextEnd); event->resetPackets(); } commandContainer.addToResidencyContainer(&event->getAllocation()); @@ -1487,7 +1487,7 @@ ze_result_t CommandListCoreFamily::appendSignalEvent(ze_event_han uint64_t baseAddr = event->getGpuAddress(); size_t eventSignalOffset = 0; if (event->isTimestampEvent) { - eventSignalOffset = offsetof(TimestampPacketStorage::Packet, contextEnd); + eventSignalOffset = offsetof(NEO::TimestampPackets::Packet, contextEnd); } if (isCopyOnly()) { @@ -1536,7 +1536,7 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu gpuAddr = event->getGpuAddress(); if (event->isTimestampEvent) { - gpuAddr += offsetof(TimestampPacketStorage::Packet, contextEnd); + gpuAddr += offsetof(NEO::TimestampPackets::Packet, contextEnd); } NEO::EncodeSempahore::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(), gpuAddr, @@ -1577,8 +1577,8 @@ void CommandListCoreFamily::appendWriteKernelTimestamp(ze_event_h auto event = Event::fromHandle(hEvent); auto baseAddr = event->getGpuAddress(); - auto contextOffset = beforeWalker ? offsetof(TimestampPacketStorage::Packet, contextStart) : offsetof(TimestampPacketStorage::Packet, contextEnd); - auto globalOffset = beforeWalker ? offsetof(TimestampPacketStorage::Packet, globalStart) : offsetof(TimestampPacketStorage::Packet, globalEnd); + auto contextOffset = beforeWalker ? offsetof(NEO::TimestampPackets::Packet, contextStart) : offsetof(NEO::TimestampPackets::Packet, contextEnd); + auto globalOffset = beforeWalker ? offsetof(NEO::TimestampPackets::Packet, globalStart) : offsetof(NEO::TimestampPackets::Packet, globalEnd); if (maskLsb) { NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, REG_GLOBAL_TIMESTAMP_LDW, mask, ptrOffset(baseAddr, globalOffset)); diff --git a/level_zero/core/source/event/event.cpp b/level_zero/core/source/event/event.cpp index 6f3271e547..3ca3ffe375 100644 --- a/level_zero/core/source/event/event.cpp +++ b/level_zero/core/source/event/event.cpp @@ -117,7 +117,7 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device * if (eventPool->isEventPoolUsedForTimestamp) { event->isTimestampEvent = true; - event->timestampsData = std::make_unique(); + event->timestampsData = std::make_unique>(); } auto alloc = eventPool->getAllocation().getGraphicsAllocation(device->getNEODevice()->getRootDeviceIndex()); @@ -140,7 +140,7 @@ NEO::GraphicsAllocation &Event::getAllocation() { } uint64_t Event::getTimestampPacketAddress() { - return gpuAddress + packetsInUse * sizeof(TimestampPacketStorage::Packet); + return gpuAddress + packetsInUse * sizeof(NEO::TimestampPackets::Packet); } ze_result_t EventImp::calculateProfilingData() { @@ -172,7 +172,7 @@ void EventImp::assignTimestampData(void *address) { for (uint32_t i = 0; i < packetsToCopy; i++) { timestampsData->assignDataToAllTimestamps(i, address); - address = ptrOffset(address, sizeof(struct TimestampPacketStorage::Packet)); + address = ptrOffset(address, sizeof(struct NEO::TimestampPackets::Packet)); } } @@ -190,7 +190,7 @@ ze_result_t EventImp::queryStatus() { this->csr->downloadAllocations(); if (isTimestampEvent) { auto baseAddr = reinterpret_cast(hostAddress); - auto timeStampAddress = baseAddr + offsetof(TimestampPacketStorage::Packet, contextEnd); + auto timeStampAddress = baseAddr + offsetof(NEO::TimestampPackets::Packet, contextEnd); hostAddr = reinterpret_cast(timeStampAddress); } memcpy_s(static_cast(&queryVal), sizeof(uint32_t), static_cast(hostAddr), sizeof(uint32_t)); @@ -212,11 +212,11 @@ ze_result_t EventImp::hostEventSetValueTimestamps(uint32_t eventVal) { }; for (uint32_t i = 0; i < NEO::TimestampPacketSizeControl::preferredPacketCount; i++) { - eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, contextStart)); - eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, globalStart)); - eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, contextEnd)); - eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, globalEnd)); - baseAddr += sizeof(struct TimestampPacketStorage::Packet); + eventTsSetFunc(baseAddr + offsetof(NEO::TimestampPackets::Packet, contextStart)); + eventTsSetFunc(baseAddr + offsetof(NEO::TimestampPackets::Packet, globalStart)); + eventTsSetFunc(baseAddr + offsetof(NEO::TimestampPackets::Packet, contextEnd)); + eventTsSetFunc(baseAddr + offsetof(NEO::TimestampPackets::Packet, globalEnd)); + baseAddr += sizeof(struct NEO::TimestampPackets::Packet); } assignTimestampData(hostAddress); diff --git a/level_zero/core/source/event/event.h b/level_zero/core/source/event/event.h index 0732176d45..7039a32595 100644 --- a/level_zero/core/source/event/event.h +++ b/level_zero/core/source/event/event.h @@ -22,7 +22,6 @@ namespace L0 { typedef uint64_t FlushStamp; struct EventPool; struct MetricStreamer; -using TimestampPacketStorage = NEO::TimestampPackets; struct Event : _ze_event_handle_t { virtual ~Event() = default; @@ -61,7 +60,7 @@ struct Event : _ze_event_handle_t { ze_event_scope_flags_t waitScope = 0u; bool isTimestampEvent = false; - std::unique_ptr timestampsData = nullptr; + std::unique_ptr> timestampsData = nullptr; uint64_t globalStartTS; uint64_t globalEndTS; uint64_t contextStartTS; @@ -158,7 +157,7 @@ struct EventPoolImp : public EventPool { size_t numEvents; protected: - const uint32_t eventSize = static_cast(alignUp(NEO::TimestampPacketSizeControl::preferredPacketCount * sizeof(struct TimestampPacketStorage::Packet), + const uint32_t eventSize = static_cast(alignUp(NEO::TimestampPacketSizeControl::preferredPacketCount * sizeof(struct NEO::TimestampPackets::Packet), MemoryConstants::cacheLineSize)); const uint32_t eventAlignment = MemoryConstants::cacheLineSize; }; diff --git a/level_zero/core/test/unit_tests/gen12lp/test_events_gen12lp.cpp b/level_zero/core/test/unit_tests/gen12lp/test_events_gen12lp.cpp index b681e4079e..49a21f4b39 100644 --- a/level_zero/core/test/unit_tests/gen12lp/test_events_gen12lp.cpp +++ b/level_zero/core/test/unit_tests/gen12lp/test_events_gen12lp.cpp @@ -42,7 +42,7 @@ struct TimestampEvent : public Test { }; GEN12LPTEST_F(TimestampEvent, givenEventTimestampsWhenQueryKernelTimestampThenCorrectDataAreSet) { - TimestampPacketStorage::Packet data = {}; + TimestampPackets::Packet data = {}; data.contextStart = 1u; data.contextEnd = 2u; data.globalStart = 3u; @@ -61,7 +61,7 @@ GEN12LPTEST_F(TimestampEvent, givenEventTimestampsWhenQueryKernelTimestampThenCo } GEN12LPTEST_F(TimestampEvent, givenEventMoreThanOneTimestampsPacketWhenQueryKernelTimestampThenCorrectCalculationAreMade) { - TimestampPacketStorage::Packet data[3] = {}; + TimestampPackets::Packet data[3] = {}; data[0].contextStart = 3u; data[0].contextEnd = 4u; data[0].globalStart = 5u; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp index 97998190b7..847a65abdd 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp @@ -1041,8 +1041,8 @@ HWTEST2_F(CommandListCreate, givenCopyCommandListWhenProfilingBeforeCommandForCo auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); auto baseAddr = event->getGpuAddress(); - auto contextOffset = offsetof(TimestampPacketStorage::Packet, contextStart); - auto globalOffset = offsetof(TimestampPacketStorage::Packet, globalStart); + auto contextOffset = offsetof(TimestampPackets::Packet, contextStart); + auto globalOffset = offsetof(TimestampPackets::Packet, globalStart); EXPECT_EQ(event->getTimestampPacketAddress(), baseAddr); commandList->appendEventForProfilingCopyCommand(event->toHandle(), true); @@ -1078,8 +1078,8 @@ HWTEST2_F(CommandListCreate, givenCopyCommandListWhenProfilingAfterCommandForCop commandList->appendEventForProfilingCopyCommand(event->toHandle(), false); - auto contextOffset = offsetof(TimestampPacketStorage::Packet, contextEnd); - auto globalOffset = offsetof(TimestampPacketStorage::Packet, globalEnd); + auto contextOffset = offsetof(TimestampPackets::Packet, contextEnd); + auto globalOffset = offsetof(TimestampPackets::Packet, globalEnd); auto baseAddr = event->getGpuAddress(); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp index 6b73c47bef..cf9c994775 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp @@ -138,7 +138,7 @@ HWTEST2_F(CommandListAppendEventReset, givenTimestampEventUsedInResetThenPipeCon commandList->appendEventReset(event->toHandle()); ASSERT_EQ(0u, event->getPacketsInUse()); - auto contextOffset = offsetof(TimestampPacketStorage::Packet, contextEnd); + auto contextOffset = offsetof(TimestampPackets::Packet, contextEnd); auto baseAddr = event->getGpuAddress(); auto gpuAddress = ptrOffset(baseAddr, contextOffset); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp index d6f99e6a4e..9aebbca51b 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp @@ -201,7 +201,7 @@ HWTEST2_F(CommandListAppendSignalEvent, givenTimestampEventUsedInSignalThenPipeC auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); commandList->appendSignalEvent(event->toHandle()); - auto contextOffset = offsetof(TimestampPacketStorage::Packet, contextEnd); + auto contextOffset = offsetof(TimestampPackets::Packet, contextEnd); auto baseAddr = event->getGpuAddress(); auto gpuAddress = ptrOffset(baseAddr, contextOffset); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp index a0f64cf247..b32673e920 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp @@ -205,10 +205,10 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListWhenTimestampPassedToMemoryCopyR GenCmdList cmdList; auto baseAddr = event->getGpuAddress(); - auto contextStartOffset = offsetof(TimestampPacketStorage::Packet, contextStart); - auto globalStartOffset = offsetof(TimestampPacketStorage::Packet, globalStart); - auto contextEndOffset = offsetof(TimestampPacketStorage::Packet, contextEnd); - auto globalEndOffset = offsetof(TimestampPacketStorage::Packet, globalEnd); + auto contextStartOffset = offsetof(TimestampPackets::Packet, contextStart); + auto globalStartOffset = offsetof(TimestampPackets::Packet, globalStart); + auto contextEndOffset = offsetof(TimestampPackets::Packet, contextEnd); + auto globalEndOffset = offsetof(TimestampPackets::Packet, globalEnd); ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), commandList->commandContainer.getCommandStream()->getUsed())); diff --git a/level_zero/core/test/unit_tests/sources/event/test_event.cpp b/level_zero/core/test/unit_tests/sources/event/test_event.cpp index a11e8bfa45..6a5aa9bda2 100644 --- a/level_zero/core/test/unit_tests/sources/event/test_event.cpp +++ b/level_zero/core/test/unit_tests/sources/event/test_event.cpp @@ -46,7 +46,7 @@ TEST_F(EventPoolCreate, givenTimestampEventsThenEventSizeSufficientForAllKernelT std::unique_ptr eventPool(EventPool::create(driverHandle.get(), 0, nullptr, &eventPoolDesc)); ASSERT_NE(nullptr, eventPool); - uint32_t packetsSize = NEO::TimestampPacketSizeControl::preferredPacketCount * sizeof(struct TimestampPacketStorage::Packet); + uint32_t packetsSize = NEO::TimestampPacketSizeControl::preferredPacketCount * sizeof(struct TimestampPackets::Packet); uint32_t kernelTimestampsSize = static_cast(alignUp(packetsSize, MemoryConstants::cacheLineSize)); EXPECT_EQ(kernelTimestampsSize, eventPool->getEventSize()); } @@ -364,12 +364,12 @@ TEST_F(TimestampEventCreate, givenEventTimestampWhenPacketCountIsIncreasedThenCo event->increasePacketsInUse(); EXPECT_EQ(1u, event->getPacketsInUse()); - gpuAddr += sizeof(TimestampPacketStorage::Packet); + gpuAddr += sizeof(TimestampPackets::Packet); EXPECT_EQ(gpuAddr, event->getTimestampPacketAddress()); } HWCMDTEST_F(IGFX_GEN9_CORE, TimestampEventCreate, givenEventTimestampsWhenQueryKernelTimestampThenCorrectDataAreSet) { - TimestampPacketStorage::Packet data = {}; + TimestampPackets::Packet data = {}; data.contextStart = 1u; data.contextEnd = 2u; data.globalStart = 3u; diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 9cc010dc07..703c7637f2 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -472,14 +472,14 @@ class CommandQueueHw : public CommandQueue { size_t hostSlicePitch); void processDeviceEnqueue(DeviceQueueHw *devQueueHw, const MultiDispatchInfo &multiDispatchInfo, - TagNode *hwTimeStamps, + TagNodeBase *hwTimeStamps, bool &blocking); template void processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo, std::unique_ptr &printfHandler, Event *event, - TagNode *&hwTimeStamps, + TagNodeBase *&hwTimeStamps, bool blockQueue, DeviceQueueHw *devQueueHw, CsrDependencies &csrDeps, diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 8054ce059b..bbd1d1d81a 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -174,7 +174,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, DeviceQueueHw *devQueueHw = castToObject>(devQueue); auto clearAllDependencies = queueDependenciesClearRequired(); - TagNode *hwTimeStamps = nullptr; + TagNodeBase *hwTimeStamps = nullptr; auto commandStreamRecieverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership(); @@ -385,13 +385,13 @@ template void CommandQueueHw::processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo, std::unique_ptr &printfHandler, Event *event, - TagNode *&hwTimeStamps, + TagNodeBase *&hwTimeStamps, bool blockQueue, DeviceQueueHw *devQueueHw, CsrDependencies &csrDeps, KernelOperation *blockedCommandsData, TimestampPacketDependencies ×tampPacketDependencies) { - TagNode *hwPerfCounter = nullptr; + TagNodeBase *hwPerfCounter = nullptr; FileLoggerInstance().dumpKernelArgs(&multiDispatchInfo); printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device)); @@ -565,7 +565,7 @@ void CommandQueueHw::processDispatchForCacheFlush(Surface **surfaces, template void CommandQueueHw::processDeviceEnqueue(DeviceQueueHw *devQueueHw, const MultiDispatchInfo &multiDispatchInfo, - TagNode *hwTimeStamps, + TagNodeBase *hwTimeStamps, bool &blocking) { auto parentKernel = multiDispatchInfo.peekParentKernel(); size_t minSizeSSHForEM = HardwareCommandsHelper::getSshSizeForExecutionModel(*parentKernel); diff --git a/opencl/source/command_queue/gpgpu_walker.h b/opencl/source/command_queue/gpgpu_walker.h index fecce241fe..b433472cbe 100644 --- a/opencl/source/command_queue/gpgpu_walker.h +++ b/opencl/source/command_queue/gpgpu_walker.h @@ -118,29 +118,29 @@ class GpgpuWalkerHelper { uint32_t requiredWorkgroupOrder); static void dispatchProfilingCommandsStart( - TagNode &hwTimeStamps, + TagNodeBase &hwTimeStamps, LinearStream *commandStream, const HardwareInfo &hwInfo); static void dispatchProfilingCommandsEnd( - TagNode &hwTimeStamps, + TagNodeBase &hwTimeStamps, LinearStream *commandStream, const HardwareInfo &hwInfo); static void dispatchPerfCountersCommandsStart( CommandQueue &commandQueue, - TagNode &hwPerfCounter, + TagNodeBase &hwPerfCounter, LinearStream *commandStream); static void dispatchPerfCountersCommandsEnd( CommandQueue &commandQueue, - TagNode &hwPerfCounter, + TagNodeBase &hwPerfCounter, LinearStream *commandStream); static void setupTimestampPacket( LinearStream *cmdStream, WALKER_TYPE *walkerCmd, - TagNode *timestampPacketNode, + TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment); static void dispatchScheduler( diff --git a/opencl/source/command_queue/gpgpu_walker_base.inl b/opencl/source/command_queue/gpgpu_walker_base.inl index 769448fcf9..88e258d930 100644 --- a/opencl/source/command_queue/gpgpu_walker_base.inl +++ b/opencl/source/command_queue/gpgpu_walker_base.inl @@ -133,7 +133,7 @@ void GpgpuWalkerHelper::addAluReadModifyWriteRegister( template void GpgpuWalkerHelper::dispatchPerfCountersCommandsStart( CommandQueue &commandQueue, - TagNode &hwPerfCounter, + TagNodeBase &hwPerfCounter, LinearStream *commandStream) { const auto pPerformanceCounters = commandQueue.getPerfCounters(); @@ -149,7 +149,7 @@ void GpgpuWalkerHelper::dispatchPerfCountersCommandsStart( template void GpgpuWalkerHelper::dispatchPerfCountersCommandsEnd( CommandQueue &commandQueue, - TagNode &hwPerfCounter, + TagNodeBase &hwPerfCounter, LinearStream *commandStream) { const auto pPerformanceCounters = commandQueue.getPerfCounters(); diff --git a/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl b/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl index 0522bf19cc..8a4e6702a6 100644 --- a/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl +++ b/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl @@ -167,7 +167,7 @@ template void GpgpuWalkerHelper::setupTimestampPacket( LinearStream *cmdStream, WALKER_TYPE *walkerCmd, - TagNode *timestampPacketNode, + TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment) { uint64_t address = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNode); @@ -210,7 +210,7 @@ void GpgpuWalkerHelper::adjustMiStoreRegMemMode(MI_STORE_REG_MEM void GpgpuWalkerHelper::dispatchProfilingCommandsStart( - TagNode &hwTimeStamps, + TagNodeBase &hwTimeStamps, LinearStream *commandStream, const HardwareInfo &hwInfo) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; @@ -242,7 +242,7 @@ void GpgpuWalkerHelper::dispatchProfilingCommandsStart( template void GpgpuWalkerHelper::dispatchProfilingCommandsEnd( - TagNode &hwTimeStamps, + TagNodeBase &hwTimeStamps, LinearStream *commandStream, const HardwareInfo &hwInfo) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; diff --git a/opencl/source/command_queue/hardware_interface.h b/opencl/source/command_queue/hardware_interface.h index 05c0dd1519..1e989932cc 100644 --- a/opencl/source/command_queue/hardware_interface.h +++ b/opencl/source/command_queue/hardware_interface.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -19,13 +19,13 @@ class DispatchInfo; class IndirectHeap; class Kernel; class LinearStream; -struct HwPerfCounter; -struct HwTimeStamps; +class HwPerfCounter; +class HwTimeStamps; struct KernelOperation; struct MultiDispatchInfo; template -struct TagNode; +class TagNode; template using WALKER_TYPE = typename GfxFamily::WALKER_TYPE; @@ -40,8 +40,8 @@ class HardwareInterface { const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, KernelOperation *blockedCommandsData, - TagNode *hwTimeStamps, - TagNode *hwPerfCounter, + TagNodeBase *hwTimeStamps, + TagNodeBase *hwPerfCounter, TimestampPacketDependencies *timestampPacketDependencies, TimestampPacketContainer *currentTimestampPacketNodes, uint32_t commandType); @@ -62,14 +62,14 @@ class HardwareInterface { const bool &enable); static void dispatchProfilingPerfStartCommands( - TagNode *hwTimeStamps, - TagNode *hwPerfCounter, + TagNodeBase *hwTimeStamps, + TagNodeBase *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue); static void dispatchProfilingPerfEndCommands( - TagNode *hwTimeStamps, - TagNode *hwPerfCounter, + TagNodeBase *hwTimeStamps, + TagNodeBase *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue); diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index 400bc83c97..469b978452 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -30,8 +30,8 @@ inline WALKER_TYPE *HardwareInterface::allocateWalkerSpace template inline void HardwareInterface::dispatchProfilingPerfStartCommands( - TagNode *hwTimeStamps, - TagNode *hwPerfCounter, + TagNodeBase *hwTimeStamps, + TagNodeBase *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue) { @@ -46,8 +46,8 @@ inline void HardwareInterface::dispatchProfilingPerfStartCommands( template inline void HardwareInterface::dispatchProfilingPerfEndCommands( - TagNode *hwTimeStamps, - TagNode *hwPerfCounter, + TagNodeBase *hwTimeStamps, + TagNodeBase *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue) { @@ -66,8 +66,8 @@ void HardwareInterface::dispatchWalker( const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, KernelOperation *blockedCommandsData, - TagNode *hwTimeStamps, - TagNode *hwPerfCounter, + TagNodeBase *hwTimeStamps, + TagNodeBase *hwPerfCounter, TimestampPacketDependencies *timestampPacketDependencies, TimestampPacketContainer *currentTimestampPacketNodes, uint32_t commandType) { diff --git a/opencl/source/device_queue/device_queue.cpp b/opencl/source/device_queue/device_queue.cpp index 8c2254c935..58c790e6cf 100644 --- a/opencl/source/device_queue/device_queue.cpp +++ b/opencl/source/device_queue/device_queue.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017-2020 Intel Corporation + * Copyright (C) 2017-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -169,7 +169,7 @@ void DeviceQueue::initDeviceQueue() { } void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, - uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNode *hwTimeStamp, bool isCcsUsed) { + uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNodeBase *hwTimeStamp, bool isCcsUsed) { setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentCount, isCcsUsed); addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, tagAddress, taskCount); } @@ -178,7 +178,7 @@ void DeviceQueue::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHea return; } -void DeviceQueue::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) { +void DeviceQueue::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) { return; } diff --git a/opencl/source/device_queue/device_queue.h b/opencl/source/device_queue/device_queue.h index ebd2fce080..ed48f03628 100644 --- a/opencl/source/device_queue/device_queue.h +++ b/opencl/source/device_queue/device_queue.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017-2020 Intel Corporation + * Copyright (C) 2017-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -23,9 +23,8 @@ class Kernel; class Event; struct MultiDispatchInfo; class SchedulerKernel; -struct HwTimeStamps; -template -struct TagNode; +class HwTimeStamps; +class TagNodeBase; template <> struct OpenCLObjectMapper<_device_queue> { @@ -72,10 +71,10 @@ class DeviceQueue : public BaseObject<_device_queue> { size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet); - void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNode *hwTimeStamp, bool isCcsUsed); + void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNodeBase *hwTimeStamp, bool isCcsUsed); virtual void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed); - virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount); + virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount); MOCKABLE_VIRTUAL bool isEMCriticalSectionFree() { auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); diff --git a/opencl/source/device_queue/device_queue_hw.h b/opencl/source/device_queue/device_queue_hw.h index b60edf0d14..72469e3b93 100644 --- a/opencl/source/device_queue/device_queue_hw.h +++ b/opencl/source/device_queue/device_queue_hw.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017-2020 Intel Corporation + * Copyright (C) 2017-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -58,7 +58,7 @@ class DeviceQueueHw : public DeviceQueue { void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed) override; - void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) override; + void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) override; void resetDeviceQueue() override; void dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh, bool isCcsUsed) override; diff --git a/opencl/source/device_queue/device_queue_hw_base.inl b/opencl/source/device_queue/device_queue_hw_base.inl index 2534019ac3..8c0d9ac567 100644 --- a/opencl/source/device_queue/device_queue_hw_base.inl +++ b/opencl/source/device_queue/device_queue_hw_base.inl @@ -103,7 +103,7 @@ void DeviceQueueHw::initPipeControl(PIPE_CONTROL *pc) { } template -void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) { +void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) { // CleanUp Section auto offset = slbCS.getUsed(); auto alignmentSize = alignUp(offset, MemoryConstants::pageSize) - offset; diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index 585bba9e1d..60a3b1c00c 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -129,8 +129,8 @@ Event::~Event() { timeStampNode->returnTag(); } if (perfCounterNode != nullptr) { - cmdQueue->getPerfCounters()->deleteQuery(perfCounterNode->tagForCpuAccess->query.handle); - perfCounterNode->tagForCpuAccess->query.handle = {}; + cmdQueue->getPerfCounters()->deleteQuery(perfCounterNode->getQueryHandleRef()); + perfCounterNode->getQueryHandleRef() = {}; perfCounterNode->returnTag(); } cmdQueue->decRefInternal(); @@ -258,12 +258,12 @@ bool Event::calcProfilingData() { for (auto i = 0u; i < timestamps.size(); i++) { std::cout << "Timestamp " << i << ", " << "profiling capable: " << timestamps[i]->isProfilingCapable() << ", "; - for (auto j = 0u; j < timestamps[i]->tagForCpuAccess->getPacketsUsed(); j++) { + for (auto j = 0u; j < timestamps[i]->getPacketsUsed(); j++) { std::cout << "packet " << j << ": " - << "global start: " << timestamps[i]->tagForCpuAccess->getGlobalStartValue(j) << ", " - << "global end: " << timestamps[i]->tagForCpuAccess->getGlobalEndValue(j) << ", " - << "context start: " << timestamps[i]->tagForCpuAccess->getContextStartValue(j) << ", " - << "context end: " << timestamps[i]->tagForCpuAccess->getContextEndValue(j) << std::endl; + << "global start: " << timestamps[i]->getGlobalStartValue(j) << ", " + << "global end: " << timestamps[i]->getGlobalEndValue(j) << ", " + << "context start: " << timestamps[i]->getContextStartValue(j) << ", " + << "context end: " << timestamps[i]->getContextEndValue(j) << std::endl; } } } @@ -277,16 +277,16 @@ bool Event::calcProfilingData() { } else if (timeStampNode) { if (HwHelper::get(this->cmdQueue->getDevice().getHardwareInfo().platform.eRenderCoreFamily).useOnlyGlobalTimestamps()) { calculateProfilingDataInternal( - timeStampNode->tagForCpuAccess->GlobalStartTS, - timeStampNode->tagForCpuAccess->GlobalEndTS, - &timeStampNode->tagForCpuAccess->GlobalEndTS, - timeStampNode->tagForCpuAccess->GlobalStartTS); + timeStampNode->getGlobalStartValue(0), + timeStampNode->getGlobalEndValue(0), + &timeStampNode->getGlobalEndRef(), + timeStampNode->getGlobalStartValue(0)); } else { calculateProfilingDataInternal( - timeStampNode->tagForCpuAccess->ContextStartTS, - timeStampNode->tagForCpuAccess->ContextEndTS, - &timeStampNode->tagForCpuAccess->ContextCompleteTS, - timeStampNode->tagForCpuAccess->GlobalStartTS); + timeStampNode->getContextStartValue(0), + timeStampNode->getContextEndValue(0), + &timeStampNode->getContextCompleteRef(), + timeStampNode->getGlobalStartValue(0)); } } } @@ -346,19 +346,19 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con void Event::getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS) { const auto timestamps = timestampContainer->peekNodes(); - globalStartTS = timestamps[0]->tagForCpuAccess->getGlobalStartValue(0); - globalEndTS = timestamps[0]->tagForCpuAccess->getGlobalEndValue(0); + globalStartTS = timestamps[0]->getGlobalStartValue(0); + globalEndTS = timestamps[0]->getGlobalEndValue(0); for (const auto ×tamp : timestamps) { if (!timestamp->isProfilingCapable()) { continue; } - for (auto i = 0u; i < timestamp->tagForCpuAccess->getPacketsUsed(); ++i) { - if (globalStartTS > timestamp->tagForCpuAccess->getGlobalStartValue(i)) { - globalStartTS = timestamp->tagForCpuAccess->getGlobalStartValue(i); + for (auto i = 0u; i < timestamp->getPacketsUsed(); ++i) { + if (globalStartTS > timestamp->getGlobalStartValue(i)) { + globalStartTS = timestamp->getGlobalStartValue(i); } - if (globalEndTS < timestamp->tagForCpuAccess->getGlobalEndValue(i)) { - globalEndTS = timestamp->tagForCpuAccess->getGlobalEndValue(i); + if (globalEndTS < timestamp->getGlobalEndValue(i)) { + globalEndTS = timestamp->getGlobalEndValue(i); } } } @@ -734,14 +734,14 @@ void Event::setEndTimeStamp() { } } -TagNode *Event::getHwTimeStampNode() { +TagNodeBase *Event::getHwTimeStampNode() { if (!cmdQueue->getTimestampPacketContainer() && !timeStampNode) { timeStampNode = cmdQueue->getGpgpuCommandStreamReceiver().getEventTsAllocator()->getTag(); } return timeStampNode; } -TagNode *Event::getHwPerfCounterNode() { +TagNodeBase *Event::getHwPerfCounterNode() { if (!perfCounterNode && cmdQueue->getPerfCounters()) { const uint32_t gpuReportSize = HwPerfCounter::getSize(*(cmdQueue->getPerfCounters())); diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h index 7d59d92b7c..917c6c3d56 100644 --- a/opencl/source/event/event.h +++ b/opencl/source/event/event.h @@ -24,7 +24,7 @@ namespace NEO { template -struct TagNode; +class TagNode; class CommandQueue; class Context; class Device; @@ -106,7 +106,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { void setProfilingEnabled(bool profilingEnabled) { this->profilingEnabled = profilingEnabled; } - TagNode *getHwTimeStampNode(); + TagNodeBase *getHwTimeStampNode(); void addTimestampPacketNodes(const TimestampPacketContainer &inputTimestampPacketContainer); TimestampPacketContainer *getTimestampPacketNodes() const; @@ -119,7 +119,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { this->perfCountersEnabled = perfCountersEnabled; } - TagNode *getHwPerfCounterNode(); + TagNodeBase *getHwPerfCounterNode(); std::unique_ptr flushStamp; std::atomic taskLevel; @@ -372,8 +372,8 @@ class Event : public BaseObject<_cl_event>, public IDNode { uint64_t completeTimeStamp; uint32_t bcsTaskCount = 0; bool perfCountersEnabled; - TagNode *timeStampNode = nullptr; - TagNode *perfCounterNode = nullptr; + TagNodeBase *timeStampNode = nullptr; + TagNodeBase *perfCounterNode = nullptr; std::unique_ptr timestampPacketContainer; //number of events this event depends on std::atomic parentCount; diff --git a/opencl/source/event/hw_timestamps.h b/opencl/source/event/hw_timestamps.h index 1c3c553dc1..2a8fef6bcd 100644 --- a/opencl/source/event/hw_timestamps.h +++ b/opencl/source/event/hw_timestamps.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017-2020 Intel Corporation + * Copyright (C) 2017-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -12,7 +12,8 @@ namespace NEO { -struct HwTimeStamps { +class HwTimeStamps : public TagTypeBase { + public: void initialize() { GlobalStartTS = 0; ContextStartTS = 0; @@ -21,12 +22,18 @@ struct HwTimeStamps { GlobalCompleteTS = 0; ContextCompleteTS = 0; } - bool isCompleted() const { return true; } - uint32_t getImplicitGpuDependenciesCount() const { return 0; } - static GraphicsAllocation::AllocationType getAllocationType() { + static constexpr GraphicsAllocation::AllocationType getAllocationType() { return GraphicsAllocation::AllocationType::PROFILING_TAG_BUFFER; } + + static constexpr TagNodeType getTagNodeType() { return TagNodeType::HwTimeStamps; } + + uint64_t getContextStartValue(uint32_t) const { return ContextStartTS; } + uint64_t getGlobalStartValue(uint32_t) const { return GlobalStartTS; } + uint64_t getContextEndValue(uint32_t) const { return ContextEndTS; } + uint64_t getGlobalEndValue(uint32_t) const { return GlobalEndTS; } + uint64_t GlobalStartTS; uint64_t ContextStartTS; uint64_t GlobalEndTS; @@ -34,4 +41,7 @@ struct HwTimeStamps { uint64_t GlobalCompleteTS; uint64_t ContextCompleteTS; }; + +static_assert((6 * sizeof(uint64_t)) == sizeof(HwTimeStamps), + "This structure is consumed by GPU and has to follow specific restrictions for padding and size"); } // namespace NEO diff --git a/opencl/source/event/perf_counter.h b/opencl/source/event/perf_counter.h index e48eea4350..8ba4415fcc 100644 --- a/opencl/source/event/perf_counter.h +++ b/opencl/source/event/perf_counter.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017-2020 Intel Corporation + * Copyright (C) 2017-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -14,24 +14,24 @@ namespace NEO { -struct HwPerfCounter { +class HwPerfCounter : public TagTypeBase { + public: void initialize() { query = {}; report[0] = 0; } - static GraphicsAllocation::AllocationType getAllocationType() { + static constexpr GraphicsAllocation::AllocationType getAllocationType() { return GraphicsAllocation::AllocationType::PROFILING_TAG_BUFFER; } + static constexpr TagNodeType getTagNodeType() { return TagNodeType::HwPerfCounter; } + template static uint32_t getSize(Type &performanceCounters) { return sizeof(query) + performanceCounters.getGpuReportSize(); } - bool isCompleted() const { return true; } - uint32_t getImplicitGpuDependenciesCount() const { return 0; } - // Gpu report size is not known during compile time. // Such information will be provided by metrics library dll. // Bellow variable will be allocated dynamically based on information @@ -43,4 +43,5 @@ struct HwPerfCounter { uint8_t report[1] = {}; }; + } // namespace NEO diff --git a/opencl/source/helpers/task_information.h b/opencl/source/helpers/task_information.h index 4c8c53714f..56a57e0647 100644 --- a/opencl/source/helpers/task_information.h +++ b/opencl/source/helpers/task_information.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017-2020 Intel Corporation + * Copyright (C) 2017-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -27,10 +27,10 @@ class Kernel; class MemObj; class Surface; class PrintfHandler; -struct HwTimeStamps; +class HwTimeStamps; class TimestampPacketContainer; template -struct TagNode; +class TagNode; enum MapOperationType { MAP, @@ -99,7 +99,7 @@ class Command : public IFNode { void setEventsRequest(EventsRequest &eventsRequest); void makeTimestampPacketsResident(CommandStreamReceiver &commandStreamReceiver); - TagNode *timestamp = nullptr; + TagNodeBase *timestamp = nullptr; CompletionStamp completionStamp = {}; protected: diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 189c469acd..4667f7ae14 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -1227,7 +1227,7 @@ bool Kernel::hasTunningFinished(KernelSubmissionData &submissionData) { bool Kernel::hasRunFinished(TimestampPacketContainer *timestampContainer) { for (const auto &node : timestampContainer->peekNodes()) { - if (!node->tagForCpuAccess->isCompleted()) { + if (!node->isCompleted()) { return false; } } diff --git a/opencl/source/os_interface/performance_counters.cpp b/opencl/source/os_interface/performance_counters.cpp index 81bf73e2ee..b097afbd5d 100644 --- a/opencl/source/os_interface/performance_counters.cpp +++ b/opencl/source/os_interface/performance_counters.cpp @@ -140,7 +140,7 @@ void PerformanceCounters::closeMetricsLibrary() { ////////////////////////////////////////////////////// // PerformanceCounters::getQueryHandle ////////////////////////////////////////////////////// -void PerformanceCounters::getQueryHandle(QueryHandle_1_0 &handle) { +void PerformanceCounters::getQueryHandleRef(QueryHandle_1_0 &handle) { if (!handle.IsValid()) { metricsLibrary->hwCountersCreate( context, @@ -201,7 +201,7 @@ uint32_t PerformanceCounters::getGpuCommandsSize( bufferData.CommandsType = ObjectType::QueryHwCounters; bufferData.Type = commandBufferType; - getQueryHandle(query); + getQueryHandleRef(query); bufferData.QueryHwCounters.Begin = begin; bufferData.QueryHwCounters.Handle = query; @@ -216,7 +216,7 @@ uint32_t PerformanceCounters::getGpuCommandsSize( ////////////////////////////////////////////////////// bool PerformanceCounters::getGpuCommands( const MetricsLibraryApi::GpuCommandBufferType commandBufferType, - TagNode &performanceCounters, + TagNodeBase &performanceCounters, const bool begin, const uint32_t bufferSize, void *pBuffer) { @@ -231,15 +231,15 @@ bool PerformanceCounters::getGpuCommands( // Gpu memory allocation for query hw counters. const uint32_t allocationOffset = offsetof(HwPerfCounter, report); - bufferData.Allocation.CpuAddress = reinterpret_cast(performanceCounters.tagForCpuAccess) + allocationOffset; + bufferData.Allocation.CpuAddress = reinterpret_cast(performanceCounters.getCpuBase()) + allocationOffset; bufferData.Allocation.GpuAddress = performanceCounters.getGpuAddress() + allocationOffset; // Allocate query handle for cl_event if not exists. - getQueryHandle(performanceCounters.tagForCpuAccess->query.handle); + getQueryHandleRef(performanceCounters.getQueryHandleRef()); // Query hw counters specific data. bufferData.QueryHwCounters.Begin = begin; - bufferData.QueryHwCounters.Handle = performanceCounters.tagForCpuAccess->query.handle; + bufferData.QueryHwCounters.Handle = performanceCounters.getQueryHandleRef(); return metricsLibrary->commandBufferGet(bufferData); } @@ -261,7 +261,7 @@ uint32_t PerformanceCounters::getGpuReportSize() { ////////////////////////////////////////////////////// // PerformanceCounters::getApiReport ////////////////////////////////////////////////////// -bool PerformanceCounters::getApiReport(const TagNode *performanceCounters, const size_t inputParamSize, void *pInputParam, size_t *pOutputParamSize, bool isEventComplete) { +bool PerformanceCounters::getApiReport(const TagNodeBase *performanceCounters, const size_t inputParamSize, void *pInputParam, size_t *pOutputParamSize, bool isEventComplete) { const uint32_t outputSize = metricsLibrary->hwCountersGetApiReportSize(); if (pOutputParamSize) { @@ -272,10 +272,6 @@ bool PerformanceCounters::getApiReport(const TagNode *performance return false; } - if (!performanceCounters->tagForCpuAccess) { - return false; - } - if (pInputParam == nullptr && inputParamSize == 0 && pOutputParamSize) { return true; } @@ -288,6 +284,6 @@ bool PerformanceCounters::getApiReport(const TagNode *performance return false; } - return metricsLibrary->hwCountersGetReport(performanceCounters->tagForCpuAccess->query.handle, 0, 1, outputSize, pInputParam); + return metricsLibrary->hwCountersGetReport(performanceCounters->getQueryHandleRef(), 0, 1, outputSize, pInputParam); } } // namespace NEO diff --git a/opencl/source/os_interface/performance_counters.h b/opencl/source/os_interface/performance_counters.h index aad35f3830..365be6294d 100644 --- a/opencl/source/os_interface/performance_counters.h +++ b/opencl/source/os_interface/performance_counters.h @@ -16,8 +16,7 @@ namespace NEO { ////////////////////////////////////////////////////// // Forward declaration. ////////////////////////////////////////////////////// -template -struct TagNode; +class TagNodeBase; class CommandQueue; ////////////////////////////////////////////////////// @@ -50,14 +49,14 @@ class PerformanceCounters { ////////////////////////////////////////////////////// static uint32_t getGpuCommandsSize(CommandQueue &commandQueue, const bool reservePerfCounters); uint32_t getGpuCommandsSize(const MetricsLibraryApi::GpuCommandBufferType commandBufferType, const bool begin); - bool getGpuCommands(const MetricsLibraryApi::GpuCommandBufferType commandBufferType, TagNode &performanceCounters, const bool begin, const uint32_t bufferSize, void *pBuffer); + bool getGpuCommands(const MetricsLibraryApi::GpuCommandBufferType commandBufferType, TagNodeBase &performanceCounters, const bool begin, const uint32_t bufferSize, void *pBuffer); ///////////////////////////////////////////////////// // Gpu/Api reports. ///////////////////////////////////////////////////// uint32_t getApiReportSize(); uint32_t getGpuReportSize(); - bool getApiReport(const TagNode *performanceCounters, const size_t inputParamSize, void *pClientData, size_t *pOutputSize, bool isEventComplete); + bool getApiReport(const TagNodeBase *performanceCounters, const size_t inputParamSize, void *pClientData, size_t *pOutputSize, bool isEventComplete); ///////////////////////////////////////////////////// // Metrics Library interface. @@ -71,7 +70,7 @@ class PerformanceCounters { // Metrics Library context/query handles. ///////////////////////////////////////////////////// ContextHandle_1_0 getMetricsLibraryContext(); - void getQueryHandle(QueryHandle_1_0 &handle); + void getQueryHandleRef(QueryHandle_1_0 &handle); void deleteQuery(QueryHandle_1_0 &handle); protected: diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 85b2081a3a..ef7a46d51c 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -613,17 +613,17 @@ HWTEST_F(CommandStreamReceiverTest, whenCsrIsCreatedThenUseTimestampPacketWriteI } TEST_F(CommandStreamReceiverTest, whenGettingEventTsAllocatorThenSameTagAllocatorIsReturned) { - TagAllocator *allocator = commandStreamReceiver->getEventTsAllocator(); + TagAllocatorBase *allocator = commandStreamReceiver->getEventTsAllocator(); EXPECT_NE(nullptr, allocator); - TagAllocator *allocator2 = commandStreamReceiver->getEventTsAllocator(); + TagAllocatorBase *allocator2 = commandStreamReceiver->getEventTsAllocator(); EXPECT_EQ(allocator2, allocator); } TEST_F(CommandStreamReceiverTest, whenGettingEventPerfCountAllocatorThenSameTagAllocatorIsReturned) { const uint32_t gpuReportSize = 100; - TagAllocator *allocator = commandStreamReceiver->getEventPerfCountAllocator(gpuReportSize); + TagAllocatorBase *allocator = commandStreamReceiver->getEventPerfCountAllocator(gpuReportSize); EXPECT_NE(nullptr, allocator); - TagAllocator *allocator2 = commandStreamReceiver->getEventPerfCountAllocator(gpuReportSize); + TagAllocatorBase *allocator2 = commandStreamReceiver->getEventPerfCountAllocator(gpuReportSize); EXPECT_EQ(allocator2, allocator); } @@ -631,11 +631,11 @@ HWTEST_F(CommandStreamReceiverTest, givenTimestampPacketAllocatorWhenAskingForTa auto &csr = pDevice->getUltCommandStreamReceiver(); EXPECT_EQ(nullptr, csr.timestampPacketAllocator.get()); - TagAllocator *allocator = csr.getTimestampPacketAllocator(); + auto allocator = static_cast> *>(csr.getTimestampPacketAllocator()); EXPECT_NE(nullptr, csr.timestampPacketAllocator.get()); EXPECT_EQ(allocator, csr.timestampPacketAllocator.get()); - TagAllocator *allocator2 = csr.getTimestampPacketAllocator(); + auto allocator2 = static_cast> *>(csr.getTimestampPacketAllocator()); EXPECT_EQ(allocator, allocator2); auto node1 = allocator->getTag(); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_with_aub_dump_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_with_aub_dump_tests.cpp index d7735235b2..90f6a11725 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_with_aub_dump_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_with_aub_dump_tests.cpp @@ -294,10 +294,10 @@ struct CommandStreamReceiverTagTests : public ::testing::Test { uint32_t zeros[4] = {}; for (uint32_t i = 0; i < TimestampPacketSizeControl::preferredPacketCount; i++) { - tag->tagForCpuAccess->assignDataToAllTimestamps(i, zeros); + tag->assignDataToAllTimestamps(i, zeros); } - EXPECT_TRUE(tag->tagForCpuAccess->isCompleted()); + EXPECT_TRUE(tag->isCompleted()); bool canBeReleased = tag->canBeReleased(); allocator->returnTag(tag); diff --git a/opencl/test/unit_test/event/event_tests.cpp b/opencl/test/unit_test/event/event_tests.cpp index a77e3b73fe..bdd8f44e77 100644 --- a/opencl/test/unit_test/event/event_tests.cpp +++ b/opencl/test/unit_test/event/event_tests.cpp @@ -864,10 +864,10 @@ HWTEST_F(InternalsEventWithPerfCountersTest, givenCpuProfilingPerfCountersPathWh pCmdQ->setPerfCountersEnabled(); MockEvent *event = new MockEvent(pCmdQ, CL_COMMAND_MARKER, 0, 0); event->setCPUProfilingPath(true); - HwPerfCounter *perfCounter = event->getHwPerfCounterNode()->tagForCpuAccess; + HwPerfCounter *perfCounter = static_cast *>(event->getHwPerfCounterNode())->tagForCpuAccess; ASSERT_NE(nullptr, perfCounter); - auto hwTimeStampNode = event->getHwTimeStampNode(); + auto hwTimeStampNode = static_cast *>(event->getHwTimeStampNode()); if (pCmdQ->getTimestampPacketContainer()) { EXPECT_EQ(nullptr, hwTimeStampNode); } else { @@ -1122,7 +1122,7 @@ HWTEST_F(EventTest, WhenGettingHwTimeStampsThenValidPointerIsReturned) { std::unique_ptr event(new Event(myCmdQ.get(), CL_COMMAND_COPY_BUFFER, 0, 0)); ASSERT_NE(nullptr, event); - HwTimeStamps *timeStamps = event->getHwTimeStampNode()->tagForCpuAccess; + HwTimeStamps *timeStamps = static_cast *>(event->getHwTimeStampNode())->tagForCpuAccess; ASSERT_NE(nullptr, timeStamps); //this should not cause any heap corruptions @@ -1133,9 +1133,9 @@ HWTEST_F(EventTest, WhenGettingHwTimeStampsThenValidPointerIsReturned) { ASSERT_EQ(0ULL, timeStamps->GlobalCompleteTS); ASSERT_EQ(0ULL, timeStamps->ContextCompleteTS); - EXPECT_TRUE(timeStamps->isCompleted()); + EXPECT_TRUE(event->getHwTimeStampNode()->isCompleted()); - HwTimeStamps *timeStamps2 = event->getHwTimeStampNode()->tagForCpuAccess; + HwTimeStamps *timeStamps2 = static_cast *>(event->getHwTimeStampNode())->tagForCpuAccess; ASSERT_EQ(timeStamps, timeStamps2); } @@ -1165,7 +1165,7 @@ HWTEST_F(EventTest, WhenEventIsCreatedThenHwTimeStampsMemoryIsPlacedInGraphicsAl std::unique_ptr event(new Event(myCmdQ.get(), CL_COMMAND_COPY_BUFFER, 0, 0)); ASSERT_NE(nullptr, event); - HwTimeStamps *timeStamps = event->getHwTimeStampNode()->tagForCpuAccess; + HwTimeStamps *timeStamps = static_cast *>(event->getHwTimeStampNode())->tagForCpuAccess; ASSERT_NE(nullptr, timeStamps); GraphicsAllocation *allocation = event->getHwTimeStampNode()->getBaseGraphicsAllocation(); diff --git a/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp b/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp index 62f6c5214d..e4121e62e7 100644 --- a/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp +++ b/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp @@ -58,9 +58,12 @@ class MockDeviceQueueHwWithCriticalSectionRelease : public DeviceQueueHw *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) override { + void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) override { cleanupSectionAdded = true; - timestampAddedInCleanupSection = hwTimeStamp ? hwTimeStamp->tagForCpuAccess : nullptr; + + auto hwTimestampT = static_cast *>(hwTimeStamp); + + timestampAddedInCleanupSection = hwTimestampT ? hwTimestampT->tagForCpuAccess : nullptr; return BaseClass::addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, tagAddress, taskCount); } void dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh, bool isCcsUsed) override { @@ -253,7 +256,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenBlockedParentK std::vector surfaces; auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, blockedCommandData, surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1); - auto timestamp = pCmdQ->getGpgpuCommandStreamReceiver().getEventTsAllocator()->getTag(); + auto timestamp = static_cast *>(pCmdQ->getGpgpuCommandStreamReceiver().getEventTsAllocator()->getTag()); cmdComputeKernel->timestamp = timestamp; cmdComputeKernel->submit(0, false); diff --git a/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp index 72ce050702..da1dfde1da 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp @@ -36,31 +36,30 @@ using namespace NEO; struct TimestampPacketSimpleTests : public ::testing::Test { - class MockTimestampPacketStorage : public TimestampPacketStorage { + class MockTimestampPacketStorage : public TimestampPackets { public: - using TimestampPacketStorage::implicitGpuDependenciesCount; - using TimestampPacketStorage::packets; + using TimestampPackets::implicitGpuDependenciesCount; + using TimestampPackets::packets; }; - template - void setTagToReadyState(TagNode *tagNode) { - auto packetsUsed = tagNode->tagForCpuAccess->getPacketsUsed(); + void setTagToReadyState(TagNodeBase *tagNode) { + auto packetsUsed = tagNode->getPacketsUsed(); tagNode->initialize(); uint32_t zeros[4] = {}; for (uint32_t i = 0; i < TimestampPacketSizeControl::preferredPacketCount; i++) { - tagNode->tagForCpuAccess->assignDataToAllTimestamps(i, zeros); + tagNode->assignDataToAllTimestamps(i, zeros); } - tagNode->tagForCpuAccess->setPacketsUsed(packetsUsed); + tagNode->setPacketsUsed(packetsUsed); } const size_t gws[3] = {1, 1, 1}; }; struct TimestampPacketTests : public TimestampPacketSimpleTests { - struct MockTagNode : public TagNode { - using TagNode::gpuAddress; + struct MockTagNode : public TagNode> { + using TagNode>::gpuAddress; }; void SetUp() override { @@ -83,19 +82,19 @@ struct TimestampPacketTests : public TimestampPacketSimpleTests { } template - void verifySemaphore(MI_SEMAPHORE_WAIT *semaphoreCmd, TagNode *timestampPacketNode, uint32_t packetId) { + void verifySemaphore(MI_SEMAPHORE_WAIT *semaphoreCmd, TagNodeBase *timestampPacketNode, uint32_t packetId) { EXPECT_NE(nullptr, semaphoreCmd); EXPECT_EQ(semaphoreCmd->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); - uint64_t compareOffset = packetId * sizeof(TimestampPacketStorage::Packet); + uint64_t compareOffset = packetId * sizeof(TimestampPackets::Packet); auto dataAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNode) + compareOffset; EXPECT_EQ(dataAddress, semaphoreCmd->getSemaphoreGraphicsAddress()); }; template - void verifyMiAtomic(typename GfxFamily::MI_ATOMIC *miAtomicCmd, TagNode *timestampPacketNode) { + void verifyMiAtomic(typename GfxFamily::MI_ATOMIC *miAtomicCmd, TagNodeBase *timestampPacketNode) { using MI_ATOMIC = typename GfxFamily::MI_ATOMIC; EXPECT_NE(nullptr, miAtomicCmd); auto writeAddress = TimestampPacketHelper::getGpuDependenciesCountGpuAddress(*timestampPacketNode); @@ -124,7 +123,7 @@ HWTEST_F(TimestampPacketTests, givenTagNodeWhenSemaphoreAndAtomicAreProgrammedTh using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using MI_ATOMIC = typename FamilyType::MI_ATOMIC; - TimestampPacketStorage tag; + TimestampPackets tag; MockTagNode mockNode; mockNode.tagForCpuAccess = &tag; mockNode.gpuAddress = 0x1230000; @@ -166,7 +165,7 @@ HWTEST_F(TimestampPacketTests, givenDebugModeWhereAtomicsAreNotEmittedWhenComman } HWTEST_F(TimestampPacketTests, givenMultipleDeviesWhenIncrementingCpuDependenciesThenIncrementMultipleTimes) { - TimestampPacketStorage tag; + TimestampPackets tag; MockTagNode mockNode; mockNode.tagForCpuAccess = &tag; mockNode.gpuAddress = 0x1230000; @@ -183,7 +182,7 @@ HWTEST_F(TimestampPacketTests, givenTagNodeWithPacketsUsed2WhenSemaphoreAndAtomi using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using MI_ATOMIC = typename FamilyType::MI_ATOMIC; - TimestampPacketStorage tag; + TimestampPackets tag; tag.setPacketsUsed(2); MockTagNode mockNode; mockNode.tagForCpuAccess = &tag; @@ -202,7 +201,7 @@ HWTEST_F(TimestampPacketTests, givenTagNodeWithPacketsUsed2WhenSemaphoreAndAtomi } TEST_F(TimestampPacketTests, givenTagNodeWhatAskingForGpuAddressesThenReturnCorrectValue) { - TimestampPacketStorage tag; + TimestampPackets tag; MockTagNode mockNode; mockNode.tagForCpuAccess = &tag; mockNode.gpuAddress = 0x1230000; @@ -242,11 +241,11 @@ TEST_F(TimestampPacketSimpleTests, givenTimestampPacketContainerWhenMovedThenMov EXPECT_FALSE(std::is_copy_assignable::value); EXPECT_FALSE(std::is_copy_constructible::value); - struct MockTagNode : public TagNode { + struct MockTagNode : public TagNode> { void returnTag() override { returnCalls++; } - using TagNode::refCount; + using TagNode>::refCount; uint32_t returnCalls = 0; }; @@ -308,7 +307,9 @@ TEST_F(TimestampPacketSimpleTests, whenNewTagIsTakenThenReinitialize) { MockMemoryManager memoryManager(executionEnvironment); MockTagAllocator allocator(0, &memoryManager, 1); - auto firstNode = allocator.getTag(); + using MockNode = TagNode; + + auto firstNode = static_cast(allocator.getTag()); auto i = 0u; for (auto &packet : firstNode->tagForCpuAccess->packets) { packet.contextStart = i++; @@ -369,7 +370,7 @@ HWTEST_F(TimestampPacketTests, givenDebugFlagSetWhenCreatingTimestampPacketAlloc auto tag = csr.getTimestampPacketAllocator()->getTag(); setTagToReadyState(tag); - EXPECT_TRUE(tag->tagForCpuAccess->isCompleted()); + EXPECT_TRUE(tag->isCompleted()); EXPECT_FALSE(tag->canBeReleased()); } @@ -550,7 +551,7 @@ HWTEST_F(TimestampPacketTests, givenEventsRequestWithEventsWithoutTimestampsWhen } HWTEST_F(TimestampPacketTests, whenEstimatingSizeForNodeDependencyThenReturnCorrectValue) { - TimestampPacketStorage tag; + TimestampPackets tag; MockTagNode mockNode; mockNode.tagForCpuAccess = &tag; mockNode.gpuAddress = 0x1230000; @@ -1457,8 +1458,8 @@ HWTEST_F(TimestampPacketTests, givenAlreadyAssignedNodeWhenEnqueueingWithOmitTim } HWTEST_F(TimestampPacketTests, givenEventsWaitlistFromDifferentDevicesWhenEnqueueingThenMakeAllTimestampsResident) { - TagAllocator tagAllocator(device->getRootDeviceIndex(), executionEnvironment->memoryManager.get(), 1, 1, - sizeof(TimestampPacketStorage), false, device->getDeviceBitfield()); + TagAllocator> tagAllocator(device->getRootDeviceIndex(), executionEnvironment->memoryManager.get(), 1, 1, + sizeof(TimestampPackets), false, device->getDeviceBitfield()); auto device2 = std::make_unique(Device::create(executionEnvironment, 1u)); auto &ultCsr = device->getUltCommandStreamReceiver(); @@ -1493,8 +1494,8 @@ HWTEST_F(TimestampPacketTests, givenEventsWaitlistFromDifferentDevicesWhenEnqueu } HWTEST_F(TimestampPacketTests, givenEventsWaitlistFromDifferentCSRsWhenEnqueueingThenMakeAllTimestampsResident) { - TagAllocator tagAllocator(device->getRootDeviceIndex(), executionEnvironment->memoryManager.get(), 1, 1, - sizeof(TimestampPacketStorage), false, device->getDeviceBitfield()); + TagAllocator> tagAllocator(device->getRootDeviceIndex(), executionEnvironment->memoryManager.get(), 1, 1, + sizeof(TimestampPackets), false, device->getDeviceBitfield()); auto &ultCsr = device->getUltCommandStreamReceiver(); ultCsr.timestampPacketWriteEnabled = true; @@ -1600,7 +1601,7 @@ HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingWithoutK auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); MockKernelWithInternals mockKernel(*device, context); - cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); // obtain first TimestampPacketStorage + cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); // obtain first TimestampPackets TimestampPacketContainer cmdQNodes; cmdQNodes.assignAndIncrementNodesRefCounts(*cmdQ->timestampPacketContainer); @@ -1815,7 +1816,7 @@ HWTEST_F(TimestampPacketTests, whenEnqueueingBarrierThenRequestPipeControlOnCsrF MockCommandQueueHw cmdQ(context, device.get(), nullptr); MockKernelWithInternals mockKernel(*device, context); - cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); // obtain first TimestampPacketStorage + cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); // obtain first TimestampPackets TimestampPacketContainer cmdQNodes; cmdQNodes.assignAndIncrementNodesRefCounts(*cmdQ.timestampPacketContainer); diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index d1fae7c68e..6138574d97 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -2180,11 +2180,11 @@ HWTEST_F(KernelResidencyTest, givenEnableFullKernelTuningWhenPerformTunningThenK EXPECT_EQ(result->second.status, MockKernel::TunningStatus::SUBDEVICE_TUNNING_IN_PROGRESS); EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferedInCurrentEnqueue); - uint32_t data[4] = {static_cast(container.getNode(0u)->tagForCpuAccess->getContextStartValue(0)), - static_cast(container.getNode(0u)->tagForCpuAccess->getGlobalStartValue(0)), + uint32_t data[4] = {static_cast(container.getNode(0u)->getContextStartValue(0)), + static_cast(container.getNode(0u)->getGlobalStartValue(0)), 2, 2}; - container.getNode(0u)->tagForCpuAccess->assignDataToAllTimestamps(0, data); + container.getNode(0u)->assignDataToAllTimestamps(0, data); mockKernel.mockKernel->performKernelTunning(commandStreamReceiver, lws, gws, offsets, &container); @@ -2193,12 +2193,12 @@ HWTEST_F(KernelResidencyTest, givenEnableFullKernelTuningWhenPerformTunningThenK EXPECT_EQ(result->second.status, MockKernel::TunningStatus::SUBDEVICE_TUNNING_IN_PROGRESS); EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferedInCurrentEnqueue); - data[0] = static_cast(subdeviceContainer.getNode(0u)->tagForCpuAccess->getContextStartValue(0)); - data[1] = static_cast(subdeviceContainer.getNode(0u)->tagForCpuAccess->getGlobalStartValue(0)); + data[0] = static_cast(subdeviceContainer.getNode(0u)->getContextStartValue(0)); + data[1] = static_cast(subdeviceContainer.getNode(0u)->getGlobalStartValue(0)); data[2] = 2; data[3] = 2; - subdeviceContainer.getNode(0u)->tagForCpuAccess->assignDataToAllTimestamps(0, data); + subdeviceContainer.getNode(0u)->assignDataToAllTimestamps(0, data); mockKernel.mockKernel->performKernelTunning(commandStreamReceiver, lws, gws, offsets, &container); @@ -2209,12 +2209,12 @@ HWTEST_F(KernelResidencyTest, givenEnableFullKernelTuningWhenPerformTunningThenK EXPECT_EQ(result->second.status, MockKernel::TunningStatus::SUBDEVICE_TUNNING_IN_PROGRESS); EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferedInCurrentEnqueue); - data[0] = static_cast(subdeviceContainer.getNode(1u)->tagForCpuAccess->getContextStartValue(0)); - data[1] = static_cast(subdeviceContainer.getNode(1u)->tagForCpuAccess->getGlobalStartValue(0)); + data[0] = static_cast(subdeviceContainer.getNode(1u)->getContextStartValue(0)); + data[1] = static_cast(subdeviceContainer.getNode(1u)->getGlobalStartValue(0)); data[2] = 2; data[3] = 2; - subdeviceContainer.getNode(1u)->tagForCpuAccess->assignDataToAllTimestamps(0, data); + subdeviceContainer.getNode(1u)->assignDataToAllTimestamps(0, data); mockKernel.mockKernel->performKernelTunning(commandStreamReceiver, lws, gws, offsets, &container); diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index 1d2a6c908d..39e860a816 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -856,10 +856,10 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenInputAndOutputTimestampPacketWhenBlitCal cl_int retVal = CL_SUCCESS; auto memoryManager = bcsCsr->getMemoryManager(); - bcsCsr->timestampPacketAllocator = std::make_unique>(device->getRootDeviceIndex(), memoryManager, 1, - MemoryConstants::cacheLineSize, - sizeof(TimestampPacketStorage), - false, device->getDeviceBitfield()); + bcsCsr->timestampPacketAllocator = std::make_unique>>(device->getRootDeviceIndex(), memoryManager, 1, + MemoryConstants::cacheLineSize, + sizeof(TimestampPackets), + false, device->getDeviceBitfield()); auto buffer = clUniquePtr(Buffer::create(bcsMockContext.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal)); buffer->forceDisallowCPUCopy = true; diff --git a/opencl/test/unit_test/mocks/mock_timestamp_container.h b/opencl/test/unit_test/mocks/mock_timestamp_container.h index 8a16a9674f..3a6095c00d 100644 --- a/opencl/test/unit_test/mocks/mock_timestamp_container.h +++ b/opencl/test/unit_test/mocks/mock_timestamp_container.h @@ -11,7 +11,7 @@ namespace NEO { -template +template > class MockTagAllocator : public TagAllocator { public: using BaseClass = TagAllocator; @@ -22,13 +22,13 @@ class MockTagAllocator : public TagAllocator { MockTagAllocator(uint32_t rootDeviceIndex, MemoryManager *memoryManager, size_t tagCount = 10) : BaseClass(rootDeviceIndex, memoryManager, tagCount, MemoryConstants::cacheLineSize, sizeof(TagType), false, mockDeviceBitfield) {} - void returnTag(NodeType *node) override { - releaseReferenceNodes.push_back(node); + void returnTag(TagNodeBase *node) override { + releaseReferenceNodes.push_back(static_cast(node)); BaseClass::returnTag(node); } - void returnTagToFreePool(NodeType *node) override { - returnedToFreePoolNodes.push_back(node); + void returnTagToFreePool(TagNodeBase *node) override { + returnedToFreePoolNodes.push_back(static_cast(node)); BaseClass::returnTagToFreePool(node); } @@ -40,13 +40,13 @@ class MockTimestampPacketContainer : public TimestampPacketContainer { public: using TimestampPacketContainer::timestampPacketNodes; - MockTimestampPacketContainer(TagAllocator &tagAllocator, size_t numberOfPreallocatedTags) { + MockTimestampPacketContainer(TagAllocatorBase &tagAllocator, size_t numberOfPreallocatedTags) { for (size_t i = 0; i < numberOfPreallocatedTags; i++) { add(tagAllocator.getTag()); } } - TagNode *getNode(size_t position) { + TagNodeBase *getNode(size_t position) { return timestampPacketNodes.at(position); } }; diff --git a/opencl/test/unit_test/os_interface/performance_counters_tests.cpp b/opencl/test/unit_test/os_interface/performance_counters_tests.cpp index a8d764bb39..8ddaecbdcd 100644 --- a/opencl/test/unit_test/os_interface/performance_counters_tests.cpp +++ b/opencl/test/unit_test/os_interface/performance_counters_tests.cpp @@ -97,7 +97,7 @@ TEST_P(PerformanceCountersProcessEventTest, givenNullptrInputParamWhenProcessEve TagNode query = {}; query.tagForCpuAccess = &counters; - performanceCountersBase->getQueryHandle(counters.query.handle); + performanceCountersBase->getQueryHandleRef(counters.query.handle); auto retVal = performanceCountersBase->getApiReport(&query, inputParamSize, nullptr, &outputParamSize, eventComplete); performanceCountersBase->deleteQuery(counters.query.handle); @@ -111,7 +111,7 @@ TEST_P(PerformanceCountersProcessEventTest, givenCorrectInputParamWhenProcessEve TagNode query = {}; query.tagForCpuAccess = &counters; - performanceCountersBase->getQueryHandle(counters.query.handle); + performanceCountersBase->getQueryHandleRef(counters.query.handle); auto retVal = performanceCountersBase->getApiReport(&query, inputParamSize, inputParam.get(), &outputParamSize, eventComplete); performanceCountersBase->deleteQuery(counters.query.handle); @@ -127,11 +127,12 @@ TEST_P(PerformanceCountersProcessEventTest, givenCorrectInputParamWhenProcessEve TEST_P(PerformanceCountersProcessEventTest, givenCorrectInputParamWhenProcessEventPerfCountersIsNotCalledThenReturnsFalse) { eventComplete = GetParam(); EXPECT_EQ(0ull, outputParamSize); + HwPerfCounter tag = {}; TagNode query = {}; - query.tagForCpuAccess = nullptr; + query.tagForCpuAccess = &tag; auto retVal = performanceCountersBase->getApiReport(&query, inputParamSize, inputParam.get(), &outputParamSize, eventComplete); - EXPECT_FALSE(retVal); + EXPECT_EQ(eventComplete, retVal); } TEST_F(PerformanceCountersProcessEventTest, givenInvalidInputParamSizeWhenProcessEventPerfCountersIsCalledThenReturnsFalse) { @@ -141,7 +142,7 @@ TEST_F(PerformanceCountersProcessEventTest, givenInvalidInputParamSizeWhenProces TagNode query = {}; query.tagForCpuAccess = &counters; - performanceCountersBase->getQueryHandle(counters.query.handle); + performanceCountersBase->getQueryHandleRef(counters.query.handle); auto retVal = performanceCountersBase->getApiReport(&query, inputParamSize - 1, inputParam.get(), &outputParamSize, eventComplete); performanceCountersBase->deleteQuery(counters.query.handle); @@ -156,7 +157,7 @@ TEST_F(PerformanceCountersProcessEventTest, givenNullptrOutputParamSizeWhenProce TagNode query = {}; query.tagForCpuAccess = &counters; - performanceCountersBase->getQueryHandle(counters.query.handle); + performanceCountersBase->getQueryHandleRef(counters.query.handle); auto retVal = performanceCountersBase->getApiReport(&query, inputParamSize, inputParam.get(), nullptr, eventComplete); performanceCountersBase->deleteQuery(counters.query.handle); @@ -171,7 +172,7 @@ TEST_F(PerformanceCountersProcessEventTest, givenNullptrInputZeroSizeWhenProcess TagNode query = {}; query.tagForCpuAccess = &counters; - performanceCountersBase->getQueryHandle(counters.query.handle); + performanceCountersBase->getQueryHandleRef(counters.query.handle); auto retVal = performanceCountersBase->getApiReport(&query, 0, nullptr, &outputParamSize, eventComplete); performanceCountersBase->deleteQuery(counters.query.handle); @@ -186,7 +187,7 @@ TEST_F(PerformanceCountersProcessEventTest, givenNullptrInputZeroSizeAndNullptrO TagNode query = {}; query.tagForCpuAccess = &counters; - performanceCountersBase->getQueryHandle(counters.query.handle); + performanceCountersBase->getQueryHandleRef(counters.query.handle); auto retVal = performanceCountersBase->getApiReport(&query, 0, nullptr, nullptr, eventComplete); performanceCountersBase->deleteQuery(counters.query.handle); @@ -487,7 +488,7 @@ TEST_F(PerformanceCountersMetricsLibraryTest, givenPerformanceCountersWhenMetric EXPECT_EQ(0u, performanceCountersBase->getReferenceNumber()); EXPECT_TRUE(performanceCountersBase->enable(false)); - performanceCountersBase->getQueryHandle(query); + performanceCountersBase->getQueryHandleRef(query); EXPECT_TRUE(query.IsValid()); performanceCountersBase->deleteQuery(query); @@ -587,13 +588,13 @@ TEST_F(PerformanceCountersMetricsLibraryTest, WhenGettingHwPerfCounterThenValidP std::unique_ptr event(new Event(queue.get(), CL_COMMAND_COPY_BUFFER, 0, 0)); ASSERT_NE(nullptr, event); - HwPerfCounter *perfCounter = event->getHwPerfCounterNode()->tagForCpuAccess; + auto perfCounter = static_cast *>(event->getHwPerfCounterNode()); ASSERT_NE(nullptr, perfCounter); - ASSERT_EQ(0ULL, perfCounter->report[0]); + ASSERT_EQ(0ULL, perfCounter->tagForCpuAccess->report[0]); EXPECT_TRUE(perfCounter->isCompleted()); - HwPerfCounter *perfCounter2 = event->getHwPerfCounterNode()->tagForCpuAccess; + auto perfCounter2 = event->getHwPerfCounterNode(); ASSERT_EQ(perfCounter, perfCounter2); performanceCountersBase->shutdown(); @@ -633,7 +634,7 @@ TEST_F(PerformanceCountersMetricsLibraryTest, WhenCreatingEventThenHwPerfCounter std::unique_ptr event(new Event(queue.get(), CL_COMMAND_COPY_BUFFER, 0, 0)); ASSERT_NE(nullptr, event); - HwPerfCounter *perfCounter = event->getHwPerfCounterNode()->tagForCpuAccess; + HwPerfCounter *perfCounter = static_cast *>(event->getHwPerfCounterNode())->tagForCpuAccess; ASSERT_NE(nullptr, perfCounter); GraphicsAllocation *allocation = event->getHwPerfCounterNode()->getBaseGraphicsAllocation(); diff --git a/opencl/test/unit_test/profiling/profiling_tests.cpp b/opencl/test/unit_test/profiling/profiling_tests.cpp index a7b09d1677..f0d26b08c4 100644 --- a/opencl/test/unit_test/profiling/profiling_tests.cpp +++ b/opencl/test/unit_test/profiling/profiling_tests.cpp @@ -1064,7 +1064,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersOnCCSTests, givenCommandQue struct MockTimestampContainer : public TimestampPacketContainer { ~MockTimestampContainer() override { for (const auto &node : timestampPacketNodes) { - delete node->tagForCpuAccess; + auto mockNode = static_cast> *>(node); + delete mockNode->tagForCpuAccess; delete node; } timestampPacketNodes.clear(); @@ -1079,8 +1080,8 @@ struct ProfilingTimestampPacketsTest : public ::testing::Test { } void addTimestampNode(uint32_t contextStart, uint32_t contextEnd, uint32_t globalStart, uint32_t globalEnd) { - auto node = new MockTagNode(); - auto timestampPacketStorage = new TimestampPacketStorage(); + auto node = new MockTagNode>(); + auto timestampPacketStorage = new TimestampPackets(); node->tagForCpuAccess = timestampPacketStorage; uint32_t values[4] = {contextStart, globalStart, contextEnd, globalEnd}; @@ -1090,8 +1091,8 @@ struct ProfilingTimestampPacketsTest : public ::testing::Test { } void addTimestampNodeMultiOsContext(uint32_t globalStart[16], uint32_t globalEnd[16], uint32_t contextStart[16], uint32_t contextEnd[16], uint32_t size) { - auto node = new MockTagNode(); - auto timestampPacketStorage = new TimestampPacketStorage(); + auto node = new MockTagNode>(); + auto timestampPacketStorage = new TimestampPackets(); timestampPacketStorage->setPacketsUsed(size); for (uint32_t i = 0u; i < timestampPacketStorage->getPacketsUsed(); ++i) { diff --git a/opencl/test/unit_test/utilities/tag_allocator_tests.cpp b/opencl/test/unit_test/utilities/tag_allocator_tests.cpp index 9c3c280f63..e79e94cd17 100644 --- a/opencl/test/unit_test/utilities/tag_allocator_tests.cpp +++ b/opencl/test/unit_test/utilities/tag_allocator_tests.cpp @@ -21,29 +21,69 @@ using namespace NEO; struct TagAllocatorTest : public Test { - const DeviceBitfield deviceBitfield{0xf}; - DebugManagerStateRestore restorer; + class MockTimestampPackets32 : public TimestampPackets { + public: + void setTagToReadyState() { + auto packetsUsed = getPacketsUsed(); + initialize(); + + uint32_t zeros[4] = {}; + + for (uint32_t i = 0; i < TimestampPacketSizeControl::preferredPacketCount; i++) { + assignDataToAllTimestamps(i, zeros); + } + setPacketsUsed(packetsUsed); + + EXPECT_TRUE(isCompleted()); + } + + void setToNonReadyState() { + packets[0].contextEnd = 1; + EXPECT_FALSE(isCompleted()); + } + }; void SetUp() override { DebugManager.flags.CreateMultipleSubDevices.set(4); MemoryAllocatorFixture::SetUp(); } + + const DeviceBitfield deviceBitfield{0xf}; + DebugManagerStateRestore restorer; }; struct TimeStamps { void initialize() { start = 1; end = 2; - release = true; } - static GraphicsAllocation::AllocationType getAllocationType() { + static constexpr GraphicsAllocation::AllocationType getAllocationType() { return GraphicsAllocation::AllocationType::PROFILING_TAG_BUFFER; } - bool isCompleted() const { return release; } - uint32_t getImplicitGpuDependenciesCount() const { return 0; } - bool release; + + static constexpr TagNodeType getTagNodeType() { return TagNodeType::HwTimeStamps; } + + uint64_t getContextStartValue(uint32_t packetIndex) const { + return start; + } + + uint64_t getGlobalStartValue(uint32_t packetIndex) const { + return start; + } + + uint64_t getContextEndValue(uint32_t packetIndex) const { + return end; + } + + uint64_t getGlobalEndValue(uint32_t packetIndex) const { + return end; + } + uint64_t start; uint64_t end; + + uint64_t ContextCompleteTS; + uint64_t GlobalEndTS; }; template @@ -58,6 +98,7 @@ class MockTagAllocator : public TagAllocator { using BaseClass::populateFreeTags; using BaseClass::releaseDeferredTags; using BaseClass::usedTags; + using BaseClass::TagAllocatorBase::cleanUpResources; MockTagAllocator(MemoryManager *memMngr, size_t tagCount, size_t tagAlignment, bool disableCompletionCheck, DeviceBitfield deviceBitfield) : BaseClass(0, memMngr, tagCount, tagAlignment, sizeof(TagType), disableCompletionCheck, deviceBitfield) { @@ -115,7 +156,7 @@ TEST_F(TagAllocatorTest, WhenGettingAndReturningTagThenFreeAndUsedListsAreUpdate ASSERT_NE(nullptr, tagAllocator.getFreeTagsHead()); EXPECT_EQ(nullptr, tagAllocator.getUsedTagsHead()); - TagNode *tagNode = tagAllocator.getTag(); + auto tagNode = static_cast *>(tagAllocator.getTag()); EXPECT_NE(nullptr, tagNode); @@ -151,7 +192,7 @@ TEST_F(TagAllocatorTest, WhenTagIsAllocatedThenItIsAligned) { ASSERT_NE(nullptr, tagAllocator.getFreeTagsHead()); - TagNode *tagNode = tagAllocator.getTag(); + TagNode *tagNode = static_cast *>(tagAllocator.getTag()); ASSERT_NE(nullptr, tagNode); EXPECT_EQ(0u, (uintptr_t)tagNode->tagForCpuAccess % alignment); @@ -170,13 +211,13 @@ TEST_F(TagAllocatorTest, givenTagAllocatorWhenAllNodesWereUsedThenCreateNewGraph TagNode *tagNodes[4]; for (size_t i = 0; i < 4; i++) { - tagNodes[i] = tagAllocator.getTag(); + tagNodes[i] = static_cast *>(tagAllocator.getTag()); EXPECT_NE(nullptr, tagNodes[i]); } EXPECT_EQ(1u, tagAllocator.getGraphicsAllocationsCount()); EXPECT_EQ(1u, tagAllocator.getTagPoolCount()); - TagNode *tagNode = tagAllocator.getTag(); + TagNode *tagNode = static_cast *>(tagAllocator.getTag()); EXPECT_NE(nullptr, tagNode); EXPECT_EQ(2u, tagAllocator.getGraphicsAllocationsCount()); @@ -188,7 +229,7 @@ TEST_F(TagAllocatorTest, givenInputTagCountWhenCreatingAllocatorThenRequestedNum public: using MockMemoryManager::MockMemoryManager; GraphicsAllocation *allocateGraphicsMemoryWithAlignment(const AllocationData &allocationData) override { - return new MemoryAllocation(0, TimestampPacketStorage::getAllocationType(), nullptr, nullptr, 0, MemoryConstants::pageSize, + return new MemoryAllocation(0, TimestampPackets::getAllocationType(), nullptr, nullptr, 0, MemoryConstants::pageSize, 1, MemoryPool::System4KBPages, false, false, mockMaxOsContextCount); } }; @@ -196,7 +237,7 @@ TEST_F(TagAllocatorTest, givenInputTagCountWhenCreatingAllocatorThenRequestedNum auto mockMemoryManager = std::make_unique(true, true, *executionEnvironment); const size_t tagsCount = 3; - MockTagAllocator tagAllocator(mockMemoryManager.get(), tagsCount, 1, deviceBitfield); + MockTagAllocator> tagAllocator(mockMemoryManager.get(), tagsCount, 1, deviceBitfield); size_t nodesFound = 0; auto head = tagAllocator.freeTags.peekHead(); @@ -219,13 +260,13 @@ TEST_F(TagAllocatorTest, GivenSpecificOrderWhenReturningTagsThenFreeListIsUpdate TagNode *tagNodes[4]; for (int i = 0; i < 4; i++) { - tagNodes[i] = tagAllocator.getTag(); + tagNodes[i] = static_cast *>(tagAllocator.getTag()); EXPECT_NE(nullptr, tagNodes[i]); } EXPECT_EQ(1u, tagAllocator.getGraphicsAllocationsCount()); EXPECT_EQ(1u, tagAllocator.getTagPoolCount()); - TagNode *tagNode2 = tagAllocator.getTag(); + TagNode *tagNode2 = static_cast *>(tagAllocator.getTag()); EXPECT_NE(nullptr, tagNode2); EXPECT_EQ(2u, tagAllocator.getGraphicsAllocationsCount()); EXPECT_EQ(2u, tagAllocator.getTagPoolCount()); @@ -263,10 +304,10 @@ TEST_F(TagAllocatorTest, WhenGettingTagsFromTwoPoolsThenTagsAreDifferent) { TagNode *tagNode1, *tagNode2; - tagNode1 = tagAllocator.getTag(); + tagNode1 = static_cast *>(tagAllocator.getTag()); ASSERT_NE(nullptr, tagNode1); - tagNode2 = tagAllocator.getTag(); + tagNode2 = static_cast *>(tagAllocator.getTag()); ASSERT_NE(nullptr, tagNode2); EXPECT_EQ(2u, tagAllocator.getGraphicsAllocationsCount()); @@ -286,11 +327,11 @@ TEST_F(TagAllocatorTest, WhenCleaningUpResourcesThenAllResourcesAreReleased) { TagNode *tagNode1, *tagNode2; // Allocate first Pool - tagNode1 = tagAllocator.getTag(); + tagNode1 = static_cast *>(tagAllocator.getTag()); EXPECT_NE(nullptr, tagNode1); // Allocate second Pool - tagNode2 = tagAllocator.getTag(); + tagNode2 = static_cast *>(tagAllocator.getTag()); ASSERT_NE(nullptr, tagNode2); // Two pools should have different gfxAllocations @@ -312,7 +353,7 @@ TEST_F(TagAllocatorTest, whenNewTagIsTakenThenItIsInitialized) { tagAllocator.getFreeTagsHead()->tagForCpuAccess->end = 4; tagAllocator.getFreeTagsHead()->setProfilingCapable(false); - auto node = tagAllocator.getTag(); + auto node = static_cast *>(tagAllocator.getTag()); EXPECT_EQ(1u, node->tagForCpuAccess->start); EXPECT_EQ(2u, node->tagForCpuAccess->end); EXPECT_TRUE(node->isProfilingCapable()); @@ -337,10 +378,10 @@ TEST_F(TagAllocatorTest, givenMultipleReferencesOnTagWhenReleasingThenReturnWhen } TEST_F(TagAllocatorTest, givenNotReadyTagWhenReturnedThenMoveToDeferredList) { - MockTagAllocator tagAllocator(memoryManager, 1, 1, deviceBitfield); - auto node = tagAllocator.getTag(); + MockTagAllocator tagAllocator(memoryManager, 1, 1, deviceBitfield); + auto node = static_cast *>(tagAllocator.getTag()); - node->tagForCpuAccess->release = false; + node->tagForCpuAccess->setToNonReadyState(); EXPECT_TRUE(tagAllocator.deferredTags.peekIsEmpty()); tagAllocator.returnTag(node); EXPECT_FALSE(tagAllocator.deferredTags.peekIsEmpty()); @@ -379,10 +420,10 @@ TEST_F(TagAllocatorTest, givenTagAllocatorWhenDisabledCompletionCheckThenNodeInh } TEST_F(TagAllocatorTest, givenReadyTagWhenReturnedThenMoveToFreeList) { - MockTagAllocator tagAllocator(memoryManager, 1, 1, deviceBitfield); - auto node = tagAllocator.getTag(); + MockTagAllocator tagAllocator(memoryManager, 1, 1, deviceBitfield); + auto node = static_cast *>(tagAllocator.getTag()); - node->tagForCpuAccess->release = true; + node->tagForCpuAccess->setTagToReadyState(); EXPECT_TRUE(tagAllocator.deferredTags.peekIsEmpty()); tagAllocator.returnTag(node); EXPECT_TRUE(tagAllocator.deferredTags.peekIsEmpty()); @@ -390,25 +431,25 @@ TEST_F(TagAllocatorTest, givenReadyTagWhenReturnedThenMoveToFreeList) { } TEST_F(TagAllocatorTest, givenEmptyFreeListWhenAskingForNewTagThenTryToReleaseDeferredListFirst) { - MockTagAllocator tagAllocator(memoryManager, 1, 1, deviceBitfield); - auto node = tagAllocator.getTag(); + MockTagAllocator tagAllocator(memoryManager, 1, 1, deviceBitfield); + auto node = static_cast *>(tagAllocator.getTag()); - node->tagForCpuAccess->release = false; + node->tagForCpuAccess->setToNonReadyState(); tagAllocator.returnTag(node); - node->tagForCpuAccess->release = false; + node->tagForCpuAccess->setToNonReadyState(); EXPECT_TRUE(tagAllocator.freeTags.peekIsEmpty()); - node = tagAllocator.getTag(); + node = static_cast *>(tagAllocator.getTag()); EXPECT_NE(nullptr, node); EXPECT_TRUE(tagAllocator.freeTags.peekIsEmpty()); // empty again - new pool wasnt allocated } TEST_F(TagAllocatorTest, givenTagsOnDeferredListWhenReleasingItThenMoveReadyTagsToFreePool) { - MockTagAllocator tagAllocator(memoryManager, 2, 1, deviceBitfield); // pool with 2 tags - auto node1 = tagAllocator.getTag(); - auto node2 = tagAllocator.getTag(); + MockTagAllocator tagAllocator(memoryManager, 2, 1, deviceBitfield); // pool with 2 tags + auto node1 = static_cast *>(tagAllocator.getTag()); + auto node2 = static_cast *>(tagAllocator.getTag()); - node1->tagForCpuAccess->release = false; - node2->tagForCpuAccess->release = false; + node1->tagForCpuAccess->setToNonReadyState(); + node2->tagForCpuAccess->setToNonReadyState(); tagAllocator.returnTag(node1); tagAllocator.returnTag(node2); @@ -416,19 +457,19 @@ TEST_F(TagAllocatorTest, givenTagsOnDeferredListWhenReleasingItThenMoveReadyTags EXPECT_FALSE(tagAllocator.deferredTags.peekIsEmpty()); EXPECT_TRUE(tagAllocator.freeTags.peekIsEmpty()); - node1->tagForCpuAccess->release = true; + node1->tagForCpuAccess->setTagToReadyState(); tagAllocator.releaseDeferredTags(); EXPECT_FALSE(tagAllocator.deferredTags.peekIsEmpty()); EXPECT_FALSE(tagAllocator.freeTags.peekIsEmpty()); - node2->tagForCpuAccess->release = true; + node2->tagForCpuAccess->setTagToReadyState(); tagAllocator.releaseDeferredTags(); EXPECT_TRUE(tagAllocator.deferredTags.peekIsEmpty()); EXPECT_FALSE(tagAllocator.freeTags.peekIsEmpty()); } TEST_F(TagAllocatorTest, givenTagAllocatorWhenGraphicsAllocationIsCreatedThenSetValidllocationType) { - TagAllocator timestampPacketAllocator(mockRootDeviceIndex, memoryManager, 1, 1, sizeof(TimestampPacketStorage), false, mockDeviceBitfield); + TagAllocator> timestampPacketAllocator(mockRootDeviceIndex, memoryManager, 1, 1, sizeof(TimestampPackets), false, mockDeviceBitfield); TagAllocator hwTimeStampsAllocator(mockRootDeviceIndex, memoryManager, 1, 1, sizeof(HwTimeStamps), false, mockDeviceBitfield); TagAllocator hwPerfCounterAllocator(mockRootDeviceIndex, memoryManager, 1, 1, sizeof(HwPerfCounter), false, mockDeviceBitfield); @@ -440,3 +481,53 @@ TEST_F(TagAllocatorTest, givenTagAllocatorWhenGraphicsAllocationIsCreatedThenSet EXPECT_EQ(GraphicsAllocation::AllocationType::PROFILING_TAG_BUFFER, hwTimeStampsTag->getBaseGraphicsAllocation()->getAllocationType()); EXPECT_EQ(GraphicsAllocation::AllocationType::PROFILING_TAG_BUFFER, hwPerfCounterTag->getBaseGraphicsAllocation()->getAllocationType()); } + +TEST_F(TagAllocatorTest, givenNotSupportedTagTypeWhenCallingMethodThenAbortOrReturnInitialValue) { + + { + TagNode perfCounterNode = {}; + + EXPECT_ANY_THROW(perfCounterNode.getGlobalStartOffset()); + EXPECT_ANY_THROW(perfCounterNode.getContextStartOffset()); + EXPECT_ANY_THROW(perfCounterNode.getContextEndOffset()); + EXPECT_ANY_THROW(perfCounterNode.getGlobalEndOffset()); + EXPECT_ANY_THROW(perfCounterNode.getImplicitGpuDependenciesCountOffset()); + EXPECT_ANY_THROW(perfCounterNode.getContextStartValue(0)); + EXPECT_ANY_THROW(perfCounterNode.getGlobalStartValue(0)); + EXPECT_ANY_THROW(perfCounterNode.getContextEndValue(0)); + EXPECT_ANY_THROW(perfCounterNode.getGlobalEndValue(0)); + EXPECT_ANY_THROW(perfCounterNode.getContextCompleteRef()); + EXPECT_ANY_THROW(perfCounterNode.getGlobalEndRef()); + EXPECT_ANY_THROW(perfCounterNode.setPacketsUsed(0)); + EXPECT_ANY_THROW(perfCounterNode.getPacketsUsed()); + EXPECT_EQ(0u, perfCounterNode.getImplicitGpuDependenciesCount()); + EXPECT_ANY_THROW(perfCounterNode.getSinglePacketSize()); + EXPECT_ANY_THROW(perfCounterNode.assignDataToAllTimestamps(0, nullptr)); + EXPECT_TRUE(perfCounterNode.isCompleted()); + } + + { + TagNode hwTimestampNode = {}; + + EXPECT_ANY_THROW(hwTimestampNode.getGlobalStartOffset()); + EXPECT_ANY_THROW(hwTimestampNode.getContextStartOffset()); + EXPECT_ANY_THROW(hwTimestampNode.getContextEndOffset()); + EXPECT_ANY_THROW(hwTimestampNode.getGlobalEndOffset()); + EXPECT_ANY_THROW(hwTimestampNode.getImplicitGpuDependenciesCountOffset()); + EXPECT_ANY_THROW(hwTimestampNode.setPacketsUsed(0)); + EXPECT_ANY_THROW(hwTimestampNode.getPacketsUsed()); + EXPECT_EQ(0u, hwTimestampNode.getImplicitGpuDependenciesCount()); + EXPECT_ANY_THROW(hwTimestampNode.getSinglePacketSize()); + EXPECT_ANY_THROW(hwTimestampNode.assignDataToAllTimestamps(0, nullptr)); + EXPECT_TRUE(hwTimestampNode.isCompleted()); + EXPECT_ANY_THROW(hwTimestampNode.getQueryHandleRef()); + } + + { + TagNode> timestampPacketsNode = {}; + + EXPECT_ANY_THROW(timestampPacketsNode.getContextCompleteRef()); + EXPECT_ANY_THROW(timestampPacketsNode.getGlobalEndRef()); + EXPECT_ANY_THROW(timestampPacketsNode.getQueryHandleRef()); + } +} diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index 4d4d0da9f3..313eb49f23 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -606,15 +606,15 @@ bool CommandStreamReceiver::createAllocationForHostSurface(HostPtrSurface &surfa return true; } -TagAllocator *CommandStreamReceiver::getEventTsAllocator() { +TagAllocatorBase *CommandStreamReceiver::getEventTsAllocator() { if (profilingTimeStampAllocator.get() == nullptr) { - profilingTimeStampAllocator = std::make_unique>( - rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize, sizeof(HwTimeStamps), false, osContext->getDeviceBitfield()); + profilingTimeStampAllocator = std::make_unique>(rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize, + sizeof(HwTimeStamps), false, osContext->getDeviceBitfield()); } return profilingTimeStampAllocator.get(); } -TagAllocator *CommandStreamReceiver::getEventPerfCountAllocator(const uint32_t tagSize) { +TagAllocatorBase *CommandStreamReceiver::getEventPerfCountAllocator(const uint32_t tagSize) { if (perfCounterAllocator.get() == nullptr) { perfCounterAllocator = std::make_unique>( rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize, tagSize, false, osContext->getDeviceBitfield()); @@ -622,15 +622,15 @@ TagAllocator *CommandStreamReceiver::getEventPerfCountAllocator(c return perfCounterAllocator.get(); } -TagAllocator *CommandStreamReceiver::getTimestampPacketAllocator() { +TagAllocatorBase *CommandStreamReceiver::getTimestampPacketAllocator() { if (timestampPacketAllocator.get() == nullptr) { // dont release nodes in aub/tbx mode, to avoid removing semaphores optimization or reusing returned tags bool doNotReleaseNodes = (getType() > CommandStreamReceiverType::CSR_HW) || DebugManager.flags.DisableTimestampPacketOptimizations.get(); - timestampPacketAllocator = std::make_unique>( + timestampPacketAllocator = std::make_unique>>( rootDeviceIndex, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize * 4, - sizeof(TimestampPacketStorage), doNotReleaseNodes, osContext->getDeviceBitfield()); + sizeof(NEO::TimestampPackets), doNotReleaseNodes, osContext->getDeviceBitfield()); } return timestampPacketAllocator.get(); } diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index b4c9c0ffa7..61156eb16d 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -43,8 +43,9 @@ class MultiGraphicsAllocation; class OsContext; class OSInterface; class ScratchSpaceController; -struct HwPerfCounter; -struct HwTimeStamps; +class HwPerfCounter; +class HwTimeStamps; +class TagAllocatorBase; template class TimestampPackets; @@ -192,9 +193,9 @@ class CommandStreamReceiver { virtual void setupContext(OsContext &osContext) { this->osContext = &osContext; } OsContext &getOsContext() const { return *osContext; } - TagAllocator *getEventTsAllocator(); - TagAllocator *getEventPerfCountAllocator(const uint32_t tagSize); - TagAllocator *getTimestampPacketAllocator(); + TagAllocatorBase *getEventTsAllocator(); + TagAllocatorBase *getEventPerfCountAllocator(const uint32_t tagSize); + TagAllocatorBase *getTimestampPacketAllocator(); virtual bool expectMemory(const void *gfxAddress, const void *srcAddress, size_t length, uint32_t compareOperation); @@ -261,9 +262,9 @@ class CommandStreamReceiver { std::unique_ptr internalAllocationStorage; std::unique_ptr kmdNotifyHelper; std::unique_ptr scratchSpaceController; - std::unique_ptr> profilingTimeStampAllocator; - std::unique_ptr> perfCounterAllocator; - std::unique_ptr> timestampPacketAllocator; + std::unique_ptr profilingTimeStampAllocator; + std::unique_ptr perfCounterAllocator; + std::unique_ptr timestampPacketAllocator; std::unique_ptr userPauseConfirmation; ResidencyContainer residencyAllocations; diff --git a/shared/source/helpers/blit_commands_helper.h b/shared/source/helpers/blit_commands_helper.h index 617e21ec6b..515cbb1968 100644 --- a/shared/source/helpers/blit_commands_helper.h +++ b/shared/source/helpers/blit_commands_helper.h @@ -26,16 +26,17 @@ class LinearStream; struct RootDeviceEnvironment; template -struct TagNode; +class TagNode; template class TimestampPackets; +class TagNodeBase; + struct BlitProperties; struct HardwareInfo; struct TimestampPacketDependencies; using BlitPropertiesContainer = StackVec; -using TimestampPacketStorage = TimestampPackets; struct BlitProperties { static BlitProperties constructPropertiesForReadWriteBuffer(BlitterConstants::BlitDirection blitDirection, @@ -60,7 +61,7 @@ struct BlitProperties { TimestampPacketContainer &kernelTimestamps, const CsrDependencies &depsFromEvents, CommandStreamReceiver &gpguCsr, CommandStreamReceiver &bcsCsr); - TagNode *outputTimestampPacket = nullptr; + TagNodeBase *outputTimestampPacket = nullptr; BlitterConstants::BlitDirection blitDirection; CsrDependencies csrDependencies; AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None; diff --git a/shared/source/helpers/common_types.h b/shared/source/helpers/common_types.h index da0922f7dd..eee4d5b664 100644 --- a/shared/source/helpers/common_types.h +++ b/shared/source/helpers/common_types.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -23,4 +23,13 @@ enum class DebugPauseState : uint32_t { hasUserEndConfirmation, terminate }; + +class TagTypeBase { +}; + +enum class TagNodeType { + TimestampPacket, + HwTimeStamps, + HwPerfCounter +}; } // namespace NEO diff --git a/shared/source/helpers/timestamp_packet.cpp b/shared/source/helpers/timestamp_packet.cpp index d51ba29355..abd12b8977 100644 --- a/shared/source/helpers/timestamp_packet.cpp +++ b/shared/source/helpers/timestamp_packet.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -13,7 +13,7 @@ using namespace NEO; -void TimestampPacketContainer::add(Node *timestampPacketNode) { +void TimestampPacketContainer::add(TagNodeBase *timestampPacketNode) { timestampPacketNodes.push_back(timestampPacketNode); } @@ -28,7 +28,7 @@ void TimestampPacketContainer::swapNodes(TimestampPacketContainer ×tampPack } void TimestampPacketContainer::resolveDependencies(bool clearAllDependencies) { - std::vector pendingNodes; + std::vector pendingNodes; for (auto node : timestampPacketNodes) { if (node->canBeReleased() || clearAllDependencies) { diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h index 724af11825..52cf590c13 100644 --- a/shared/source/helpers/timestamp_packet.h +++ b/shared/source/helpers/timestamp_packet.h @@ -31,7 +31,7 @@ constexpr uint32_t preferredPacketCount = 16u; #pragma pack(1) template -class TimestampPackets { +class TimestampPackets : public TagTypeBase { public: struct Packet { TSize contextStart = 1u; @@ -40,10 +40,14 @@ class TimestampPackets { TSize globalEnd = 1u; }; - static GraphicsAllocation::AllocationType getAllocationType() { + static constexpr GraphicsAllocation::AllocationType getAllocationType() { return GraphicsAllocation::AllocationType::TIMESTAMP_PACKET_TAG_BUFFER; } + static constexpr TagNodeType getTagNodeType() { return TagNodeType::TimestampPacket; } + + size_t getSinglePacketSize() const { return sizeof(Packet); } + bool isCompleted() const { if (DebugManager.flags.DisableAtomicForPostSyncs.get()) { return false; @@ -96,29 +100,25 @@ class TimestampPackets { }; #pragma pack() -using TimestampPacketStorage = TimestampPackets; - -static_assert(((4 * TimestampPacketSizeControl::preferredPacketCount + 2) * sizeof(uint32_t)) == sizeof(TimestampPacketStorage), +static_assert(((4 * TimestampPacketSizeControl::preferredPacketCount + 2) * sizeof(uint32_t)) == sizeof(TimestampPackets), "This structure is consumed by GPU and has to follow specific restrictions for padding and size"); class TimestampPacketContainer : public NonCopyableClass { public: - using Node = TagNode; - TimestampPacketContainer() = default; TimestampPacketContainer(TimestampPacketContainer &&) = default; TimestampPacketContainer &operator=(TimestampPacketContainer &&) = default; MOCKABLE_VIRTUAL ~TimestampPacketContainer(); - const std::vector &peekNodes() const { return timestampPacketNodes; } - void add(Node *timestampPacketNode); + const std::vector &peekNodes() const { return timestampPacketNodes; } + void add(TagNodeBase *timestampPacketNode); void swapNodes(TimestampPacketContainer ×tampPacketContainer); void assignAndIncrementNodesRefCounts(const TimestampPacketContainer &inputTimestampPacketContainer); void resolveDependencies(bool clearAllDependencies); void makeResident(CommandStreamReceiver &commandStreamReceiver); protected: - std::vector timestampPacketNodes; + std::vector timestampPacketNodes; }; struct TimestampPacketDependencies : public NonCopyableClass { @@ -130,27 +130,27 @@ struct TimestampPacketDependencies : public NonCopyableClass { }; struct TimestampPacketHelper { - static uint64_t getContextEndGpuAddress(const TagNode ×tampPacketNode) { - return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getContextEndOffset(); + static uint64_t getContextEndGpuAddress(const TagNodeBase ×tampPacketNode) { + return timestampPacketNode.getGpuAddress() + timestampPacketNode.getContextEndOffset(); } - static uint64_t getContextStartGpuAddress(const TagNode ×tampPacketNode) { - return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getContextStartOffset(); + static uint64_t getContextStartGpuAddress(const TagNodeBase ×tampPacketNode) { + return timestampPacketNode.getGpuAddress() + timestampPacketNode.getContextStartOffset(); } - static uint64_t getGlobalEndGpuAddress(const TagNode ×tampPacketNode) { - return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getGlobalEndOffset(); + static uint64_t getGlobalEndGpuAddress(const TagNodeBase ×tampPacketNode) { + return timestampPacketNode.getGpuAddress() + timestampPacketNode.getGlobalEndOffset(); } - static uint64_t getGlobalStartGpuAddress(const TagNode ×tampPacketNode) { - return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getGlobalStartOffset(); + static uint64_t getGlobalStartGpuAddress(const TagNodeBase ×tampPacketNode) { + return timestampPacketNode.getGpuAddress() + timestampPacketNode.getGlobalStartOffset(); } - static uint64_t getGpuDependenciesCountGpuAddress(const TagNode ×tampPacketNode) { - return timestampPacketNode.getGpuAddress() + timestampPacketNode.tagForCpuAccess->getImplicitGpuDependenciesCountOffset(); + static uint64_t getGpuDependenciesCountGpuAddress(const TagNodeBase ×tampPacketNode) { + return timestampPacketNode.getGpuAddress() + timestampPacketNode.getImplicitGpuDependenciesCountOffset(); } static void overrideSupportedDevicesCount(uint32_t &numSupportedDevices); template - static void programSemaphoreWithImplicitDependency(LinearStream &cmdStream, TagNode ×tampPacketNode, uint32_t numSupportedDevices) { + static void programSemaphoreWithImplicitDependency(LinearStream &cmdStream, TagNodeBase ×tampPacketNode, uint32_t numSupportedDevices) { using MI_ATOMIC = typename GfxFamily::MI_ATOMIC; using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; @@ -158,8 +158,8 @@ struct TimestampPacketHelper { auto compareAddress = getContextEndGpuAddress(timestampPacketNode); auto dependenciesCountAddress = getGpuDependenciesCountGpuAddress(timestampPacketNode); - for (uint32_t packetId = 0; packetId < timestampPacketNode.tagForCpuAccess->getPacketsUsed(); packetId++) { - uint64_t compareOffset = packetId * sizeof(TimestampPacketStorage::Packet); + for (uint32_t packetId = 0; packetId < timestampPacketNode.getPacketsUsed(); packetId++) { + uint64_t compareOffset = packetId * timestampPacketNode.getSinglePacketSize(); EncodeSempahore::addMiSemaphoreWaitCommand(cmdStream, compareAddress + compareOffset, 1, COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); } @@ -231,8 +231,8 @@ struct TimestampPacketHelper { } template - static size_t getRequiredCmdStreamSizeForNodeDependency(TagNode ×tampPacketNode) { - size_t totalMiSemaphoreWaitSize = timestampPacketNode.tagForCpuAccess->getPacketsUsed() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT); + static size_t getRequiredCmdStreamSizeForNodeDependency(TagNodeBase ×tampPacketNode) { + size_t totalMiSemaphoreWaitSize = timestampPacketNode.getPacketsUsed() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT); return totalMiSemaphoreWaitSize + sizeof(typename GfxFamily::MI_ATOMIC); } diff --git a/shared/source/utilities/CMakeLists.txt b/shared/source/utilities/CMakeLists.txt index 148546fdd1..16a2b5a2d5 100644 --- a/shared/source/utilities/CMakeLists.txt +++ b/shared/source/utilities/CMakeLists.txt @@ -34,7 +34,9 @@ set(NEO_CORE_UTILITIES ${CMAKE_CURRENT_SOURCE_DIR}/software_tags_manager.h ${CMAKE_CURRENT_SOURCE_DIR}/spinlock.h ${CMAKE_CURRENT_SOURCE_DIR}/stackvec.h + ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator.h + ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator.inl ${CMAKE_CURRENT_SOURCE_DIR}/time_measure_wrapper.h ${CMAKE_CURRENT_SOURCE_DIR}/timer_util.h ) diff --git a/shared/source/utilities/tag_allocator.cpp b/shared/source/utilities/tag_allocator.cpp new file mode 100644 index 0000000000..106470c3f7 --- /dev/null +++ b/shared/source/utilities/tag_allocator.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/utilities/tag_allocator.h" + +namespace NEO { + +TagAllocatorBase::TagAllocatorBase(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount, size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes, DeviceBitfield deviceBitfield) + : deviceBitfield(deviceBitfield), rootDeviceIndex(rootDeviceIndex), memoryManager(memMngr), tagCount(tagCount), tagSize(tagSize), doNotReleaseNodes(doNotReleaseNodes) { + + this->tagSize = alignUp(tagSize, tagAlignment); +} + +void TagAllocatorBase::cleanUpResources() { + for (auto gfxAllocation : gfxAllocations) { + memoryManager->freeGraphicsMemory(gfxAllocation); + } + gfxAllocations.clear(); +} + +void TagNodeBase::returnTag() { + allocator->returnTag(this); +} + +bool TagNodeBase::canBeReleased() const { + return (!doNotReleaseNodes) && + (isCompleted()) && + (getImplicitGpuDependenciesCount() == getImplicitCpuDependenciesCount()); +} + +} // namespace NEO diff --git a/shared/source/utilities/tag_allocator.h b/shared/source/utilities/tag_allocator.h index 8275ad7386..a9b804f339 100644 --- a/shared/source/utilities/tag_allocator.h +++ b/shared/source/utilities/tag_allocator.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017-2020 Intel Corporation + * Copyright (C) 2017-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -14,6 +14,7 @@ #include #include #include +#include #include namespace NEO { @@ -23,28 +24,31 @@ template class TagAllocator; template -struct TagNode : public IDNode>, NonCopyableOrMovableClass { +class TagNode; + +class TagAllocatorBase; + +class TagNodeBase : public NonCopyableOrMovableClass { public: - TagType *tagForCpuAccess; + virtual ~TagNodeBase() = default; GraphicsAllocation *getBaseGraphicsAllocation() const { return gfxAllocation; } + uint64_t getGpuAddress() const { return gpuAddress; } void incRefCount() { refCount++; } - MOCKABLE_VIRTUAL void returnTag() { - allocator->returnTag(this); - } + uint32_t refCountFetchSub(uint32_t value) { return refCount.fetch_sub(value); } - bool canBeReleased() const { - return (!doNotReleaseNodes) && - (tagForCpuAccess->isCompleted()) && - (tagForCpuAccess->getImplicitGpuDependenciesCount() == getImplicitCpuDependenciesCount()); - } + MOCKABLE_VIRTUAL void returnTag(); - void setDoNotReleaseNodes(bool doNotRelease) { - doNotReleaseNodes = doNotRelease; - } + virtual void initialize() = 0; + + bool canBeReleased() const; + + virtual void *getCpuBase() const = 0; + + void setDoNotReleaseNodes(bool doNotRelease) { doNotReleaseNodes = doNotRelease; } void setProfilingCapable(bool capable) { profilingCapable = capable; } @@ -52,18 +56,42 @@ struct TagNode : public IDNode>, NonCopyableOrMovableClass { void incImplicitCpuDependenciesCount() { implicitCpuDependenciesCount++; } - void initialize() { - tagForCpuAccess->initialize(); - implicitCpuDependenciesCount.store(0); - setProfilingCapable(true); - } - uint32_t getImplicitCpuDependenciesCount() const { return implicitCpuDependenciesCount.load(); } - const TagAllocator *getAllocator() const { return allocator; } + const TagAllocatorBase *getAllocator() const { return allocator; } + + // TagType specific calls + virtual bool isCompleted() const = 0; + virtual void assignDataToAllTimestamps(uint32_t packetIndex, void *source) = 0; + + virtual size_t getGlobalStartOffset() const = 0; + virtual size_t getContextStartOffset() const = 0; + virtual size_t getContextEndOffset() const = 0; + virtual size_t getGlobalEndOffset() const = 0; + virtual size_t getImplicitGpuDependenciesCountOffset() const = 0; + + virtual uint64_t getContextStartValue(uint32_t packetIndex) const = 0; + virtual uint64_t getGlobalStartValue(uint32_t packetIndex) const = 0; + virtual uint64_t getContextEndValue(uint32_t packetIndex) const = 0; + virtual uint64_t getGlobalEndValue(uint32_t packetIndex) const = 0; + + virtual uint64_t &getGlobalEndRef() const = 0; + virtual uint64_t &getContextCompleteRef() const = 0; + + virtual void setPacketsUsed(uint32_t used) = 0; + virtual uint32_t getPacketsUsed() const = 0; + + virtual size_t getSinglePacketSize() const = 0; + + virtual uint32_t getImplicitGpuDependenciesCount() const = 0; + + virtual MetricsLibraryApi::QueryHandle_1_0 &getQueryHandleRef() const = 0; protected: - TagAllocator *allocator = nullptr; + TagNodeBase() = default; + + TagAllocatorBase *allocator = nullptr; + GraphicsAllocation *gfxAllocation = nullptr; uint64_t gpuAddress = 0; std::atomic refCount{0}; @@ -71,71 +99,78 @@ struct TagNode : public IDNode>, NonCopyableOrMovableClass { bool doNotReleaseNodes = false; bool profilingCapable = true; - template + template friend class TagAllocator; }; template -class TagAllocator { +class TagNode : public TagNodeBase, public IDNode> { + static_assert(!std::is_polymorphic::value, + "This structure is consumed by GPU and has to follow specific restrictions for padding and size"); + public: - using NodeType = TagNode; + TagType *tagForCpuAccess; - TagAllocator(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount, - size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes, - DeviceBitfield deviceBitfield) : deviceBitfield(deviceBitfield), - rootDeviceIndex(rootDeviceIndex), - memoryManager(memMngr), - tagCount(tagCount), - doNotReleaseNodes(doNotReleaseNodes) { - - this->tagSize = alignUp(tagSize, tagAlignment); - populateFreeTags(); + void initialize() override { + tagForCpuAccess->initialize(); + implicitCpuDependenciesCount.store(0); + setProfilingCapable(true); } - MOCKABLE_VIRTUAL ~TagAllocator() { - cleanUpResources(); - } + void *getCpuBase() const override { return tagForCpuAccess; } - void cleanUpResources() { - for (auto gfxAllocation : gfxAllocations) { - memoryManager->freeGraphicsMemory(gfxAllocation); - } - gfxAllocations.clear(); - } + void assignDataToAllTimestamps(uint32_t packetIndex, void *source) override; - NodeType *getTag() { - if (freeTags.peekIsEmpty()) { - releaseDeferredTags(); - } - NodeType *node = freeTags.removeFrontOne().release(); - if (!node) { - std::unique_lock lock(allocatorMutex); - populateFreeTags(); - node = freeTags.removeFrontOne().release(); - } - usedTags.pushFrontOne(*node); - node->incRefCount(); - node->initialize(); - return node; - } + bool isCompleted() const override; - MOCKABLE_VIRTUAL void returnTag(NodeType *node) { - if (node->refCount.fetch_sub(1) == 1) { - if (node->canBeReleased()) { - returnTagToFreePool(node); - } else { - returnTagToDeferredPool(node); - } - } - } + size_t getGlobalStartOffset() const override; + size_t getContextStartOffset() const override; + size_t getContextEndOffset() const override; + size_t getGlobalEndOffset() const override; + size_t getImplicitGpuDependenciesCountOffset() const override; + + uint64_t getContextStartValue(uint32_t packetIndex) const override; + uint64_t getGlobalStartValue(uint32_t packetIndex) const override; + uint64_t getContextEndValue(uint32_t packetIndex) const override; + uint64_t getGlobalEndValue(uint32_t packetIndex) const override; + + uint64_t &getGlobalEndRef() const override; + uint64_t &getContextCompleteRef() const override; + + void setPacketsUsed(uint32_t used) override; + uint32_t getPacketsUsed() const override; + + size_t getSinglePacketSize() const override; + + uint32_t getImplicitGpuDependenciesCount() const override; + + MetricsLibraryApi::QueryHandle_1_0 &getQueryHandleRef() const override; +}; + +class TagAllocatorBase { + public: + virtual ~TagAllocatorBase() { cleanUpResources(); }; + + virtual void returnTag(TagNodeBase *node) = 0; + + virtual TagNodeBase *getTag() = 0; protected: - IDList freeTags; - IDList usedTags; - IDList deferredTags; - std::vector gfxAllocations; - std::vector> tagPoolMemory; + TagAllocatorBase() = delete; + TagAllocatorBase(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount, + size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes, + DeviceBitfield deviceBitfield); + + virtual void returnTagToFreePool(TagNodeBase *node) = 0; + + virtual void returnTagToDeferredPool(TagNodeBase *node) = 0; + + virtual void releaseDeferredTags() = 0; + + void cleanUpResources(); + + std::vector gfxAllocations; const DeviceBitfield deviceBitfield; const uint32_t rootDeviceIndex; MemoryManager *memoryManager; @@ -144,66 +179,38 @@ class TagAllocator { bool doNotReleaseNodes = false; std::mutex allocatorMutex; +}; - MOCKABLE_VIRTUAL void returnTagToFreePool(NodeType *node) { - NodeType *usedNode = usedTags.removeOne(*node).release(); - DEBUG_BREAK_IF(usedNode == nullptr); - UNUSED_VARIABLE(usedNode); - freeTags.pushFrontOne(*node); - } +template +class TagAllocator : public TagAllocatorBase { + public: + using NodeType = TagNode; - void returnTagToDeferredPool(NodeType *node) { - NodeType *usedNode = usedTags.removeOne(*node).release(); - DEBUG_BREAK_IF(!usedNode); - deferredTags.pushFrontOne(*usedNode); - } + TagAllocator(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount, + size_t tagAlignment, size_t tagSize, bool doNotReleaseNodes, + DeviceBitfield deviceBitfield); - void populateFreeTags() { - size_t allocationSizeRequired = tagCount * tagSize; + TagNodeBase *getTag() override; - auto allocationType = TagType::getAllocationType(); - AllocationProperties allocationProperties{rootDeviceIndex, allocationSizeRequired, allocationType, deviceBitfield}; - GraphicsAllocation *graphicsAllocation = memoryManager->allocateGraphicsMemoryWithProperties(allocationProperties); - gfxAllocations.push_back(graphicsAllocation); + void returnTag(TagNodeBase *node) override; - auto nodesMemory = std::make_unique(tagCount); + protected: + TagAllocator() = delete; - for (size_t i = 0; i < tagCount; ++i) { - auto tagOffset = i * tagSize; + void returnTagToFreePool(TagNodeBase *node) override; - nodesMemory[i].allocator = this; - nodesMemory[i].gfxAllocation = graphicsAllocation; - nodesMemory[i].tagForCpuAccess = reinterpret_cast(ptrOffset(graphicsAllocation->getUnderlyingBuffer(), tagOffset)); - nodesMemory[i].gpuAddress = graphicsAllocation->getGpuAddress() + tagOffset; - nodesMemory[i].setDoNotReleaseNodes(doNotReleaseNodes); + void returnTagToDeferredPool(TagNodeBase *node) override; - freeTags.pushTailOne(nodesMemory[i]); - } + void releaseDeferredTags() override; - tagPoolMemory.push_back(std::move(nodesMemory)); - } + void populateFreeTags(); - void releaseDeferredTags() { - IDList pendingFreeTags; - IDList pendingDeferredTags; - auto currentNode = deferredTags.detachNodes(); + IDList freeTags; + IDList usedTags; + IDList deferredTags; - while (currentNode != nullptr) { - auto nextNode = currentNode->next; - if (currentNode->canBeReleased()) { - pendingFreeTags.pushFrontOne(*currentNode); - } else { - pendingDeferredTags.pushFrontOne(*currentNode); - } - currentNode = nextNode; - } - - if (!pendingFreeTags.peekIsEmpty()) { - freeTags.splice(*pendingFreeTags.detachNodes()); - } - if (!pendingDeferredTags.peekIsEmpty()) { - deferredTags.splice(*pendingDeferredTags.detachNodes()); - } - } + std::vector> tagPoolMemory; }; } // namespace NEO + +#include "shared/source/utilities/tag_allocator.inl" diff --git a/shared/source/utilities/tag_allocator.inl b/shared/source/utilities/tag_allocator.inl new file mode 100644 index 0000000000..d81371f0f8 --- /dev/null +++ b/shared/source/utilities/tag_allocator.inl @@ -0,0 +1,282 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/utilities/tag_allocator.h" + +namespace NEO { +template +TagAllocator::TagAllocator(uint32_t rootDeviceIndex, MemoryManager *memMngr, size_t tagCount, size_t tagAlignment, + size_t tagSize, bool doNotReleaseNodes, DeviceBitfield deviceBitfield) + : TagAllocatorBase(rootDeviceIndex, memMngr, tagCount, tagAlignment, tagSize, doNotReleaseNodes, deviceBitfield) { + + populateFreeTags(); +} + +template +TagNodeBase *TagAllocator::getTag() { + if (freeTags.peekIsEmpty()) { + releaseDeferredTags(); + } + auto node = freeTags.removeFrontOne().release(); + if (!node) { + std::unique_lock lock(allocatorMutex); + populateFreeTags(); + node = freeTags.removeFrontOne().release(); + } + usedTags.pushFrontOne(*node); + node->incRefCount(); + node->initialize(); + return node; +} + +template +void TagAllocator::returnTagToFreePool(TagNodeBase *node) { + auto nodeT = static_cast(node); + auto usedNode = usedTags.removeOne(*nodeT).release(); + DEBUG_BREAK_IF(usedNode == nullptr); + UNUSED_VARIABLE(usedNode); + freeTags.pushFrontOne(*nodeT); +} + +template +void TagAllocator::returnTagToDeferredPool(TagNodeBase *node) { + auto nodeT = static_cast(node); + auto usedNode = usedTags.removeOne(*nodeT).release(); + DEBUG_BREAK_IF(!usedNode); + deferredTags.pushFrontOne(*usedNode); +} + +template +void TagAllocator::releaseDeferredTags() { + IDList pendingFreeTags; + IDList pendingDeferredTags; + auto currentNode = deferredTags.detachNodes(); + + while (currentNode != nullptr) { + auto nextNode = currentNode->next; + if (currentNode->canBeReleased()) { + pendingFreeTags.pushFrontOne(*currentNode); + } else { + pendingDeferredTags.pushFrontOne(*currentNode); + } + currentNode = nextNode; + } + + if (!pendingFreeTags.peekIsEmpty()) { + freeTags.splice(*pendingFreeTags.detachNodes()); + } + if (!pendingDeferredTags.peekIsEmpty()) { + deferredTags.splice(*pendingDeferredTags.detachNodes()); + } +} + +template +void TagAllocator::populateFreeTags() { + size_t allocationSizeRequired = tagCount * tagSize; + + AllocationProperties allocationProperties{rootDeviceIndex, allocationSizeRequired, TagType::getAllocationType(), deviceBitfield}; + GraphicsAllocation *graphicsAllocation = memoryManager->allocateGraphicsMemoryWithProperties(allocationProperties); + gfxAllocations.push_back(graphicsAllocation); + + auto nodesMemory = std::make_unique(tagCount); + + for (size_t i = 0; i < tagCount; ++i) { + auto tagOffset = i * tagSize; + + nodesMemory[i].allocator = this; + nodesMemory[i].gfxAllocation = graphicsAllocation; + nodesMemory[i].tagForCpuAccess = reinterpret_cast(ptrOffset(graphicsAllocation->getUnderlyingBuffer(), tagOffset)); + nodesMemory[i].gpuAddress = graphicsAllocation->getGpuAddress() + tagOffset; + nodesMemory[i].setDoNotReleaseNodes(doNotReleaseNodes); + + freeTags.pushTailOne(nodesMemory[i]); + } + + tagPoolMemory.push_back(std::move(nodesMemory)); +} + +template +void TagAllocator::returnTag(TagNodeBase *node) { + if (node->refCountFetchSub(1) == 1) { + if (node->canBeReleased()) { + returnTagToFreePool(node); + } else { + returnTagToDeferredPool(node); + } + } +} + +template +size_t TagNode::getGlobalStartOffset() const { + if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) { + return tagForCpuAccess->getGlobalStartOffset(); + } else { + UNRECOVERABLE_IF(true); + } +} + +template +size_t TagNode::getContextStartOffset() const { + if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) { + return tagForCpuAccess->getContextStartOffset(); + } else { + UNRECOVERABLE_IF(true); + } +} + +template +size_t TagNode::getContextEndOffset() const { + if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) { + return tagForCpuAccess->getContextEndOffset(); + } else { + UNRECOVERABLE_IF(true); + } +} + +template +size_t TagNode::getGlobalEndOffset() const { + if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) { + return tagForCpuAccess->getGlobalEndOffset(); + } else { + UNRECOVERABLE_IF(true); + } +} + +template +size_t TagNode::getImplicitGpuDependenciesCountOffset() const { + if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) { + return tagForCpuAccess->getImplicitGpuDependenciesCountOffset(); + } else { + UNRECOVERABLE_IF(true); + } +} + +template +uint64_t TagNode::getContextStartValue(uint32_t packetIndex) const { + if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) { + return tagForCpuAccess->getContextStartValue(packetIndex); + } else { + UNUSED_VARIABLE(packetIndex); + UNRECOVERABLE_IF(true); + } +} + +template +uint64_t TagNode::getGlobalStartValue(uint32_t packetIndex) const { + if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) { + return tagForCpuAccess->getGlobalStartValue(packetIndex); + } else { + UNUSED_VARIABLE(packetIndex); + UNRECOVERABLE_IF(true); + } +} + +template +uint64_t TagNode::getContextEndValue(uint32_t packetIndex) const { + if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) { + return tagForCpuAccess->getContextEndValue(packetIndex); + } else { + UNUSED_VARIABLE(packetIndex); + UNRECOVERABLE_IF(true); + } +} + +template +uint64_t TagNode::getGlobalEndValue(uint32_t packetIndex) const { + if constexpr (TagType::getTagNodeType() != TagNodeType::HwPerfCounter) { + return tagForCpuAccess->getGlobalEndValue(packetIndex); + } else { + UNUSED_VARIABLE(packetIndex); + UNRECOVERABLE_IF(true); + } +} + +template +uint64_t &TagNode::getContextCompleteRef() const { + if constexpr (TagType::getTagNodeType() == TagNodeType::HwTimeStamps) { + return tagForCpuAccess->ContextCompleteTS; + } else { + UNRECOVERABLE_IF(true); + } +} + +template +uint64_t &TagNode::getGlobalEndRef() const { + if constexpr (TagType::getTagNodeType() == TagNodeType::HwTimeStamps) { + return tagForCpuAccess->GlobalEndTS; + } else { + UNRECOVERABLE_IF(true); + } +} + +template +void TagNode::setPacketsUsed(uint32_t used) { + if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) { + return tagForCpuAccess->setPacketsUsed(used); + } else { + UNUSED_VARIABLE(used); + UNRECOVERABLE_IF(true); + } +} + +template +uint32_t TagNode::getPacketsUsed() const { + if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) { + return tagForCpuAccess->getPacketsUsed(); + } else { + UNRECOVERABLE_IF(true); + } +} + +template +uint32_t TagNode::getImplicitGpuDependenciesCount() const { + if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) { + return tagForCpuAccess->getImplicitGpuDependenciesCount(); + } else { + return 0; + } +} + +template +size_t TagNode::getSinglePacketSize() const { + if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) { + return tagForCpuAccess->getSinglePacketSize(); + } else { + UNRECOVERABLE_IF(true); + } +} + +template +void TagNode::assignDataToAllTimestamps(uint32_t packetIndex, void *source) { + if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) { + return tagForCpuAccess->assignDataToAllTimestamps(packetIndex, source); + } else { + UNUSED_VARIABLE(packetIndex); + UNUSED_VARIABLE(source); + UNRECOVERABLE_IF(true); + } +} + +template +bool TagNode::isCompleted() const { + if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) { + return tagForCpuAccess->isCompleted(); + } else { + return true; + } +} + +template +MetricsLibraryApi::QueryHandle_1_0 &TagNode::getQueryHandleRef() const { + if constexpr (TagType::getTagNodeType() == TagNodeType::HwPerfCounter) { + return tagForCpuAccess->query.handle; + } else { + UNRECOVERABLE_IF(true); + } +} + +} // namespace NEO