From 44a6d70cedfb27bf9e6744fd86d8686339a8a193 Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Tue, 29 Sep 2020 14:35:23 +0200 Subject: [PATCH] Dont use Packets without profiling data to calculate kernel duration Change-Id: I710348835f8884a3b244502f53ff4e4980441654 Signed-off-by: Bartosz Dunajski --- opencl/source/command_queue/enqueue_common.h | 1 + .../command_queue/hardware_interface_base.inl | 1 + opencl/source/event/event.cpp | 7 ++++- .../unit_test/profiling/profiling_tests.cpp | 28 ++++++++++++++++++- .../utilities/tag_allocator_tests.cpp | 2 ++ shared/source/utilities/tag_allocator.h | 6 ++++ 6 files changed, 43 insertions(+), 2 deletions(-) diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 9cd1ca3695..2cad17dadb 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -519,6 +519,7 @@ void CommandQueueHw::processDispatchForCacheFlush(Surface **surfaces, uint64_t postSyncAddress = 0; if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { auto timestampPacketNodeForPostSync = timestampPacketContainer->peekNodes().at(0); + timestampPacketNodeForPostSync->setProfilingCapable(false); postSyncAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNodeForPostSync); } diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index b536394b30..b10e200183 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -143,6 +143,7 @@ void HardwareInterface::dispatchWalker( uint64_t postSyncAddress = 0; if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { auto timestampPacketNodeForPostSync = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex); + timestampPacketNodeForPostSync->setProfilingCapable(false); postSyncAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNodeForPostSync); } HardwareCommandsHelper::programCacheFlushAfterWalkerCommand(commandStream, commandQueue, mainKernel, postSyncAddress); diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index e451af6e55..38dfa3b7ff 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -256,9 +256,11 @@ bool Event::calcProfilingData() { if (DebugManager.flags.PrintTimestampPacketContents.get()) { for (auto i = 0u; i < timestamps.size(); i++) { + std::cout << "Timestamp " << i << ", " + << "profiling capable: " << timestamps[i]->isProfilingCapable() << ", "; for (auto j = 0u; j < timestamps[i]->tagForCpuAccess->packetsUsed; j++) { const auto &packet = timestamps[i]->tagForCpuAccess->packets[j]; - std::cout << "Timestamp " << i << ", packet " << j << ": " + std::cout << "packet " << j << ": " << "global start: " << packet.globalStart << ", " << "global end: " << packet.globalEnd << ", " << "context start: " << packet.contextStart << ", " @@ -271,6 +273,9 @@ bool Event::calcProfilingData() { uint64_t globalEndTS = timestamps[0]->tagForCpuAccess->packets[0].globalEnd; for (const auto ×tamp : timestamps) { + if (!timestamp->isProfilingCapable()) { + continue; + } for (auto i = 0u; i < timestamp->tagForCpuAccess->packetsUsed; ++i) { const auto &packet = timestamp->tagForCpuAccess->packets[i]; if (globalStartTS > packet.globalStart) { diff --git a/opencl/test/unit_test/profiling/profiling_tests.cpp b/opencl/test/unit_test/profiling/profiling_tests.cpp index c3c137f7a9..dbd6f694c9 100644 --- a/opencl/test/unit_test/profiling/profiling_tests.cpp +++ b/opencl/test/unit_test/profiling/profiling_tests.cpp @@ -1123,6 +1123,30 @@ TEST_F(ProfilingTimestampPacketsTest, givenMultiOsContextCapableSetToTrueWhenCal EXPECT_EQ(350u, ev->getEndTimeStamp()); } +TEST_F(ProfilingTimestampPacketsTest, givenTimestampPacketWithoutProfilingDataWhenCalculatingThenDontUseThatPacket) { + int globalStart0 = 20; + int globalEnd0 = 51; + int contextStart0 = 21; + int contextEnd0 = 50; + + int globalStart1 = globalStart0 - 1; + int globalEnd1 = globalEnd0 + 1; + int contextStart1 = contextStart0 - 1; + int contextEnd1 = contextEnd0 + 1; + + addTimestampNodeMultiOsContext(&globalStart0, &globalEnd0, &contextStart0, &contextEnd0, 1); + addTimestampNodeMultiOsContext(&globalStart1, &globalEnd1, &contextStart1, &contextEnd1, 1); + auto &device = reinterpret_cast(cmdQ->getDevice()); + auto &csr = device.getUltCommandStreamReceiver(); + csr.multiOsContextCapable = true; + + ev->timestampPacketContainer->peekNodes()[1]->setProfilingCapable(false); + + ev->calcProfilingData(); + EXPECT_EQ(static_cast(globalStart0), ev->getStartTimeStamp()); + EXPECT_EQ(static_cast(globalEnd0), ev->getEndTimeStamp()); +} + TEST_F(ProfilingTimestampPacketsTest, givenPrintTimestampPacketContentsSetWhenCalcProfilingDataThenTimeStampsArePrinted) { DebugManagerStateRestore restorer; DebugManager.flags.PrintTimestampPacketContents.set(true); @@ -1148,8 +1172,10 @@ TEST_F(ProfilingTimestampPacketsTest, givenPrintTimestampPacketContentsSetWhenCa std::string output = testing::internal::GetCapturedStdout(); std::stringstream expected; + + expected << "Timestamp 0, profiling capable: " << ev->timestampPacketContainer->peekNodes()[0]->isProfilingCapable() << ", "; for (int i = 0; i < 16; i++) { - expected << "Timestamp 0, packet " << i << ": " + expected << "packet " << i << ": " << "global start: " << globalStart[i] << ", " << "global end: " << globalEnd[i] << ", " << "context start: " << contextStart[i] << ", " diff --git a/opencl/test/unit_test/utilities/tag_allocator_tests.cpp b/opencl/test/unit_test/utilities/tag_allocator_tests.cpp index 37c7c708cc..4da1ee3cea 100644 --- a/opencl/test/unit_test/utilities/tag_allocator_tests.cpp +++ b/opencl/test/unit_test/utilities/tag_allocator_tests.cpp @@ -310,10 +310,12 @@ TEST_F(TagAllocatorTest, whenNewTagIsTakenThenItIsInitialized) { MockTagAllocator tagAllocator(memoryManager, 1, 2, deviceBitfield); tagAllocator.getFreeTagsHead()->tagForCpuAccess->start = 3; tagAllocator.getFreeTagsHead()->tagForCpuAccess->end = 4; + tagAllocator.getFreeTagsHead()->setProfilingCapable(false); auto node = tagAllocator.getTag(); EXPECT_EQ(1u, node->tagForCpuAccess->start); EXPECT_EQ(2u, node->tagForCpuAccess->end); + EXPECT_TRUE(node->isProfilingCapable()); } TEST_F(TagAllocatorTest, givenMultipleReferencesOnTagWhenReleasingThenReturnWhenAllRefCountsAreReleased) { diff --git a/shared/source/utilities/tag_allocator.h b/shared/source/utilities/tag_allocator.h index f1471ea831..8275ad7386 100644 --- a/shared/source/utilities/tag_allocator.h +++ b/shared/source/utilities/tag_allocator.h @@ -46,11 +46,16 @@ struct TagNode : public IDNode>, NonCopyableOrMovableClass { doNotReleaseNodes = doNotRelease; } + void setProfilingCapable(bool capable) { profilingCapable = capable; } + + bool isProfilingCapable() const { return profilingCapable; } + void incImplicitCpuDependenciesCount() { implicitCpuDependenciesCount++; } void initialize() { tagForCpuAccess->initialize(); implicitCpuDependenciesCount.store(0); + setProfilingCapable(true); } uint32_t getImplicitCpuDependenciesCount() const { return implicitCpuDependenciesCount.load(); } @@ -64,6 +69,7 @@ struct TagNode : public IDNode>, NonCopyableOrMovableClass { std::atomic refCount{0}; std::atomic implicitCpuDependenciesCount{0}; bool doNotReleaseNodes = false; + bool profilingCapable = true; template friend class TagAllocator;