From 7cac94b597ed6176e7375424c16b57ba0bbc631e Mon Sep 17 00:00:00 2001 From: Piotr Zdunowski Date: Thu, 20 May 2021 15:07:00 +0000 Subject: [PATCH] Fix profiling for marker commands. Resolves: NEO-4867 Signed-off-by: Piotr Zdunowski --- .../source/command_queue/command_queue_hw.h | 8 +++ .../command_queue/command_queue_hw_base.inl | 2 +- opencl/source/command_queue/enqueue_common.h | 55 +++++++++++++++++-- opencl/source/helpers/enqueue_properties.h | 10 +++- .../enqueue_command_without_kernel_tests.cpp | 24 ++++---- .../helpers/timestamp_packet_tests.cpp | 16 ++++++ .../unit_test/profiling/profiling_tests.cpp | 50 ++++++++++++++++- 7 files changed, 146 insertions(+), 19 deletions(-) diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 6605384f49..f5ddce7418 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -408,6 +408,14 @@ class CommandQueueHw : public CommandQueue { size_t numSurfaces, LinearStream *commandStream, CsrDependencies &csrDeps); + void processDispatchForMarker(CommandQueue &commandQueue, + LinearStream *commandStream, + EventsRequest &eventsRequest, + CsrDependencies &csrDeps); + void processDispatchForMarkerWithTimestampPacket(CommandQueue &commandQueue, + LinearStream *commandStream, + EventsRequest &eventsRequest, + CsrDependencies &csrDeps); BlitProperties processDispatchForBlitEnqueue(const MultiDispatchInfo &multiDispatchInfo, TimestampPacketDependencies ×tampPacketDependencies, const EventsRequest &eventsRequest, diff --git a/opencl/source/command_queue/command_queue_hw_base.inl b/opencl/source/command_queue/command_queue_hw_base.inl index a80028491b..4f2b1a5f46 100644 --- a/opencl/source/command_queue/command_queue_hw_base.inl +++ b/opencl/source/command_queue/command_queue_hw_base.inl @@ -182,7 +182,7 @@ void CommandQueueHw::setupEvent(EventBuilder &eventBuilder, cl_event *ou getDevice().getOSTime()->getCpuGpuTime(&queueTimeStamp); eventObj->setQueueTimeStamp(&queueTimeStamp); - if (isCommandWithoutKernel(cmdType)) { + if (isCommandWithoutKernel(cmdType) && cmdType != CL_COMMAND_MARKER) { eventObj->setCPUProfilingPath(true); eventObj->setQueueTimeStamp(); } diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 970955c5f5..037c0a5ffe 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -151,7 +151,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, const cl_event *eventWaitList, cl_event *event) { if (multiDispatchInfo.empty() && !isCommandWithoutKernel(commandType)) { - enqueueHandler(surfacesForResidency, numSurfaceForResidency, blocking, multiDispatchInfo, + enqueueHandler(nullptr, 0, blocking, multiDispatchInfo, numEventsInWaitList, eventWaitList, event); if (event) { castToObjectOrAbort(*event)->setCmdType(commandType); @@ -171,6 +171,8 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, EventBuilder eventBuilder; setupEvent(eventBuilder, event, commandType); + bool isMarkerWithProfiling = (CL_COMMAND_MARKER == commandType) && (eventBuilder.getEvent() && eventBuilder.getEvent()->isProfilingEnabled()); + std::unique_ptr blockedCommandsData; std::unique_ptr printfHandler; TakeOwnershipWrapper> queueOwnership(*this); @@ -208,7 +210,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, auto allocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); size_t nodesCount = 0u; - if (isCacheFlushCommand(commandType)) { + if (isCacheFlushCommand(commandType) || isMarkerWithProfiling) { nodesCount = 1; } else if (!multiDispatchInfo.empty()) { nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo); @@ -266,18 +268,24 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } } } + if (flushDependenciesForNonKernelCommand) { TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer( commandStream, csrDeps, getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices()); } + + if (isMarkerWithProfiling) { + processDispatchForMarkerWithTimestampPacket(*this, &commandStream, eventsRequest, csrDeps); + } + } else if (isMarkerWithProfiling) { + processDispatchForMarker(*this, &commandStream, eventsRequest, csrDeps); } CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0}; - const EnqueueProperties enqueueProperties(false, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType), - flushDependenciesForNonKernelCommand, &blitPropertiesContainer); + flushDependenciesForNonKernelCommand, isMarkerWithProfiling, &blitPropertiesContainer); if (!blockQueue) { if (parentKernel) { @@ -567,6 +575,42 @@ void CommandQueueHw::processDispatchForCacheFlush(Surface **surfaces, submitCacheFlush(surfaces, numSurfaces, commandStream, postSyncAddress); } +template +void CommandQueueHw::processDispatchForMarker(CommandQueue &commandQueue, + LinearStream *commandStream, + EventsRequest &eventsRequest, + CsrDependencies &csrDeps) { + auto event = castToObjectOrAbort(*eventsRequest.outEvent); + + TagNodeBase *hwTimeStamps = nullptr; + TagNodeBase *hwPerfCounter = nullptr; + + hwTimeStamps = event->getHwTimeStampNode(); + + HardwareInterface::dispatchProfilingPerfStartCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); + HardwareInterface::dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); +} + +template +void CommandQueueHw::processDispatchForMarkerWithTimestampPacket(CommandQueue &commandQueue, + LinearStream *commandStream, + EventsRequest &eventsRequest, + CsrDependencies &csrDeps) { + auto currentTimestampPacketNode = commandQueue.getTimestampPacketContainer()->peekNodes().at(0); + + auto timestampContextStartGpuAddress = TimestampPacketHelper::getContextStartGpuAddress(*currentTimestampPacketNode); + auto timestampGlobalStartAddress = TimestampPacketHelper::getGlobalStartGpuAddress(*currentTimestampPacketNode); + + EncodeStoreMMIO::encode(*commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextStartGpuAddress); + EncodeStoreMMIO::encode(*commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalStartAddress); + + auto timestampContextEndGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*currentTimestampPacketNode); + auto timestampGlobalEndAddress = TimestampPacketHelper::getGlobalEndGpuAddress(*currentTimestampPacketNode); + + EncodeStoreMMIO::encode(*commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextEndGpuAddress); + EncodeStoreMMIO::encode(*commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalEndAddress); +} + template void CommandQueueHw::processDeviceEnqueue(DeviceQueueHw *devQueueHw, const MultiDispatchInfo &multiDispatchInfo, @@ -992,6 +1036,7 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp); eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp); + eventBuilder.getEvent()->setStartTimeStamp(); } if (flushGpgpuCsr) { @@ -1144,7 +1189,7 @@ void CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDispat CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0}; - const EnqueueProperties enqueueProperties(true, false, false, false, &blitPropertiesContainer); + const EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer); if (!blockQueue) { completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, taskLevel, csrDeps); diff --git a/opencl/source/helpers/enqueue_properties.h b/opencl/source/helpers/enqueue_properties.h index 7f41d1494e..3b11f8ffd3 100644 --- a/opencl/source/helpers/enqueue_properties.h +++ b/opencl/source/helpers/enqueue_properties.h @@ -18,10 +18,11 @@ struct EnqueueProperties { EnqueueWithoutSubmission, DependencyResolveOnGpu, GpuKernel, + ProfilingOnly }; EnqueueProperties() = delete; - EnqueueProperties(bool blitEnqueue, bool hasKernels, bool isCacheFlushCmd, bool flushDependenciesOnly, + EnqueueProperties(bool blitEnqueue, bool hasKernels, bool isCacheFlushCmd, bool flushDependenciesOnly, bool isMarkerWithEvent, const BlitPropertiesContainer *blitPropertiesContainer) { if (blitEnqueue) { operation = Operation::Blit; @@ -45,12 +46,17 @@ struct EnqueueProperties { return; } + if (isMarkerWithEvent) { + operation = Operation::ProfilingOnly; + return; + } + operation = Operation::EnqueueWithoutSubmission; } bool isFlushWithoutKernelRequired() const { return (operation == Operation::Blit) || (operation == Operation::ExplicitCacheFlush) || - (operation == Operation::DependencyResolveOnGpu); + (operation == Operation::DependencyResolveOnGpu) || (operation == EnqueueProperties::Operation::ProfilingOnly); } const BlitPropertiesContainer *blitPropertiesContainer = nullptr; diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp index e9eca5c159..2e1fb5a941 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp @@ -50,8 +50,9 @@ HWTEST_F(EnqueueHandlerTest, GivenCommandStreamWithoutKernelWhenCommandEnqueuedT Surface *surfaces[] = {surface.get()}; auto blocking = true; TimestampPacketDependencies timestampPacketDependencies; + CsrDependencies csrDeps; - EnqueueProperties enqueueProperties(false, false, false, true, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, csrDeps); @@ -88,8 +89,9 @@ HWTEST_F(EnqueueHandlerTimestampEnabledTest, givenProflingAndTimeStampPacketsEna Surface *surfaces[] = {surface.get()}; auto blocking = true; TimestampPacketDependencies timestampPacketDependencies; + CsrDependencies csrDeps; - EnqueueProperties enqueueProperties(false, false, false, true, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); EXPECT_EQ(ev->submitTimeStamp.CPUTimeinNS, 0u); EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u); @@ -119,8 +121,9 @@ HWTEST_F(EnqueueHandlerTimestampDisabledTest, givenProflingEnabledTimeStampPacke Surface *surfaces[] = {surface.get()}; auto blocking = true; TimestampPacketDependencies timestampPacketDependencies; + CsrDependencies csrDeps; - EnqueueProperties enqueueProperties(false, false, false, true, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); EXPECT_EQ(ev->submitTimeStamp.CPUTimeinNS, 0u); EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u); @@ -148,7 +151,7 @@ HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontReg EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; - const EnqueueProperties enqueuePropertiesForDependencyFlush(false, false, false, true, nullptr); + const EnqueueProperties enqueuePropertiesForDependencyFlush(false, false, false, true, false, nullptr); auto blockedCommandsData = std::unique_ptr(blockedCommandsDataForDependencyFlush); Surface *surfaces[] = {nullptr}; @@ -177,7 +180,7 @@ HWTEST_F(EnqueueHandlerTest, givenBlitPropertyWhenEnqueueIsBlockedThenRegisterBl blitProperties.dstAllocation = reinterpret_cast(0x56789); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); - const EnqueueProperties enqueuePropertiesForBlitEnqueue(true, false, false, false, &blitPropertiesContainer); + const EnqueueProperties enqueuePropertiesForBlitEnqueue(true, false, false, false, false, &blitPropertiesContainer); auto blockedCommandsData = std::unique_ptr(blockedCommandsDataForBlitEnqueue); Surface *surfaces[] = {nullptr}; @@ -202,7 +205,7 @@ HWTEST_F(DispatchFlagsTests, whenEnqueueCommandWithoutKernelThenPassCorrectDispa EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; - EnqueueProperties enqueueProperties(false, false, false, true, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, csrDeps); @@ -228,7 +231,7 @@ HWTEST_F(DispatchFlagsTests, whenEnqueueCommandWithoutKernelThenPassCorrectThrot EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; - EnqueueProperties enqueueProperties(false, false, false, true, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); bool blocking = true; mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, @@ -270,7 +273,7 @@ HWTEST_F(DispatchFlagsTests, givenBlitEnqueueWhenDispatchingCommandsWithoutKerne BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); - EnqueueProperties enqueueProperties(true, false, false, false, &blitPropertiesContainer); + EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer); mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, csrDeps); @@ -311,8 +314,9 @@ HWTEST_F(DispatchFlagsTests, givenN1EnabledWhenDispatchingWithoutKernelThenAllow eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); - EnqueueProperties enqueueProperties(true, false, false, false, &blitPropertiesContainer); + CsrDependencies csrDeps; + EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer); mockCsr->nTo1SubmissionModelEnabled = false; mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocked, enqueueProperties, timestampPacketDependencies, @@ -336,7 +340,7 @@ HWTEST_F(DispatchFlagsTests, givenMockKernelWhenSettingAdditionalKernelExecInfoT EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; - EnqueueProperties enqueueProperties(false, false, false, true, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 4096, GraphicsAllocation::AllocationType::COMMAND_BUFFER, device->getDeviceBitfield()})); auto blockedCommandsData = std::make_unique(cmdStream, *mockCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage()); diff --git a/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp index e892f70b33..530f515a0b 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp @@ -1905,10 +1905,26 @@ HWTEST_F(TimestampPacketTests, givenEmptyWaitlistAndNoOutputEventWhenEnqueueingM auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); cmdQ->enqueueMarkerWithWaitList(0, nullptr, nullptr); + EXPECT_EQ(0u, cmdQ->timestampPacketContainer->peekNodes().size()); EXPECT_FALSE(csr.stallingPipeControlOnNextFlushRequired); } +HWTEST_F(TimestampPacketTests, givenEmptyWaitlistAndEventWhenEnqueueingMarkerWithProfilingEnabledThenObtainNewNode) { + auto &csr = device->getUltCommandStreamReceiver(); + csr.timestampPacketWriteEnabled = true; + + auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); + cmdQ->setProfilingEnabled(); + + cl_event event; + cmdQ->enqueueMarkerWithWaitList(0, nullptr, &event); + + EXPECT_EQ(1u, cmdQ->timestampPacketContainer->peekNodes().size()); + + clReleaseEvent(event); +} + HWTEST_F(TimestampPacketTests, whenEnqueueingBarrierThenRequestPipeControlOnCsrFlush) { auto &csr = device->getUltCommandStreamReceiver(); csr.timestampPacketWriteEnabled = true; diff --git a/opencl/test/unit_test/profiling/profiling_tests.cpp b/opencl/test/unit_test/profiling/profiling_tests.cpp index 9da56d228b..5174704dbf 100644 --- a/opencl/test/unit_test/profiling/profiling_tests.cpp +++ b/opencl/test/unit_test/profiling/profiling_tests.cpp @@ -381,9 +381,37 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueBlockedWithProfilin pCmdQ->isQueueBlocked(); } +HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProflingWhenMarkerIsDispatchedThenPipeControlIsPresentInCS) { + typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; + + cl_event event; + + static_cast *>(pCmdQ)->enqueueMarkerWithWaitList( + 0, + nullptr, + &event); + + parseCommands(*pCmdQ); + + // Check PIPE_CONTROLs + auto itorFirstPC = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorFirstPC); + auto pFirstPC = genCmdCast(*itorFirstPC); + ASSERT_NE(nullptr, pFirstPC); + + auto itorSecondPC = find(itorFirstPC, cmdList.end()); + ASSERT_NE(cmdList.end(), itorSecondPC); + auto pSecondPC = genCmdCast(*itorSecondPC); + ASSERT_NE(nullptr, pSecondPC); + + EXPECT_TRUE(static_cast *>(event)->calcProfilingData()); + + clReleaseEvent(event); +} + HWTEST_F(ProfilingTests, givenNonKernelEnqueueWhenNonBlockedEnqueueThenSetCpuPath) { cl_event event; - pCmdQ->enqueueMarkerWithWaitList(0, nullptr, &event); + pCmdQ->enqueueBarrierWithWaitList(0, nullptr, &event); auto eventObj = static_cast(event); EXPECT_TRUE(eventObj->isCPUProfilingPath() == CL_TRUE); pCmdQ->finish(); @@ -407,6 +435,26 @@ HWTEST_F(ProfilingTests, givenNonKernelEnqueueWhenNonBlockedEnqueueThenSetCpuPat eventObj->release(); } +HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenNonBlockedEnqueueThenSetGpuPath) { + cl_event event; + pCmdQ->enqueueMarkerWithWaitList(0, nullptr, &event); + auto eventObj = static_cast(event); + EXPECT_TRUE(eventObj->isCPUProfilingPath() == CL_FALSE); + pCmdQ->finish(); + + uint64_t queued, submit; + cl_int retVal; + + retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queued, 0); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_LT(0u, queued); + EXPECT_LT(queued, submit); + eventObj->release(); +} + template struct MockTagNode : public TagNode { public: