diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 0de76826fa..a1b48c2ed6 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -780,12 +780,12 @@ bool CommandQueue::blitEnqueueImageAllowed(const size_t *origin, const size_t *r return blitEnqueueImageAllowed; } -bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const { +bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue, bool isMarkerWithProfiling) const { if (!blockedQueue) { return false; } - if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType)) { + if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType) || isMarkerWithProfiling) { return true; } diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 23cc181efe..d64336ecf6 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -342,7 +342,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { cl_int enqueueUnmapMemObject(TransferProperties &transferProperties, EventsRequest &eventsRequest); virtual void obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType){}; - bool isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const; + bool isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue, bool isMarkerWithProfiling) const; MOCKABLE_VIRTUAL void obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes, bool clearAllDependencies, bool blitEnqueue); void storeProperties(const cl_queue_properties *properties); diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 58dc8c42aa..347d6420cd 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -461,7 +461,7 @@ class CommandQueueHw : public CommandQueue { bool profilingRequired = (this->isProfilingEnabled() && eventsRequest.outEvent); bool perfCountersRequired = (this->isPerfCountersEnabled() && eventsRequest.outEvent); - if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue)) { + if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue, isMarkerWithProfiling)) { constexpr size_t additionalAllocationSize = CSRequirements::csOverfetchSize; constexpr size_t allocationSize = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize; commandStream = new LinearStream(); diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index e1441aa5fb..efa910773d 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -812,7 +812,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( TimeStampData submitTimeStamp = {}; if (isProfilingEnabled() && eventBuilder.getEvent()) { - this->getDevice().getOSTime()->getCpuTime(&submitTimeStamp.CPUTimeinNS); + this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp); eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp); auto hwTimestampNode = eventBuilder.getEvent()->getHwTimeStampNode(); diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index aee3725334..6026860eeb 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -160,21 +160,19 @@ cl_int Event::getEventProfilingInfo(cl_profiling_info paramName, return CL_PROFILING_INFO_NOT_AVAILABLE; } + uint64_t timestamp = 0u; + // if paramValue is NULL, it is ignored switch (paramName) { case CL_PROFILING_COMMAND_QUEUED: - src = &queueTimeStamp.CPUTimeinNS; - if (DebugManager.flags.ReturnRawGpuTimestamps.get()) { - src = &queueTimeStamp.GPUTimeStamp; - } + timestamp = getTimeInNSFromTimestampData(queueTimeStamp); + src = ×tamp; srcSize = sizeof(cl_ulong); break; case CL_PROFILING_COMMAND_SUBMIT: - src = &submitTimeStamp.CPUTimeinNS; - if (DebugManager.flags.ReturnRawGpuTimestamps.get()) { - src = &submitTimeStamp.GPUTimeStamp; - } + timestamp = getTimeInNSFromTimestampData(submitTimeStamp); + src = ×tamp; srcSize = sizeof(cl_ulong); break; @@ -249,6 +247,26 @@ cl_ulong Event::getDelta(cl_ulong startTime, return Delta; } +uint64_t Event::getTimeInNSFromTimestampData(const TimeStampData ×tamp) const { + if (isCPUProfilingPath()) { + return timestamp.CPUTimeinNS; + } + + if (DebugManager.flags.ReturnRawGpuTimestamps.get()) { + return timestamp.GPUTimeStamp; + } + + if (cmdQueue && DebugManager.flags.EnableDeviceBasedTimestamps.get()) { + auto &device = cmdQueue->getDevice(); + auto &hwHelper = HwHelper::get(device.getHardwareInfo().platform.eRenderCoreFamily); + double resolution = device.getDeviceInfo().profilingTimerResolution; + + return hwHelper.getGpuTimeStampInNS(timestamp.GPUTimeStamp, resolution); + } + + return timestamp.CPUTimeinNS; +} + bool Event::calcProfilingData() { if (!dataCalculated && !profilingCpuPath) { if (timestampPacketContainer && timestampPacketContainer->peekNodes().size() > 0) { @@ -294,23 +312,29 @@ bool Event::calcProfilingData() { } void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t contextEndTS, uint64_t *contextCompleteTS, uint64_t globalStartTS) { - uint64_t gpuDuration = 0; uint64_t cpuDuration = 0; uint64_t gpuCompleteDuration = 0; uint64_t cpuCompleteDuration = 0; - auto &hwHelper = HwHelper::get(this->cmdQueue->getDevice().getHardwareInfo().platform.eRenderCoreFamily); - auto frequency = cmdQueue->getDevice().getDeviceInfo().profilingTimerResolution; - auto gpuTimeStamp = queueTimeStamp.GPUTimeStamp; + auto &device = this->cmdQueue->getDevice(); + auto &hwHelper = HwHelper::get(device.getHardwareInfo().platform.eRenderCoreFamily); + auto frequency = device.getDeviceInfo().profilingTimerResolution; + auto gpuQueueTimeStamp = hwHelper.getGpuTimeStampInNS(queueTimeStamp.GPUTimeStamp, frequency); - int64_t c0 = queueTimeStamp.CPUTimeinNS - hwHelper.getGpuTimeStampInNS(gpuTimeStamp, frequency); - - startTimeStamp = static_cast(globalStartTS * frequency) + c0; - if (startTimeStamp < queueTimeStamp.CPUTimeinNS) { - c0 += static_cast((1ULL << (hwHelper.getGlobalTimeStampBits())) * frequency); + if (DebugManager.flags.EnableDeviceBasedTimestamps.get()) { + startTimeStamp = static_cast(globalStartTS * frequency); + if (startTimeStamp < gpuQueueTimeStamp) { + startTimeStamp += static_cast((1ULL << hwHelper.getGlobalTimeStampBits()) * frequency); + } + } else { + int64_t c0 = queueTimeStamp.CPUTimeinNS - gpuQueueTimeStamp; startTimeStamp = static_cast(globalStartTS * frequency) + c0; + if (startTimeStamp < queueTimeStamp.CPUTimeinNS) { + c0 += static_cast((1ULL << (hwHelper.getGlobalTimeStampBits())) * frequency); + startTimeStamp = static_cast(globalStartTS * frequency) + c0; + } } /* calculation based on equation diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h index 06d47c3f67..eb17147f37 100644 --- a/opencl/source/event/event.h +++ b/opencl/source/event/event.h @@ -326,6 +326,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { } } + uint64_t getTimeInNSFromTimestampData(const TimeStampData ×tamp) const; bool calcProfilingData(); MOCKABLE_VIRTUAL void calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t contextEndTS, uint64_t *contextCompleteTS, uint64_t globalStartTS); MOCKABLE_VIRTUAL void synchronizeTaskCount() { diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index 9027425501..8c8b891e81 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -390,14 +390,14 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate commandQueue.getContext().containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext false); //memoryMigrationRequired - UNRECOVERABLE_IF(!kernelOperation->blitEnqueue && !commandStreamReceiver.peekTimestampPacketWriteEnabled() && commandQueue.getContext().getRootDeviceIndices().size() == 1); - if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); } - eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr); - makeTimestampPacketsResident(commandStreamReceiver); + if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) { + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr); + makeTimestampPacketsResident(commandStreamReceiver); + } gtpinNotifyPreFlushTask(&commandQueue); diff --git a/opencl/test/unit_test/command_queue/enqueue_marker_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_marker_tests.cpp index e162a8d259..0b7de22ed6 100644 --- a/opencl/test/unit_test/command_queue/enqueue_marker_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_marker_tests.cpp @@ -7,6 +7,7 @@ #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" #include "opencl/source/event/user_event.h" #include "opencl/test/unit_test/command_queue/command_enqueue_fixture.h" @@ -252,3 +253,31 @@ HWTEST_F(MarkerTest, givenMarkerCallFollowingNdrangeCallInBatchedModeWhenWaitFor clReleaseEvent(eventFromMarker); clReleaseEvent(eventFromNdr); } + +struct MarkerWithProfilingTest : public MarkerTest { + void SetUp() override { + dbgRestore = std::make_unique(); + DebugManager.flags.EnableTimestampPacket.set(0); + MarkerTest::SetUp(); + } + + void TearDown() override { + MarkerTest::TearDown(); + dbgRestore.reset(nullptr); + } + + std::unique_ptr dbgRestore; +}; + +struct WhiteBoxCommandQueue : public CommandQueue { + using CommandQueue::isBlockedCommandStreamRequired; +}; + +HWTEST_F(MarkerWithProfilingTest, givenMarkerWithProfilingAndBlockedEnqueueThenBlockedCommandStreamIsRequired) { + auto cmdQueueWB = static_cast(pCmdQ); + EventsRequest eventsRequest(0, nullptr, nullptr); + + bool ret = cmdQueueWB->isBlockedCommandStreamRequired(CL_COMMAND_MARKER, eventsRequest, true, true); + + EXPECT_TRUE(ret); +} diff --git a/opencl/test/unit_test/device/device_timers_tests.cpp b/opencl/test/unit_test/device/device_timers_tests.cpp index fab794fa19..3c2ca3db9e 100644 --- a/opencl/test/unit_test/device/device_timers_tests.cpp +++ b/opencl/test/unit_test/device/device_timers_tests.cpp @@ -5,6 +5,8 @@ * */ +#include "shared/test/common/helpers/debug_manager_state_restore.h" + #include "opencl/test/unit_test/fixtures/cl_device_fixture.h" #include "opencl/test/unit_test/mocks/mock_ostime.h" @@ -111,4 +113,106 @@ TEST(MockOSTime, GivenNullWhenSettingOsTimeThenResolutionIsZero) { delete mDev; } + +TEST(MockOSTime, givenDeviceTimestampBaseNotEnabledWhenGetDeviceAndHostTimerThenCpuTimestampIsReturned) { + auto mockDevice = std::unique_ptr(MockDevice::createWithNewExecutionEnvironment(nullptr)); + mockDevice->setOSTime(new MockOSTimeWithConstTimestamp()); + + uint64_t deviceTS = 0u, hostTS = 0u; + mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS); + + EXPECT_EQ(deviceTS, MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS); + EXPECT_EQ(deviceTS, hostTS); +} + +TEST(MockOSTime, givenDeviceTimestampBaseEnabledWhenGetDeviceAndHostTimerThenGpuTimestampIsReturned) { + DebugManagerStateRestore dbgRestorer; + DebugManager.flags.EnableDeviceBasedTimestamps.set(true); + + auto mockDevice = std::unique_ptr(MockDevice::createWithNewExecutionEnvironment(nullptr)); + mockDevice->setOSTime(new MockOSTimeWithConstTimestamp()); + + uint64_t deviceTS = 0u, hostTS = 0u; + mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS); + + EXPECT_EQ(deviceTS, MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP); + EXPECT_NE(deviceTS, hostTS); +} + +class FailingMockOSTime : public OSTime { + public: + FailingMockOSTime() { + this->deviceTime = std::make_unique(); + } + + bool getCpuTime(uint64_t *timeStamp) override { + return false; + } + + double getHostTimerResolution() const override { + return 0; + } + + uint64_t getCpuRawTimestamp() override { + return 0; + } +}; + +TEST(MockOSTime, givenFailingOSTimeWhenGetDeviceAndHostTimerThenFalseIsReturned) { + auto mockDevice = std::unique_ptr(MockDevice::createWithNewExecutionEnvironment(nullptr)); + mockDevice->setOSTime(new FailingMockOSTime()); + + uint64_t deviceTS = 0u, hostTS = 0u; + bool retVal = mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS); + + EXPECT_FALSE(retVal); + EXPECT_EQ(deviceTS, 0u); + EXPECT_EQ(hostTS, 0u); +} + +class FailingMockDeviceTime : public DeviceTime { + public: + bool getCpuGpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime) override { + return false; + } + + double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override { + return 1.0; + } + + uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override { + return static_cast(1000000000.0 / OSTime::getDeviceTimerResolution(hwInfo)); + } +}; + +class MockOSTimeWithFailingDeviceTime : public OSTime { + public: + MockOSTimeWithFailingDeviceTime() { + this->deviceTime = std::make_unique(); + } + + bool getCpuTime(uint64_t *timeStamp) override { + return true; + } + + double getHostTimerResolution() const override { + return 0; + } + + uint64_t getCpuRawTimestamp() override { + return 0; + } +}; + +TEST(MockOSTime, givenFailingDeviceTimeWhenGetDeviceAndHostTimerThenFalseIsReturned) { + auto mockDevice = std::unique_ptr(MockDevice::createWithNewExecutionEnvironment(nullptr)); + mockDevice->setOSTime(new MockOSTimeWithFailingDeviceTime()); + + uint64_t deviceTS = 0u, hostTS = 0u; + bool retVal = mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS); + + EXPECT_FALSE(retVal); + EXPECT_EQ(deviceTS, 0u); +} + } // namespace ULT diff --git a/opencl/test/unit_test/event/event_tests.cpp b/opencl/test/unit_test/event/event_tests.cpp index 9d3e60877b..dadf427d06 100644 --- a/opencl/test/unit_test/event/event_tests.cpp +++ b/opencl/test/unit_test/event/event_tests.cpp @@ -26,6 +26,7 @@ #include "opencl/test/unit_test/mocks/mock_kernel.h" #include "opencl/test/unit_test/mocks/mock_mdi.h" #include "opencl/test/unit_test/mocks/mock_memory_manager.h" +#include "opencl/test/unit_test/mocks/mock_ostime.h" #include "opencl/test/unit_test/mocks/mock_platform.h" #include "opencl/test/unit_test/mocks/mock_program.h" #include "opencl/test/unit_test/os_interface/mock_performance_counters.h" @@ -700,6 +701,90 @@ TEST_F(InternalsEventTest, GivenProfilingWhenUserEventCreatedThenProfilingNotSet EXPECT_FALSE(event.get()->isProfilingEnabled()); } +TEST_F(InternalsEventTest, givenDeviceTimestampBaseNotEnabledWhenGetEventProfilingInfoThenCpuTimestampIsReturned) { + pClDevice->setOSTime(new MockOSTimeWithConstTimestamp()); + const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; + MockCommandQueue cmdQ(mockContext, pClDevice, props, false); + MockEvent event(&cmdQ, CL_COMMAND_MARKER, 0, 0); + + event.setCommand(std::unique_ptr(new CommandWithoutKernel(cmdQ))); + + event.submitCommand(false); + uint64_t submitTime = 0ULL; + event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submitTime, 0); + + EXPECT_EQ(submitTime, MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS); +} + +TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledWhenGetEventProfilingInfoThenGpuTimestampIsReturned) { + DebugManagerStateRestore dbgRestorer; + DebugManager.flags.EnableDeviceBasedTimestamps.set(true); + + pClDevice->setOSTime(new MockOSTimeWithConstTimestamp()); + const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; + MockCommandQueue cmdQ(mockContext, pClDevice, props, false); + MockEvent event(&cmdQ, CL_COMMAND_MARKER, 0, 0); + + event.setCommand(std::unique_ptr(new CommandWithoutKernel(cmdQ))); + + event.submitCommand(false); + uint64_t submitTime = 0ULL; + event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submitTime, 0); + + auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution; + EXPECT_EQ(submitTime, static_cast(MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP * resolution)); +} + +TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledWhenCalculateStartTimestampThenCorrectTimeIsReturned) { + DebugManagerStateRestore dbgRestorer; + DebugManager.flags.EnableDeviceBasedTimestamps.set(true); + + const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; + MockCommandQueue cmdQ(mockContext, pClDevice, props, false); + MockEvent event(&cmdQ, CL_COMPLETE, 0, 0); + + HwTimeStamps timestamp{}; + timestamp.GlobalStartTS = 2; + event.queueTimeStamp.GPUTimeStamp = 1; + TagNode timestampNode{}; + timestampNode.tagForCpuAccess = ×tamp; + event.timeStampNode = ×tampNode; + + uint64_t start; + event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr); + + auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution; + EXPECT_EQ(start, static_cast(timestamp.GlobalStartTS * resolution)); + + event.timeStampNode = nullptr; +} + +TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWhenCalculateStartTimestampThenCorrectTimeIsReturned) { + DebugManagerStateRestore dbgRestorer; + DebugManager.flags.EnableDeviceBasedTimestamps.set(true); + + const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; + MockCommandQueue cmdQ(mockContext, pClDevice, props, false); + MockEvent event(&cmdQ, CL_COMPLETE, 0, 0); + + HwTimeStamps timestamp{}; + timestamp.GlobalStartTS = 1; + event.queueTimeStamp.GPUTimeStamp = 2; + TagNode timestampNode{}; + timestampNode.tagForCpuAccess = ×tamp; + event.timeStampNode = ×tampNode; + + uint64_t start = 0u; + event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr); + + auto &hwHelper = HwHelper::get(pClDevice->getHardwareInfo().platform.eRenderCoreFamily); + auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution; + auto refStartTime = static_cast(timestamp.GlobalStartTS * resolution + (1ULL << hwHelper.getGlobalTimeStampBits()) * resolution); + EXPECT_EQ(start, refStartTime); + + event.timeStampNode = nullptr; +} + TEST_F(InternalsEventTest, GivenProfilingWHENMapOperationTHENTimesSet) { const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; MockCommandQueue *pCmdQ = new MockCommandQueue(mockContext, pClDevice, props, false); diff --git a/opencl/test/unit_test/mocks/mock_ostime.h b/opencl/test/unit_test/mocks/mock_ostime.h index bf30b894c4..2a6197b4d3 100644 --- a/opencl/test/unit_test/mocks/mock_ostime.h +++ b/opencl/test/unit_test/mocks/mock_ostime.h @@ -47,4 +47,44 @@ class MockOSTime : public OSTime { return std::unique_ptr(new MockOSTime()); } }; + +class MockDeviceTimeWithConstTimestamp : public DeviceTime { + public: + static constexpr uint64_t CPU_TIME_IN_NS = 1u; + static constexpr uint64_t GPU_TIMESTAMP = 2u; + + bool getCpuGpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime) override { + pGpuCpuTime->GPUTimeStamp = GPU_TIMESTAMP; + pGpuCpuTime->CPUTimeinNS = CPU_TIME_IN_NS; + return true; + } + + double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override { + return 1.0; + } + + uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override { + return static_cast(1000000000.0 / OSTime::getDeviceTimerResolution(hwInfo)); + } +}; + +class MockOSTimeWithConstTimestamp : public OSTime { + public: + MockOSTimeWithConstTimestamp() { + this->deviceTime = std::make_unique(); + } + + bool getCpuTime(uint64_t *timeStamp) override { + *timeStamp = MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS; + return true; + } + + double getHostTimerResolution() const override { + return 0; + } + + uint64_t getCpuRawTimestamp() override { + return 0; + } +}; } // namespace NEO diff --git a/opencl/test/unit_test/profiling/profiling_tests.cpp b/opencl/test/unit_test/profiling/profiling_tests.cpp index ef578203e9..d03266ab36 100644 --- a/opencl/test/unit_test/profiling/profiling_tests.cpp +++ b/opencl/test/unit_test/profiling/profiling_tests.cpp @@ -181,7 +181,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfolingWhenWa clReleaseEvent(event); } -HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNonBlockedEnqueueIsExecutedThenSubmittedTimestampDoesntHaveGPUTime) { +HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNonBlockedEnqueueIsExecutedThenSubmittedTimestampHasGPUTime) { MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); @@ -203,8 +203,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNo auto mockEvent = static_cast *>(event); EXPECT_NE(0u, mockEvent->queueTimeStamp.GPUTimeStamp); EXPECT_NE(0u, mockEvent->queueTimeStamp.CPUTimeinNS); + EXPECT_LT(mockEvent->queueTimeStamp.GPUTimeStamp, mockEvent->submitTimeStamp.GPUTimeStamp); EXPECT_LT(mockEvent->queueTimeStamp.CPUTimeinNS, mockEvent->submitTimeStamp.CPUTimeinNS); - EXPECT_EQ(0u, mockEvent->submitTimeStamp.GPUTimeStamp); clReleaseEvent(event); } @@ -455,6 +455,71 @@ HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenNonBlockedEnqueueThenSetGpuPath) eventObj->release(); } +HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenBlockedEnqueueThenSetGpuPath) { + cl_event event = nullptr; + cl_event userEvent = new UserEvent(); + pCmdQ->enqueueMarkerWithWaitList(1, &userEvent, &event); + + auto eventObj = static_cast(event); + EXPECT_FALSE(eventObj->isCPUProfilingPath()); + + auto userEventObj = static_cast(userEvent); + + pCmdQ->flush(); + userEventObj->setStatus(CL_COMPLETE); + Event::waitForEvents(1, &event); + + uint64_t queued = 0u, submit = 0u; + cl_int retVal; + + retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queued, 0); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_LT(0u, queued); + EXPECT_LT(queued, submit); + + eventObj->release(); + userEventObj->release(); +} + +HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenBlockedEnqueueThenPipeControlsArePresentInCS) { + typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; + + cl_event event = nullptr; + cl_event userEvent = new UserEvent(); + static_cast *>(pCmdQ)->enqueueMarkerWithWaitList(1, &userEvent, &event); + + auto eventObj = static_cast(event); + EXPECT_FALSE(eventObj->isCPUProfilingPath()); + + auto userEventObj = static_cast(userEvent); + + pCmdQ->flush(); + userEventObj->setStatus(CL_COMPLETE); + Event::waitForEvents(1, &event); + + parseCommands(*pCmdQ); + + // Check PIPE_CONTROLs + auto itorFirstPC = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorFirstPC); + auto pFirstPC = genCmdCast(*itorFirstPC); + ASSERT_NE(nullptr, pFirstPC); + + auto itorSecondPC = find(itorFirstPC, cmdList.end()); + ASSERT_NE(cmdList.end(), itorSecondPC); + auto pSecondPC = genCmdCast(*itorSecondPC); + ASSERT_NE(nullptr, pSecondPC); + + EXPECT_TRUE(static_cast *>(event)->calcProfilingData()); + + eventObj->release(); + userEventObj->release(); + pCmdQ->isQueueBlocked(); +} + template struct MockTagNode : public TagNode { public: diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index b09789e7b0..ecbbaba05b 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -176,6 +176,7 @@ EnableTimestampPacket = -1 AllocateSharedAllocationsWithCpuAndGpuStorage = -1 UseMaxSimdSizeToDeduceMaxWorkgroupSize = 0 ReturnRawGpuTimestamps = 0 +EnableDeviceBasedTimestamps = 0 ForcePerDssBackedBufferProgramming = 0 MaxHwThreadsPercent = 0 MinHwThreadsUnoccupied = 0 diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 4f807ab33b..7150b905fc 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -328,6 +328,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, UsmInitialPlacement, -1, "-1: default, 0: optimi DECLARE_DEBUG_VARIABLE(int32_t, ForceHostPointerImport, -1, "-1: default, 0: disable, 1: enable, Forces the driver to import every host pointer coming into driver, WARNING this is not spec complaint.") DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger") DECLARE_DEBUG_VARIABLE(bool, ReturnRawGpuTimestamps, false, "Driver returns raw GPU tiemstamps instead of calculated ones.") +DECLARE_DEBUG_VARIABLE(bool, EnableDeviceBasedTimestamps, false, "Driver returns timestamps in nanoseconds based on device timer.") DECLARE_DEBUG_VARIABLE(bool, ForcePerDssBackedBufferProgramming, false, "Always program per-DSS memory backed buffer in preamble") DECLARE_DEBUG_VARIABLE(bool, UseCommandBufferHeaderSizeForWddmQueueSubmission, true, "0: Page size (4096), 1: sizeof(COMMAND_BUFFER_HEADER)") DECLARE_DEBUG_VARIABLE(bool, DisableDeepBind, false, "Disable passing RTLD_DEEPBIND flag to all dlopen calls.") diff --git a/shared/source/device/device.cpp b/shared/source/device/device.cpp index 2deb4d9af6..c93ba7b207 100644 --- a/shared/source/device/device.cpp +++ b/shared/source/device/device.cpp @@ -466,14 +466,18 @@ EngineControl &Device::getEngine(uint32_t index) { } bool Device::getDeviceAndHostTimer(uint64_t *deviceTimestamp, uint64_t *hostTimestamp) const { - TimeStampData queueTimeStamp; - bool retVal = getOSTime()->getCpuGpuTime(&queueTimeStamp); + bool retVal = getOSTime()->getCpuTime(hostTimestamp); if (retVal) { - uint64_t resolution = (uint64_t)getOSTime()->getDynamicDeviceTimerResolution(getHardwareInfo()); - *deviceTimestamp = queueTimeStamp.GPUTimeStamp * resolution; + TimeStampData timeStamp; + retVal = getOSTime()->getCpuGpuTime(&timeStamp); + if (retVal) { + if (DebugManager.flags.EnableDeviceBasedTimestamps.get()) { + auto resolution = getOSTime()->getDynamicDeviceTimerResolution(getHardwareInfo()); + *deviceTimestamp = static_cast(timeStamp.GPUTimeStamp * resolution); + } else + *deviceTimestamp = *hostTimestamp; + } } - - retVal = getOSTime()->getCpuTime(hostTimestamp); return retVal; } diff --git a/shared/source/os_interface/os_time.h b/shared/source/os_interface/os_time.h index 08be00ebf2..08f87d00b3 100644 --- a/shared/source/os_interface/os_time.h +++ b/shared/source/os_interface/os_time.h @@ -16,7 +16,7 @@ class OSInterface; struct HardwareInfo; struct TimeStampData { - uint64_t GPUTimeStamp; // GPU time in ns + uint64_t GPUTimeStamp; // GPU time in counter ticks uint64_t CPUTimeinNS; // CPU time in ns };