Fix event profiling for marker commands

Related-To: NEO-5799 Signed-off-by: Konstanty Misiak <konstanty.misiak@intel.com>
2026-01-08 22:12:59 +08:00 · 2021-06-22 13:16:27 +00:00
parent 816e95443f
commit ad19eda689
16 changed files with 389 additions and 35 deletions
--- a/opencl/source/command_queue/command_queue.cpp
+++ b/opencl/source/command_queue/command_queue.cpp
@@ -780,12 +780,12 @@ bool CommandQueue::blitEnqueueImageAllowed(const size_t *origin, const size_t *r
    return blitEnqueueImageAllowed;
 }

-bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const {
+bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue, bool isMarkerWithProfiling) const {
    if (!blockedQueue) {
        return false;
    }

-    if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType)) {
+    if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType) || isMarkerWithProfiling) {
        return true;
    }

--- a/opencl/source/command_queue/command_queue.h
+++ b/opencl/source/command_queue/command_queue.h
@@ -342,7 +342,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
    cl_int enqueueUnmapMemObject(TransferProperties &transferProperties, EventsRequest &eventsRequest);

    virtual void obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType){};
-    bool isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const;
+    bool isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue, bool isMarkerWithProfiling) const;

    MOCKABLE_VIRTUAL void obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes, bool clearAllDependencies, bool blitEnqueue);
    void storeProperties(const cl_queue_properties *properties);
--- a/opencl/source/command_queue/command_queue_hw.h
+++ b/opencl/source/command_queue/command_queue_hw.h
@@ -461,7 +461,7 @@ class CommandQueueHw : public CommandQueue {
        bool profilingRequired = (this->isProfilingEnabled() && eventsRequest.outEvent);
        bool perfCountersRequired = (this->isPerfCountersEnabled() && eventsRequest.outEvent);

-        if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue)) {
+        if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue, isMarkerWithProfiling)) {
            constexpr size_t additionalAllocationSize = CSRequirements::csOverfetchSize;
            constexpr size_t allocationSize = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize;
            commandStream = new LinearStream();
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -812,7 +812,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(

    TimeStampData submitTimeStamp = {};
    if (isProfilingEnabled() && eventBuilder.getEvent()) {
-        this->getDevice().getOSTime()->getCpuTime(&submitTimeStamp.CPUTimeinNS);
+        this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp);
        eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp);

        auto hwTimestampNode = eventBuilder.getEvent()->getHwTimeStampNode();
--- a/opencl/source/event/event.cpp
+++ b/opencl/source/event/event.cpp
@@ -160,21 +160,19 @@ cl_int Event::getEventProfilingInfo(cl_profiling_info paramName,
        return CL_PROFILING_INFO_NOT_AVAILABLE;
    }

+    uint64_t timestamp = 0u;
+
    // if paramValue is NULL, it is ignored
    switch (paramName) {
    case CL_PROFILING_COMMAND_QUEUED:
-        src = &queueTimeStamp.CPUTimeinNS;
-        if (DebugManager.flags.ReturnRawGpuTimestamps.get()) {
-            src = &queueTimeStamp.GPUTimeStamp;
-        }
+        timestamp = getTimeInNSFromTimestampData(queueTimeStamp);
+        src = &timestamp;
        srcSize = sizeof(cl_ulong);
        break;

    case CL_PROFILING_COMMAND_SUBMIT:
-        src = &submitTimeStamp.CPUTimeinNS;
-        if (DebugManager.flags.ReturnRawGpuTimestamps.get()) {
-            src = &submitTimeStamp.GPUTimeStamp;
-        }
+        timestamp = getTimeInNSFromTimestampData(submitTimeStamp);
+        src = &timestamp;
        srcSize = sizeof(cl_ulong);
        break;

@@ -249,6 +247,26 @@ cl_ulong Event::getDelta(cl_ulong startTime,
    return Delta;
 }

+uint64_t Event::getTimeInNSFromTimestampData(const TimeStampData &timestamp) const {
+    if (isCPUProfilingPath()) {
+        return timestamp.CPUTimeinNS;
+    }
+
+    if (DebugManager.flags.ReturnRawGpuTimestamps.get()) {
+        return timestamp.GPUTimeStamp;
+    }
+
+    if (cmdQueue && DebugManager.flags.EnableDeviceBasedTimestamps.get()) {
+        auto &device = cmdQueue->getDevice();
+        auto &hwHelper = HwHelper::get(device.getHardwareInfo().platform.eRenderCoreFamily);
+        double resolution = device.getDeviceInfo().profilingTimerResolution;
+
+        return hwHelper.getGpuTimeStampInNS(timestamp.GPUTimeStamp, resolution);
+    }
+
+    return timestamp.CPUTimeinNS;
+}
+
 bool Event::calcProfilingData() {
    if (!dataCalculated && !profilingCpuPath) {
        if (timestampPacketContainer && timestampPacketContainer->peekNodes().size() > 0) {
@@ -294,24 +312,30 @@ bool Event::calcProfilingData() {
 }

 void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t contextEndTS, uint64_t *contextCompleteTS, uint64_t globalStartTS) {
-
    uint64_t gpuDuration = 0;
    uint64_t cpuDuration = 0;

    uint64_t gpuCompleteDuration = 0;
    uint64_t cpuCompleteDuration = 0;

-    auto &hwHelper = HwHelper::get(this->cmdQueue->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
-    auto frequency = cmdQueue->getDevice().getDeviceInfo().profilingTimerResolution;
-    auto gpuTimeStamp = queueTimeStamp.GPUTimeStamp;
-
-    int64_t c0 = queueTimeStamp.CPUTimeinNS - hwHelper.getGpuTimeStampInNS(gpuTimeStamp, frequency);
+    auto &device = this->cmdQueue->getDevice();
+    auto &hwHelper = HwHelper::get(device.getHardwareInfo().platform.eRenderCoreFamily);
+    auto frequency = device.getDeviceInfo().profilingTimerResolution;
+    auto gpuQueueTimeStamp = hwHelper.getGpuTimeStampInNS(queueTimeStamp.GPUTimeStamp, frequency);

+    if (DebugManager.flags.EnableDeviceBasedTimestamps.get()) {
+        startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency);
+        if (startTimeStamp < gpuQueueTimeStamp) {
+            startTimeStamp += static_cast<uint64_t>((1ULL << hwHelper.getGlobalTimeStampBits()) * frequency);
+        }
+    } else {
+        int64_t c0 = queueTimeStamp.CPUTimeinNS - gpuQueueTimeStamp;
        startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency) + c0;
        if (startTimeStamp < queueTimeStamp.CPUTimeinNS) {
            c0 += static_cast<uint64_t>((1ULL << (hwHelper.getGlobalTimeStampBits())) * frequency);
            startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency) + c0;
        }
+    }

    /* calculation based on equation
       CpuTime = GpuTime * scalar + const( == c0)
--- a/opencl/source/event/event.h
+++ b/opencl/source/event/event.h
@@ -326,6 +326,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
        }
    }

+    uint64_t getTimeInNSFromTimestampData(const TimeStampData &timestamp) const;
    bool calcProfilingData();
    MOCKABLE_VIRTUAL void calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t contextEndTS, uint64_t *contextCompleteTS, uint64_t globalStartTS);
    MOCKABLE_VIRTUAL void synchronizeTaskCount() {
--- a/opencl/source/helpers/task_information.cpp
+++ b/opencl/source/helpers/task_information.cpp
@@ -390,14 +390,14 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
        commandQueue.getContext().containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext
        false);                                                                //memoryMigrationRequired

-    UNRECOVERABLE_IF(!kernelOperation->blitEnqueue && !commandStreamReceiver.peekTimestampPacketWriteEnabled() && commandQueue.getContext().getRootDeviceIndices().size() == 1);
-
    if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
        eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
    }

+    if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) {
        eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
        makeTimestampPacketsResident(commandStreamReceiver);
+    }

    gtpinNotifyPreFlushTask(&commandQueue);

--- a/opencl/test/unit_test/command_queue/enqueue_marker_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_marker_tests.cpp
@@ -7,6 +7,7 @@

 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/test/common/cmd_parse/gen_cmd_parse.h"
+#include "shared/test/common/helpers/debug_manager_state_restore.h"

 #include "opencl/source/event/user_event.h"
 #include "opencl/test/unit_test/command_queue/command_enqueue_fixture.h"
@@ -252,3 +253,31 @@ HWTEST_F(MarkerTest, givenMarkerCallFollowingNdrangeCallInBatchedModeWhenWaitFor
    clReleaseEvent(eventFromMarker);
    clReleaseEvent(eventFromNdr);
 }
+
+struct MarkerWithProfilingTest : public MarkerTest {
+    void SetUp() override {
+        dbgRestore = std::make_unique<DebugManagerStateRestore>();
+        DebugManager.flags.EnableTimestampPacket.set(0);
+        MarkerTest::SetUp();
+    }
+
+    void TearDown() override {
+        MarkerTest::TearDown();
+        dbgRestore.reset(nullptr);
+    }
+
+    std::unique_ptr<DebugManagerStateRestore> dbgRestore;
+};
+
+struct WhiteBoxCommandQueue : public CommandQueue {
+    using CommandQueue::isBlockedCommandStreamRequired;
+};
+
+HWTEST_F(MarkerWithProfilingTest, givenMarkerWithProfilingAndBlockedEnqueueThenBlockedCommandStreamIsRequired) {
+    auto cmdQueueWB = static_cast<WhiteBoxCommandQueue *>(pCmdQ);
+    EventsRequest eventsRequest(0, nullptr, nullptr);
+
+    bool ret = cmdQueueWB->isBlockedCommandStreamRequired(CL_COMMAND_MARKER, eventsRequest, true, true);
+
+    EXPECT_TRUE(ret);
+}
--- a/opencl/test/unit_test/device/device_timers_tests.cpp
+++ b/opencl/test/unit_test/device/device_timers_tests.cpp
@@ -5,6 +5,8 @@
 *
 */

+#include "shared/test/common/helpers/debug_manager_state_restore.h"
+
 #include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
 #include "opencl/test/unit_test/mocks/mock_ostime.h"

@@ -111,4 +113,106 @@ TEST(MockOSTime, GivenNullWhenSettingOsTimeThenResolutionIsZero) {

    delete mDev;
 }
+
+TEST(MockOSTime, givenDeviceTimestampBaseNotEnabledWhenGetDeviceAndHostTimerThenCpuTimestampIsReturned) {
+    auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
+    mockDevice->setOSTime(new MockOSTimeWithConstTimestamp());
+
+    uint64_t deviceTS = 0u, hostTS = 0u;
+    mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
+
+    EXPECT_EQ(deviceTS, MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS);
+    EXPECT_EQ(deviceTS, hostTS);
+}
+
+TEST(MockOSTime, givenDeviceTimestampBaseEnabledWhenGetDeviceAndHostTimerThenGpuTimestampIsReturned) {
+    DebugManagerStateRestore dbgRestorer;
+    DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
+
+    auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
+    mockDevice->setOSTime(new MockOSTimeWithConstTimestamp());
+
+    uint64_t deviceTS = 0u, hostTS = 0u;
+    mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
+
+    EXPECT_EQ(deviceTS, MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
+    EXPECT_NE(deviceTS, hostTS);
+}
+
+class FailingMockOSTime : public OSTime {
+  public:
+    FailingMockOSTime() {
+        this->deviceTime = std::make_unique<MockDeviceTime>();
+    }
+
+    bool getCpuTime(uint64_t *timeStamp) override {
+        return false;
+    }
+
+    double getHostTimerResolution() const override {
+        return 0;
+    }
+
+    uint64_t getCpuRawTimestamp() override {
+        return 0;
+    }
+};
+
+TEST(MockOSTime, givenFailingOSTimeWhenGetDeviceAndHostTimerThenFalseIsReturned) {
+    auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
+    mockDevice->setOSTime(new FailingMockOSTime());
+
+    uint64_t deviceTS = 0u, hostTS = 0u;
+    bool retVal = mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
+
+    EXPECT_FALSE(retVal);
+    EXPECT_EQ(deviceTS, 0u);
+    EXPECT_EQ(hostTS, 0u);
+}
+
+class FailingMockDeviceTime : public DeviceTime {
+  public:
+    bool getCpuGpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime) override {
+        return false;
+    }
+
+    double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override {
+        return 1.0;
+    }
+
+    uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override {
+        return static_cast<uint64_t>(1000000000.0 / OSTime::getDeviceTimerResolution(hwInfo));
+    }
+};
+
+class MockOSTimeWithFailingDeviceTime : public OSTime {
+  public:
+    MockOSTimeWithFailingDeviceTime() {
+        this->deviceTime = std::make_unique<FailingMockDeviceTime>();
+    }
+
+    bool getCpuTime(uint64_t *timeStamp) override {
+        return true;
+    }
+
+    double getHostTimerResolution() const override {
+        return 0;
+    }
+
+    uint64_t getCpuRawTimestamp() override {
+        return 0;
+    }
+};
+
+TEST(MockOSTime, givenFailingDeviceTimeWhenGetDeviceAndHostTimerThenFalseIsReturned) {
+    auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
+    mockDevice->setOSTime(new MockOSTimeWithFailingDeviceTime());
+
+    uint64_t deviceTS = 0u, hostTS = 0u;
+    bool retVal = mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
+
+    EXPECT_FALSE(retVal);
+    EXPECT_EQ(deviceTS, 0u);
+}
+
 } // namespace ULT
--- a/opencl/test/unit_test/event/event_tests.cpp
+++ b/opencl/test/unit_test/event/event_tests.cpp
@@ -26,6 +26,7 @@
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
 #include "opencl/test/unit_test/mocks/mock_mdi.h"
 #include "opencl/test/unit_test/mocks/mock_memory_manager.h"
+#include "opencl/test/unit_test/mocks/mock_ostime.h"
 #include "opencl/test/unit_test/mocks/mock_platform.h"
 #include "opencl/test/unit_test/mocks/mock_program.h"
 #include "opencl/test/unit_test/os_interface/mock_performance_counters.h"
@@ -700,6 +701,90 @@ TEST_F(InternalsEventTest, GivenProfilingWhenUserEventCreatedThenProfilingNotSet
    EXPECT_FALSE(event.get()->isProfilingEnabled());
 }

+TEST_F(InternalsEventTest, givenDeviceTimestampBaseNotEnabledWhenGetEventProfilingInfoThenCpuTimestampIsReturned) {
+    pClDevice->setOSTime(new MockOSTimeWithConstTimestamp());
+    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
+    MockEvent<Event> event(&cmdQ, CL_COMMAND_MARKER, 0, 0);
+
+    event.setCommand(std::unique_ptr<Command>(new CommandWithoutKernel(cmdQ)));
+
+    event.submitCommand(false);
+    uint64_t submitTime = 0ULL;
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submitTime, 0);
+
+    EXPECT_EQ(submitTime, MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS);
+}
+
+TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledWhenGetEventProfilingInfoThenGpuTimestampIsReturned) {
+    DebugManagerStateRestore dbgRestorer;
+    DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
+
+    pClDevice->setOSTime(new MockOSTimeWithConstTimestamp());
+    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
+    MockEvent<Event> event(&cmdQ, CL_COMMAND_MARKER, 0, 0);
+
+    event.setCommand(std::unique_ptr<Command>(new CommandWithoutKernel(cmdQ)));
+
+    event.submitCommand(false);
+    uint64_t submitTime = 0ULL;
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submitTime, 0);
+
+    auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+    EXPECT_EQ(submitTime, static_cast<uint64_t>(MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP * resolution));
+}
+
+TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledWhenCalculateStartTimestampThenCorrectTimeIsReturned) {
+    DebugManagerStateRestore dbgRestorer;
+    DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
+
+    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
+    MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
+
+    HwTimeStamps timestamp{};
+    timestamp.GlobalStartTS = 2;
+    event.queueTimeStamp.GPUTimeStamp = 1;
+    TagNode<HwTimeStamps> timestampNode{};
+    timestampNode.tagForCpuAccess = &timestamp;
+    event.timeStampNode = &timestampNode;
+
+    uint64_t start;
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
+
+    auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+    EXPECT_EQ(start, static_cast<uint64_t>(timestamp.GlobalStartTS * resolution));
+
+    event.timeStampNode = nullptr;
+}
+
+TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWhenCalculateStartTimestampThenCorrectTimeIsReturned) {
+    DebugManagerStateRestore dbgRestorer;
+    DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
+
+    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
+    MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
+
+    HwTimeStamps timestamp{};
+    timestamp.GlobalStartTS = 1;
+    event.queueTimeStamp.GPUTimeStamp = 2;
+    TagNode<HwTimeStamps> timestampNode{};
+    timestampNode.tagForCpuAccess = &timestamp;
+    event.timeStampNode = &timestampNode;
+
+    uint64_t start = 0u;
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
+
+    auto &hwHelper = HwHelper::get(pClDevice->getHardwareInfo().platform.eRenderCoreFamily);
+    auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+    auto refStartTime = static_cast<uint64_t>(timestamp.GlobalStartTS * resolution + (1ULL << hwHelper.getGlobalTimeStampBits()) * resolution);
+    EXPECT_EQ(start, refStartTime);
+
+    event.timeStampNode = nullptr;
+}
+
 TEST_F(InternalsEventTest, GivenProfilingWHENMapOperationTHENTimesSet) {
    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
    MockCommandQueue *pCmdQ = new MockCommandQueue(mockContext, pClDevice, props, false);
--- a/opencl/test/unit_test/mocks/mock_ostime.h
+++ b/opencl/test/unit_test/mocks/mock_ostime.h
@@ -47,4 +47,44 @@ class MockOSTime : public OSTime {
        return std::unique_ptr<OSTime>(new MockOSTime());
    }
 };
+
+class MockDeviceTimeWithConstTimestamp : public DeviceTime {
+  public:
+    static constexpr uint64_t CPU_TIME_IN_NS = 1u;
+    static constexpr uint64_t GPU_TIMESTAMP = 2u;
+
+    bool getCpuGpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime) override {
+        pGpuCpuTime->GPUTimeStamp = GPU_TIMESTAMP;
+        pGpuCpuTime->CPUTimeinNS = CPU_TIME_IN_NS;
+        return true;
+    }
+
+    double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override {
+        return 1.0;
+    }
+
+    uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override {
+        return static_cast<uint64_t>(1000000000.0 / OSTime::getDeviceTimerResolution(hwInfo));
+    }
+};
+
+class MockOSTimeWithConstTimestamp : public OSTime {
+  public:
+    MockOSTimeWithConstTimestamp() {
+        this->deviceTime = std::make_unique<MockDeviceTimeWithConstTimestamp>();
+    }
+
+    bool getCpuTime(uint64_t *timeStamp) override {
+        *timeStamp = MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS;
+        return true;
+    }
+
+    double getHostTimerResolution() const override {
+        return 0;
+    }
+
+    uint64_t getCpuRawTimestamp() override {
+        return 0;
+    }
+};
 } // namespace NEO
--- a/opencl/test/unit_test/profiling/profiling_tests.cpp
+++ b/opencl/test/unit_test/profiling/profiling_tests.cpp
@@ -181,7 +181,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfolingWhenWa
    clReleaseEvent(event);
 }

-HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNonBlockedEnqueueIsExecutedThenSubmittedTimestampDoesntHaveGPUTime) {
+HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNonBlockedEnqueueIsExecutedThenSubmittedTimestampHasGPUTime) {
    MockKernel kernel(program.get(), kernelInfo, *pClDevice);
    ASSERT_EQ(CL_SUCCESS, kernel.initialize());

@@ -203,8 +203,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNo
    auto mockEvent = static_cast<MockEvent<Event> *>(event);
    EXPECT_NE(0u, mockEvent->queueTimeStamp.GPUTimeStamp);
    EXPECT_NE(0u, mockEvent->queueTimeStamp.CPUTimeinNS);
+    EXPECT_LT(mockEvent->queueTimeStamp.GPUTimeStamp, mockEvent->submitTimeStamp.GPUTimeStamp);
    EXPECT_LT(mockEvent->queueTimeStamp.CPUTimeinNS, mockEvent->submitTimeStamp.CPUTimeinNS);
-    EXPECT_EQ(0u, mockEvent->submitTimeStamp.GPUTimeStamp);

    clReleaseEvent(event);
 }
@@ -455,6 +455,71 @@ HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenNonBlockedEnqueueThenSetGpuPath)
    eventObj->release();
 }

+HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenBlockedEnqueueThenSetGpuPath) {
+    cl_event event = nullptr;
+    cl_event userEvent = new UserEvent();
+    pCmdQ->enqueueMarkerWithWaitList(1, &userEvent, &event);
+
+    auto eventObj = static_cast<Event *>(event);
+    EXPECT_FALSE(eventObj->isCPUProfilingPath());
+
+    auto userEventObj = static_cast<UserEvent *>(userEvent);
+
+    pCmdQ->flush();
+    userEventObj->setStatus(CL_COMPLETE);
+    Event::waitForEvents(1, &event);
+
+    uint64_t queued = 0u, submit = 0u;
+    cl_int retVal;
+
+    retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queued, 0);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    EXPECT_LT(0u, queued);
+    EXPECT_LT(queued, submit);
+
+    eventObj->release();
+    userEventObj->release();
+}
+
+HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenBlockedEnqueueThenPipeControlsArePresentInCS) {
+    typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
+
+    cl_event event = nullptr;
+    cl_event userEvent = new UserEvent();
+    static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueMarkerWithWaitList(1, &userEvent, &event);
+
+    auto eventObj = static_cast<Event *>(event);
+    EXPECT_FALSE(eventObj->isCPUProfilingPath());
+
+    auto userEventObj = static_cast<UserEvent *>(userEvent);
+
+    pCmdQ->flush();
+    userEventObj->setStatus(CL_COMPLETE);
+    Event::waitForEvents(1, &event);
+
+    parseCommands<FamilyType>(*pCmdQ);
+
+    // Check PIPE_CONTROLs
+    auto itorFirstPC = find<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
+    ASSERT_NE(cmdList.end(), itorFirstPC);
+    auto pFirstPC = genCmdCast<PIPE_CONTROL *>(*itorFirstPC);
+    ASSERT_NE(nullptr, pFirstPC);
+
+    auto itorSecondPC = find<PIPE_CONTROL *>(itorFirstPC, cmdList.end());
+    ASSERT_NE(cmdList.end(), itorSecondPC);
+    auto pSecondPC = genCmdCast<PIPE_CONTROL *>(*itorSecondPC);
+    ASSERT_NE(nullptr, pSecondPC);
+
+    EXPECT_TRUE(static_cast<MockEvent<Event> *>(event)->calcProfilingData());
+
+    eventObj->release();
+    userEventObj->release();
+    pCmdQ->isQueueBlocked();
+}
+
 template <typename TagType>
 struct MockTagNode : public TagNode<TagType> {
  public:
--- a/opencl/test/unit_test/test_files/igdrcl.config
+++ b/opencl/test/unit_test/test_files/igdrcl.config
@@ -176,6 +176,7 @@ EnableTimestampPacket = -1
 AllocateSharedAllocationsWithCpuAndGpuStorage = -1
 UseMaxSimdSizeToDeduceMaxWorkgroupSize = 0
 ReturnRawGpuTimestamps = 0
+EnableDeviceBasedTimestamps = 0
 ForcePerDssBackedBufferProgramming = 0
 MaxHwThreadsPercent = 0
 MinHwThreadsUnoccupied = 0
--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@@ -328,6 +328,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, UsmInitialPlacement, -1, "-1: default, 0: optimi
 DECLARE_DEBUG_VARIABLE(int32_t, ForceHostPointerImport, -1, "-1: default, 0: disable, 1: enable, Forces the driver to import every host pointer coming into driver, WARNING this is not spec complaint.")
 DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger")
 DECLARE_DEBUG_VARIABLE(bool, ReturnRawGpuTimestamps, false, "Driver returns raw GPU tiemstamps instead of calculated ones.")
+DECLARE_DEBUG_VARIABLE(bool, EnableDeviceBasedTimestamps, false, "Driver returns timestamps in nanoseconds based on device timer.")
 DECLARE_DEBUG_VARIABLE(bool, ForcePerDssBackedBufferProgramming, false, "Always program per-DSS memory backed buffer in preamble")
 DECLARE_DEBUG_VARIABLE(bool, UseCommandBufferHeaderSizeForWddmQueueSubmission, true, "0: Page size (4096), 1: sizeof(COMMAND_BUFFER_HEADER)")
 DECLARE_DEBUG_VARIABLE(bool, DisableDeepBind, false, "Disable passing RTLD_DEEPBIND flag to all dlopen calls.")
--- a/shared/source/device/device.cpp
+++ b/shared/source/device/device.cpp
@@ -466,14 +466,18 @@ EngineControl &Device::getEngine(uint32_t index) {
 }

 bool Device::getDeviceAndHostTimer(uint64_t *deviceTimestamp, uint64_t *hostTimestamp) const {
-    TimeStampData queueTimeStamp;
-    bool retVal = getOSTime()->getCpuGpuTime(&queueTimeStamp);
+    bool retVal = getOSTime()->getCpuTime(hostTimestamp);
    if (retVal) {
-        uint64_t resolution = (uint64_t)getOSTime()->getDynamicDeviceTimerResolution(getHardwareInfo());
-        *deviceTimestamp = queueTimeStamp.GPUTimeStamp * resolution;
+        TimeStampData timeStamp;
+        retVal = getOSTime()->getCpuGpuTime(&timeStamp);
+        if (retVal) {
+            if (DebugManager.flags.EnableDeviceBasedTimestamps.get()) {
+                auto resolution = getOSTime()->getDynamicDeviceTimerResolution(getHardwareInfo());
+                *deviceTimestamp = static_cast<uint64_t>(timeStamp.GPUTimeStamp * resolution);
+            } else
+                *deviceTimestamp = *hostTimestamp;
+        }
    }
-
-    retVal = getOSTime()->getCpuTime(hostTimestamp);
    return retVal;
 }

--- a/shared/source/os_interface/os_time.h
+++ b/shared/source/os_interface/os_time.h
@@ -16,7 +16,7 @@ class OSInterface;
 struct HardwareInfo;

 struct TimeStampData {
-    uint64_t GPUTimeStamp; // GPU time in ns
+    uint64_t GPUTimeStamp; // GPU time in counter ticks
    uint64_t CPUTimeinNS;  // CPU time in ns
 };