diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp
index 0de76826fa..a1b48c2ed6 100644
--- a/opencl/source/command_queue/command_queue.cpp
+++ b/opencl/source/command_queue/command_queue.cpp
@@ -780,12 +780,12 @@ bool CommandQueue::blitEnqueueImageAllowed(const size_t *origin, const size_t *r
     return blitEnqueueImageAllowed;
 }
 
-bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const {
+bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue, bool isMarkerWithProfiling) const {
     if (!blockedQueue) {
         return false;
     }
 
-    if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType)) {
+    if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType) || isMarkerWithProfiling) {
         return true;
     }
 
diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h
index 23cc181efe..d64336ecf6 100644
--- a/opencl/source/command_queue/command_queue.h
+++ b/opencl/source/command_queue/command_queue.h
@@ -342,7 +342,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
     cl_int enqueueUnmapMemObject(TransferProperties &transferProperties, EventsRequest &eventsRequest);
 
     virtual void obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType){};
-    bool isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const;
+    bool isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue, bool isMarkerWithProfiling) const;
 
     MOCKABLE_VIRTUAL void obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes, bool clearAllDependencies, bool blitEnqueue);
     void storeProperties(const cl_queue_properties *properties);
diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h
index 58dc8c42aa..347d6420cd 100644
--- a/opencl/source/command_queue/command_queue_hw.h
+++ b/opencl/source/command_queue/command_queue_hw.h
@@ -461,7 +461,7 @@ class CommandQueueHw : public CommandQueue {
         bool profilingRequired = (this->isProfilingEnabled() && eventsRequest.outEvent);
         bool perfCountersRequired = (this->isPerfCountersEnabled() && eventsRequest.outEvent);
 
-        if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue)) {
+        if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue, isMarkerWithProfiling)) {
             constexpr size_t additionalAllocationSize = CSRequirements::csOverfetchSize;
             constexpr size_t allocationSize = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize;
             commandStream = new LinearStream();
diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h
index e1441aa5fb..efa910773d 100644
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -812,7 +812,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
 
     TimeStampData submitTimeStamp = {};
     if (isProfilingEnabled() && eventBuilder.getEvent()) {
-        this->getDevice().getOSTime()->getCpuTime(&submitTimeStamp.CPUTimeinNS);
+        this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp);
         eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp);
 
         auto hwTimestampNode = eventBuilder.getEvent()->getHwTimeStampNode();
diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp
index aee3725334..6026860eeb 100644
--- a/opencl/source/event/event.cpp
+++ b/opencl/source/event/event.cpp
@@ -160,21 +160,19 @@ cl_int Event::getEventProfilingInfo(cl_profiling_info paramName,
         return CL_PROFILING_INFO_NOT_AVAILABLE;
     }
 
+    uint64_t timestamp = 0u;
+
     // if paramValue is NULL, it is ignored
     switch (paramName) {
     case CL_PROFILING_COMMAND_QUEUED:
-        src = &queueTimeStamp.CPUTimeinNS;
-        if (DebugManager.flags.ReturnRawGpuTimestamps.get()) {
-            src = &queueTimeStamp.GPUTimeStamp;
-        }
+        timestamp = getTimeInNSFromTimestampData(queueTimeStamp);
+        src = &timestamp;
         srcSize = sizeof(cl_ulong);
         break;
 
     case CL_PROFILING_COMMAND_SUBMIT:
-        src = &submitTimeStamp.CPUTimeinNS;
-        if (DebugManager.flags.ReturnRawGpuTimestamps.get()) {
-            src = &submitTimeStamp.GPUTimeStamp;
-        }
+        timestamp = getTimeInNSFromTimestampData(submitTimeStamp);
+        src = &timestamp;
         srcSize = sizeof(cl_ulong);
         break;
 
@@ -249,6 +247,26 @@ cl_ulong Event::getDelta(cl_ulong startTime,
     return Delta;
 }
 
+uint64_t Event::getTimeInNSFromTimestampData(const TimeStampData &timestamp) const {
+    if (isCPUProfilingPath()) {
+        return timestamp.CPUTimeinNS;
+    }
+
+    if (DebugManager.flags.ReturnRawGpuTimestamps.get()) {
+        return timestamp.GPUTimeStamp;
+    }
+
+    if (cmdQueue && DebugManager.flags.EnableDeviceBasedTimestamps.get()) {
+        auto &device = cmdQueue->getDevice();
+        auto &hwHelper = HwHelper::get(device.getHardwareInfo().platform.eRenderCoreFamily);
+        double resolution = device.getDeviceInfo().profilingTimerResolution;
+
+        return hwHelper.getGpuTimeStampInNS(timestamp.GPUTimeStamp, resolution);
+    }
+
+    return timestamp.CPUTimeinNS;
+}
+
 bool Event::calcProfilingData() {
     if (!dataCalculated && !profilingCpuPath) {
         if (timestampPacketContainer && timestampPacketContainer->peekNodes().size() > 0) {
@@ -294,23 +312,29 @@ bool Event::calcProfilingData() {
 }
 
 void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t contextEndTS, uint64_t *contextCompleteTS, uint64_t globalStartTS) {
-
     uint64_t gpuDuration = 0;
     uint64_t cpuDuration = 0;
 
     uint64_t gpuCompleteDuration = 0;
     uint64_t cpuCompleteDuration = 0;
 
-    auto &hwHelper = HwHelper::get(this->cmdQueue->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
-    auto frequency = cmdQueue->getDevice().getDeviceInfo().profilingTimerResolution;
-    auto gpuTimeStamp = queueTimeStamp.GPUTimeStamp;
+    auto &device = this->cmdQueue->getDevice();
+    auto &hwHelper = HwHelper::get(device.getHardwareInfo().platform.eRenderCoreFamily);
+    auto frequency = device.getDeviceInfo().profilingTimerResolution;
+    auto gpuQueueTimeStamp = hwHelper.getGpuTimeStampInNS(queueTimeStamp.GPUTimeStamp, frequency);
 
-    int64_t c0 = queueTimeStamp.CPUTimeinNS - hwHelper.getGpuTimeStampInNS(gpuTimeStamp, frequency);
-
-    startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency) + c0;
-    if (startTimeStamp < queueTimeStamp.CPUTimeinNS) {
-        c0 += static_cast<uint64_t>((1ULL << (hwHelper.getGlobalTimeStampBits())) * frequency);
+    if (DebugManager.flags.EnableDeviceBasedTimestamps.get()) {
+        startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency);
+        if (startTimeStamp < gpuQueueTimeStamp) {
+            startTimeStamp += static_cast<uint64_t>((1ULL << hwHelper.getGlobalTimeStampBits()) * frequency);
+        }
+    } else {
+        int64_t c0 = queueTimeStamp.CPUTimeinNS - gpuQueueTimeStamp;
         startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency) + c0;
+        if (startTimeStamp < queueTimeStamp.CPUTimeinNS) {
+            c0 += static_cast<uint64_t>((1ULL << (hwHelper.getGlobalTimeStampBits())) * frequency);
+            startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency) + c0;
+        }
     }
 
     /* calculation based on equation
diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h
index 06d47c3f67..eb17147f37 100644
--- a/opencl/source/event/event.h
+++ b/opencl/source/event/event.h
@@ -326,6 +326,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
         }
     }
 
+    uint64_t getTimeInNSFromTimestampData(const TimeStampData &timestamp) const;
     bool calcProfilingData();
     MOCKABLE_VIRTUAL void calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t contextEndTS, uint64_t *contextCompleteTS, uint64_t globalStartTS);
     MOCKABLE_VIRTUAL void synchronizeTaskCount() {
diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp
index 9027425501..8c8b891e81 100644
--- a/opencl/source/helpers/task_information.cpp
+++ b/opencl/source/helpers/task_information.cpp
@@ -390,14 +390,14 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
         commandQueue.getContext().containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext
         false);                                                                //memoryMigrationRequired
 
-    UNRECOVERABLE_IF(!kernelOperation->blitEnqueue && !commandStreamReceiver.peekTimestampPacketWriteEnabled() && commandQueue.getContext().getRootDeviceIndices().size() == 1);
-
     if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
         eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
     }
 
-    eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
-    makeTimestampPacketsResident(commandStreamReceiver);
+    if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) {
+        eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
+        makeTimestampPacketsResident(commandStreamReceiver);
+    }
 
     gtpinNotifyPreFlushTask(&commandQueue);
 
diff --git a/opencl/test/unit_test/command_queue/enqueue_marker_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_marker_tests.cpp
index e162a8d259..0b7de22ed6 100644
--- a/opencl/test/unit_test/command_queue/enqueue_marker_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_marker_tests.cpp
@@ -7,6 +7,7 @@
 
 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/test/common/cmd_parse/gen_cmd_parse.h"
+#include "shared/test/common/helpers/debug_manager_state_restore.h"
 
 #include "opencl/source/event/user_event.h"
 #include "opencl/test/unit_test/command_queue/command_enqueue_fixture.h"
@@ -252,3 +253,31 @@ HWTEST_F(MarkerTest, givenMarkerCallFollowingNdrangeCallInBatchedModeWhenWaitFor
     clReleaseEvent(eventFromMarker);
     clReleaseEvent(eventFromNdr);
 }
+
+struct MarkerWithProfilingTest : public MarkerTest {
+    void SetUp() override {
+        dbgRestore = std::make_unique<DebugManagerStateRestore>();
+        DebugManager.flags.EnableTimestampPacket.set(0);
+        MarkerTest::SetUp();
+    }
+
+    void TearDown() override {
+        MarkerTest::TearDown();
+        dbgRestore.reset(nullptr);
+    }
+
+    std::unique_ptr<DebugManagerStateRestore> dbgRestore;
+};
+
+struct WhiteBoxCommandQueue : public CommandQueue {
+    using CommandQueue::isBlockedCommandStreamRequired;
+};
+
+HWTEST_F(MarkerWithProfilingTest, givenMarkerWithProfilingAndBlockedEnqueueThenBlockedCommandStreamIsRequired) {
+    auto cmdQueueWB = static_cast<WhiteBoxCommandQueue *>(pCmdQ);
+    EventsRequest eventsRequest(0, nullptr, nullptr);
+
+    bool ret = cmdQueueWB->isBlockedCommandStreamRequired(CL_COMMAND_MARKER, eventsRequest, true, true);
+
+    EXPECT_TRUE(ret);
+}
diff --git a/opencl/test/unit_test/device/device_timers_tests.cpp b/opencl/test/unit_test/device/device_timers_tests.cpp
index fab794fa19..3c2ca3db9e 100644
--- a/opencl/test/unit_test/device/device_timers_tests.cpp
+++ b/opencl/test/unit_test/device/device_timers_tests.cpp
@@ -5,6 +5,8 @@
  *
  */
 
+#include "shared/test/common/helpers/debug_manager_state_restore.h"
+
 #include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
 #include "opencl/test/unit_test/mocks/mock_ostime.h"
 
@@ -111,4 +113,106 @@ TEST(MockOSTime, GivenNullWhenSettingOsTimeThenResolutionIsZero) {
 
     delete mDev;
 }
+
+TEST(MockOSTime, givenDeviceTimestampBaseNotEnabledWhenGetDeviceAndHostTimerThenCpuTimestampIsReturned) {
+    auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
+    mockDevice->setOSTime(new MockOSTimeWithConstTimestamp());
+
+    uint64_t deviceTS = 0u, hostTS = 0u;
+    mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
+
+    EXPECT_EQ(deviceTS, MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS);
+    EXPECT_EQ(deviceTS, hostTS);
+}
+
+TEST(MockOSTime, givenDeviceTimestampBaseEnabledWhenGetDeviceAndHostTimerThenGpuTimestampIsReturned) {
+    DebugManagerStateRestore dbgRestorer;
+    DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
+
+    auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
+    mockDevice->setOSTime(new MockOSTimeWithConstTimestamp());
+
+    uint64_t deviceTS = 0u, hostTS = 0u;
+    mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
+
+    EXPECT_EQ(deviceTS, MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
+    EXPECT_NE(deviceTS, hostTS);
+}
+
+class FailingMockOSTime : public OSTime {
+  public:
+    FailingMockOSTime() {
+        this->deviceTime = std::make_unique<MockDeviceTime>();
+    }
+
+    bool getCpuTime(uint64_t *timeStamp) override {
+        return false;
+    }
+
+    double getHostTimerResolution() const override {
+        return 0;
+    }
+
+    uint64_t getCpuRawTimestamp() override {
+        return 0;
+    }
+};
+
+TEST(MockOSTime, givenFailingOSTimeWhenGetDeviceAndHostTimerThenFalseIsReturned) {
+    auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
+    mockDevice->setOSTime(new FailingMockOSTime());
+
+    uint64_t deviceTS = 0u, hostTS = 0u;
+    bool retVal = mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
+
+    EXPECT_FALSE(retVal);
+    EXPECT_EQ(deviceTS, 0u);
+    EXPECT_EQ(hostTS, 0u);
+}
+
+class FailingMockDeviceTime : public DeviceTime {
+  public:
+    bool getCpuGpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime) override {
+        return false;
+    }
+
+    double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override {
+        return 1.0;
+    }
+
+    uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override {
+        return static_cast<uint64_t>(1000000000.0 / OSTime::getDeviceTimerResolution(hwInfo));
+    }
+};
+
+class MockOSTimeWithFailingDeviceTime : public OSTime {
+  public:
+    MockOSTimeWithFailingDeviceTime() {
+        this->deviceTime = std::make_unique<FailingMockDeviceTime>();
+    }
+
+    bool getCpuTime(uint64_t *timeStamp) override {
+        return true;
+    }
+
+    double getHostTimerResolution() const override {
+        return 0;
+    }
+
+    uint64_t getCpuRawTimestamp() override {
+        return 0;
+    }
+};
+
+TEST(MockOSTime, givenFailingDeviceTimeWhenGetDeviceAndHostTimerThenFalseIsReturned) {
+    auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
+    mockDevice->setOSTime(new MockOSTimeWithFailingDeviceTime());
+
+    uint64_t deviceTS = 0u, hostTS = 0u;
+    bool retVal = mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
+
+    EXPECT_FALSE(retVal);
+    EXPECT_EQ(deviceTS, 0u);
+}
+
 } // namespace ULT
diff --git a/opencl/test/unit_test/event/event_tests.cpp b/opencl/test/unit_test/event/event_tests.cpp
index 9d3e60877b..dadf427d06 100644
--- a/opencl/test/unit_test/event/event_tests.cpp
+++ b/opencl/test/unit_test/event/event_tests.cpp
@@ -26,6 +26,7 @@
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
 #include "opencl/test/unit_test/mocks/mock_mdi.h"
 #include "opencl/test/unit_test/mocks/mock_memory_manager.h"
+#include "opencl/test/unit_test/mocks/mock_ostime.h"
 #include "opencl/test/unit_test/mocks/mock_platform.h"
 #include "opencl/test/unit_test/mocks/mock_program.h"
 #include "opencl/test/unit_test/os_interface/mock_performance_counters.h"
@@ -700,6 +701,90 @@ TEST_F(InternalsEventTest, GivenProfilingWhenUserEventCreatedThenProfilingNotSet
     EXPECT_FALSE(event.get()->isProfilingEnabled());
 }
 
+TEST_F(InternalsEventTest, givenDeviceTimestampBaseNotEnabledWhenGetEventProfilingInfoThenCpuTimestampIsReturned) {
+    pClDevice->setOSTime(new MockOSTimeWithConstTimestamp());
+    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
+    MockEvent<Event> event(&cmdQ, CL_COMMAND_MARKER, 0, 0);
+
+    event.setCommand(std::unique_ptr<Command>(new CommandWithoutKernel(cmdQ)));
+
+    event.submitCommand(false);
+    uint64_t submitTime = 0ULL;
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submitTime, 0);
+
+    EXPECT_EQ(submitTime, MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS);
+}
+
+TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledWhenGetEventProfilingInfoThenGpuTimestampIsReturned) {
+    DebugManagerStateRestore dbgRestorer;
+    DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
+
+    pClDevice->setOSTime(new MockOSTimeWithConstTimestamp());
+    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
+    MockEvent<Event> event(&cmdQ, CL_COMMAND_MARKER, 0, 0);
+
+    event.setCommand(std::unique_ptr<Command>(new CommandWithoutKernel(cmdQ)));
+
+    event.submitCommand(false);
+    uint64_t submitTime = 0ULL;
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submitTime, 0);
+
+    auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+    EXPECT_EQ(submitTime, static_cast<uint64_t>(MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP * resolution));
+}
+
+TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledWhenCalculateStartTimestampThenCorrectTimeIsReturned) {
+    DebugManagerStateRestore dbgRestorer;
+    DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
+
+    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
+    MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
+
+    HwTimeStamps timestamp{};
+    timestamp.GlobalStartTS = 2;
+    event.queueTimeStamp.GPUTimeStamp = 1;
+    TagNode<HwTimeStamps> timestampNode{};
+    timestampNode.tagForCpuAccess = &timestamp;
+    event.timeStampNode = &timestampNode;
+
+    uint64_t start;
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
+
+    auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+    EXPECT_EQ(start, static_cast<uint64_t>(timestamp.GlobalStartTS * resolution));
+
+    event.timeStampNode = nullptr;
+}
+
+TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWhenCalculateStartTimestampThenCorrectTimeIsReturned) {
+    DebugManagerStateRestore dbgRestorer;
+    DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
+
+    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
+    MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
+
+    HwTimeStamps timestamp{};
+    timestamp.GlobalStartTS = 1;
+    event.queueTimeStamp.GPUTimeStamp = 2;
+    TagNode<HwTimeStamps> timestampNode{};
+    timestampNode.tagForCpuAccess = &timestamp;
+    event.timeStampNode = &timestampNode;
+
+    uint64_t start = 0u;
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
+
+    auto &hwHelper = HwHelper::get(pClDevice->getHardwareInfo().platform.eRenderCoreFamily);
+    auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+    auto refStartTime = static_cast<uint64_t>(timestamp.GlobalStartTS * resolution + (1ULL << hwHelper.getGlobalTimeStampBits()) * resolution);
+    EXPECT_EQ(start, refStartTime);
+
+    event.timeStampNode = nullptr;
+}
+
 TEST_F(InternalsEventTest, GivenProfilingWHENMapOperationTHENTimesSet) {
     const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
     MockCommandQueue *pCmdQ = new MockCommandQueue(mockContext, pClDevice, props, false);
diff --git a/opencl/test/unit_test/mocks/mock_ostime.h b/opencl/test/unit_test/mocks/mock_ostime.h
index bf30b894c4..2a6197b4d3 100644
--- a/opencl/test/unit_test/mocks/mock_ostime.h
+++ b/opencl/test/unit_test/mocks/mock_ostime.h
@@ -47,4 +47,44 @@ class MockOSTime : public OSTime {
         return std::unique_ptr<OSTime>(new MockOSTime());
     }
 };
+
+class MockDeviceTimeWithConstTimestamp : public DeviceTime {
+  public:
+    static constexpr uint64_t CPU_TIME_IN_NS = 1u;
+    static constexpr uint64_t GPU_TIMESTAMP = 2u;
+
+    bool getCpuGpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime) override {
+        pGpuCpuTime->GPUTimeStamp = GPU_TIMESTAMP;
+        pGpuCpuTime->CPUTimeinNS = CPU_TIME_IN_NS;
+        return true;
+    }
+
+    double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override {
+        return 1.0;
+    }
+
+    uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override {
+        return static_cast<uint64_t>(1000000000.0 / OSTime::getDeviceTimerResolution(hwInfo));
+    }
+};
+
+class MockOSTimeWithConstTimestamp : public OSTime {
+  public:
+    MockOSTimeWithConstTimestamp() {
+        this->deviceTime = std::make_unique<MockDeviceTimeWithConstTimestamp>();
+    }
+
+    bool getCpuTime(uint64_t *timeStamp) override {
+        *timeStamp = MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS;
+        return true;
+    }
+
+    double getHostTimerResolution() const override {
+        return 0;
+    }
+
+    uint64_t getCpuRawTimestamp() override {
+        return 0;
+    }
+};
 } // namespace NEO
diff --git a/opencl/test/unit_test/profiling/profiling_tests.cpp b/opencl/test/unit_test/profiling/profiling_tests.cpp
index ef578203e9..d03266ab36 100644
--- a/opencl/test/unit_test/profiling/profiling_tests.cpp
+++ b/opencl/test/unit_test/profiling/profiling_tests.cpp
@@ -181,7 +181,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfolingWhenWa
     clReleaseEvent(event);
 }
 
-HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNonBlockedEnqueueIsExecutedThenSubmittedTimestampDoesntHaveGPUTime) {
+HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNonBlockedEnqueueIsExecutedThenSubmittedTimestampHasGPUTime) {
     MockKernel kernel(program.get(), kernelInfo, *pClDevice);
     ASSERT_EQ(CL_SUCCESS, kernel.initialize());
 
@@ -203,8 +203,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNo
     auto mockEvent = static_cast<MockEvent<Event> *>(event);
     EXPECT_NE(0u, mockEvent->queueTimeStamp.GPUTimeStamp);
     EXPECT_NE(0u, mockEvent->queueTimeStamp.CPUTimeinNS);
+    EXPECT_LT(mockEvent->queueTimeStamp.GPUTimeStamp, mockEvent->submitTimeStamp.GPUTimeStamp);
     EXPECT_LT(mockEvent->queueTimeStamp.CPUTimeinNS, mockEvent->submitTimeStamp.CPUTimeinNS);
-    EXPECT_EQ(0u, mockEvent->submitTimeStamp.GPUTimeStamp);
 
     clReleaseEvent(event);
 }
@@ -455,6 +455,71 @@ HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenNonBlockedEnqueueThenSetGpuPath)
     eventObj->release();
 }
 
+HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenBlockedEnqueueThenSetGpuPath) {
+    cl_event event = nullptr;
+    cl_event userEvent = new UserEvent();
+    pCmdQ->enqueueMarkerWithWaitList(1, &userEvent, &event);
+
+    auto eventObj = static_cast<Event *>(event);
+    EXPECT_FALSE(eventObj->isCPUProfilingPath());
+
+    auto userEventObj = static_cast<UserEvent *>(userEvent);
+
+    pCmdQ->flush();
+    userEventObj->setStatus(CL_COMPLETE);
+    Event::waitForEvents(1, &event);
+
+    uint64_t queued = 0u, submit = 0u;
+    cl_int retVal;
+
+    retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queued, 0);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    EXPECT_LT(0u, queued);
+    EXPECT_LT(queued, submit);
+
+    eventObj->release();
+    userEventObj->release();
+}
+
+HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenBlockedEnqueueThenPipeControlsArePresentInCS) {
+    typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
+
+    cl_event event = nullptr;
+    cl_event userEvent = new UserEvent();
+    static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueMarkerWithWaitList(1, &userEvent, &event);
+
+    auto eventObj = static_cast<Event *>(event);
+    EXPECT_FALSE(eventObj->isCPUProfilingPath());
+
+    auto userEventObj = static_cast<UserEvent *>(userEvent);
+
+    pCmdQ->flush();
+    userEventObj->setStatus(CL_COMPLETE);
+    Event::waitForEvents(1, &event);
+
+    parseCommands<FamilyType>(*pCmdQ);
+
+    // Check PIPE_CONTROLs
+    auto itorFirstPC = find<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
+    ASSERT_NE(cmdList.end(), itorFirstPC);
+    auto pFirstPC = genCmdCast<PIPE_CONTROL *>(*itorFirstPC);
+    ASSERT_NE(nullptr, pFirstPC);
+
+    auto itorSecondPC = find<PIPE_CONTROL *>(itorFirstPC, cmdList.end());
+    ASSERT_NE(cmdList.end(), itorSecondPC);
+    auto pSecondPC = genCmdCast<PIPE_CONTROL *>(*itorSecondPC);
+    ASSERT_NE(nullptr, pSecondPC);
+
+    EXPECT_TRUE(static_cast<MockEvent<Event> *>(event)->calcProfilingData());
+
+    eventObj->release();
+    userEventObj->release();
+    pCmdQ->isQueueBlocked();
+}
+
 template <typename TagType>
 struct MockTagNode : public TagNode<TagType> {
   public:
diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config
index b09789e7b0..ecbbaba05b 100644
--- a/opencl/test/unit_test/test_files/igdrcl.config
+++ b/opencl/test/unit_test/test_files/igdrcl.config
@@ -176,6 +176,7 @@ EnableTimestampPacket = -1
 AllocateSharedAllocationsWithCpuAndGpuStorage = -1
 UseMaxSimdSizeToDeduceMaxWorkgroupSize = 0
 ReturnRawGpuTimestamps = 0
+EnableDeviceBasedTimestamps = 0
 ForcePerDssBackedBufferProgramming = 0
 MaxHwThreadsPercent = 0
 MinHwThreadsUnoccupied = 0
diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl
index 4f807ab33b..7150b905fc 100644
--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@@ -328,6 +328,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, UsmInitialPlacement, -1, "-1: default, 0: optimi
 DECLARE_DEBUG_VARIABLE(int32_t, ForceHostPointerImport, -1, "-1: default, 0: disable, 1: enable, Forces the driver to import every host pointer coming into driver, WARNING this is not spec complaint.")
 DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger")
 DECLARE_DEBUG_VARIABLE(bool, ReturnRawGpuTimestamps, false, "Driver returns raw GPU tiemstamps instead of calculated ones.")
+DECLARE_DEBUG_VARIABLE(bool, EnableDeviceBasedTimestamps, false, "Driver returns timestamps in nanoseconds based on device timer.")
 DECLARE_DEBUG_VARIABLE(bool, ForcePerDssBackedBufferProgramming, false, "Always program per-DSS memory backed buffer in preamble")
 DECLARE_DEBUG_VARIABLE(bool, UseCommandBufferHeaderSizeForWddmQueueSubmission, true, "0: Page size (4096), 1: sizeof(COMMAND_BUFFER_HEADER)")
 DECLARE_DEBUG_VARIABLE(bool, DisableDeepBind, false, "Disable passing RTLD_DEEPBIND flag to all dlopen calls.")
diff --git a/shared/source/device/device.cpp b/shared/source/device/device.cpp
index 2deb4d9af6..c93ba7b207 100644
--- a/shared/source/device/device.cpp
+++ b/shared/source/device/device.cpp
@@ -466,14 +466,18 @@ EngineControl &Device::getEngine(uint32_t index) {
 }
 
 bool Device::getDeviceAndHostTimer(uint64_t *deviceTimestamp, uint64_t *hostTimestamp) const {
-    TimeStampData queueTimeStamp;
-    bool retVal = getOSTime()->getCpuGpuTime(&queueTimeStamp);
+    bool retVal = getOSTime()->getCpuTime(hostTimestamp);
     if (retVal) {
-        uint64_t resolution = (uint64_t)getOSTime()->getDynamicDeviceTimerResolution(getHardwareInfo());
-        *deviceTimestamp = queueTimeStamp.GPUTimeStamp * resolution;
+        TimeStampData timeStamp;
+        retVal = getOSTime()->getCpuGpuTime(&timeStamp);
+        if (retVal) {
+            if (DebugManager.flags.EnableDeviceBasedTimestamps.get()) {
+                auto resolution = getOSTime()->getDynamicDeviceTimerResolution(getHardwareInfo());
+                *deviceTimestamp = static_cast<uint64_t>(timeStamp.GPUTimeStamp * resolution);
+            } else
+                *deviceTimestamp = *hostTimestamp;
+        }
     }
-
-    retVal = getOSTime()->getCpuTime(hostTimestamp);
     return retVal;
 }
 
diff --git a/shared/source/os_interface/os_time.h b/shared/source/os_interface/os_time.h
index 08be00ebf2..08f87d00b3 100644
--- a/shared/source/os_interface/os_time.h
+++ b/shared/source/os_interface/os_time.h
@@ -16,7 +16,7 @@ class OSInterface;
 struct HardwareInfo;
 
 struct TimeStampData {
-    uint64_t GPUTimeStamp; // GPU time in ns
+    uint64_t GPUTimeStamp; // GPU time in counter ticks
     uint64_t CPUTimeinNS;  // CPU time in ns
 };