Fix event profiling for marker commands

Related-To: NEO-5799

Signed-off-by: Konstanty Misiak <konstanty.misiak@intel.com>
This commit is contained in:
Konstanty Misiak
2021-06-22 13:16:27 +00:00
committed by Compute-Runtime-Automation
parent 816e95443f
commit ad19eda689
16 changed files with 389 additions and 35 deletions

View File

@@ -780,12 +780,12 @@ bool CommandQueue::blitEnqueueImageAllowed(const size_t *origin, const size_t *r
return blitEnqueueImageAllowed;
}
bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const {
bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue, bool isMarkerWithProfiling) const {
if (!blockedQueue) {
return false;
}
if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType)) {
if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType) || isMarkerWithProfiling) {
return true;
}

View File

@@ -342,7 +342,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
cl_int enqueueUnmapMemObject(TransferProperties &transferProperties, EventsRequest &eventsRequest);
virtual void obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType){};
bool isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const;
bool isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue, bool isMarkerWithProfiling) const;
MOCKABLE_VIRTUAL void obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes, bool clearAllDependencies, bool blitEnqueue);
void storeProperties(const cl_queue_properties *properties);

View File

@@ -461,7 +461,7 @@ class CommandQueueHw : public CommandQueue {
bool profilingRequired = (this->isProfilingEnabled() && eventsRequest.outEvent);
bool perfCountersRequired = (this->isPerfCountersEnabled() && eventsRequest.outEvent);
if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue)) {
if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue, isMarkerWithProfiling)) {
constexpr size_t additionalAllocationSize = CSRequirements::csOverfetchSize;
constexpr size_t allocationSize = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize;
commandStream = new LinearStream();

View File

@@ -812,7 +812,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
TimeStampData submitTimeStamp = {};
if (isProfilingEnabled() && eventBuilder.getEvent()) {
this->getDevice().getOSTime()->getCpuTime(&submitTimeStamp.CPUTimeinNS);
this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp);
eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp);
auto hwTimestampNode = eventBuilder.getEvent()->getHwTimeStampNode();

View File

@@ -160,21 +160,19 @@ cl_int Event::getEventProfilingInfo(cl_profiling_info paramName,
return CL_PROFILING_INFO_NOT_AVAILABLE;
}
uint64_t timestamp = 0u;
// if paramValue is NULL, it is ignored
switch (paramName) {
case CL_PROFILING_COMMAND_QUEUED:
src = &queueTimeStamp.CPUTimeinNS;
if (DebugManager.flags.ReturnRawGpuTimestamps.get()) {
src = &queueTimeStamp.GPUTimeStamp;
}
timestamp = getTimeInNSFromTimestampData(queueTimeStamp);
src = &timestamp;
srcSize = sizeof(cl_ulong);
break;
case CL_PROFILING_COMMAND_SUBMIT:
src = &submitTimeStamp.CPUTimeinNS;
if (DebugManager.flags.ReturnRawGpuTimestamps.get()) {
src = &submitTimeStamp.GPUTimeStamp;
}
timestamp = getTimeInNSFromTimestampData(submitTimeStamp);
src = &timestamp;
srcSize = sizeof(cl_ulong);
break;
@@ -249,6 +247,26 @@ cl_ulong Event::getDelta(cl_ulong startTime,
return Delta;
}
uint64_t Event::getTimeInNSFromTimestampData(const TimeStampData &timestamp) const {
if (isCPUProfilingPath()) {
return timestamp.CPUTimeinNS;
}
if (DebugManager.flags.ReturnRawGpuTimestamps.get()) {
return timestamp.GPUTimeStamp;
}
if (cmdQueue && DebugManager.flags.EnableDeviceBasedTimestamps.get()) {
auto &device = cmdQueue->getDevice();
auto &hwHelper = HwHelper::get(device.getHardwareInfo().platform.eRenderCoreFamily);
double resolution = device.getDeviceInfo().profilingTimerResolution;
return hwHelper.getGpuTimeStampInNS(timestamp.GPUTimeStamp, resolution);
}
return timestamp.CPUTimeinNS;
}
bool Event::calcProfilingData() {
if (!dataCalculated && !profilingCpuPath) {
if (timestampPacketContainer && timestampPacketContainer->peekNodes().size() > 0) {
@@ -294,24 +312,30 @@ bool Event::calcProfilingData() {
}
void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t contextEndTS, uint64_t *contextCompleteTS, uint64_t globalStartTS) {
uint64_t gpuDuration = 0;
uint64_t cpuDuration = 0;
uint64_t gpuCompleteDuration = 0;
uint64_t cpuCompleteDuration = 0;
auto &hwHelper = HwHelper::get(this->cmdQueue->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
auto frequency = cmdQueue->getDevice().getDeviceInfo().profilingTimerResolution;
auto gpuTimeStamp = queueTimeStamp.GPUTimeStamp;
int64_t c0 = queueTimeStamp.CPUTimeinNS - hwHelper.getGpuTimeStampInNS(gpuTimeStamp, frequency);
auto &device = this->cmdQueue->getDevice();
auto &hwHelper = HwHelper::get(device.getHardwareInfo().platform.eRenderCoreFamily);
auto frequency = device.getDeviceInfo().profilingTimerResolution;
auto gpuQueueTimeStamp = hwHelper.getGpuTimeStampInNS(queueTimeStamp.GPUTimeStamp, frequency);
if (DebugManager.flags.EnableDeviceBasedTimestamps.get()) {
startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency);
if (startTimeStamp < gpuQueueTimeStamp) {
startTimeStamp += static_cast<uint64_t>((1ULL << hwHelper.getGlobalTimeStampBits()) * frequency);
}
} else {
int64_t c0 = queueTimeStamp.CPUTimeinNS - gpuQueueTimeStamp;
startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency) + c0;
if (startTimeStamp < queueTimeStamp.CPUTimeinNS) {
c0 += static_cast<uint64_t>((1ULL << (hwHelper.getGlobalTimeStampBits())) * frequency);
startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency) + c0;
}
}
/* calculation based on equation
CpuTime = GpuTime * scalar + const( == c0)

View File

@@ -326,6 +326,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
}
}
uint64_t getTimeInNSFromTimestampData(const TimeStampData &timestamp) const;
bool calcProfilingData();
MOCKABLE_VIRTUAL void calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t contextEndTS, uint64_t *contextCompleteTS, uint64_t globalStartTS);
MOCKABLE_VIRTUAL void synchronizeTaskCount() {

View File

@@ -390,14 +390,14 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
commandQueue.getContext().containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext
false); //memoryMigrationRequired
UNRECOVERABLE_IF(!kernelOperation->blitEnqueue && !commandStreamReceiver.peekTimestampPacketWriteEnabled() && commandQueue.getContext().getRootDeviceIndices().size() == 1);
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
}
if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) {
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
makeTimestampPacketsResident(commandStreamReceiver);
}
gtpinNotifyPreFlushTask(&commandQueue);

View File

@@ -7,6 +7,7 @@
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "opencl/source/event/user_event.h"
#include "opencl/test/unit_test/command_queue/command_enqueue_fixture.h"
@@ -252,3 +253,31 @@ HWTEST_F(MarkerTest, givenMarkerCallFollowingNdrangeCallInBatchedModeWhenWaitFor
clReleaseEvent(eventFromMarker);
clReleaseEvent(eventFromNdr);
}
struct MarkerWithProfilingTest : public MarkerTest {
void SetUp() override {
dbgRestore = std::make_unique<DebugManagerStateRestore>();
DebugManager.flags.EnableTimestampPacket.set(0);
MarkerTest::SetUp();
}
void TearDown() override {
MarkerTest::TearDown();
dbgRestore.reset(nullptr);
}
std::unique_ptr<DebugManagerStateRestore> dbgRestore;
};
struct WhiteBoxCommandQueue : public CommandQueue {
using CommandQueue::isBlockedCommandStreamRequired;
};
HWTEST_F(MarkerWithProfilingTest, givenMarkerWithProfilingAndBlockedEnqueueThenBlockedCommandStreamIsRequired) {
auto cmdQueueWB = static_cast<WhiteBoxCommandQueue *>(pCmdQ);
EventsRequest eventsRequest(0, nullptr, nullptr);
bool ret = cmdQueueWB->isBlockedCommandStreamRequired(CL_COMMAND_MARKER, eventsRequest, true, true);
EXPECT_TRUE(ret);
}

View File

@@ -5,6 +5,8 @@
*
*/
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
#include "opencl/test/unit_test/mocks/mock_ostime.h"
@@ -111,4 +113,106 @@ TEST(MockOSTime, GivenNullWhenSettingOsTimeThenResolutionIsZero) {
delete mDev;
}
TEST(MockOSTime, givenDeviceTimestampBaseNotEnabledWhenGetDeviceAndHostTimerThenCpuTimestampIsReturned) {
auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
mockDevice->setOSTime(new MockOSTimeWithConstTimestamp());
uint64_t deviceTS = 0u, hostTS = 0u;
mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
EXPECT_EQ(deviceTS, MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS);
EXPECT_EQ(deviceTS, hostTS);
}
TEST(MockOSTime, givenDeviceTimestampBaseEnabledWhenGetDeviceAndHostTimerThenGpuTimestampIsReturned) {
DebugManagerStateRestore dbgRestorer;
DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
mockDevice->setOSTime(new MockOSTimeWithConstTimestamp());
uint64_t deviceTS = 0u, hostTS = 0u;
mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
EXPECT_EQ(deviceTS, MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
EXPECT_NE(deviceTS, hostTS);
}
class FailingMockOSTime : public OSTime {
public:
FailingMockOSTime() {
this->deviceTime = std::make_unique<MockDeviceTime>();
}
bool getCpuTime(uint64_t *timeStamp) override {
return false;
}
double getHostTimerResolution() const override {
return 0;
}
uint64_t getCpuRawTimestamp() override {
return 0;
}
};
TEST(MockOSTime, givenFailingOSTimeWhenGetDeviceAndHostTimerThenFalseIsReturned) {
auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
mockDevice->setOSTime(new FailingMockOSTime());
uint64_t deviceTS = 0u, hostTS = 0u;
bool retVal = mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
EXPECT_FALSE(retVal);
EXPECT_EQ(deviceTS, 0u);
EXPECT_EQ(hostTS, 0u);
}
class FailingMockDeviceTime : public DeviceTime {
public:
bool getCpuGpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime) override {
return false;
}
double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override {
return 1.0;
}
uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override {
return static_cast<uint64_t>(1000000000.0 / OSTime::getDeviceTimerResolution(hwInfo));
}
};
class MockOSTimeWithFailingDeviceTime : public OSTime {
public:
MockOSTimeWithFailingDeviceTime() {
this->deviceTime = std::make_unique<FailingMockDeviceTime>();
}
bool getCpuTime(uint64_t *timeStamp) override {
return true;
}
double getHostTimerResolution() const override {
return 0;
}
uint64_t getCpuRawTimestamp() override {
return 0;
}
};
TEST(MockOSTime, givenFailingDeviceTimeWhenGetDeviceAndHostTimerThenFalseIsReturned) {
auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
mockDevice->setOSTime(new MockOSTimeWithFailingDeviceTime());
uint64_t deviceTS = 0u, hostTS = 0u;
bool retVal = mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
EXPECT_FALSE(retVal);
EXPECT_EQ(deviceTS, 0u);
}
} // namespace ULT

View File

@@ -26,6 +26,7 @@
#include "opencl/test/unit_test/mocks/mock_kernel.h"
#include "opencl/test/unit_test/mocks/mock_mdi.h"
#include "opencl/test/unit_test/mocks/mock_memory_manager.h"
#include "opencl/test/unit_test/mocks/mock_ostime.h"
#include "opencl/test/unit_test/mocks/mock_platform.h"
#include "opencl/test/unit_test/mocks/mock_program.h"
#include "opencl/test/unit_test/os_interface/mock_performance_counters.h"
@@ -700,6 +701,90 @@ TEST_F(InternalsEventTest, GivenProfilingWhenUserEventCreatedThenProfilingNotSet
EXPECT_FALSE(event.get()->isProfilingEnabled());
}
TEST_F(InternalsEventTest, givenDeviceTimestampBaseNotEnabledWhenGetEventProfilingInfoThenCpuTimestampIsReturned) {
pClDevice->setOSTime(new MockOSTimeWithConstTimestamp());
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
MockEvent<Event> event(&cmdQ, CL_COMMAND_MARKER, 0, 0);
event.setCommand(std::unique_ptr<Command>(new CommandWithoutKernel(cmdQ)));
event.submitCommand(false);
uint64_t submitTime = 0ULL;
event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submitTime, 0);
EXPECT_EQ(submitTime, MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS);
}
TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledWhenGetEventProfilingInfoThenGpuTimestampIsReturned) {
DebugManagerStateRestore dbgRestorer;
DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
pClDevice->setOSTime(new MockOSTimeWithConstTimestamp());
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
MockEvent<Event> event(&cmdQ, CL_COMMAND_MARKER, 0, 0);
event.setCommand(std::unique_ptr<Command>(new CommandWithoutKernel(cmdQ)));
event.submitCommand(false);
uint64_t submitTime = 0ULL;
event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submitTime, 0);
auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
EXPECT_EQ(submitTime, static_cast<uint64_t>(MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP * resolution));
}
TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledWhenCalculateStartTimestampThenCorrectTimeIsReturned) {
DebugManagerStateRestore dbgRestorer;
DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
HwTimeStamps timestamp{};
timestamp.GlobalStartTS = 2;
event.queueTimeStamp.GPUTimeStamp = 1;
TagNode<HwTimeStamps> timestampNode{};
timestampNode.tagForCpuAccess = &timestamp;
event.timeStampNode = &timestampNode;
uint64_t start;
event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
EXPECT_EQ(start, static_cast<uint64_t>(timestamp.GlobalStartTS * resolution));
event.timeStampNode = nullptr;
}
TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWhenCalculateStartTimestampThenCorrectTimeIsReturned) {
DebugManagerStateRestore dbgRestorer;
DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
HwTimeStamps timestamp{};
timestamp.GlobalStartTS = 1;
event.queueTimeStamp.GPUTimeStamp = 2;
TagNode<HwTimeStamps> timestampNode{};
timestampNode.tagForCpuAccess = &timestamp;
event.timeStampNode = &timestampNode;
uint64_t start = 0u;
event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
auto &hwHelper = HwHelper::get(pClDevice->getHardwareInfo().platform.eRenderCoreFamily);
auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
auto refStartTime = static_cast<uint64_t>(timestamp.GlobalStartTS * resolution + (1ULL << hwHelper.getGlobalTimeStampBits()) * resolution);
EXPECT_EQ(start, refStartTime);
event.timeStampNode = nullptr;
}
TEST_F(InternalsEventTest, GivenProfilingWHENMapOperationTHENTimesSet) {
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
MockCommandQueue *pCmdQ = new MockCommandQueue(mockContext, pClDevice, props, false);

View File

@@ -47,4 +47,44 @@ class MockOSTime : public OSTime {
return std::unique_ptr<OSTime>(new MockOSTime());
}
};
class MockDeviceTimeWithConstTimestamp : public DeviceTime {
public:
static constexpr uint64_t CPU_TIME_IN_NS = 1u;
static constexpr uint64_t GPU_TIMESTAMP = 2u;
bool getCpuGpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime) override {
pGpuCpuTime->GPUTimeStamp = GPU_TIMESTAMP;
pGpuCpuTime->CPUTimeinNS = CPU_TIME_IN_NS;
return true;
}
double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override {
return 1.0;
}
uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override {
return static_cast<uint64_t>(1000000000.0 / OSTime::getDeviceTimerResolution(hwInfo));
}
};
class MockOSTimeWithConstTimestamp : public OSTime {
public:
MockOSTimeWithConstTimestamp() {
this->deviceTime = std::make_unique<MockDeviceTimeWithConstTimestamp>();
}
bool getCpuTime(uint64_t *timeStamp) override {
*timeStamp = MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS;
return true;
}
double getHostTimerResolution() const override {
return 0;
}
uint64_t getCpuRawTimestamp() override {
return 0;
}
};
} // namespace NEO

View File

@@ -181,7 +181,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfolingWhenWa
clReleaseEvent(event);
}
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNonBlockedEnqueueIsExecutedThenSubmittedTimestampDoesntHaveGPUTime) {
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNonBlockedEnqueueIsExecutedThenSubmittedTimestampHasGPUTime) {
MockKernel kernel(program.get(), kernelInfo, *pClDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
@@ -203,8 +203,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNo
auto mockEvent = static_cast<MockEvent<Event> *>(event);
EXPECT_NE(0u, mockEvent->queueTimeStamp.GPUTimeStamp);
EXPECT_NE(0u, mockEvent->queueTimeStamp.CPUTimeinNS);
EXPECT_LT(mockEvent->queueTimeStamp.GPUTimeStamp, mockEvent->submitTimeStamp.GPUTimeStamp);
EXPECT_LT(mockEvent->queueTimeStamp.CPUTimeinNS, mockEvent->submitTimeStamp.CPUTimeinNS);
EXPECT_EQ(0u, mockEvent->submitTimeStamp.GPUTimeStamp);
clReleaseEvent(event);
}
@@ -455,6 +455,71 @@ HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenNonBlockedEnqueueThenSetGpuPath)
eventObj->release();
}
HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenBlockedEnqueueThenSetGpuPath) {
cl_event event = nullptr;
cl_event userEvent = new UserEvent();
pCmdQ->enqueueMarkerWithWaitList(1, &userEvent, &event);
auto eventObj = static_cast<Event *>(event);
EXPECT_FALSE(eventObj->isCPUProfilingPath());
auto userEventObj = static_cast<UserEvent *>(userEvent);
pCmdQ->flush();
userEventObj->setStatus(CL_COMPLETE);
Event::waitForEvents(1, &event);
uint64_t queued = 0u, submit = 0u;
cl_int retVal;
retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queued, 0);
EXPECT_EQ(CL_SUCCESS, retVal);
retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_LT(0u, queued);
EXPECT_LT(queued, submit);
eventObj->release();
userEventObj->release();
}
HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenBlockedEnqueueThenPipeControlsArePresentInCS) {
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
cl_event event = nullptr;
cl_event userEvent = new UserEvent();
static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueMarkerWithWaitList(1, &userEvent, &event);
auto eventObj = static_cast<Event *>(event);
EXPECT_FALSE(eventObj->isCPUProfilingPath());
auto userEventObj = static_cast<UserEvent *>(userEvent);
pCmdQ->flush();
userEventObj->setStatus(CL_COMPLETE);
Event::waitForEvents(1, &event);
parseCommands<FamilyType>(*pCmdQ);
// Check PIPE_CONTROLs
auto itorFirstPC = find<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), itorFirstPC);
auto pFirstPC = genCmdCast<PIPE_CONTROL *>(*itorFirstPC);
ASSERT_NE(nullptr, pFirstPC);
auto itorSecondPC = find<PIPE_CONTROL *>(itorFirstPC, cmdList.end());
ASSERT_NE(cmdList.end(), itorSecondPC);
auto pSecondPC = genCmdCast<PIPE_CONTROL *>(*itorSecondPC);
ASSERT_NE(nullptr, pSecondPC);
EXPECT_TRUE(static_cast<MockEvent<Event> *>(event)->calcProfilingData());
eventObj->release();
userEventObj->release();
pCmdQ->isQueueBlocked();
}
template <typename TagType>
struct MockTagNode : public TagNode<TagType> {
public:

View File

@@ -176,6 +176,7 @@ EnableTimestampPacket = -1
AllocateSharedAllocationsWithCpuAndGpuStorage = -1
UseMaxSimdSizeToDeduceMaxWorkgroupSize = 0
ReturnRawGpuTimestamps = 0
EnableDeviceBasedTimestamps = 0
ForcePerDssBackedBufferProgramming = 0
MaxHwThreadsPercent = 0
MinHwThreadsUnoccupied = 0

View File

@@ -328,6 +328,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, UsmInitialPlacement, -1, "-1: default, 0: optimi
DECLARE_DEBUG_VARIABLE(int32_t, ForceHostPointerImport, -1, "-1: default, 0: disable, 1: enable, Forces the driver to import every host pointer coming into driver, WARNING this is not spec complaint.")
DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger")
DECLARE_DEBUG_VARIABLE(bool, ReturnRawGpuTimestamps, false, "Driver returns raw GPU tiemstamps instead of calculated ones.")
DECLARE_DEBUG_VARIABLE(bool, EnableDeviceBasedTimestamps, false, "Driver returns timestamps in nanoseconds based on device timer.")
DECLARE_DEBUG_VARIABLE(bool, ForcePerDssBackedBufferProgramming, false, "Always program per-DSS memory backed buffer in preamble")
DECLARE_DEBUG_VARIABLE(bool, UseCommandBufferHeaderSizeForWddmQueueSubmission, true, "0: Page size (4096), 1: sizeof(COMMAND_BUFFER_HEADER)")
DECLARE_DEBUG_VARIABLE(bool, DisableDeepBind, false, "Disable passing RTLD_DEEPBIND flag to all dlopen calls.")

View File

@@ -466,14 +466,18 @@ EngineControl &Device::getEngine(uint32_t index) {
}
bool Device::getDeviceAndHostTimer(uint64_t *deviceTimestamp, uint64_t *hostTimestamp) const {
TimeStampData queueTimeStamp;
bool retVal = getOSTime()->getCpuGpuTime(&queueTimeStamp);
bool retVal = getOSTime()->getCpuTime(hostTimestamp);
if (retVal) {
uint64_t resolution = (uint64_t)getOSTime()->getDynamicDeviceTimerResolution(getHardwareInfo());
*deviceTimestamp = queueTimeStamp.GPUTimeStamp * resolution;
TimeStampData timeStamp;
retVal = getOSTime()->getCpuGpuTime(&timeStamp);
if (retVal) {
if (DebugManager.flags.EnableDeviceBasedTimestamps.get()) {
auto resolution = getOSTime()->getDynamicDeviceTimerResolution(getHardwareInfo());
*deviceTimestamp = static_cast<uint64_t>(timeStamp.GPUTimeStamp * resolution);
} else
*deviceTimestamp = *hostTimestamp;
}
}
retVal = getOSTime()->getCpuTime(hostTimestamp);
return retVal;
}

View File

@@ -16,7 +16,7 @@ class OSInterface;
struct HardwareInfo;
struct TimeStampData {
uint64_t GPUTimeStamp; // GPU time in ns
uint64_t GPUTimeStamp; // GPU time in counter ticks
uint64_t CPUTimeinNS; // CPU time in ns
};