mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-08 22:12:59 +08:00
Fix event profiling for marker commands
Related-To: NEO-5799 Signed-off-by: Konstanty Misiak <konstanty.misiak@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
816e95443f
commit
ad19eda689
@@ -780,12 +780,12 @@ bool CommandQueue::blitEnqueueImageAllowed(const size_t *origin, const size_t *r
|
||||
return blitEnqueueImageAllowed;
|
||||
}
|
||||
|
||||
bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const {
|
||||
bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue, bool isMarkerWithProfiling) const {
|
||||
if (!blockedQueue) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType)) {
|
||||
if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType) || isMarkerWithProfiling) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -342,7 +342,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
cl_int enqueueUnmapMemObject(TransferProperties &transferProperties, EventsRequest &eventsRequest);
|
||||
|
||||
virtual void obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType){};
|
||||
bool isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue) const;
|
||||
bool isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue, bool isMarkerWithProfiling) const;
|
||||
|
||||
MOCKABLE_VIRTUAL void obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes, bool clearAllDependencies, bool blitEnqueue);
|
||||
void storeProperties(const cl_queue_properties *properties);
|
||||
|
||||
@@ -461,7 +461,7 @@ class CommandQueueHw : public CommandQueue {
|
||||
bool profilingRequired = (this->isProfilingEnabled() && eventsRequest.outEvent);
|
||||
bool perfCountersRequired = (this->isPerfCountersEnabled() && eventsRequest.outEvent);
|
||||
|
||||
if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue)) {
|
||||
if (isBlockedCommandStreamRequired(commandType, eventsRequest, blockedQueue, isMarkerWithProfiling)) {
|
||||
constexpr size_t additionalAllocationSize = CSRequirements::csOverfetchSize;
|
||||
constexpr size_t allocationSize = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize;
|
||||
commandStream = new LinearStream();
|
||||
|
||||
@@ -812,7 +812,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
|
||||
|
||||
TimeStampData submitTimeStamp = {};
|
||||
if (isProfilingEnabled() && eventBuilder.getEvent()) {
|
||||
this->getDevice().getOSTime()->getCpuTime(&submitTimeStamp.CPUTimeinNS);
|
||||
this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp);
|
||||
eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp);
|
||||
|
||||
auto hwTimestampNode = eventBuilder.getEvent()->getHwTimeStampNode();
|
||||
|
||||
@@ -160,21 +160,19 @@ cl_int Event::getEventProfilingInfo(cl_profiling_info paramName,
|
||||
return CL_PROFILING_INFO_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
uint64_t timestamp = 0u;
|
||||
|
||||
// if paramValue is NULL, it is ignored
|
||||
switch (paramName) {
|
||||
case CL_PROFILING_COMMAND_QUEUED:
|
||||
src = &queueTimeStamp.CPUTimeinNS;
|
||||
if (DebugManager.flags.ReturnRawGpuTimestamps.get()) {
|
||||
src = &queueTimeStamp.GPUTimeStamp;
|
||||
}
|
||||
timestamp = getTimeInNSFromTimestampData(queueTimeStamp);
|
||||
src = ×tamp;
|
||||
srcSize = sizeof(cl_ulong);
|
||||
break;
|
||||
|
||||
case CL_PROFILING_COMMAND_SUBMIT:
|
||||
src = &submitTimeStamp.CPUTimeinNS;
|
||||
if (DebugManager.flags.ReturnRawGpuTimestamps.get()) {
|
||||
src = &submitTimeStamp.GPUTimeStamp;
|
||||
}
|
||||
timestamp = getTimeInNSFromTimestampData(submitTimeStamp);
|
||||
src = ×tamp;
|
||||
srcSize = sizeof(cl_ulong);
|
||||
break;
|
||||
|
||||
@@ -249,6 +247,26 @@ cl_ulong Event::getDelta(cl_ulong startTime,
|
||||
return Delta;
|
||||
}
|
||||
|
||||
uint64_t Event::getTimeInNSFromTimestampData(const TimeStampData ×tamp) const {
|
||||
if (isCPUProfilingPath()) {
|
||||
return timestamp.CPUTimeinNS;
|
||||
}
|
||||
|
||||
if (DebugManager.flags.ReturnRawGpuTimestamps.get()) {
|
||||
return timestamp.GPUTimeStamp;
|
||||
}
|
||||
|
||||
if (cmdQueue && DebugManager.flags.EnableDeviceBasedTimestamps.get()) {
|
||||
auto &device = cmdQueue->getDevice();
|
||||
auto &hwHelper = HwHelper::get(device.getHardwareInfo().platform.eRenderCoreFamily);
|
||||
double resolution = device.getDeviceInfo().profilingTimerResolution;
|
||||
|
||||
return hwHelper.getGpuTimeStampInNS(timestamp.GPUTimeStamp, resolution);
|
||||
}
|
||||
|
||||
return timestamp.CPUTimeinNS;
|
||||
}
|
||||
|
||||
bool Event::calcProfilingData() {
|
||||
if (!dataCalculated && !profilingCpuPath) {
|
||||
if (timestampPacketContainer && timestampPacketContainer->peekNodes().size() > 0) {
|
||||
@@ -294,24 +312,30 @@ bool Event::calcProfilingData() {
|
||||
}
|
||||
|
||||
void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t contextEndTS, uint64_t *contextCompleteTS, uint64_t globalStartTS) {
|
||||
|
||||
uint64_t gpuDuration = 0;
|
||||
uint64_t cpuDuration = 0;
|
||||
|
||||
uint64_t gpuCompleteDuration = 0;
|
||||
uint64_t cpuCompleteDuration = 0;
|
||||
|
||||
auto &hwHelper = HwHelper::get(this->cmdQueue->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
|
||||
auto frequency = cmdQueue->getDevice().getDeviceInfo().profilingTimerResolution;
|
||||
auto gpuTimeStamp = queueTimeStamp.GPUTimeStamp;
|
||||
|
||||
int64_t c0 = queueTimeStamp.CPUTimeinNS - hwHelper.getGpuTimeStampInNS(gpuTimeStamp, frequency);
|
||||
auto &device = this->cmdQueue->getDevice();
|
||||
auto &hwHelper = HwHelper::get(device.getHardwareInfo().platform.eRenderCoreFamily);
|
||||
auto frequency = device.getDeviceInfo().profilingTimerResolution;
|
||||
auto gpuQueueTimeStamp = hwHelper.getGpuTimeStampInNS(queueTimeStamp.GPUTimeStamp, frequency);
|
||||
|
||||
if (DebugManager.flags.EnableDeviceBasedTimestamps.get()) {
|
||||
startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency);
|
||||
if (startTimeStamp < gpuQueueTimeStamp) {
|
||||
startTimeStamp += static_cast<uint64_t>((1ULL << hwHelper.getGlobalTimeStampBits()) * frequency);
|
||||
}
|
||||
} else {
|
||||
int64_t c0 = queueTimeStamp.CPUTimeinNS - gpuQueueTimeStamp;
|
||||
startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency) + c0;
|
||||
if (startTimeStamp < queueTimeStamp.CPUTimeinNS) {
|
||||
c0 += static_cast<uint64_t>((1ULL << (hwHelper.getGlobalTimeStampBits())) * frequency);
|
||||
startTimeStamp = static_cast<uint64_t>(globalStartTS * frequency) + c0;
|
||||
}
|
||||
}
|
||||
|
||||
/* calculation based on equation
|
||||
CpuTime = GpuTime * scalar + const( == c0)
|
||||
|
||||
@@ -326,6 +326,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t getTimeInNSFromTimestampData(const TimeStampData ×tamp) const;
|
||||
bool calcProfilingData();
|
||||
MOCKABLE_VIRTUAL void calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t contextEndTS, uint64_t *contextCompleteTS, uint64_t globalStartTS);
|
||||
MOCKABLE_VIRTUAL void synchronizeTaskCount() {
|
||||
|
||||
@@ -390,14 +390,14 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
|
||||
commandQueue.getContext().containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext
|
||||
false); //memoryMigrationRequired
|
||||
|
||||
UNRECOVERABLE_IF(!kernelOperation->blitEnqueue && !commandStreamReceiver.peekTimestampPacketWriteEnabled() && commandQueue.getContext().getRootDeviceIndices().size() == 1);
|
||||
|
||||
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
|
||||
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
|
||||
}
|
||||
|
||||
if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) {
|
||||
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
|
||||
makeTimestampPacketsResident(commandStreamReceiver);
|
||||
}
|
||||
|
||||
gtpinNotifyPreFlushTask(&commandQueue);
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "shared/source/command_stream/command_stream_receiver.h"
|
||||
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
|
||||
#include "opencl/source/event/user_event.h"
|
||||
#include "opencl/test/unit_test/command_queue/command_enqueue_fixture.h"
|
||||
@@ -252,3 +253,31 @@ HWTEST_F(MarkerTest, givenMarkerCallFollowingNdrangeCallInBatchedModeWhenWaitFor
|
||||
clReleaseEvent(eventFromMarker);
|
||||
clReleaseEvent(eventFromNdr);
|
||||
}
|
||||
|
||||
struct MarkerWithProfilingTest : public MarkerTest {
|
||||
void SetUp() override {
|
||||
dbgRestore = std::make_unique<DebugManagerStateRestore>();
|
||||
DebugManager.flags.EnableTimestampPacket.set(0);
|
||||
MarkerTest::SetUp();
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
MarkerTest::TearDown();
|
||||
dbgRestore.reset(nullptr);
|
||||
}
|
||||
|
||||
std::unique_ptr<DebugManagerStateRestore> dbgRestore;
|
||||
};
|
||||
|
||||
struct WhiteBoxCommandQueue : public CommandQueue {
|
||||
using CommandQueue::isBlockedCommandStreamRequired;
|
||||
};
|
||||
|
||||
HWTEST_F(MarkerWithProfilingTest, givenMarkerWithProfilingAndBlockedEnqueueThenBlockedCommandStreamIsRequired) {
|
||||
auto cmdQueueWB = static_cast<WhiteBoxCommandQueue *>(pCmdQ);
|
||||
EventsRequest eventsRequest(0, nullptr, nullptr);
|
||||
|
||||
bool ret = cmdQueueWB->isBlockedCommandStreamRequired(CL_COMMAND_MARKER, eventsRequest, true, true);
|
||||
|
||||
EXPECT_TRUE(ret);
|
||||
}
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
|
||||
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
|
||||
#include "opencl/test/unit_test/mocks/mock_ostime.h"
|
||||
|
||||
@@ -111,4 +113,106 @@ TEST(MockOSTime, GivenNullWhenSettingOsTimeThenResolutionIsZero) {
|
||||
|
||||
delete mDev;
|
||||
}
|
||||
|
||||
TEST(MockOSTime, givenDeviceTimestampBaseNotEnabledWhenGetDeviceAndHostTimerThenCpuTimestampIsReturned) {
|
||||
auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
|
||||
mockDevice->setOSTime(new MockOSTimeWithConstTimestamp());
|
||||
|
||||
uint64_t deviceTS = 0u, hostTS = 0u;
|
||||
mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
|
||||
|
||||
EXPECT_EQ(deviceTS, MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS);
|
||||
EXPECT_EQ(deviceTS, hostTS);
|
||||
}
|
||||
|
||||
TEST(MockOSTime, givenDeviceTimestampBaseEnabledWhenGetDeviceAndHostTimerThenGpuTimestampIsReturned) {
|
||||
DebugManagerStateRestore dbgRestorer;
|
||||
DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
|
||||
|
||||
auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
|
||||
mockDevice->setOSTime(new MockOSTimeWithConstTimestamp());
|
||||
|
||||
uint64_t deviceTS = 0u, hostTS = 0u;
|
||||
mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
|
||||
|
||||
EXPECT_EQ(deviceTS, MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP);
|
||||
EXPECT_NE(deviceTS, hostTS);
|
||||
}
|
||||
|
||||
class FailingMockOSTime : public OSTime {
|
||||
public:
|
||||
FailingMockOSTime() {
|
||||
this->deviceTime = std::make_unique<MockDeviceTime>();
|
||||
}
|
||||
|
||||
bool getCpuTime(uint64_t *timeStamp) override {
|
||||
return false;
|
||||
}
|
||||
|
||||
double getHostTimerResolution() const override {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t getCpuRawTimestamp() override {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
TEST(MockOSTime, givenFailingOSTimeWhenGetDeviceAndHostTimerThenFalseIsReturned) {
|
||||
auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
|
||||
mockDevice->setOSTime(new FailingMockOSTime());
|
||||
|
||||
uint64_t deviceTS = 0u, hostTS = 0u;
|
||||
bool retVal = mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
|
||||
|
||||
EXPECT_FALSE(retVal);
|
||||
EXPECT_EQ(deviceTS, 0u);
|
||||
EXPECT_EQ(hostTS, 0u);
|
||||
}
|
||||
|
||||
class FailingMockDeviceTime : public DeviceTime {
|
||||
public:
|
||||
bool getCpuGpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime) override {
|
||||
return false;
|
||||
}
|
||||
|
||||
double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override {
|
||||
return static_cast<uint64_t>(1000000000.0 / OSTime::getDeviceTimerResolution(hwInfo));
|
||||
}
|
||||
};
|
||||
|
||||
class MockOSTimeWithFailingDeviceTime : public OSTime {
|
||||
public:
|
||||
MockOSTimeWithFailingDeviceTime() {
|
||||
this->deviceTime = std::make_unique<FailingMockDeviceTime>();
|
||||
}
|
||||
|
||||
bool getCpuTime(uint64_t *timeStamp) override {
|
||||
return true;
|
||||
}
|
||||
|
||||
double getHostTimerResolution() const override {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t getCpuRawTimestamp() override {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
TEST(MockOSTime, givenFailingDeviceTimeWhenGetDeviceAndHostTimerThenFalseIsReturned) {
|
||||
auto mockDevice = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
|
||||
mockDevice->setOSTime(new MockOSTimeWithFailingDeviceTime());
|
||||
|
||||
uint64_t deviceTS = 0u, hostTS = 0u;
|
||||
bool retVal = mockDevice->getDeviceAndHostTimer(&deviceTS, &hostTS);
|
||||
|
||||
EXPECT_FALSE(retVal);
|
||||
EXPECT_EQ(deviceTS, 0u);
|
||||
}
|
||||
|
||||
} // namespace ULT
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include "opencl/test/unit_test/mocks/mock_kernel.h"
|
||||
#include "opencl/test/unit_test/mocks/mock_mdi.h"
|
||||
#include "opencl/test/unit_test/mocks/mock_memory_manager.h"
|
||||
#include "opencl/test/unit_test/mocks/mock_ostime.h"
|
||||
#include "opencl/test/unit_test/mocks/mock_platform.h"
|
||||
#include "opencl/test/unit_test/mocks/mock_program.h"
|
||||
#include "opencl/test/unit_test/os_interface/mock_performance_counters.h"
|
||||
@@ -700,6 +701,90 @@ TEST_F(InternalsEventTest, GivenProfilingWhenUserEventCreatedThenProfilingNotSet
|
||||
EXPECT_FALSE(event.get()->isProfilingEnabled());
|
||||
}
|
||||
|
||||
TEST_F(InternalsEventTest, givenDeviceTimestampBaseNotEnabledWhenGetEventProfilingInfoThenCpuTimestampIsReturned) {
|
||||
pClDevice->setOSTime(new MockOSTimeWithConstTimestamp());
|
||||
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
|
||||
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
|
||||
MockEvent<Event> event(&cmdQ, CL_COMMAND_MARKER, 0, 0);
|
||||
|
||||
event.setCommand(std::unique_ptr<Command>(new CommandWithoutKernel(cmdQ)));
|
||||
|
||||
event.submitCommand(false);
|
||||
uint64_t submitTime = 0ULL;
|
||||
event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submitTime, 0);
|
||||
|
||||
EXPECT_EQ(submitTime, MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS);
|
||||
}
|
||||
|
||||
TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledWhenGetEventProfilingInfoThenGpuTimestampIsReturned) {
|
||||
DebugManagerStateRestore dbgRestorer;
|
||||
DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
|
||||
|
||||
pClDevice->setOSTime(new MockOSTimeWithConstTimestamp());
|
||||
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
|
||||
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
|
||||
MockEvent<Event> event(&cmdQ, CL_COMMAND_MARKER, 0, 0);
|
||||
|
||||
event.setCommand(std::unique_ptr<Command>(new CommandWithoutKernel(cmdQ)));
|
||||
|
||||
event.submitCommand(false);
|
||||
uint64_t submitTime = 0ULL;
|
||||
event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submitTime, 0);
|
||||
|
||||
auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
|
||||
EXPECT_EQ(submitTime, static_cast<uint64_t>(MockDeviceTimeWithConstTimestamp::GPU_TIMESTAMP * resolution));
|
||||
}
|
||||
|
||||
TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledWhenCalculateStartTimestampThenCorrectTimeIsReturned) {
|
||||
DebugManagerStateRestore dbgRestorer;
|
||||
DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
|
||||
|
||||
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
|
||||
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
|
||||
MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
|
||||
|
||||
HwTimeStamps timestamp{};
|
||||
timestamp.GlobalStartTS = 2;
|
||||
event.queueTimeStamp.GPUTimeStamp = 1;
|
||||
TagNode<HwTimeStamps> timestampNode{};
|
||||
timestampNode.tagForCpuAccess = ×tamp;
|
||||
event.timeStampNode = ×tampNode;
|
||||
|
||||
uint64_t start;
|
||||
event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
|
||||
|
||||
auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
|
||||
EXPECT_EQ(start, static_cast<uint64_t>(timestamp.GlobalStartTS * resolution));
|
||||
|
||||
event.timeStampNode = nullptr;
|
||||
}
|
||||
|
||||
TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWhenCalculateStartTimestampThenCorrectTimeIsReturned) {
|
||||
DebugManagerStateRestore dbgRestorer;
|
||||
DebugManager.flags.EnableDeviceBasedTimestamps.set(true);
|
||||
|
||||
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
|
||||
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
|
||||
MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
|
||||
|
||||
HwTimeStamps timestamp{};
|
||||
timestamp.GlobalStartTS = 1;
|
||||
event.queueTimeStamp.GPUTimeStamp = 2;
|
||||
TagNode<HwTimeStamps> timestampNode{};
|
||||
timestampNode.tagForCpuAccess = ×tamp;
|
||||
event.timeStampNode = ×tampNode;
|
||||
|
||||
uint64_t start = 0u;
|
||||
event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
|
||||
|
||||
auto &hwHelper = HwHelper::get(pClDevice->getHardwareInfo().platform.eRenderCoreFamily);
|
||||
auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
|
||||
auto refStartTime = static_cast<uint64_t>(timestamp.GlobalStartTS * resolution + (1ULL << hwHelper.getGlobalTimeStampBits()) * resolution);
|
||||
EXPECT_EQ(start, refStartTime);
|
||||
|
||||
event.timeStampNode = nullptr;
|
||||
}
|
||||
|
||||
TEST_F(InternalsEventTest, GivenProfilingWHENMapOperationTHENTimesSet) {
|
||||
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
|
||||
MockCommandQueue *pCmdQ = new MockCommandQueue(mockContext, pClDevice, props, false);
|
||||
|
||||
@@ -47,4 +47,44 @@ class MockOSTime : public OSTime {
|
||||
return std::unique_ptr<OSTime>(new MockOSTime());
|
||||
}
|
||||
};
|
||||
|
||||
class MockDeviceTimeWithConstTimestamp : public DeviceTime {
|
||||
public:
|
||||
static constexpr uint64_t CPU_TIME_IN_NS = 1u;
|
||||
static constexpr uint64_t GPU_TIMESTAMP = 2u;
|
||||
|
||||
bool getCpuGpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime) override {
|
||||
pGpuCpuTime->GPUTimeStamp = GPU_TIMESTAMP;
|
||||
pGpuCpuTime->CPUTimeinNS = CPU_TIME_IN_NS;
|
||||
return true;
|
||||
}
|
||||
|
||||
double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override {
|
||||
return static_cast<uint64_t>(1000000000.0 / OSTime::getDeviceTimerResolution(hwInfo));
|
||||
}
|
||||
};
|
||||
|
||||
class MockOSTimeWithConstTimestamp : public OSTime {
|
||||
public:
|
||||
MockOSTimeWithConstTimestamp() {
|
||||
this->deviceTime = std::make_unique<MockDeviceTimeWithConstTimestamp>();
|
||||
}
|
||||
|
||||
bool getCpuTime(uint64_t *timeStamp) override {
|
||||
*timeStamp = MockDeviceTimeWithConstTimestamp::CPU_TIME_IN_NS;
|
||||
return true;
|
||||
}
|
||||
|
||||
double getHostTimerResolution() const override {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t getCpuRawTimestamp() override {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
} // namespace NEO
|
||||
|
||||
@@ -181,7 +181,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfolingWhenWa
|
||||
clReleaseEvent(event);
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNonBlockedEnqueueIsExecutedThenSubmittedTimestampDoesntHaveGPUTime) {
|
||||
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNonBlockedEnqueueIsExecutedThenSubmittedTimestampHasGPUTime) {
|
||||
MockKernel kernel(program.get(), kernelInfo, *pClDevice);
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
@@ -203,8 +203,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNo
|
||||
auto mockEvent = static_cast<MockEvent<Event> *>(event);
|
||||
EXPECT_NE(0u, mockEvent->queueTimeStamp.GPUTimeStamp);
|
||||
EXPECT_NE(0u, mockEvent->queueTimeStamp.CPUTimeinNS);
|
||||
EXPECT_LT(mockEvent->queueTimeStamp.GPUTimeStamp, mockEvent->submitTimeStamp.GPUTimeStamp);
|
||||
EXPECT_LT(mockEvent->queueTimeStamp.CPUTimeinNS, mockEvent->submitTimeStamp.CPUTimeinNS);
|
||||
EXPECT_EQ(0u, mockEvent->submitTimeStamp.GPUTimeStamp);
|
||||
|
||||
clReleaseEvent(event);
|
||||
}
|
||||
@@ -455,6 +455,71 @@ HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenNonBlockedEnqueueThenSetGpuPath)
|
||||
eventObj->release();
|
||||
}
|
||||
|
||||
HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenBlockedEnqueueThenSetGpuPath) {
|
||||
cl_event event = nullptr;
|
||||
cl_event userEvent = new UserEvent();
|
||||
pCmdQ->enqueueMarkerWithWaitList(1, &userEvent, &event);
|
||||
|
||||
auto eventObj = static_cast<Event *>(event);
|
||||
EXPECT_FALSE(eventObj->isCPUProfilingPath());
|
||||
|
||||
auto userEventObj = static_cast<UserEvent *>(userEvent);
|
||||
|
||||
pCmdQ->flush();
|
||||
userEventObj->setStatus(CL_COMPLETE);
|
||||
Event::waitForEvents(1, &event);
|
||||
|
||||
uint64_t queued = 0u, submit = 0u;
|
||||
cl_int retVal;
|
||||
|
||||
retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(uint64_t), &queued, 0);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
retVal = eventObj->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(uint64_t), &submit, 0);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
EXPECT_LT(0u, queued);
|
||||
EXPECT_LT(queued, submit);
|
||||
|
||||
eventObj->release();
|
||||
userEventObj->release();
|
||||
}
|
||||
|
||||
HWTEST_F(ProfilingTests, givenMarkerEnqueueWhenBlockedEnqueueThenPipeControlsArePresentInCS) {
|
||||
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
|
||||
|
||||
cl_event event = nullptr;
|
||||
cl_event userEvent = new UserEvent();
|
||||
static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueMarkerWithWaitList(1, &userEvent, &event);
|
||||
|
||||
auto eventObj = static_cast<Event *>(event);
|
||||
EXPECT_FALSE(eventObj->isCPUProfilingPath());
|
||||
|
||||
auto userEventObj = static_cast<UserEvent *>(userEvent);
|
||||
|
||||
pCmdQ->flush();
|
||||
userEventObj->setStatus(CL_COMPLETE);
|
||||
Event::waitForEvents(1, &event);
|
||||
|
||||
parseCommands<FamilyType>(*pCmdQ);
|
||||
|
||||
// Check PIPE_CONTROLs
|
||||
auto itorFirstPC = find<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(cmdList.end(), itorFirstPC);
|
||||
auto pFirstPC = genCmdCast<PIPE_CONTROL *>(*itorFirstPC);
|
||||
ASSERT_NE(nullptr, pFirstPC);
|
||||
|
||||
auto itorSecondPC = find<PIPE_CONTROL *>(itorFirstPC, cmdList.end());
|
||||
ASSERT_NE(cmdList.end(), itorSecondPC);
|
||||
auto pSecondPC = genCmdCast<PIPE_CONTROL *>(*itorSecondPC);
|
||||
ASSERT_NE(nullptr, pSecondPC);
|
||||
|
||||
EXPECT_TRUE(static_cast<MockEvent<Event> *>(event)->calcProfilingData());
|
||||
|
||||
eventObj->release();
|
||||
userEventObj->release();
|
||||
pCmdQ->isQueueBlocked();
|
||||
}
|
||||
|
||||
template <typename TagType>
|
||||
struct MockTagNode : public TagNode<TagType> {
|
||||
public:
|
||||
|
||||
@@ -176,6 +176,7 @@ EnableTimestampPacket = -1
|
||||
AllocateSharedAllocationsWithCpuAndGpuStorage = -1
|
||||
UseMaxSimdSizeToDeduceMaxWorkgroupSize = 0
|
||||
ReturnRawGpuTimestamps = 0
|
||||
EnableDeviceBasedTimestamps = 0
|
||||
ForcePerDssBackedBufferProgramming = 0
|
||||
MaxHwThreadsPercent = 0
|
||||
MinHwThreadsUnoccupied = 0
|
||||
|
||||
@@ -328,6 +328,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, UsmInitialPlacement, -1, "-1: default, 0: optimi
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceHostPointerImport, -1, "-1: default, 0: disable, 1: enable, Forces the driver to import every host pointer coming into driver, WARNING this is not spec complaint.")
|
||||
DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger")
|
||||
DECLARE_DEBUG_VARIABLE(bool, ReturnRawGpuTimestamps, false, "Driver returns raw GPU tiemstamps instead of calculated ones.")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableDeviceBasedTimestamps, false, "Driver returns timestamps in nanoseconds based on device timer.")
|
||||
DECLARE_DEBUG_VARIABLE(bool, ForcePerDssBackedBufferProgramming, false, "Always program per-DSS memory backed buffer in preamble")
|
||||
DECLARE_DEBUG_VARIABLE(bool, UseCommandBufferHeaderSizeForWddmQueueSubmission, true, "0: Page size (4096), 1: sizeof(COMMAND_BUFFER_HEADER)")
|
||||
DECLARE_DEBUG_VARIABLE(bool, DisableDeepBind, false, "Disable passing RTLD_DEEPBIND flag to all dlopen calls.")
|
||||
|
||||
@@ -466,14 +466,18 @@ EngineControl &Device::getEngine(uint32_t index) {
|
||||
}
|
||||
|
||||
bool Device::getDeviceAndHostTimer(uint64_t *deviceTimestamp, uint64_t *hostTimestamp) const {
|
||||
TimeStampData queueTimeStamp;
|
||||
bool retVal = getOSTime()->getCpuGpuTime(&queueTimeStamp);
|
||||
bool retVal = getOSTime()->getCpuTime(hostTimestamp);
|
||||
if (retVal) {
|
||||
uint64_t resolution = (uint64_t)getOSTime()->getDynamicDeviceTimerResolution(getHardwareInfo());
|
||||
*deviceTimestamp = queueTimeStamp.GPUTimeStamp * resolution;
|
||||
TimeStampData timeStamp;
|
||||
retVal = getOSTime()->getCpuGpuTime(&timeStamp);
|
||||
if (retVal) {
|
||||
if (DebugManager.flags.EnableDeviceBasedTimestamps.get()) {
|
||||
auto resolution = getOSTime()->getDynamicDeviceTimerResolution(getHardwareInfo());
|
||||
*deviceTimestamp = static_cast<uint64_t>(timeStamp.GPUTimeStamp * resolution);
|
||||
} else
|
||||
*deviceTimestamp = *hostTimestamp;
|
||||
}
|
||||
}
|
||||
|
||||
retVal = getOSTime()->getCpuTime(hostTimestamp);
|
||||
return retVal;
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ class OSInterface;
|
||||
struct HardwareInfo;
|
||||
|
||||
struct TimeStampData {
|
||||
uint64_t GPUTimeStamp; // GPU time in ns
|
||||
uint64_t GPUTimeStamp; // GPU time in counter ticks
|
||||
uint64_t CPUTimeinNS; // CPU time in ns
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user