feature: support metrics runtime enabling

Related-To: NEO-14287

Signed-off-by: Joshua Santosh Ranjan <joshua.santosh.ranjan@intel.com>
This commit is contained in:
Joshua Santosh Ranjan 2025-03-10 14:21:34 +00:00 committed by Compute-Runtime-Automation
parent 4292f7e5e6
commit 06b34da853
12 changed files with 236 additions and 18 deletions

View File

@ -10,6 +10,14 @@
namespace L0 {
ze_result_t ZE_APICALL zetIntelDeviceEnableMetricsExp(zet_device_handle_t hDevice) {
return L0::metricsEnable(hDevice);
}
ze_result_t ZE_APICALL zetIntelDeviceDisableMetricsExp(zet_device_handle_t hDevice) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
ze_result_t ZE_APICALL zetIntelCommandListAppendMarkerExp(zet_command_list_handle_t hCommandList,
zet_metric_group_handle_t hMetricGroup,
uint32_t value) {
@ -85,6 +93,14 @@ ze_result_t zetIntelMetricCalculateOperationDestroyExp(zet_intel_metric_calculat
extern "C" {
ze_result_t ZE_APICALL zetIntelDeviceEnableMetricsExp(zet_device_handle_t hDevice) {
return L0::zetIntelDeviceEnableMetricsExp(hDevice);
}
ze_result_t ZE_APICALL zetIntelDeviceDisableMetricsExp(zet_device_handle_t hDevice) {
return L0::zetIntelDeviceDisableMetricsExp(hDevice);
}
ze_result_t ZE_APICALL
zetIntelCommandListAppendMarkerExp(
zet_command_list_handle_t hCommandList,

View File

@ -59,6 +59,8 @@ void *ExtensionFunctionAddressHelper::getExtensionFunctionAddress(const std::str
RETURN_FUNC_PTR_IF_EXIST(zexIntelReleaseNetworkInterrupt);
RETURN_FUNC_PTR_IF_EXIST(zetIntelCommandListAppendMarkerExp);
RETURN_FUNC_PTR_IF_EXIST(zetIntelDeviceEnableMetricsExp);
RETURN_FUNC_PTR_IF_EXIST(zetIntelDeviceDisableMetricsExp);
RETURN_FUNC_PTR_IF_EXIST(zetIntelMetricTracerCreateExp);
RETURN_FUNC_PTR_IF_EXIST(zetIntelMetricTracerDestroyExp);
RETURN_FUNC_PTR_IF_EXIST(zetIntelMetricTracerEnableExp);

View File

@ -385,6 +385,34 @@ zetIntelMetricCalculateMultipleValuesExp(
zet_intel_metric_result_exp_t *pMetricResults); ///< [in,out][optional][range(0, *pTotalMetricResultsCount)] buffer of calculated
///< metrics results.
#ifndef ZET_INTEL_METRICS_RUNTIME_ENABLE_DISABLE_EXP_NAME
/// @brief Extension name for query to read the Intel Level Zero Driver Version String
#define ZET_INTEL_METRICS_RUNTIME_ENABLE_DISABLE_EXP_NAME "ZET_intel_metrics_runtime_enable_disable"
#endif // ZET_INTEL_METRICS_RUNTIME_ENABLE_DISABLE_EXP_NAME
///////////////////////////////////////////////////////////////////////////////
/// @brief Metrics Runtime Enable Disable extension Version(s)
typedef enum _zet_intel_metrics_runtime_enable_disable_exp_version_t {
ZET_INTEL_METRICS_RUNTIME_ENABLE_DISABLE_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0
ZET_INTEL_METRICS_RUNTIME_ENABLE_DISABLE_EXP_VERSION_CURRENT = ZET_INTEL_METRICS_RUNTIME_ENABLE_DISABLE_EXP_VERSION_1_0, ///< latest known version
ZET_INTEL_METRICS_RUNTIME_ENABLE_DISABLE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
} zet_intel_metrics_runtime_enable_disable_exp_version_t;
///////////////////////////////////////////////////////////////////////////////
/// @brief This API Enables Metric collection for a device if not already enabled.
/// If device is a root-device, then its sub-devices are also enabled.
/// This API can be used as runtime alternative to setting ZET_ENABLE_METRICS=1.
ze_result_t ZE_APICALL zetIntelDeviceEnableMetricsExp(zet_device_handle_t hDevice);
///////////////////////////////////////////////////////////////////////////////
/// @brief This API Disables Metric for a device, if it was previously enabled.
/// If device is a root-device, then its sub-devices are also disabled.
/// The application has to ensure that all metric operations are complete and
/// all metric resources are released before this API is called.
/// If there are metric operations in progress, then ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
/// is returned.
ze_result_t ZE_APICALL zetIntelDeviceDisableMetricsExp(zet_device_handle_t hDevice);
#if defined(__cplusplus)
} // extern "C"
#endif

View File

@ -58,11 +58,17 @@ MetricDeviceContext::MetricDeviceContext(Device &inputDevice) : device(inputDevi
bool MetricDeviceContext::enable() {
bool status = false;
std::lock_guard<std::mutex> lock(enableMetricsMutex);
for (auto const &entry : metricSources) {
auto const &metricSource = entry.second;
// Enable only if not already enabled.
if (!isEnableChecked) {
metricSource->enable();
}
status |= metricSource->isAvailable();
}
isEnableChecked = true;
return status;
}
@ -178,6 +184,18 @@ Device &MetricDeviceContext::getDevice() const {
return device;
}
void MetricDeviceContext::enableMetricApiForDevice(zet_device_handle_t hDevice, bool &isFailed) {
auto deviceImp = static_cast<DeviceImp *>(L0::Device::fromHandle(hDevice));
// Initialize device.
isFailed |= !deviceImp->metricContext->enable();
// Initialize sub devices if available.
for (uint32_t i = 0; i < deviceImp->numSubDevices; ++i) {
isFailed |= !deviceImp->subDevices[i]->getMetricDeviceContext().enable();
}
}
ze_result_t MetricDeviceContext::enableMetricApi() {
bool failed = false;
@ -194,14 +212,7 @@ ze_result_t MetricDeviceContext::enableMetricApi() {
driverHandle->getDevice(&rootDeviceCount, rootDevices.data());
for (auto rootDeviceHandle : rootDevices) {
auto rootDevice = static_cast<DeviceImp *>(L0::Device::fromHandle(rootDeviceHandle));
// Initialize root device.
failed |= !rootDevice->metricContext->enable();
// Initialize sub devices.
for (uint32_t i = 0; i < rootDevice->numSubDevices; ++i) {
failed |= !rootDevice->subDevices[i]->getMetricDeviceContext().enable();
}
enableMetricApiForDevice(rootDeviceHandle, failed);
}
if (failed) {
break;
@ -870,4 +881,11 @@ ze_result_t metricCalculateOperationDestroy(
return MetricCalcOp::fromHandle(hCalculateOperation)->destroy();
}
ze_result_t metricsEnable(zet_device_handle_t hDevice) {
auto isFailed = false;
MetricDeviceContext::enableMetricApiForDevice(hDevice, isFailed);
return isFailed ? ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE : ZE_RESULT_SUCCESS;
}
} // namespace L0

View File

@ -145,6 +145,7 @@ class MetricDeviceContext {
static std::unique_ptr<MetricDeviceContext> create(Device &device);
static ze_result_t enableMetricApi();
static void enableMetricApiForDevice(zet_device_handle_t hDevice, bool &isFailed);
ze_result_t getConcurrentMetricGroups(uint32_t metricGroupCount, zet_metric_group_handle_t *phMetricGroups,
uint32_t *pConcurrentGroupCount, uint32_t *pCountPerConcurrentGroup);
@ -173,6 +174,8 @@ class MetricDeviceContext {
struct Device &device;
bool multiDeviceCapable = false;
uint32_t subDeviceIndex = 0;
bool isEnableChecked = false;
std::mutex enableMetricsMutex;
};
struct Metric : _zet_metric_handle_t {
@ -492,4 +495,6 @@ ze_result_t metricCalculateOperationCreate(zet_context_handle_t hContext, zet_de
ze_result_t metricCalculateOperationDestroy(zet_intel_metric_calculate_operation_exp_handle_t hCalculateOperation);
ze_result_t metricsEnable(zet_device_handle_t hDevice);
} // namespace L0

View File

@ -798,6 +798,76 @@ bool metricGetTimestampTest() {
return status;
}
///////////////
/// runtimeEnableTest
///////////////
bool runtimeEnableTest() {
// This test verifies run time initialization of a specific device
bool status = true;
zmu::TestMachineConfiguration machineConfig = {};
zmu::getTestMachineConfiguration(machineConfig);
auto testSettings = zmu::TestSettings::get();
auto runtimeEnableTestRun = [](uint32_t deviceId, int32_t subDeviceId, std::string &metricGroupName) {
LOG(zmu::LogLevel::INFO) << "Running Runtime Init Test : Device [" << deviceId << ", " << subDeviceId << " ] : Metric Group :" << metricGroupName.c_str() << "\n";
if (!zmu::isDeviceAvailable(deviceId, subDeviceId)) {
return false;
}
std::unique_ptr<SingleDeviceSingleQueueExecutionCtxt> executionCtxt =
std::make_unique<SingleDeviceSingleQueueExecutionCtxt>(deviceId, subDeviceId);
uint32_t metricGroupCountBeforeEnable = 0;
VALIDATECALL(zetMetricGroupGet(executionCtxt->getDeviceHandle(0), &metricGroupCountBeforeEnable, nullptr));
if (zmu::isEnvVariableSet("ZET_ENABLE_METRICS")) {
LOG(zmu::LogLevel::INFO) << "Unset ZET_ENABLE_METRICS and run this test ! \n";
return false;
}
EXPECT(metricGroupCountBeforeEnable == 0u);
LOG(zmu::LogLevel::INFO) << "MetricGroup Count Before Runtime Enabling: " << metricGroupCountBeforeEnable << "\n";
typedef ze_result_t (*pfzetIntelDeviceEnableMetricsExp)(zet_device_handle_t hDevice);
pfzetIntelDeviceEnableMetricsExp zetIntelDeviceEnableMetricsExp = nullptr;
ze_result_t result = zeDriverGetExtensionFunctionAddress(executionCtxt->getDriverHandle(0), "zetIntelDeviceEnableMetricsExp", reinterpret_cast<void **>(&zetIntelDeviceEnableMetricsExp));
VALIDATECALL(result);
auto status = zetIntelDeviceEnableMetricsExp(executionCtxt->getDeviceHandle(0));
VALIDATECALL(status);
uint32_t metricGroupCountAfterEnable = 0;
status = zetMetricGroupGet(executionCtxt->getDeviceHandle(0), &metricGroupCountAfterEnable, nullptr);
EXPECT(status == ZE_RESULT_SUCCESS);
LOG(zmu::LogLevel::INFO) << "MetricGroup Count After Runtime Enabling: " << metricGroupCountAfterEnable << "\n";
EXPECT(metricGroupCountAfterEnable > 0u);
return true;
};
if (testSettings->deviceId.get() == -1) {
for (uint32_t deviceId = 0; deviceId < machineConfig.deviceCount; deviceId++) {
// Run for all subdevices
for (uint32_t subDeviceId = 0; subDeviceId < machineConfig.devices[deviceId].subDeviceCount; subDeviceId++) {
status &= runtimeEnableTestRun(deviceId, subDeviceId, testSettings->metricGroupName.get());
}
// Run for root device
status &= runtimeEnableTestRun(deviceId, -1, testSettings->metricGroupName.get());
}
} else {
// Run for specific device
status &= runtimeEnableTestRun(testSettings->deviceId.get(), testSettings->subDeviceId.get(), testSettings->metricGroupName.get());
}
return status;
}
ZELLO_METRICS_ADD_TEST(runtimeEnableTest)
ZELLO_METRICS_ADD_TEST(queryTest)
ZELLO_METRICS_ADD_TEST(streamTest)
ZELLO_METRICS_ADD_TEST(streamMultiMetricDomainTest)

View File

@ -499,6 +499,13 @@ void TestSettings::readMetricNames(char *optArg) {
}
}
bool isEnvVariableSet(const char *name) {
const char *env = getenv(name);
if ((nullptr == env) || (0 == strcmp("0", env)))
return false;
return (0 == strcmp("1", env));
}
////////////////
// Test Settings
////////////////

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2022-2024 Intel Corporation
* Copyright (C) 2022-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -101,5 +101,6 @@ void showMetricsExportData(uint8_t *pExportData, size_t exportDataSize);
bool osStreamMpCollectionWorkloadDifferentProcess();
bool osStreamMpCollectionWorkloadSameProcess();
int32_t osRunAllTests(int32_t runStatus);
bool isEnvVariableSet(const char *name);
} // namespace ZelloMetricsUtility

View File

@ -57,11 +57,11 @@ class MockMetricIpSamplingSource : public IpSamplingMetricSourceImp {
}
};
class MockMetricDeviceContext : public MetricDeviceContext {
class MockMetricDeviceContextIpSampling : public MetricDeviceContext {
public:
MockMetricDeviceContext(Device &device) : MetricDeviceContext(device) {}
MockMetricDeviceContextIpSampling(Device &device) : MetricDeviceContext(device) {}
void setMetricTraceSource(MockMetricIpSamplingSource *metricSource) {
void setMetricIpSamplingSource(MockMetricIpSamplingSource *metricSource) {
metricSources[MetricSource::metricSourceTypeIpSampling] = std::unique_ptr<MockMetricIpSamplingSource>(metricSource);
}
};

View File

@ -15,8 +15,9 @@ namespace ult {
class MockMetricSource : public L0::MetricSource {
public:
uint32_t enableCallCount = 0;
bool isAvailableReturn = false;
void enable() override {}
void enable() override { enableCallCount++; }
bool isAvailable() override { return isAvailableReturn; }
ze_result_t appendMetricMemoryBarrier(L0::CommandList &commandList) override { return ZE_RESULT_ERROR_UNKNOWN; }
ze_result_t metricGroupGet(uint32_t *pCount, zet_metric_group_handle_t *phMetricGroups) override { return ZE_RESULT_ERROR_UNKNOWN; }
@ -169,5 +170,18 @@ class MockMetricCalcOp : public MetricCalcOpImp {
zet_intel_metric_result_exp_t *pMetricResults) override { return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; };
};
class MockMetricDeviceContext : public MetricDeviceContext {
public:
MockMetricDeviceContext(Device &device) : MetricDeviceContext(device) {}
void clearAllSources() {
metricSources.clear();
}
void setMockMetricSource(MockMetricSource *metricSource) {
metricSources[MetricSource::metricSourceTypeOa] = std::unique_ptr<MockMetricSource>(metricSource);
}
};
} // namespace ult
} // namespace L0

View File

@ -5,11 +5,13 @@
*
*/
#include "shared/test/common/test_macros/test.h"
#include "shared/test/common/test_macros/test_base.h"
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
#include "level_zero/include/level_zero/zet_intel_gpu_metric.h"
#include "level_zero/tools/test/unit_tests/sources/metrics/mock_metric_source.h"
#include <level_zero/zet_api.h>
#include "level_zero/zet_api.h"
#include "gtest/gtest.h"
@ -230,5 +232,60 @@ TEST_F(CalcOperationFixture, WhenCreatingCalcOpObjectToAndFromHandleBaseClassWor
EXPECT_NE(nullptr, mockCalcOp);
}
using MetricRuntimeFixture = Test<DeviceFixture>;
TEST_F(MetricRuntimeFixture, WhenRunTimeEnableIsDoneThenReturnSuccess) {
auto mockDeviceContext = new MockMetricDeviceContext(*device);
mockDeviceContext->clearAllSources();
auto metricSource = new MockMetricSource();
metricSource->isAvailableReturn = true;
mockDeviceContext->setMockMetricSource(metricSource);
auto deviceImp = static_cast<DeviceImp *>(device);
deviceImp->metricContext.reset(mockDeviceContext);
EXPECT_EQ(ZE_RESULT_SUCCESS, zetIntelDeviceEnableMetricsExp(device->toHandle()));
deviceImp->metricContext.reset();
}
TEST_F(MetricRuntimeFixture, WhenRunTimeEnableIsDoneMultipleTimesThenEnableIsDoneOnlyOnce) {
auto mockDeviceContext = new MockMetricDeviceContext(*device);
mockDeviceContext->clearAllSources();
auto metricSource = new MockMetricSource();
metricSource->isAvailableReturn = true;
mockDeviceContext->setMockMetricSource(metricSource);
auto deviceImp = static_cast<DeviceImp *>(device);
deviceImp->metricContext.reset(mockDeviceContext);
EXPECT_EQ(ZE_RESULT_SUCCESS, zetIntelDeviceEnableMetricsExp(device->toHandle()));
EXPECT_EQ(metricSource->enableCallCount, 1u);
EXPECT_EQ(ZE_RESULT_SUCCESS, zetIntelDeviceEnableMetricsExp(device->toHandle()));
EXPECT_EQ(metricSource->enableCallCount, 1u);
deviceImp->metricContext.reset();
}
TEST_F(MetricRuntimeFixture, WhenRunTimeEnableIsDoneAndNoSourcesAreAvailableThenReturnError) {
auto mockDeviceContext = new MockMetricDeviceContext(*device);
mockDeviceContext->clearAllSources();
auto metricSource = new MockMetricSource();
metricSource->isAvailableReturn = false;
mockDeviceContext->setMockMetricSource(metricSource);
auto deviceImp = static_cast<DeviceImp *>(device);
deviceImp->metricContext.reset(mockDeviceContext);
EXPECT_EQ(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE, zetIntelDeviceEnableMetricsExp(device->toHandle()));
EXPECT_EQ(metricSource->enableCallCount, 1u);
deviceImp->metricContext.reset();
}
TEST_F(MetricRuntimeFixture, WhenRunTimeDisableIsDoneMultipleTimesThenEnableIsDoneOnlyOnce) {
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zetIntelDeviceDisableMetricsExp(device->toHandle()));
}
} // namespace ult
} // namespace L0

View File

@ -1547,7 +1547,7 @@ class MultiSourceOaMetricProgrammableFixture : public DeviceFixture,
protected:
void SetUp() override;
void TearDown() override;
std::unique_ptr<MockMetricDeviceContext> deviceContext = nullptr;
std::unique_ptr<MockMetricDeviceContextIpSampling> deviceContext = nullptr;
OaMetricSourceImp *oaMetricSource = nullptr;
MetricEnumeration *metricEnumeration = nullptr;
MockIAdapterGroup1x13 mockAdapterGroup{};
@ -1564,7 +1564,7 @@ void MultiSourceOaMetricProgrammableFixture::SetUp() {
mockAdapterGroup.mockParams.Version.MajorNumber = 1;
mockAdapterGroup.mockParams.Version.MinorNumber = 13;
deviceContext = std::make_unique<MockMetricDeviceContext>(*device);
deviceContext = std::make_unique<MockMetricDeviceContextIpSampling>(*device);
oaMetricSource = static_cast<OaMetricSourceImp *>(&deviceContext->getMetricSource<OaMetricSourceImp>());
metricEnumeration = static_cast<MetricEnumeration *>(&oaMetricSource->getMetricEnumeration());
metricEnumeration->setAdapterGroup(&mockAdapterGroup);
@ -1573,7 +1573,7 @@ void MultiSourceOaMetricProgrammableFixture::SetUp() {
metricEnumeration->setInitializationState(ZE_RESULT_SUCCESS);
metricSource = new MockMetricIpSamplingSource(*deviceContext);
deviceContext->setMetricTraceSource(metricSource);
deviceContext->setMetricIpSamplingSource(metricSource);
}
TEST_F(MultiSourceOaMetricProgrammableFixture, givenCreateMetricGroupsFromMetricsIsCalledAndOneMetricSourcesReturnsUnsupportedThenSuccessIsReturned) {