feature(sysman): supports product helper for temperature module

Related-To: NEO-8720

Signed-off-by: Kulkarni, Ashwin Kumar <ashwin.kumar.kulkarni@intel.com>
This commit is contained in:
Kulkarni, Ashwin Kumar
2023-11-23 14:12:29 +00:00
committed by Compute-Runtime-Automation
parent 985186242e
commit d7aea3e745
13 changed files with 538 additions and 185 deletions

View File

@@ -9,17 +9,12 @@
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "level_zero/sysman/source/shared/linux/product_helper/sysman_product_helper.h"
#include "level_zero/sysman/source/shared/linux/zes_os_sysman_imp.h"
namespace L0 {
namespace Sysman {
constexpr uint32_t numSocTemperatureEntries = 7; // entries would be PCH or GT_TEMP, DRAM, SA, PSF, DE, PCIE, TYPEC
constexpr uint32_t numCoreTemperatureEntries = 4; // entries would be CORE0, CORE1, CORE2, CORE3
constexpr uint32_t numComputeTemperatureEntries = 3; // entries would be IA, GT and LLC
constexpr uint32_t invalidMaxTemperature = 125;
constexpr uint32_t invalidMinTemperature = 10;
ze_result_t LinuxTemperatureImp::getProperties(zes_temp_properties_t *pProperties) {
pProperties->type = type;
pProperties->onSubdevice = 0;
@@ -31,157 +26,16 @@ ze_result_t LinuxTemperatureImp::getProperties(zes_temp_properties_t *pPropertie
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxTemperatureImp::getGlobalMaxTemperatureNoSubDevice(double *pTemperature) {
auto isValidTemperature = [](auto temperature) {
if ((temperature > invalidMaxTemperature) || (temperature < invalidMinTemperature)) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): temperature:%f is not in valid limits \n", __FUNCTION__, temperature);
return false;
}
return true;
};
auto getMaxTemperature = [&](auto temperature, auto numTemperatureEntries) {
uint32_t maxTemperature = 0;
for (uint32_t count = 0; count < numTemperatureEntries; count++) {
uint32_t localTemperatureVal = (temperature >> (8 * count)) & 0xff;
if (isValidTemperature(localTemperatureVal)) {
if (localTemperatureVal > maxTemperature) {
maxTemperature = localTemperatureVal;
}
}
}
return maxTemperature;
};
ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
uint32_t maxComputeTemperature = 0;
uint32_t maxCoreTemperature = 0;
std::string key;
if (productFamily == IGFX_DG1) {
uint32_t computeTemperature = 0;
key = "COMPUTE_TEMPERATURES";
result = pPmt->readValue(key, computeTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for COMPUTE_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
// Check max temperature among IA, GT and LLC sensors across COMPUTE_TEMPERATURES
maxComputeTemperature = getMaxTemperature(computeTemperature, numComputeTemperatureEntries);
uint32_t coreTemperature = 0;
key = "CORE_TEMPERATURES";
result = pPmt->readValue(key, coreTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for CORE_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
// Check max temperature among CORE0, CORE1, CORE2, CORE3 sensors across CORE_TEMPERATURES
maxCoreTemperature = getMaxTemperature(coreTemperature, numCoreTemperatureEntries);
}
// SOC_TEMPERATURES is present in all product families
uint64_t socTemperature = 0;
key = "SOC_TEMPERATURES";
result = pPmt->readValue(key, socTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for SOC_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
// Check max temperature among possible sensors like PCH or GT_TEMP, DRAM, SA, PSF, DE, PCIE, TYPEC across SOC_TEMPERATURES
uint32_t maxSocTemperature = getMaxTemperature(socTemperature, numSocTemperatureEntries);
*pTemperature = static_cast<double>(std::max({maxComputeTemperature, maxCoreTemperature, maxSocTemperature}));
return result;
}
ze_result_t LinuxTemperatureImp::getGlobalMaxTemperature(double *pTemperature) {
// For XE_HP_SDV and PVC single tile devices, telemetry info is retrieved from
// tile's telem node rather from root device telem node.
if ((!isSubdevice) && (!(productFamily == IGFX_PVC))) {
return getGlobalMaxTemperatureNoSubDevice(pTemperature);
}
uint32_t globalMaxTemperature = 0;
std::string key("TileMaxTemperature");
ze_result_t result = pPmt->readValue(key, globalMaxTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for TileMaxTemperature is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
*pTemperature = static_cast<double>(globalMaxTemperature);
return result;
}
ze_result_t LinuxTemperatureImp::getGpuMaxTemperatureNoSubDevice(double *pTemperature) {
double gpuMaxTemperature = 0;
uint64_t socTemperature = 0;
// Gpu temperature is obtained from GT_TEMP in SOC_TEMPERATURE's bit 0 to 7.
std::string key = "SOC_TEMPERATURES";
auto result = pPmt->readValue(key, socTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for SOC_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
gpuMaxTemperature = static_cast<double>(socTemperature & 0xff);
if (productFamily == IGFX_DG1) {
// In DG1 platform, Gpu Max Temperature is obtained from COMPUTE_TEMPERATURE only
uint32_t computeTemperature = 0;
std::string key("COMPUTE_TEMPERATURES");
ze_result_t result = pPmt->readValue(key, computeTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for COMPUTE_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
// GT temperature could be read via 8th to 15th bit in the value read in temperature
computeTemperature = (computeTemperature >> 8) & 0xff;
gpuMaxTemperature = static_cast<double>(computeTemperature);
}
*pTemperature = gpuMaxTemperature;
return ZE_RESULT_SUCCESS;
return pSysmanProductHelper->getGlobalMaxTemperature(pPmt, pTemperature);
}
ze_result_t LinuxTemperatureImp::getGpuMaxTemperature(double *pTemperature) {
if ((!isSubdevice) && (!(productFamily == IGFX_PVC))) {
return getGpuMaxTemperatureNoSubDevice(pTemperature);
}
uint32_t gpuMaxTemperature = 0;
std::string key("GTMaxTemperature");
ze_result_t result = pPmt->readValue(key, gpuMaxTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for GTMaxTemperature is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
*pTemperature = static_cast<double>(gpuMaxTemperature);
return result;
return pSysmanProductHelper->getGpuMaxTemperature(pPmt, pTemperature);
}
ze_result_t LinuxTemperatureImp::getMemoryMaxTemperature(double *pTemperature) {
ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
uint32_t numHbmModules = 0u;
if (productFamily == IGFX_PVC) {
numHbmModules = 4u;
} else {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s() returning UNSUPPORTED_FEATURE \n", __FUNCTION__);
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
std::vector<uint32_t> maxDeviceTemperatureList;
for (auto hbmModuleIndex = 0u; hbmModuleIndex < numHbmModules; hbmModuleIndex++) {
uint32_t maxDeviceTemperature = 0;
// To read HBM 0's max device temperature key would be HBM0MaxDeviceTemperature
std::string key = "HBM" + std::to_string(hbmModuleIndex) + "MaxDeviceTemperature";
result = pPmt->readValue(key, maxDeviceTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for %s is returning error:0x%x \n", __FUNCTION__, key.c_str(), result);
return result;
}
maxDeviceTemperatureList.push_back(maxDeviceTemperature);
}
*pTemperature = static_cast<double>(*std::max_element(maxDeviceTemperatureList.begin(), maxDeviceTemperatureList.end()));
return result;
return pSysmanProductHelper->getMemoryMaxTemperature(pPmt, pTemperature);
}
ze_result_t LinuxTemperatureImp::getSensorTemperature(double *pTemperature) {
@@ -189,21 +43,12 @@ ze_result_t LinuxTemperatureImp::getSensorTemperature(double *pTemperature) {
switch (type) {
case ZES_TEMP_SENSORS_GLOBAL:
result = getGlobalMaxTemperature(pTemperature);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
break;
case ZES_TEMP_SENSORS_GPU:
result = getGpuMaxTemperature(pTemperature);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
break;
case ZES_TEMP_SENSORS_MEMORY:
result = getMemoryMaxTemperature(pTemperature);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
break;
default:
*pTemperature = 0;
@@ -215,13 +60,19 @@ ze_result_t LinuxTemperatureImp::getSensorTemperature(double *pTemperature) {
}
bool LinuxTemperatureImp::isTempModuleSupported() {
if ((!isSubdevice) && (!(productFamily == IGFX_PVC))) {
if (type == ZES_TEMP_SENSORS_MEMORY) {
return false;
}
bool result = (pPmt != nullptr);
switch (type) {
case ZES_TEMP_SENSORS_GLOBAL:
case ZES_TEMP_SENSORS_GPU:
break;
case ZES_TEMP_SENSORS_MEMORY:
result &= pSysmanProductHelper->isMemoryMaxTemperatureSupported();
break;
default:
result = false;
break;
}
return (pPmt != nullptr);
return result;
}
void LinuxTemperatureImp::setSensorType(zes_temp_sensors_t sensorType) {
@@ -232,7 +83,7 @@ LinuxTemperatureImp::LinuxTemperatureImp(OsSysman *pOsSysman, ze_bool_t onSubdev
uint32_t subdeviceId) : subdeviceId(subdeviceId), isSubdevice(onSubdevice) {
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
pPmt = pLinuxSysmanImp->getPlatformMonitoringTechAccess(subdeviceId);
productFamily = pLinuxSysmanImp->getProductFamily();
pSysmanProductHelper = pLinuxSysmanImp->getSysmanProductHelper();
}
std::unique_ptr<OsTemperature> OsTemperature::create(OsSysman *pOsSysman, ze_bool_t onSubdevice, uint32_t subdeviceId, zes_temp_sensors_t sensorType) {

View File

@@ -20,6 +20,7 @@ namespace Sysman {
class SysfsAccess;
class PlatformMonitoringTech;
class SysmanProductHelper;
class LinuxTemperatureImp : public OsTemperature, NEO::NonCopyableOrMovableClass {
public:
ze_result_t getProperties(zes_temp_properties_t *pProperties) override;
@@ -36,15 +37,11 @@ class LinuxTemperatureImp : public OsTemperature, NEO::NonCopyableOrMovableClass
private:
ze_result_t getGlobalMaxTemperature(double *pTemperature);
ze_result_t getGlobalMinTemperature(double *pTemperature);
ze_result_t getGpuMaxTemperature(double *pTemperature);
ze_result_t getGpuMinTemperature(double *pTemperature);
ze_result_t getMemoryMaxTemperature(double *pTemperature);
ze_result_t getGlobalMaxTemperatureNoSubDevice(double *pTemperature);
ze_result_t getGpuMaxTemperatureNoSubDevice(double *pTemperature);
uint32_t subdeviceId = 0;
ze_bool_t isSubdevice = 0;
PRODUCT_FAMILY productFamily = IGFX_UNKNOWN;
SysmanProductHelper *pSysmanProductHelper = nullptr;
};
} // namespace Sysman