Debug: Add debug logs for temperature module

Related-To: LOCI-3881

Signed-off-by: Devarinti, Puneeth Kumar Reddy <puneeth.kumar.reddy.devarinti@intel.com>
This commit is contained in:
Devarinti, Puneeth Kumar Reddy 2023-02-02 05:07:34 +00:00 committed by Compute-Runtime-Automation
parent 996d73b768
commit 535debff2d
1 changed files with 13 additions and 1 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2022 Intel Corporation
* Copyright (C) 2022-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -7,6 +7,8 @@
#include "level_zero/tools/source/sysman/temperature/linux/os_temperature_imp.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "level_zero/tools/source/sysman/linux/pmt/pmt.h"
#include "sysman/linux/os_sysman_imp.h"
@ -33,6 +35,7 @@ ze_result_t LinuxTemperatureImp::getProperties(zes_temp_properties_t *pPropertie
ze_result_t LinuxTemperatureImp::getGlobalMaxTemperatureNoSubDevice(double *pTemperature) {
auto isValidTemperature = [](auto temperature) {
if ((temperature > invalidMaxTemperature) || (temperature < invalidMinTemperature)) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): temperature:%f is not in valid limits \n", __FUNCTION__, temperature);
return false;
}
return true;
@ -60,6 +63,7 @@ ze_result_t LinuxTemperatureImp::getGlobalMaxTemperatureNoSubDevice(double *pTem
key = "COMPUTE_TEMPERATURES";
result = pPmt->readValue(key, computeTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for COMPUTE_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
// Check max temperature among IA, GT and LLC sensors across COMPUTE_TEMPERATURES
@ -69,6 +73,7 @@ ze_result_t LinuxTemperatureImp::getGlobalMaxTemperatureNoSubDevice(double *pTem
key = "CORE_TEMPERATURES";
result = pPmt->readValue(key, coreTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for CORE_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
// Check max temperature among CORE0, CORE1, CORE2, CORE3 sensors across CORE_TEMPERATURES
@ -80,6 +85,7 @@ ze_result_t LinuxTemperatureImp::getGlobalMaxTemperatureNoSubDevice(double *pTem
key = "SOC_TEMPERATURES";
result = pPmt->readValue(key, socTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for SOC_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
// Check max temperature among possible sensors like PCH or GT_TEMP, DRAM, SA, PSF, DE, PCIE, TYPEC across SOC_TEMPERATURES
@ -100,6 +106,7 @@ ze_result_t LinuxTemperatureImp::getGlobalMaxTemperature(double *pTemperature) {
std::string key("TileMaxTemperature");
ze_result_t result = pPmt->readValue(key, globalMaxTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for TileMaxTemperature is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
*pTemperature = static_cast<double>(globalMaxTemperature);
@ -113,6 +120,7 @@ ze_result_t LinuxTemperatureImp::getGpuMaxTemperatureNoSubDevice(double *pTemper
std::string key = "SOC_TEMPERATURES";
auto result = pPmt->readValue(key, socTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for SOC_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
gpuMaxTemperature = static_cast<double>(socTemperature & 0xff);
@ -123,6 +131,7 @@ ze_result_t LinuxTemperatureImp::getGpuMaxTemperatureNoSubDevice(double *pTemper
std::string key("COMPUTE_TEMPERATURES");
ze_result_t result = pPmt->readValue(key, computeTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for COMPUTE_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
@ -142,6 +151,7 @@ ze_result_t LinuxTemperatureImp::getGpuMaxTemperature(double *pTemperature) {
std::string key("GTMaxTemperature");
ze_result_t result = pPmt->readValue(key, gpuMaxTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for GTMaxTemperature is returning error:0x%x \n", __FUNCTION__, result);
return result;
}
*pTemperature = static_cast<double>(gpuMaxTemperature);
@ -156,6 +166,7 @@ ze_result_t LinuxTemperatureImp::getMemoryMaxTemperature(double *pTemperature) {
} else if (productFamily == IGFX_PVC) {
numHbmModules = 4u;
} else {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s() returning UNSUPPORTED_FEATURE \n", __FUNCTION__);
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
@ -166,6 +177,7 @@ ze_result_t LinuxTemperatureImp::getMemoryMaxTemperature(double *pTemperature) {
std::string key = "HBM" + std::to_string(hbmModuleIndex) + "MaxDeviceTemperature";
result = pPmt->readValue(key, maxDeviceTemperature);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for %s is returning error:0x%x \n", __FUNCTION__, key.c_str(), result);
return result;
}
maxDeviceTemperatureList.push_back(maxDeviceTemperature);