Debug: Add debug logs for temperature module

Related-To: LOCI-3881

Signed-off-by: Devarinti, Puneeth Kumar Reddy <puneeth.kumar.reddy.devarinti@intel.com>
This commit is contained in:
Devarinti, Puneeth Kumar Reddy 2023-02-02 05:07:34 +00:00 committed by Compute-Runtime-Automation
parent 996d73b768
commit 535debff2d
1 changed files with 13 additions and 1 deletions

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2022 Intel Corporation * Copyright (C) 2022-2023 Intel Corporation
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
* *
@ -7,6 +7,8 @@
#include "level_zero/tools/source/sysman/temperature/linux/os_temperature_imp.h" #include "level_zero/tools/source/sysman/temperature/linux/os_temperature_imp.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "level_zero/tools/source/sysman/linux/pmt/pmt.h" #include "level_zero/tools/source/sysman/linux/pmt/pmt.h"
#include "sysman/linux/os_sysman_imp.h" #include "sysman/linux/os_sysman_imp.h"
@ -33,6 +35,7 @@ ze_result_t LinuxTemperatureImp::getProperties(zes_temp_properties_t *pPropertie
ze_result_t LinuxTemperatureImp::getGlobalMaxTemperatureNoSubDevice(double *pTemperature) { ze_result_t LinuxTemperatureImp::getGlobalMaxTemperatureNoSubDevice(double *pTemperature) {
auto isValidTemperature = [](auto temperature) { auto isValidTemperature = [](auto temperature) {
if ((temperature > invalidMaxTemperature) || (temperature < invalidMinTemperature)) { if ((temperature > invalidMaxTemperature) || (temperature < invalidMinTemperature)) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): temperature:%f is not in valid limits \n", __FUNCTION__, temperature);
return false; return false;
} }
return true; return true;
@ -60,6 +63,7 @@ ze_result_t LinuxTemperatureImp::getGlobalMaxTemperatureNoSubDevice(double *pTem
key = "COMPUTE_TEMPERATURES"; key = "COMPUTE_TEMPERATURES";
result = pPmt->readValue(key, computeTemperature); result = pPmt->readValue(key, computeTemperature);
if (result != ZE_RESULT_SUCCESS) { if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for COMPUTE_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result; return result;
} }
// Check max temperature among IA, GT and LLC sensors across COMPUTE_TEMPERATURES // Check max temperature among IA, GT and LLC sensors across COMPUTE_TEMPERATURES
@ -69,6 +73,7 @@ ze_result_t LinuxTemperatureImp::getGlobalMaxTemperatureNoSubDevice(double *pTem
key = "CORE_TEMPERATURES"; key = "CORE_TEMPERATURES";
result = pPmt->readValue(key, coreTemperature); result = pPmt->readValue(key, coreTemperature);
if (result != ZE_RESULT_SUCCESS) { if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for CORE_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result; return result;
} }
// Check max temperature among CORE0, CORE1, CORE2, CORE3 sensors across CORE_TEMPERATURES // Check max temperature among CORE0, CORE1, CORE2, CORE3 sensors across CORE_TEMPERATURES
@ -80,6 +85,7 @@ ze_result_t LinuxTemperatureImp::getGlobalMaxTemperatureNoSubDevice(double *pTem
key = "SOC_TEMPERATURES"; key = "SOC_TEMPERATURES";
result = pPmt->readValue(key, socTemperature); result = pPmt->readValue(key, socTemperature);
if (result != ZE_RESULT_SUCCESS) { if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for SOC_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result; return result;
} }
// Check max temperature among possible sensors like PCH or GT_TEMP, DRAM, SA, PSF, DE, PCIE, TYPEC across SOC_TEMPERATURES // Check max temperature among possible sensors like PCH or GT_TEMP, DRAM, SA, PSF, DE, PCIE, TYPEC across SOC_TEMPERATURES
@ -100,6 +106,7 @@ ze_result_t LinuxTemperatureImp::getGlobalMaxTemperature(double *pTemperature) {
std::string key("TileMaxTemperature"); std::string key("TileMaxTemperature");
ze_result_t result = pPmt->readValue(key, globalMaxTemperature); ze_result_t result = pPmt->readValue(key, globalMaxTemperature);
if (result != ZE_RESULT_SUCCESS) { if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for TileMaxTemperature is returning error:0x%x \n", __FUNCTION__, result);
return result; return result;
} }
*pTemperature = static_cast<double>(globalMaxTemperature); *pTemperature = static_cast<double>(globalMaxTemperature);
@ -113,6 +120,7 @@ ze_result_t LinuxTemperatureImp::getGpuMaxTemperatureNoSubDevice(double *pTemper
std::string key = "SOC_TEMPERATURES"; std::string key = "SOC_TEMPERATURES";
auto result = pPmt->readValue(key, socTemperature); auto result = pPmt->readValue(key, socTemperature);
if (result != ZE_RESULT_SUCCESS) { if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for SOC_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result; return result;
} }
gpuMaxTemperature = static_cast<double>(socTemperature & 0xff); gpuMaxTemperature = static_cast<double>(socTemperature & 0xff);
@ -123,6 +131,7 @@ ze_result_t LinuxTemperatureImp::getGpuMaxTemperatureNoSubDevice(double *pTemper
std::string key("COMPUTE_TEMPERATURES"); std::string key("COMPUTE_TEMPERATURES");
ze_result_t result = pPmt->readValue(key, computeTemperature); ze_result_t result = pPmt->readValue(key, computeTemperature);
if (result != ZE_RESULT_SUCCESS) { if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for COMPUTE_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result);
return result; return result;
} }
@ -142,6 +151,7 @@ ze_result_t LinuxTemperatureImp::getGpuMaxTemperature(double *pTemperature) {
std::string key("GTMaxTemperature"); std::string key("GTMaxTemperature");
ze_result_t result = pPmt->readValue(key, gpuMaxTemperature); ze_result_t result = pPmt->readValue(key, gpuMaxTemperature);
if (result != ZE_RESULT_SUCCESS) { if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for GTMaxTemperature is returning error:0x%x \n", __FUNCTION__, result);
return result; return result;
} }
*pTemperature = static_cast<double>(gpuMaxTemperature); *pTemperature = static_cast<double>(gpuMaxTemperature);
@ -156,6 +166,7 @@ ze_result_t LinuxTemperatureImp::getMemoryMaxTemperature(double *pTemperature) {
} else if (productFamily == IGFX_PVC) { } else if (productFamily == IGFX_PVC) {
numHbmModules = 4u; numHbmModules = 4u;
} else { } else {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s() returning UNSUPPORTED_FEATURE \n", __FUNCTION__);
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
} }
@ -166,6 +177,7 @@ ze_result_t LinuxTemperatureImp::getMemoryMaxTemperature(double *pTemperature) {
std::string key = "HBM" + std::to_string(hbmModuleIndex) + "MaxDeviceTemperature"; std::string key = "HBM" + std::to_string(hbmModuleIndex) + "MaxDeviceTemperature";
result = pPmt->readValue(key, maxDeviceTemperature); result = pPmt->readValue(key, maxDeviceTemperature);
if (result != ZE_RESULT_SUCCESS) { if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for %s is returning error:0x%x \n", __FUNCTION__, key.c_str(), result);
return result; return result;
} }
maxDeviceTemperatureList.push_back(maxDeviceTemperature); maxDeviceTemperatureList.push_back(maxDeviceTemperature);