diff --git a/level_zero/api/sysman/zes_handles_struct.h b/level_zero/api/sysman/zes_handles_struct.h index dc2cda7d3f..74ebb1535f 100644 --- a/level_zero/api/sysman/zes_handles_struct.h +++ b/level_zero/api/sysman/zes_handles_struct.h @@ -47,3 +47,7 @@ struct _zes_ras_handle_t { struct _zes_standby_handle_t { virtual ~_zes_standby_handle_t() = default; }; + +struct _zes_temp_handle_t { + virtual ~_zes_temp_handle_t() = default; +}; \ No newline at end of file diff --git a/level_zero/api/sysman/zes_sysman_api_entrypoints.h b/level_zero/api/sysman/zes_sysman_api_entrypoints.h index be93bbcfe1..15407917ab 100644 --- a/level_zero/api/sysman/zes_sysman_api_entrypoints.h +++ b/level_zero/api/sysman/zes_sysman_api_entrypoints.h @@ -704,31 +704,51 @@ ze_result_t zesDeviceEnumTemperatureSensors( zes_device_handle_t hDevice, uint32_t *pCount, zes_temp_handle_t *phTemperature) { - return L0::SysmanDevice::temperatureGet(hDevice, pCount, phTemperature); + if (L0::sysmanInitFromCore) { + return L0::SysmanDevice::temperatureGet(hDevice, pCount, phTemperature); + } else { + return L0::Sysman::SysmanDevice::temperatureGet(hDevice, pCount, phTemperature); + } } ze_result_t zesTemperatureGetProperties( zes_temp_handle_t hTemperature, zes_temp_properties_t *pProperties) { - return L0::Temperature::fromHandle(hTemperature)->temperatureGetProperties(pProperties); + if (L0::sysmanInitFromCore) { + return L0::Temperature::fromHandle(hTemperature)->temperatureGetProperties(pProperties); + } else { + return L0::Sysman::Temperature::fromHandle(hTemperature)->temperatureGetProperties(pProperties); + } } ze_result_t zesTemperatureGetConfig( zes_temp_handle_t hTemperature, zes_temp_config_t *pConfig) { - return L0::Temperature::fromHandle(hTemperature)->temperatureGetConfig(pConfig); + if (L0::sysmanInitFromCore) { + return L0::Temperature::fromHandle(hTemperature)->temperatureGetConfig(pConfig); + } else { + return L0::Sysman::Temperature::fromHandle(hTemperature)->temperatureGetConfig(pConfig); + } } ze_result_t zesTemperatureSetConfig( zes_temp_handle_t hTemperature, const zes_temp_config_t *pConfig) { - return L0::Temperature::fromHandle(hTemperature)->temperatureSetConfig(pConfig); + if (L0::sysmanInitFromCore) { + return L0::Temperature::fromHandle(hTemperature)->temperatureSetConfig(pConfig); + } else { + return L0::Sysman::Temperature::fromHandle(hTemperature)->temperatureSetConfig(pConfig); + } } ze_result_t zesTemperatureGetState( zes_temp_handle_t hTemperature, double *pTemperature) { - return L0::Temperature::fromHandle(hTemperature)->temperatureGetState(pTemperature); + if (L0::sysmanInitFromCore) { + return L0::Temperature::fromHandle(hTemperature)->temperatureGetState(pTemperature); + } else { + return L0::Sysman::Temperature::fromHandle(hTemperature)->temperatureGetState(pTemperature); + } } ze_result_t zesDeviceEnumPsus( diff --git a/level_zero/sysman/source/sysman_device.cpp b/level_zero/sysman/source/sysman_device.cpp index fde0be7d01..a0220559f2 100644 --- a/level_zero/sysman/source/sysman_device.cpp +++ b/level_zero/sysman/source/sysman_device.cpp @@ -119,5 +119,10 @@ ze_result_t SysmanDevice::deviceSetEccState(zes_device_handle_t hDevice, const z return pSysmanDevice->deviceSetEccState(newState, pState); } +ze_result_t SysmanDevice::temperatureGet(zes_device_handle_t hDevice, uint32_t *pCount, zes_temp_handle_t *phTemperature) { + auto pSysmanDevice = L0::Sysman::SysmanDevice::fromHandle(hDevice); + return pSysmanDevice->temperatureGet(pCount, phTemperature); +} + } // namespace Sysman } // namespace L0 diff --git a/level_zero/sysman/source/sysman_device.h b/level_zero/sysman/source/sysman_device.h index 9622cd0d91..97dff74a58 100644 --- a/level_zero/sysman/source/sysman_device.h +++ b/level_zero/sysman/source/sysman_device.h @@ -21,6 +21,7 @@ #include "level_zero/sysman/source/ras/ras.h" #include "level_zero/sysman/source/scheduler/scheduler.h" #include "level_zero/sysman/source/standby/standby.h" +#include "level_zero/sysman/source/temperature/temperature.h" #include #include @@ -90,6 +91,9 @@ struct SysmanDevice : _ze_device_handle_t { static ze_result_t deviceSetEccState(zes_device_handle_t hDevice, const zes_device_ecc_desc_t *newState, zes_device_ecc_properties_t *pState); virtual ze_result_t deviceSetEccState(const zes_device_ecc_desc_t *newState, zes_device_ecc_properties_t *pState) = 0; + + static ze_result_t temperatureGet(zes_device_handle_t hDevice, uint32_t *pCount, zes_temp_handle_t *phTemperature); + virtual ze_result_t temperatureGet(uint32_t *pCount, zes_temp_handle_t *phTemperature) = 0; }; } // namespace Sysman diff --git a/level_zero/sysman/source/sysman_device_imp.cpp b/level_zero/sysman/source/sysman_device_imp.cpp index 74d05964c7..5a94430076 100644 --- a/level_zero/sysman/source/sysman_device_imp.cpp +++ b/level_zero/sysman/source/sysman_device_imp.cpp @@ -35,6 +35,7 @@ SysmanDeviceImp::SysmanDeviceImp(NEO::ExecutionEnvironment *executionEnvironment pGlobalOperations = new GlobalOperationsImp(pOsSysman); pStandbyHandleContext = new StandbyHandleContext(pOsSysman); pEcc = new EccImp(pOsSysman); + pTempHandleContext = new TemperatureHandleContext(pOsSysman); } SysmanDeviceImp::~SysmanDeviceImp() { @@ -50,6 +51,7 @@ SysmanDeviceImp::~SysmanDeviceImp() { freeResource(pFabricPortHandleContext); freeResource(pStandbyHandleContext); freeResource(pEcc); + freeResource(pTempHandleContext); freeResource(pOsSysman); executionEnvironment->decRefInternal(); } @@ -117,15 +119,19 @@ ze_result_t SysmanDeviceImp::firmwareGet(uint32_t *pCount, zes_firmware_handle_t ze_result_t SysmanDeviceImp::diagnosticsGet(uint32_t *pCount, zes_diag_handle_t *phDiagnostics) { return pDiagnosticsHandleContext->diagnosticsGet(pCount, phDiagnostics); } + ze_result_t SysmanDeviceImp::deviceEccAvailable(ze_bool_t *pAvailable) { return pEcc->deviceEccAvailable(pAvailable); } + ze_result_t SysmanDeviceImp::deviceEccConfigurable(ze_bool_t *pConfigurable) { return pEcc->deviceEccConfigurable(pConfigurable); } + ze_result_t SysmanDeviceImp::deviceGetEccState(zes_device_ecc_properties_t *pState) { return pEcc->getEccState(pState); } + ze_result_t SysmanDeviceImp::deviceSetEccState(const zes_device_ecc_desc_t *newState, zes_device_ecc_properties_t *pState) { return pEcc->setEccState(newState, pState); } @@ -134,5 +140,9 @@ ze_result_t SysmanDeviceImp::standbyGet(uint32_t *pCount, zes_standby_handle_t * return pStandbyHandleContext->standbyGet(pCount, phStandby); } +ze_result_t SysmanDeviceImp::temperatureGet(uint32_t *pCount, zes_temp_handle_t *phTemperature) { + return pTempHandleContext->temperatureGet(pCount, phTemperature); +} + } // namespace Sysman } // namespace L0 diff --git a/level_zero/sysman/source/sysman_device_imp.h b/level_zero/sysman/source/sysman_device_imp.h index c6c943f6e5..d12a4ce0ed 100644 --- a/level_zero/sysman/source/sysman_device_imp.h +++ b/level_zero/sysman/source/sysman_device_imp.h @@ -49,6 +49,7 @@ struct SysmanDeviceImp : SysmanDevice, NEO::NonCopyableOrMovableClass { FrequencyHandleContext *pFrequencyHandleContext = nullptr; StandbyHandleContext *pStandbyHandleContext = nullptr; Ecc *pEcc = nullptr; + TemperatureHandleContext *pTempHandleContext = nullptr; ze_result_t powerGet(uint32_t *pCount, zes_pwr_handle_t *phPower) override; ze_result_t powerGetCardDomain(zes_pwr_handle_t *phPower) override; @@ -69,6 +70,7 @@ struct SysmanDeviceImp : SysmanDevice, NEO::NonCopyableOrMovableClass { ze_result_t deviceEccConfigurable(ze_bool_t *pConfigurable) override; ze_result_t deviceGetEccState(zes_device_ecc_properties_t *pState) override; ze_result_t deviceSetEccState(const zes_device_ecc_desc_t *newState, zes_device_ecc_properties_t *pState) override; + ze_result_t temperatureGet(uint32_t *pCount, zes_temp_handle_t *phTemperature) override; private: NEO::ExecutionEnvironment *executionEnvironment = nullptr; diff --git a/level_zero/sysman/source/temperature/CMakeLists.txt b/level_zero/sysman/source/temperature/CMakeLists.txt new file mode 100644 index 0000000000..75f83c35e9 --- /dev/null +++ b/level_zero/sysman/source/temperature/CMakeLists.txt @@ -0,0 +1,17 @@ +# +# Copyright (C) 2020-2023 Intel Corporation +# +# SPDX-License-Identifier: MIT +# + +target_sources(${L0_STATIC_LIB_NAME} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/os_temperature.h + ${CMAKE_CURRENT_SOURCE_DIR}/temperature.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/temperature.h + ${CMAKE_CURRENT_SOURCE_DIR}/temperature_imp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/temperature_imp.h +) + +add_subdirectories() diff --git a/level_zero/sysman/source/temperature/linux/CMakeLists.txt b/level_zero/sysman/source/temperature/linux/CMakeLists.txt new file mode 100644 index 0000000000..9eb8667664 --- /dev/null +++ b/level_zero/sysman/source/temperature/linux/CMakeLists.txt @@ -0,0 +1,14 @@ +# +# Copyright (C) 2020-2023 Intel Corporation +# +# SPDX-License-Identifier: MIT +# + +if(UNIX) + target_sources(${L0_STATIC_LIB_NAME} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/os_temperature_imp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/os_temperature_imp.h + ) +endif() diff --git a/level_zero/sysman/source/temperature/linux/os_temperature_imp.cpp b/level_zero/sysman/source/temperature/linux/os_temperature_imp.cpp new file mode 100644 index 0000000000..93f3fe4f11 --- /dev/null +++ b/level_zero/sysman/source/temperature/linux/os_temperature_imp.cpp @@ -0,0 +1,247 @@ +/* + * Copyright (C) 2022-2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "level_zero/sysman/source/temperature/linux/os_temperature_imp.h" + +#include "shared/source/debug_settings/debug_settings_manager.h" + +#include "level_zero/sysman/source/linux/os_sysman_imp.h" + +namespace L0 { +namespace Sysman { + +constexpr uint32_t numSocTemperatureEntries = 7; // entries would be PCH or GT_TEMP, DRAM, SA, PSF, DE, PCIE, TYPEC +constexpr uint32_t numCoreTemperatureEntries = 4; // entries would be CORE0, CORE1, CORE2, CORE3 +constexpr uint32_t numComputeTemperatureEntries = 3; // entries would be IA, GT and LLC +constexpr uint32_t invalidMaxTemperature = 125; +constexpr uint32_t invalidMinTemperature = 10; + +ze_result_t LinuxTemperatureImp::getProperties(zes_temp_properties_t *pProperties) { + pProperties->type = type; + pProperties->onSubdevice = 0; + pProperties->subdeviceId = 0; + if (isSubdevice) { + pProperties->onSubdevice = isSubdevice; + pProperties->subdeviceId = subdeviceId; + } + return ZE_RESULT_SUCCESS; +} + +ze_result_t LinuxTemperatureImp::getGlobalMaxTemperatureNoSubDevice(double *pTemperature) { + auto isValidTemperature = [](auto temperature) { + if ((temperature > invalidMaxTemperature) || (temperature < invalidMinTemperature)) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): temperature:%f is not in valid limits \n", __FUNCTION__, temperature); + return false; + } + return true; + }; + + auto getMaxTemperature = [&](auto temperature, auto numTemperatureEntries) { + uint32_t maxTemperature = 0; + for (uint32_t count = 0; count < numTemperatureEntries; count++) { + uint32_t localTemperatureVal = (temperature >> (8 * count)) & 0xff; + if (isValidTemperature(localTemperatureVal)) { + if (localTemperatureVal > maxTemperature) { + maxTemperature = localTemperatureVal; + } + } + } + return maxTemperature; + }; + + ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + uint32_t maxComputeTemperature = 0; + uint32_t maxCoreTemperature = 0; + std::string key; + if (productFamily == IGFX_DG1) { + uint32_t computeTemperature = 0; + key = "COMPUTE_TEMPERATURES"; + result = pPmt->readValue(key, computeTemperature); + if (result != ZE_RESULT_SUCCESS) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for COMPUTE_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result); + return result; + } + // Check max temperature among IA, GT and LLC sensors across COMPUTE_TEMPERATURES + maxComputeTemperature = getMaxTemperature(computeTemperature, numComputeTemperatureEntries); + + uint32_t coreTemperature = 0; + key = "CORE_TEMPERATURES"; + result = pPmt->readValue(key, coreTemperature); + if (result != ZE_RESULT_SUCCESS) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for CORE_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result); + return result; + } + // Check max temperature among CORE0, CORE1, CORE2, CORE3 sensors across CORE_TEMPERATURES + maxCoreTemperature = getMaxTemperature(coreTemperature, numCoreTemperatureEntries); + } + + // SOC_TEMPERATURES is present in all product families + uint64_t socTemperature = 0; + key = "SOC_TEMPERATURES"; + result = pPmt->readValue(key, socTemperature); + if (result != ZE_RESULT_SUCCESS) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for SOC_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result); + return result; + } + // Check max temperature among possible sensors like PCH or GT_TEMP, DRAM, SA, PSF, DE, PCIE, TYPEC across SOC_TEMPERATURES + uint32_t maxSocTemperature = getMaxTemperature(socTemperature, numSocTemperatureEntries); + + *pTemperature = static_cast(std::max({maxComputeTemperature, maxCoreTemperature, maxSocTemperature})); + + return result; +} + +ze_result_t LinuxTemperatureImp::getGlobalMaxTemperature(double *pTemperature) { + // For XE_HP_SDV and PVC single tile devices, telemetry info is retrieved from + // tile's telem node rather from root device telem node. + if ((!isSubdevice) && (!((productFamily == IGFX_PVC) || (productFamily == IGFX_XE_HP_SDV)))) { + return getGlobalMaxTemperatureNoSubDevice(pTemperature); + } + uint32_t globalMaxTemperature = 0; + std::string key("TileMaxTemperature"); + ze_result_t result = pPmt->readValue(key, globalMaxTemperature); + if (result != ZE_RESULT_SUCCESS) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for TileMaxTemperature is returning error:0x%x \n", __FUNCTION__, result); + return result; + } + *pTemperature = static_cast(globalMaxTemperature); + return result; +} + +ze_result_t LinuxTemperatureImp::getGpuMaxTemperatureNoSubDevice(double *pTemperature) { + double gpuMaxTemperature = 0; + uint64_t socTemperature = 0; + // Gpu temperature is obtained from GT_TEMP in SOC_TEMPERATURE's bit 0 to 7. + std::string key = "SOC_TEMPERATURES"; + auto result = pPmt->readValue(key, socTemperature); + if (result != ZE_RESULT_SUCCESS) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for SOC_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result); + return result; + } + gpuMaxTemperature = static_cast(socTemperature & 0xff); + + if (productFamily == IGFX_DG1) { + // In DG1 platform, Gpu Max Temperature is obtained from COMPUTE_TEMPERATURE only + uint32_t computeTemperature = 0; + std::string key("COMPUTE_TEMPERATURES"); + ze_result_t result = pPmt->readValue(key, computeTemperature); + if (result != ZE_RESULT_SUCCESS) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for COMPUTE_TEMPERATURES is returning error:0x%x \n", __FUNCTION__, result); + return result; + } + + // GT temperature could be read via 8th to 15th bit in the value read in temperature + computeTemperature = (computeTemperature >> 8) & 0xff; + gpuMaxTemperature = static_cast(computeTemperature); + } + *pTemperature = gpuMaxTemperature; + return ZE_RESULT_SUCCESS; +} + +ze_result_t LinuxTemperatureImp::getGpuMaxTemperature(double *pTemperature) { + if ((!isSubdevice) && (!((productFamily == IGFX_PVC) || (productFamily == IGFX_XE_HP_SDV)))) { + return getGpuMaxTemperatureNoSubDevice(pTemperature); + } + uint32_t gpuMaxTemperature = 0; + std::string key("GTMaxTemperature"); + ze_result_t result = pPmt->readValue(key, gpuMaxTemperature); + if (result != ZE_RESULT_SUCCESS) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for GTMaxTemperature is returning error:0x%x \n", __FUNCTION__, result); + return result; + } + *pTemperature = static_cast(gpuMaxTemperature); + return result; +} + +ze_result_t LinuxTemperatureImp::getMemoryMaxTemperature(double *pTemperature) { + ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + uint32_t numHbmModules = 0u; + if (productFamily == IGFX_XE_HP_SDV) { + numHbmModules = 2u; + } else if (productFamily == IGFX_PVC) { + numHbmModules = 4u; + } else { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s() returning UNSUPPORTED_FEATURE \n", __FUNCTION__); + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + std::vector maxDeviceTemperatureList; + for (auto hbmModuleIndex = 0u; hbmModuleIndex < numHbmModules; hbmModuleIndex++) { + uint32_t maxDeviceTemperature = 0; + // To read HBM 0's max device temperature key would be HBM0MaxDeviceTemperature + std::string key = "HBM" + std::to_string(hbmModuleIndex) + "MaxDeviceTemperature"; + result = pPmt->readValue(key, maxDeviceTemperature); + if (result != ZE_RESULT_SUCCESS) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Pmt->readvalue() for %s is returning error:0x%x \n", __FUNCTION__, key.c_str(), result); + return result; + } + maxDeviceTemperatureList.push_back(maxDeviceTemperature); + } + + *pTemperature = static_cast(*std::max_element(maxDeviceTemperatureList.begin(), maxDeviceTemperatureList.end())); + return result; +} + +ze_result_t LinuxTemperatureImp::getSensorTemperature(double *pTemperature) { + ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + switch (type) { + case ZES_TEMP_SENSORS_GLOBAL: + result = getGlobalMaxTemperature(pTemperature); + if (result != ZE_RESULT_SUCCESS) { + return result; + } + break; + case ZES_TEMP_SENSORS_GPU: + result = getGpuMaxTemperature(pTemperature); + if (result != ZE_RESULT_SUCCESS) { + return result; + } + break; + case ZES_TEMP_SENSORS_MEMORY: + result = getMemoryMaxTemperature(pTemperature); + if (result != ZE_RESULT_SUCCESS) { + return result; + } + break; + default: + *pTemperature = 0; + result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + break; + } + + return result; +} + +bool LinuxTemperatureImp::isTempModuleSupported() { + if ((!isSubdevice) && (!((productFamily == IGFX_PVC) || (productFamily == IGFX_XE_HP_SDV)))) { + if (type == ZES_TEMP_SENSORS_MEMORY) { + return false; + } + } + + return (pPmt != nullptr); +} + +void LinuxTemperatureImp::setSensorType(zes_temp_sensors_t sensorType) { + type = sensorType; +} + +LinuxTemperatureImp::LinuxTemperatureImp(OsSysman *pOsSysman, ze_bool_t onSubdevice, + uint32_t subdeviceId) : subdeviceId(subdeviceId), isSubdevice(onSubdevice) { + LinuxSysmanImp *pLinuxSysmanImp = static_cast(pOsSysman); + pPmt = pLinuxSysmanImp->getPlatformMonitoringTechAccess(subdeviceId); + productFamily = pLinuxSysmanImp->getProductFamily(); +} + +std::unique_ptr OsTemperature::create(OsSysman *pOsSysman, ze_bool_t onSubdevice, uint32_t subdeviceId, zes_temp_sensors_t sensorType) { + std::unique_ptr pLinuxTemperatureImp = std::make_unique(pOsSysman, onSubdevice, subdeviceId); + pLinuxTemperatureImp->setSensorType(sensorType); + return pLinuxTemperatureImp; +} + +} // namespace Sysman +} // namespace L0 diff --git a/level_zero/sysman/source/temperature/linux/os_temperature_imp.h b/level_zero/sysman/source/temperature/linux/os_temperature_imp.h new file mode 100644 index 0000000000..69f055dbab --- /dev/null +++ b/level_zero/sysman/source/temperature/linux/os_temperature_imp.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2022-2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once +#include "shared/source/helpers/non_copyable_or_moveable.h" + +#include "level_zero/sysman/source/linux/fs_access.h" +#include "level_zero/sysman/source/linux/pmt/pmt.h" +#include "level_zero/sysman/source/temperature/os_temperature.h" + +#include "igfxfmid.h" + +#include + +namespace L0 { +namespace Sysman { + +class SysfsAccess; +class PlatformMonitoringTech; +class LinuxTemperatureImp : public OsTemperature, NEO::NonCopyableOrMovableClass { + public: + ze_result_t getProperties(zes_temp_properties_t *pProperties) override; + ze_result_t getSensorTemperature(double *pTemperature) override; + bool isTempModuleSupported() override; + void setSensorType(zes_temp_sensors_t sensorType); + LinuxTemperatureImp(OsSysman *pOsSysman, ze_bool_t onSubdevice, uint32_t subdeviceId); + LinuxTemperatureImp() = default; + ~LinuxTemperatureImp() override = default; + + protected: + PlatformMonitoringTech *pPmt = nullptr; + zes_temp_sensors_t type = ZES_TEMP_SENSORS_GLOBAL; + + private: + ze_result_t getGlobalMaxTemperature(double *pTemperature); + ze_result_t getGlobalMinTemperature(double *pTemperature); + ze_result_t getGpuMaxTemperature(double *pTemperature); + ze_result_t getGpuMinTemperature(double *pTemperature); + ze_result_t getMemoryMaxTemperature(double *pTemperature); + ze_result_t getGlobalMaxTemperatureNoSubDevice(double *pTemperature); + ze_result_t getGpuMaxTemperatureNoSubDevice(double *pTemperature); + uint32_t subdeviceId = 0; + ze_bool_t isSubdevice = 0; + PRODUCT_FAMILY productFamily = IGFX_UNKNOWN; +}; + +} // namespace Sysman +} // namespace L0 diff --git a/level_zero/sysman/source/temperature/os_temperature.h b/level_zero/sysman/source/temperature/os_temperature.h new file mode 100644 index 0000000000..8ed1fc6dc0 --- /dev/null +++ b/level_zero/sysman/source/temperature/os_temperature.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2020-2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +#include + +#include + +namespace L0 { +namespace Sysman { +struct OsSysman; +class OsTemperature { + public: + virtual ze_result_t getProperties(zes_temp_properties_t *pProperties) = 0; + virtual ze_result_t getSensorTemperature(double *pTemperature) = 0; + virtual bool isTempModuleSupported() = 0; + static std::unique_ptr create(OsSysman *pOsSysman, ze_bool_t onSubdevice, uint32_t subdeviceId, zes_temp_sensors_t sensorType); + virtual ~OsTemperature() = default; +}; + +} // namespace Sysman +} // namespace L0 \ No newline at end of file diff --git a/level_zero/sysman/source/temperature/temperature.cpp b/level_zero/sysman/source/temperature/temperature.cpp new file mode 100644 index 0000000000..a749291196 --- /dev/null +++ b/level_zero/sysman/source/temperature/temperature.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2020-2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/helpers/basic_math.h" + +#include "level_zero/sysman/source/os_sysman.h" +#include "level_zero/sysman/source/temperature/temperature_imp.h" + +namespace L0 { +namespace Sysman { + +TemperatureHandleContext::~TemperatureHandleContext() {} + +void TemperatureHandleContext::createHandle(bool onSubdevice, uint32_t subDeviceId, zes_temp_sensors_t type) { + std::unique_ptr pTemperature = std::make_unique(pOsSysman, onSubdevice, subDeviceId, type); + if (pTemperature->initSuccess == true) { + handleList.push_back(std::move(pTemperature)); + } +} + +ze_result_t TemperatureHandleContext::init(uint32_t subDeviceCount) { + + if (subDeviceCount > 0) { + for (uint32_t subDeviceId = 0; subDeviceId < subDeviceCount; subDeviceId++) { + createHandle(true, subDeviceId, ZES_TEMP_SENSORS_GLOBAL); + createHandle(true, subDeviceId, ZES_TEMP_SENSORS_GPU); + createHandle(true, subDeviceId, ZES_TEMP_SENSORS_MEMORY); + } + } else { + createHandle(false, 0, ZES_TEMP_SENSORS_GLOBAL); + createHandle(false, 0, ZES_TEMP_SENSORS_GPU); + createHandle(false, 0, ZES_TEMP_SENSORS_MEMORY); + } + + return ZE_RESULT_SUCCESS; +} + +ze_result_t TemperatureHandleContext::temperatureGet(uint32_t *pCount, zes_temp_handle_t *phTemperature) { + std::call_once(initTemperatureOnce, [this]() { + this->init(pOsSysman->getSubDeviceCount()); + }); + uint32_t handleListSize = static_cast(handleList.size()); + uint32_t numToCopy = std::min(*pCount, handleListSize); + if (0 == *pCount || *pCount > handleListSize) { + *pCount = handleListSize; + } + if (nullptr != phTemperature) { + for (uint32_t i = 0; i < numToCopy; i++) { + phTemperature[i] = handleList[i]->toHandle(); + } + } + return ZE_RESULT_SUCCESS; +} + +} // namespace Sysman +} // namespace L0 diff --git a/level_zero/sysman/source/temperature/temperature.h b/level_zero/sysman/source/temperature/temperature.h new file mode 100644 index 0000000000..1db9941937 --- /dev/null +++ b/level_zero/sysman/source/temperature/temperature.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2020-2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once +#include "level_zero/api/sysman/zes_handles_struct.h" +#include + +#include +#include +#include + +namespace L0 { +namespace Sysman { + +struct OsSysman; +class Temperature : _zes_temp_handle_t { + public: + virtual ze_result_t temperatureGetProperties(zes_temp_properties_t *pProperties) = 0; + virtual ze_result_t temperatureGetConfig(zes_temp_config_t *pConfig) = 0; + virtual ze_result_t temperatureSetConfig(const zes_temp_config_t *pConfig) = 0; + virtual ze_result_t temperatureGetState(double *pTemperature) = 0; + + static Temperature *fromHandle(zes_temp_handle_t handle) { + return static_cast(handle); + } + inline zes_temp_handle_t toHandle() { return this; } + bool initSuccess = false; + zes_temp_properties_t tempProperties = {}; +}; + +struct TemperatureHandleContext { + TemperatureHandleContext(OsSysman *pOsSysman) : pOsSysman(pOsSysman){}; + ~TemperatureHandleContext(); + + ze_result_t init(uint32_t subDeviceCount); + + ze_result_t temperatureGet(uint32_t *pCount, zes_temp_handle_t *phTemperature); + + OsSysman *pOsSysman = nullptr; + std::vector> handleList = {}; + + private: + void createHandle(bool onSubdevice, uint32_t subDeviceId, zes_temp_sensors_t type); + std::once_flag initTemperatureOnce; +}; + +} // namespace Sysman +} // namespace L0 diff --git a/level_zero/sysman/source/temperature/temperature_imp.cpp b/level_zero/sysman/source/temperature/temperature_imp.cpp new file mode 100644 index 0000000000..763bc5d288 --- /dev/null +++ b/level_zero/sysman/source/temperature/temperature_imp.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2020-2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "level_zero/sysman/source/temperature/temperature_imp.h" + +#include "level_zero/sysman/source/sysman_device_imp.h" + +namespace L0 { +namespace Sysman { + +ze_result_t TemperatureImp::temperatureGetProperties(zes_temp_properties_t *pProperties) { + *pProperties = tempProperties; + return ZE_RESULT_SUCCESS; +} + +ze_result_t TemperatureImp::temperatureGetConfig(zes_temp_config_t *pConfig) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ze_result_t TemperatureImp::temperatureSetConfig(const zes_temp_config_t *pConfig) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ze_result_t TemperatureImp::temperatureGetState(double *pTemperature) { + return pOsTemperature->getSensorTemperature(pTemperature); +} + +void TemperatureImp::init() { + if (pOsTemperature->isTempModuleSupported()) { + pOsTemperature->getProperties(&tempProperties); + this->initSuccess = true; + } +} + +TemperatureImp::TemperatureImp(OsSysman *pOsSysman, ze_bool_t onSubdevice, uint32_t subdeviceId, zes_temp_sensors_t type) { + pOsTemperature = OsTemperature::create(pOsSysman, onSubdevice, subdeviceId, type); + init(); +} + +TemperatureImp::~TemperatureImp() { +} + +} // namespace Sysman +} // namespace L0 \ No newline at end of file diff --git a/level_zero/sysman/source/temperature/temperature_imp.h b/level_zero/sysman/source/temperature/temperature_imp.h new file mode 100644 index 0000000000..211c434a29 --- /dev/null +++ b/level_zero/sysman/source/temperature/temperature_imp.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2020-2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +#include "shared/source/helpers/non_copyable_or_moveable.h" + +#include "level_zero/sysman/source/temperature/os_temperature.h" +#include "level_zero/sysman/source/temperature/temperature.h" + +namespace L0 { +namespace Sysman { +class TemperatureImp : public Temperature, NEO::NonCopyableOrMovableClass { + public: + ze_result_t temperatureGetProperties(zes_temp_properties_t *pProperties) override; + ze_result_t temperatureGetConfig(zes_temp_config_t *pConfig) override; + ze_result_t temperatureSetConfig(const zes_temp_config_t *pConfig) override; + ze_result_t temperatureGetState(double *pTemperature) override; + + TemperatureImp() = default; + TemperatureImp(OsSysman *pOsSysman, ze_bool_t onSubdevice, uint32_t subdeviceId, zes_temp_sensors_t type); + ~TemperatureImp() override; + + std::unique_ptr pOsTemperature = nullptr; + void init(); +}; +} // namespace Sysman +} // namespace L0 diff --git a/level_zero/sysman/source/temperature/windows/CMakeLists.txt b/level_zero/sysman/source/temperature/windows/CMakeLists.txt new file mode 100644 index 0000000000..98194ab115 --- /dev/null +++ b/level_zero/sysman/source/temperature/windows/CMakeLists.txt @@ -0,0 +1,14 @@ +# +# Copyright (C) 2020-2023 Intel Corporation +# +# SPDX-License-Identifier: MIT +# + +if(WIN32) + target_sources(${L0_STATIC_LIB_NAME} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/os_temperature_imp.h + ${CMAKE_CURRENT_SOURCE_DIR}/os_temperature_imp.cpp + ) +endif() diff --git a/level_zero/sysman/source/temperature/windows/os_temperature_imp.cpp b/level_zero/sysman/source/temperature/windows/os_temperature_imp.cpp new file mode 100644 index 0000000000..2eae4de0f5 --- /dev/null +++ b/level_zero/sysman/source/temperature/windows/os_temperature_imp.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2020-2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "level_zero/sysman/source/temperature/windows/os_temperature_imp.h" + +namespace L0 { +namespace Sysman { + +ze_result_t WddmTemperatureImp::getProperties(zes_temp_properties_t *pProperties) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ze_result_t WddmTemperatureImp::getSensorTemperature(double *pTemperature) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +bool WddmTemperatureImp::isTempModuleSupported() { + return false; +} + +WddmTemperatureImp::WddmTemperatureImp(OsSysman *pOsSysman) {} + +std::unique_ptr OsTemperature::create(OsSysman *pOsSysman, ze_bool_t onSubdevice, uint32_t subdeviceId, zes_temp_sensors_t sensorType) { + std::unique_ptr pWddmTemperatureImp = std::make_unique(pOsSysman); + return std::move(pWddmTemperatureImp); +} + +} // namespace Sysman +} // namespace L0 diff --git a/level_zero/sysman/source/temperature/windows/os_temperature_imp.h b/level_zero/sysman/source/temperature/windows/os_temperature_imp.h new file mode 100644 index 0000000000..bc1e33697b --- /dev/null +++ b/level_zero/sysman/source/temperature/windows/os_temperature_imp.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2020-2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once +#include "shared/source/helpers/non_copyable_or_moveable.h" + +#include "level_zero/sysman/source/temperature/os_temperature.h" +#include "level_zero/sysman/source/windows/os_sysman_imp.h" + +namespace L0 { +namespace Sysman { +class KmdSysManager; +class WddmTemperatureImp : public OsTemperature, NEO::NonCopyableOrMovableClass { + public: + ze_result_t getProperties(zes_temp_properties_t *pProperties) override; + ze_result_t getSensorTemperature(double *pTemperature) override; + bool isTempModuleSupported() override; + + WddmTemperatureImp(OsSysman *pOsSysman); + WddmTemperatureImp() = default; + ~WddmTemperatureImp() override = default; +}; + +} // namespace Sysman +} // namespace L0 \ No newline at end of file diff --git a/level_zero/sysman/test/unit_tests/sources/temperature/CMakeLists.txt b/level_zero/sysman/test/unit_tests/sources/temperature/CMakeLists.txt new file mode 100644 index 0000000000..cb5b30473c --- /dev/null +++ b/level_zero/sysman/test/unit_tests/sources/temperature/CMakeLists.txt @@ -0,0 +1,10 @@ +# +# Copyright (C) 2020-2023 Intel Corporation +# +# SPDX-License-Identifier: MIT +# + +target_sources(${TARGET_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt +) +add_subdirectories() diff --git a/level_zero/sysman/test/unit_tests/sources/temperature/linux/CMakeLists.txt b/level_zero/sysman/test/unit_tests/sources/temperature/linux/CMakeLists.txt new file mode 100644 index 0000000000..5a6283f427 --- /dev/null +++ b/level_zero/sysman/test/unit_tests/sources/temperature/linux/CMakeLists.txt @@ -0,0 +1,18 @@ +# +# Copyright (C) 2020-2023 Intel Corporation +# +# SPDX-License-Identifier: MIT +# + +set(L0_TESTS_SYSMAN_TEMPERATURE_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_temperature.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mock_sysfs_temperature.h +) + +if(UNIX) + target_sources(${TARGET_NAME} + PRIVATE + ${L0_TESTS_SYSMAN_TEMPERATURE_LINUX} + ) +endif() diff --git a/level_zero/sysman/test/unit_tests/sources/temperature/linux/mock_sysfs_temperature.h b/level_zero/sysman/test/unit_tests/sources/temperature/linux/mock_sysfs_temperature.h new file mode 100644 index 0000000000..3f9322cfb2 --- /dev/null +++ b/level_zero/sysman/test/unit_tests/sources/temperature/linux/mock_sysfs_temperature.h @@ -0,0 +1,174 @@ +/* + * Copyright (C) 2022-2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once +#include "level_zero/sysman/source/linux/fs_access.h" +#include "level_zero/sysman/source/linux/pmt/pmt.h" +#include "level_zero/sysman/source/temperature/linux/os_temperature_imp.h" +#include "level_zero/sysman/source/temperature/temperature_imp.h" + +namespace L0 { +namespace ult { + +constexpr uint8_t memory0MaxTemperature = 0x12; +constexpr uint8_t memory1MaxTemperature = 0x45; +constexpr uint8_t memory2MaxTemperature = 0x32; +constexpr uint8_t memory3MaxTemperature = 0x36; +constexpr uint32_t gtMaxTemperature = 0x1d; +constexpr uint32_t tileMaxTemperature = 0x34; + +constexpr uint8_t computeTempIndex = 8; +constexpr uint8_t coreTempIndex = 12; +constexpr uint8_t socTempIndex = 0; +constexpr uint8_t tempArrForNoSubDevices[19] = {0x12, 0x23, 0x43, 0xde, 0xa3, 0xce, 0x23, 0x11, 0x45, 0x32, 0x67, 0x47, 0xac, 0x21, 0x03, 0x90, 0, 0, 0}; +constexpr uint8_t computeIndexForNoSubDevices = 9; +constexpr uint8_t gtTempIndexForNoSubDevices = 0; +const std::string baseTelemSysFS("/sys/class/intel_pmt"); +std::string gpuUpstreamPortPathInTemperature = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0"; +const std::string realPathTelem1 = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:02.0/0000:8e:00.1/pmt_telemetry.1.auto/intel_pmt/telem1"; +const std::string realPathTelem2 = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:02.0/0000:8e:00.1/pmt_telemetry.1.auto/intel_pmt/telem2"; +const std::string realPathTelem3 = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:02.0/0000:8e:00.1/pmt_telemetry.1.auto/intel_pmt/telem3"; +const std::string realPathTelem4 = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:02.0/0000:8e:00.1/pmt_telemetry.1.auto/intel_pmt/telem4"; +const std::string realPathTelem5 = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:02.0/0000:8e:00.1/pmt_telemetry.1.auto/intel_pmt/telem5"; +const std::string sysfsPahTelem1 = "/sys/class/intel_pmt/telem1"; +const std::string sysfsPahTelem2 = "/sys/class/intel_pmt/telem2"; +const std::string sysfsPahTelem3 = "/sys/class/intel_pmt/telem3"; +const std::string sysfsPahTelem4 = "/sys/class/intel_pmt/telem4"; +const std::string sysfsPahTelem5 = "/sys/class/intel_pmt/telem5"; + +struct MockTemperaturePmt : public L0::Sysman::PlatformMonitoringTech { + MockTemperaturePmt(L0::Sysman::FsAccess *pFsAccess, ze_bool_t onSubdevice, uint32_t subdeviceId) : L0::Sysman::PlatformMonitoringTech(pFsAccess, onSubdevice, subdeviceId) {} + using L0::Sysman::PlatformMonitoringTech::closeFunction; + using L0::Sysman::PlatformMonitoringTech::keyOffsetMap; + using L0::Sysman::PlatformMonitoringTech::openFunction; + using L0::Sysman::PlatformMonitoringTech::preadFunction; + using L0::Sysman::PlatformMonitoringTech::telemetryDeviceEntry; + + ze_result_t mockReadValueResult = ZE_RESULT_SUCCESS; + ze_result_t mockReadCoreTempResult = ZE_RESULT_SUCCESS; + ze_result_t mockReadComputeTempResult = ZE_RESULT_SUCCESS; + + ~MockTemperaturePmt() override { + rootDeviceTelemNodeIndex = 0; + } + + void mockedInit(L0::Sysman::FsAccess *pFsAccess) { + if (ZE_RESULT_SUCCESS != PlatformMonitoringTech::enumerateRootTelemIndex(pFsAccess, gpuUpstreamPortPathInTemperature)) { + return; + } + telemetryDeviceEntry = "/sys/class/intel_pmt/telem2/telem"; + } + + ze_result_t readValue(const std::string key, uint32_t &val) override { + + if (mockReadValueResult != ZE_RESULT_SUCCESS) { + return mockReadValueResult; + } + + ze_result_t result = ZE_RESULT_SUCCESS; + if (key.compare("HBM0MaxDeviceTemperature") == 0) { + val = memory0MaxTemperature; + } else if (key.compare("HBM1MaxDeviceTemperature") == 0) { + val = memory1MaxTemperature; + } else if (key.compare("HBM2MaxDeviceTemperature") == 0) { + val = memory2MaxTemperature; + } else if (key.compare("HBM3MaxDeviceTemperature") == 0) { + val = memory3MaxTemperature; + } else if (key.compare("GTMaxTemperature") == 0) { + val = gtMaxTemperature; + } else if (key.compare("TileMaxTemperature") == 0) { + val = tileMaxTemperature; + } else if (key.compare("COMPUTE_TEMPERATURES") == 0) { + if (mockReadComputeTempResult != ZE_RESULT_SUCCESS) { + return mockReadComputeTempResult; + } + val = 0; + for (uint8_t i = 0; i < sizeof(uint32_t); i++) { + val |= (uint32_t)tempArrForNoSubDevices[(computeTempIndex) + i] << (i * 8); + } + } else if (key.compare("CORE_TEMPERATURES") == 0) { + if (mockReadCoreTempResult != ZE_RESULT_SUCCESS) { + return mockReadCoreTempResult; + } + val = 0; + for (uint8_t i = 0; i < sizeof(uint32_t); i++) { + val |= (uint32_t)tempArrForNoSubDevices[(coreTempIndex) + i] << (i * 8); + } + } else { + result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + return result; + } + + ze_result_t readValue(const std::string key, uint64_t &val) override { + + if (mockReadValueResult != ZE_RESULT_SUCCESS) { + return mockReadValueResult; + } + + if (key.compare("SOC_TEMPERATURES") == 0) { + val = 0; + for (uint8_t i = 0; i < sizeof(uint64_t); i++) { + val |= (uint64_t)tempArrForNoSubDevices[(socTempIndex) + i] << (i * 8); + } + return ZE_RESULT_SUCCESS; + } else { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + } +}; + +struct MockTemperatureFsAccess : public L0::Sysman::FsAccess { + ze_result_t mockErrorListDirectory = ZE_RESULT_SUCCESS; + ze_result_t mockErrorGetRealPath = ZE_RESULT_SUCCESS; + ze_result_t listDirectory(const std::string directory, std::vector &listOfTelemNodes) override { + if (mockErrorListDirectory != ZE_RESULT_SUCCESS) { + return mockErrorListDirectory; + } + if (directory.compare(baseTelemSysFS) == 0) { + listOfTelemNodes.push_back("telem1"); + listOfTelemNodes.push_back("telem2"); + listOfTelemNodes.push_back("telem3"); + listOfTelemNodes.push_back("telem4"); + listOfTelemNodes.push_back("telem5"); + return ZE_RESULT_SUCCESS; + } + return ZE_RESULT_ERROR_NOT_AVAILABLE; + } + + ze_result_t getRealPath(const std::string path, std::string &buf) override { + if (mockErrorGetRealPath != ZE_RESULT_SUCCESS) { + return mockErrorGetRealPath; + } + if (path.compare(sysfsPahTelem1) == 0) { + buf = realPathTelem1; + } else if (path.compare(sysfsPahTelem2) == 0) { + buf = realPathTelem2; + } else if (path.compare(sysfsPahTelem3) == 0) { + buf = realPathTelem3; + } else if (path.compare(sysfsPahTelem4) == 0) { + buf = realPathTelem4; + } else if (path.compare(sysfsPahTelem5) == 0) { + buf = realPathTelem5; + } else { + return ZE_RESULT_ERROR_NOT_AVAILABLE; + } + return ZE_RESULT_SUCCESS; + } + + MockTemperatureFsAccess() = default; +}; + +class PublicLinuxTemperatureImp : public L0::Sysman::LinuxTemperatureImp { + public: + PublicLinuxTemperatureImp(L0::Sysman::OsSysman *pOsSysman, ze_bool_t onSubdevice, uint32_t subdeviceId) : LinuxTemperatureImp(pOsSysman, onSubdevice, subdeviceId) {} + using L0::Sysman::LinuxTemperatureImp::pPmt; + using L0::Sysman::LinuxTemperatureImp::type; +}; +} // namespace ult +} // namespace L0 \ No newline at end of file diff --git a/level_zero/sysman/test/unit_tests/sources/temperature/linux/test_zes_temperature.cpp b/level_zero/sysman/test/unit_tests/sources/temperature/linux/test_zes_temperature.cpp new file mode 100644 index 0000000000..ea15214f96 --- /dev/null +++ b/level_zero/sysman/test/unit_tests/sources/temperature/linux/test_zes_temperature.cpp @@ -0,0 +1,376 @@ +/* + * Copyright (C) 2020-2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "level_zero/sysman/source/linux/pmt/pmt_xml_offsets.h" +#include "level_zero/sysman/test/unit_tests/sources/linux/mock_sysman_fixture.h" +#include "level_zero/sysman/test/unit_tests/sources/temperature/linux/mock_sysfs_temperature.h" + +namespace L0 { +namespace ult { + +constexpr uint32_t handleComponentCountForTwoTileDevices = 6u; +constexpr uint32_t handleComponentCountForSingleTileDevice = 3u; +constexpr uint32_t handleComponentCountForNoSubDevices = 2u; +constexpr uint32_t invalidMaxTemperature = 125; +constexpr uint32_t invalidMinTemperature = 10; +const std::string sampleGuid1 = "0xb15a0edc"; +const std::string sampleGuid2 = "0x490e01"; + +class SysmanMultiDeviceTemperatureFixture : public SysmanMultiDeviceFixture { + protected: + std::unique_ptr pPublicLinuxTemperatureImp; + std::unique_ptr pFsAccess; + L0::Sysman::FsAccess *pFsAccessOriginal = nullptr; + std::map mapOriginal; + L0::Sysman::SysmanDevice *device = nullptr; + void SetUp() override { + SysmanMultiDeviceFixture::SetUp(); + device = pSysmanDevice; + pSysmanDeviceImp->pTempHandleContext->handleList.clear(); + pFsAccess = std::make_unique(); + pFsAccessOriginal = pLinuxSysmanImp->pFsAccess; + pLinuxSysmanImp->pFsAccess = pFsAccess.get(); + + mapOriginal = pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject; + pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject.clear(); + + auto subDeviceCount = pLinuxSysmanImp->getSubDeviceCount(); + uint32_t subdeviceId = 0; + + do { + ze_bool_t onSubdevice = (subDeviceCount == 0) ? false : true; + auto pPmt = new MockTemperaturePmt(pFsAccess.get(), onSubdevice, subdeviceId); + pPmt->mockedInit(pFsAccess.get()); + auto keyOffsetMapEntry = L0::Sysman::guidToKeyOffsetMap.find(sampleGuid1); + pPmt->keyOffsetMap = keyOffsetMapEntry->second; + pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject.emplace(subdeviceId, pPmt); + } while (++subdeviceId < subDeviceCount); + getTempHandles(0); + } + void TearDown() override { + for (const auto &pmtMapElement : pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject) { + delete pmtMapElement.second; + } + pLinuxSysmanImp->pFsAccess = pFsAccessOriginal; + pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject = mapOriginal; + SysmanMultiDeviceFixture::TearDown(); + } + + std::vector getTempHandles(uint32_t count) { + std::vector handles(count, nullptr); + EXPECT_EQ(zesDeviceEnumTemperatureSensors(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); + return handles; + } +}; + +TEST_F(SysmanMultiDeviceTemperatureFixture, GivenComponentCountZeroWhenCallingZetSysmanTemperatureGetThenZeroCountIsReturnedAndVerifySysmanTemperatureGetCallSucceeds) { + uint32_t count = 0; + ze_result_t result = zesDeviceEnumTemperatureSensors(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, handleComponentCountForTwoTileDevices); + + uint32_t testcount = count + 1; + result = zesDeviceEnumTemperatureSensors(device->toHandle(), &testcount, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(testcount, handleComponentCountForTwoTileDevices); + + count = 0; + std::vector handles(count, nullptr); + EXPECT_EQ(zesDeviceEnumTemperatureSensors(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); + EXPECT_EQ(count, handleComponentCountForTwoTileDevices); +} + +HWTEST2_F(SysmanMultiDeviceTemperatureFixture, GivenValidTempHandleWhenGettingTemperatureThenValidTemperatureReadingsRetrieved, IsPVC) { + auto handles = getTempHandles(handleComponentCountForTwoTileDevices); + for (auto handle : handles) { + zes_temp_properties_t properties = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetProperties(handle, &properties)); + double temperature; + if (properties.type == ZES_TEMP_SENSORS_GLOBAL) { + ASSERT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetState(handle, &temperature)); + EXPECT_EQ(temperature, static_cast(tileMaxTemperature)); + } + if (properties.type == ZES_TEMP_SENSORS_GPU) { + ASSERT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetState(handle, &temperature)); + EXPECT_EQ(temperature, static_cast(gtMaxTemperature)); + } + if (properties.type == ZES_TEMP_SENSORS_MEMORY) { + ASSERT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetState(handle, &temperature)); + EXPECT_EQ(temperature, static_cast(std::max({memory0MaxTemperature, memory1MaxTemperature, memory2MaxTemperature, memory3MaxTemperature}))); + } + } +} + +TEST_F(SysmanMultiDeviceTemperatureFixture, GivenValidTempHandleWhenGettingTemperatureConfigThenUnsupportedIsReturned) { + auto handles = getTempHandles(handleComponentCountForTwoTileDevices); + for (auto handle : handles) { + zes_temp_config_t config = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesTemperatureGetConfig(handle, &config)); + } +} + +TEST_F(SysmanMultiDeviceTemperatureFixture, GivenValidTempHandleWhenSettingTemperatureConfigThenUnsupportedIsReturned) { + auto handles = getTempHandles(handleComponentCountForTwoTileDevices); + for (auto handle : handles) { + zes_temp_config_t config = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesTemperatureSetConfig(handle, &config)); + } +} + +TEST_F(SysmanMultiDeviceTemperatureFixture, GivenCreatePmtObjectsWhenRootTileIndexEnumeratesSuccessfulThenValidatePmtObjectsReceivedAndBranches) { + std::map mapOfSubDeviceIdToPmtObject; + L0::Sysman::PlatformMonitoringTech::create(pLinuxSysmanImp, gpuUpstreamPortPathInTemperature, mapOfSubDeviceIdToPmtObject); + uint32_t subdeviceId = 0; + for (auto &subDeviceIdToPmtEntry : mapOfSubDeviceIdToPmtObject) { + EXPECT_NE(subDeviceIdToPmtEntry.second, nullptr); + EXPECT_EQ(subDeviceIdToPmtEntry.first, subdeviceId); + subdeviceId++; + delete subDeviceIdToPmtEntry.second; // delete memory to avoid mem leak here, as we finished our test validation just above. + } +} + +TEST_F(SysmanMultiDeviceTemperatureFixture, GivenValidTempHandleAndPmtReadValueFailsWhenGettingTemperatureThenFailureReturned) { + auto handles = getTempHandles(handleComponentCountForTwoTileDevices); + + auto subDeviceCount = pLinuxSysmanImp->getSubDeviceCount(); + uint32_t subdeviceId = 0; + + for (subdeviceId = 0; subdeviceId < subDeviceCount; subdeviceId++) { + auto pPmt = static_cast(pLinuxSysmanImp->getPlatformMonitoringTechAccess(subdeviceId)); + pPmt->mockReadValueResult = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + for (auto &handle : handles) { + zes_temp_properties_t properties = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetProperties(handle, &properties)); + double temperature; + ASSERT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesTemperatureGetState(handle, &temperature)); + } +} + +class SysmanDeviceTemperatureFixture : public SysmanDeviceFixture { + protected: + std::unique_ptr pPublicLinuxTemperatureImp; + std::unique_ptr pFsAccess; + L0::Sysman::FsAccess *pFsAccessOriginal = nullptr; + std::map pmtMapOriginal; + L0::Sysman::SysmanDevice *device = nullptr; + void SetUp() override { + SysmanDeviceFixture::SetUp(); + device = pSysmanDevice; + pFsAccess = std::make_unique(); + pFsAccessOriginal = pLinuxSysmanImp->pFsAccess; + pLinuxSysmanImp->pFsAccess = pFsAccess.get(); + + auto subDeviceCount = pLinuxSysmanImp->getSubDeviceCount(); + uint32_t subdeviceId = 0; + + pmtMapOriginal = pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject; + pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject.clear(); + do { + ze_bool_t onSubdevice = (subDeviceCount == 0) ? false : true; + auto pPmt = new MockTemperaturePmt(pFsAccess.get(), onSubdevice, subdeviceId); + pPmt->mockedInit(pFsAccess.get()); + auto keyOffsetMapEntry = L0::Sysman::guidToKeyOffsetMap.find(sampleGuid2); + pPmt->keyOffsetMap = keyOffsetMapEntry->second; + pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject.emplace(subdeviceId, pPmt); + } while (++subdeviceId < subDeviceCount); + getTempHandles(0); + } + void TearDown() override { + pLinuxSysmanImp->releasePmtObject(); + pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject = pmtMapOriginal; + pLinuxSysmanImp->pFsAccess = pFsAccessOriginal; + SysmanDeviceFixture::TearDown(); + } + + std::vector getTempHandles(uint32_t count) { + std::vector handles(count, nullptr); + EXPECT_EQ(zesDeviceEnumTemperatureSensors(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); + return handles; + } +}; + +HWTEST2_F(SysmanDeviceTemperatureFixture, GivenValidTempHandleWhenGettingGPUAndGlobalTemperatureThenValidTemperatureReadingsRetrieved, IsDG1) { + auto handles = getTempHandles(handleComponentCountForNoSubDevices); + for (auto &handle : handles) { + zes_temp_properties_t properties = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetProperties(handle, &properties)); + double temperature; + ASSERT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetState(handle, &temperature)); + if (properties.type == ZES_TEMP_SENSORS_GLOBAL) { + uint8_t maxTemp = 0; + for (uint64_t i = 0; i < sizeof(tempArrForNoSubDevices) / sizeof(uint8_t); i++) { + if ((tempArrForNoSubDevices[i] > invalidMaxTemperature) || + (tempArrForNoSubDevices[i] < invalidMinTemperature) || (maxTemp > tempArrForNoSubDevices[i])) { + continue; + } + maxTemp = tempArrForNoSubDevices[i]; + } + EXPECT_EQ(temperature, static_cast(maxTemp)); + } + if (properties.type == ZES_TEMP_SENSORS_GPU) { + EXPECT_EQ(temperature, static_cast(tempArrForNoSubDevices[computeIndexForNoSubDevices])); + } + } +} + +HWTEST2_F(SysmanDeviceTemperatureFixture, GivenValidTempHandleAndReadCoreTemperatureFailsWhenGettingGpuAndGlobalTempThenValidGpuTempAndFailureForGlobalTempAreReturned, IsDG1) { + auto handles = getTempHandles(handleComponentCountForNoSubDevices); + uint32_t subdeviceId = 0; + + auto pPmt = static_cast(pLinuxSysmanImp->getPlatformMonitoringTechAccess(subdeviceId)); + pPmt->mockReadCoreTempResult = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + + for (auto &handle : handles) { + zes_temp_properties_t properties = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetProperties(handle, &properties)); + double temperature; + if (properties.type == ZES_TEMP_SENSORS_GLOBAL) { + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesTemperatureGetState(handle, &temperature)); + } + if (properties.type == ZES_TEMP_SENSORS_GPU) { + ASSERT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetState(handle, &temperature)); + EXPECT_EQ(temperature, static_cast(tempArrForNoSubDevices[computeIndexForNoSubDevices])); + } + } +} + +HWTEST2_F(SysmanDeviceTemperatureFixture, GivenValidTempHandleAndReadComputeTemperatureFailsWhenGettingGPUAndGlobalTemperatureThenFailureReturned, IsDG1) { + auto handles = getTempHandles(handleComponentCountForNoSubDevices); + uint32_t subdeviceId = 0; + + auto pPmt = static_cast(pLinuxSysmanImp->getPlatformMonitoringTechAccess(subdeviceId)); + pPmt->mockReadComputeTempResult = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + + for (auto &handle : handles) { + double temperature; + ASSERT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesTemperatureGetState(handle, &temperature)); + } +} + +HWTEST2_F(SysmanDeviceTemperatureFixture, GivenValidTempHandleWhenGettingGPUAndGlobalTemperatureThenValidTemperatureReadingsRetrieved, IsDG2) { + auto handles = getTempHandles(handleComponentCountForNoSubDevices); + for (auto &handle : handles) { + zes_temp_properties_t properties = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetProperties(handle, &properties)); + double temperature; + ASSERT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetState(handle, &temperature)); + if (properties.type == ZES_TEMP_SENSORS_GLOBAL) { + uint8_t maxTemp = 0; + // For DG2, Global Max temperature will be Maximum of SOC_TEMPERATURES + for (uint64_t i = 0; i < sizeof(uint64_t); i++) { + if ((tempArrForNoSubDevices[i] > invalidMaxTemperature) || + (tempArrForNoSubDevices[i] < invalidMinTemperature) || (maxTemp > tempArrForNoSubDevices[i])) { + continue; + } + maxTemp = tempArrForNoSubDevices[i]; + } + EXPECT_EQ(temperature, static_cast(maxTemp)); + } + if (properties.type == ZES_TEMP_SENSORS_GPU) { + EXPECT_EQ(temperature, static_cast(tempArrForNoSubDevices[gtTempIndexForNoSubDevices])); + } + } +} + +TEST_F(SysmanDeviceTemperatureFixture, GivenValidTempHandleAndPmtReadValueFailsWhenGettingTemperatureThenFailureReturned) { + auto handles = getTempHandles(handleComponentCountForNoSubDevices); + uint32_t subdeviceId = 0; + + auto pPmt = static_cast(pLinuxSysmanImp->getPlatformMonitoringTechAccess(subdeviceId)); + pPmt->mockReadValueResult = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + + for (auto &handle : handles) { + zes_temp_properties_t properties = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetProperties(handle, &properties)); + double temperature; + ASSERT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesTemperatureGetState(handle, &temperature)); + } +} + +TEST_F(SysmanDeviceTemperatureFixture, GivenValidTempHandleWhenGettingUnsupportedSensorsTemperatureThenUnsupportedReturned) { + auto subDeviceCount = pLinuxSysmanImp->getSubDeviceCount(); + ze_bool_t onSubdevice = (subDeviceCount == 0) ? false : true; + uint32_t subdeviceId = 0; + + auto pPublicLinuxTemperatureImp = std::make_unique(pOsSysman, onSubdevice, subdeviceId); + pPublicLinuxTemperatureImp->setSensorType(ZES_TEMP_SENSORS_MEMORY_MIN); + double temperature; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, pPublicLinuxTemperatureImp->getSensorTemperature(&temperature)); +} + +TEST_F(SysmanDeviceTemperatureFixture, GivenValidateEnumerateRootTelemIndexWhengetRealPathFailsThenFailureReturned) { + pFsAccess->mockErrorListDirectory = ZE_RESULT_ERROR_NOT_AVAILABLE; + EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, + L0::Sysman::PlatformMonitoringTech::enumerateRootTelemIndex(pFsAccess.get(), gpuUpstreamPortPathInTemperature)); + + std::map mapOfSubDeviceIdToPmtObject; + L0::Sysman::PlatformMonitoringTech::create(pLinuxSysmanImp, gpuUpstreamPortPathInTemperature, mapOfSubDeviceIdToPmtObject); + EXPECT_TRUE(mapOfSubDeviceIdToPmtObject.empty()); +} + +TEST_F(SysmanDeviceTemperatureFixture, GivenValidatePmtReadValueWhenkeyOffsetMapIsNotThereThenFailureReturned) { + auto pPmt = std::make_unique(pFsAccess.get(), 0, 0); + pPmt->mockedInit(pFsAccess.get()); + // Get keyOffsetMap + auto keyOffsetMapEntry = L0::Sysman::guidToKeyOffsetMap.find(sampleGuid2); + pPmt->keyOffsetMap = keyOffsetMapEntry->second; + uint32_t val = 0; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, pPmt->readValue("SOMETHING", val)); +} + +TEST_F(SysmanDeviceTemperatureFixture, GivenCreatePmtObjectsWhenRootTileIndexEnumeratesSuccessfulThenValidatePmtObjectsReceivedAndBranches) { + std::map mapOfSubDeviceIdToPmtObject; + L0::Sysman::PlatformMonitoringTech::create(pLinuxSysmanImp, gpuUpstreamPortPathInTemperature, mapOfSubDeviceIdToPmtObject); + for (auto &subDeviceIdToPmtEntry : mapOfSubDeviceIdToPmtObject) { + EXPECT_NE(subDeviceIdToPmtEntry.second, nullptr); + EXPECT_EQ(subDeviceIdToPmtEntry.first, 0u); + delete subDeviceIdToPmtEntry.second; + } +} + +HWTEST2_F(SysmanDeviceTemperatureFixture, GivenComponentCountZeroWhenCallingZetSysmanTemperatureGetThenZeroCountIsReturnedAndVerifySysmanTemperatureGetCallSucceeds, IsPVC) { + uint32_t count = 0; + ze_result_t result = zesDeviceEnumTemperatureSensors(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, handleComponentCountForSingleTileDevice); + + uint32_t testcount = count + 1; + result = zesDeviceEnumTemperatureSensors(device->toHandle(), &testcount, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(testcount, handleComponentCountForSingleTileDevice); + + count = 0; + std::vector handles(count, nullptr); + EXPECT_EQ(zesDeviceEnumTemperatureSensors(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); + EXPECT_EQ(count, handleComponentCountForSingleTileDevice); +} + +HWTEST2_F(SysmanDeviceTemperatureFixture, GivenValidTempHandleWhenGettingTemperatureThenValidTemperatureReadingsRetrieved, IsPVC) { + auto handles = getTempHandles(handleComponentCountForSingleTileDevice); + for (auto handle : handles) { + zes_temp_properties_t properties = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetProperties(handle, &properties)); + double temperature; + if (properties.type == ZES_TEMP_SENSORS_GLOBAL) { + ASSERT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetState(handle, &temperature)); + EXPECT_EQ(temperature, static_cast(tileMaxTemperature)); + } + if (properties.type == ZES_TEMP_SENSORS_GPU) { + ASSERT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetState(handle, &temperature)); + EXPECT_EQ(temperature, static_cast(gtMaxTemperature)); + } + if (properties.type == ZES_TEMP_SENSORS_MEMORY) { + ASSERT_EQ(ZE_RESULT_SUCCESS, zesTemperatureGetState(handle, &temperature)); + EXPECT_EQ(temperature, static_cast(std::max({memory0MaxTemperature, memory1MaxTemperature, memory2MaxTemperature, memory3MaxTemperature}))); + } + } +} + +} // namespace ult +} // namespace L0 diff --git a/level_zero/tools/source/sysman/temperature/temperature.h b/level_zero/tools/source/sysman/temperature/temperature.h index f8111b50f5..14ab66b309 100644 --- a/level_zero/tools/source/sysman/temperature/temperature.h +++ b/level_zero/tools/source/sysman/temperature/temperature.h @@ -6,15 +6,12 @@ */ #pragma once +#include "level_zero/api/sysman/zes_handles_struct.h" #include #include #include -struct _zes_temp_handle_t { - virtual ~_zes_temp_handle_t() = default; -}; - namespace L0 { struct OsSysman;