From 16725e2438a8daf4faf7d7c676cb2d50b2cba662 Mon Sep 17 00:00:00 2001 From: Bellekallu Rajkiran Date: Thu, 9 Nov 2023 13:20:25 +0000 Subject: [PATCH] refactor: Merge Ras prelim files with non-prelim files Related-To: NEO-9469 Signed-off-by: Bellekallu Rajkiran --- .../source/api/ras/linux/CMakeLists.txt | 20 +- .../api/ras/linux/sysman_os_ras_imp.cpp | 65 +- .../source/api/ras/linux/sysman_os_ras_imp.h | 73 ++ .../api/ras/linux/sysman_os_ras_imp_gt.cpp | 2 +- .../api/ras/linux/sysman_os_ras_imp_hbm.cpp | 2 +- .../ras/linux/sysman_os_ras_imp_prelim.cpp | 107 --- .../api/ras/linux/sysman_os_ras_imp_prelim.h | 114 --- .../sources/ras/linux/CMakeLists.txt | 18 +- .../sources/ras/linux/mock_fs_ras.h | 30 - ...mock_fs_ras_prelim.h => mock_sysman_ras.h} | 2 +- .../sources/ras/linux/test_zes_ras.cpp | 631 +++++++++++++++-- .../sources/ras/linux/test_zes_ras_prelim.cpp | 653 ------------------ 12 files changed, 695 insertions(+), 1022 deletions(-) delete mode 100644 level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_prelim.cpp delete mode 100644 level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_prelim.h delete mode 100644 level_zero/sysman/test/unit_tests/sources/ras/linux/mock_fs_ras.h rename level_zero/sysman/test/unit_tests/sources/ras/linux/{mock_fs_ras_prelim.h => mock_sysman_ras.h} (99%) delete mode 100644 level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras_prelim.cpp diff --git a/level_zero/sysman/source/api/ras/linux/CMakeLists.txt b/level_zero/sysman/source/api/ras/linux/CMakeLists.txt index 520d68e83d..d9d012b337 100644 --- a/level_zero/sysman/source/api/ras/linux/CMakeLists.txt +++ b/level_zero/sysman/source/api/ras/linux/CMakeLists.txt @@ -8,21 +8,9 @@ if(UNIX) target_sources(${L0_STATIC_LIB_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_ras_imp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_ras_imp.h + ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_ras_imp_gt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_ras_imp_hbm.cpp ) - - if(NEO_ENABLE_i915_PRELIM_DETECTION) - target_sources(${L0_STATIC_LIB_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_ras_imp_prelim.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_ras_imp_prelim.h - ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_ras_imp_gt.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_ras_imp_hbm.cpp - ) - else() - target_sources(${L0_STATIC_LIB_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_ras_imp.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_ras_imp.h - ) - endif() endif() diff --git a/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.cpp b/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.cpp index 98fe730c32..da14d67e77 100644 --- a/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.cpp +++ b/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.cpp @@ -8,35 +8,46 @@ #include "level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.h" #include "shared/source/debug_settings/debug_settings_manager.h" +#include "shared/source/helpers/string.h" +#include "shared/source/os_interface/linux/system_info.h" #include "level_zero/sysman/source/shared/linux/zes_os_sysman_imp.h" -#include +#include "drm/intel_hwconfig_types.h" namespace L0 { namespace Sysman { -LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) { - pLinuxSysmanImp = static_cast(pOsSysman); - pFsAccess = &pLinuxSysmanImp->getFsAccess(); +static bool isMemoryTypeHbm(LinuxSysmanImp *pLinuxSysmanImp) { + uint32_t memType = pLinuxSysmanImp->getMemoryType(); + if (memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2e || memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2) { + return true; + } + return false; } -void OsRas::getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId) {} +void OsRas::getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId) { -ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) { - return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + constexpr auto maxErrorTypes = 2; + LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, isSubDevice, subDeviceId); + if (errorType.size() < maxErrorTypes) { + auto pLinuxSysmanImp = static_cast(pOsSysman); + if (isMemoryTypeHbm(pLinuxSysmanImp) == true) { + LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, isSubDevice, subDeviceId); + } + } } ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) { config->totalThreshold = totalThreshold; - memcpy(config->detailedThresholds.category, categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t)); + memcpy_s(config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t), categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t)); return ZE_RESULT_SUCCESS; } ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) { if (pFsAccess->isRootUser() == true) { totalThreshold = config->totalThreshold; - memcpy(categoryThreshold, config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t)); + memcpy_s(categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t), config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t)); return ZE_RESULT_SUCCESS; } NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); @@ -51,6 +62,42 @@ ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) { return ZE_RESULT_SUCCESS; } +ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) { + if (clear == true) { + if (pFsAccess->isRootUser() == false) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); + return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS; + } + } + + ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + for (auto &rasSource : rasSources) { + zes_ras_state_t localState = {}; + ze_result_t localResult = rasSource->osRasGetState(localState, clear); + if (localResult != ZE_RESULT_SUCCESS) { + continue; + } + for (uint32_t i = 0; i < maxRasErrorCategoryCount; i++) { + state.category[i] += localState.category[i]; + } + result = ZE_RESULT_SUCCESS; + } + return result; +} + +void LinuxRasImp::initSources() { + rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId)); + if (isMemoryTypeHbm(pLinuxSysmanImp) == true) { + rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, subdeviceId)); + } +} + +LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) { + pLinuxSysmanImp = static_cast(pOsSysman); + pFsAccess = &pLinuxSysmanImp->getFsAccess(); + initSources(); +} + OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) { LinuxRasImp *pLinuxRasImp = new LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId); return static_cast(pLinuxRasImp); diff --git a/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.h b/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.h index 668a59052c..6b657b9ff9 100644 --- a/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.h +++ b/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.h @@ -9,12 +9,28 @@ #include "shared/source/helpers/non_copyable_or_moveable.h" #include "level_zero/sysman/source/api/ras/sysman_os_ras.h" +#include "level_zero/sysman/source/device/sysman_device_imp.h" +#include "level_zero/sysman/source/shared/linux/pmu/sysman_pmu_imp.h" #include "level_zero/sysman/source/shared/linux/sysman_fs_access.h" #include "level_zero/sysman/source/sysman_const.h" +#include +#include +#include +#include + namespace L0 { namespace Sysman { + class LinuxSysmanImp; +class FirmwareUtil; + +class LinuxRasSources : NEO::NonCopyableOrMovableClass { + public: + virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0; + virtual ~LinuxRasSources() = default; +}; + class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass { public: ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override; @@ -29,13 +45,70 @@ class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass { zes_ras_error_type_t osRasErrorType = {}; FsAccess *pFsAccess = nullptr; LinuxSysmanImp *pLinuxSysmanImp = nullptr; + std::vector> rasSources = {}; private: + void initSources(); bool isSubdevice = false; uint32_t subdeviceId = 0; uint64_t totalThreshold = 0; uint64_t categoryThreshold[maxRasErrorCategoryCount] = {0}; }; +class LinuxRasSourceGt : public LinuxRasSources { + public: + ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; + static void getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId); + LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId); + LinuxRasSourceGt() = default; + ~LinuxRasSourceGt() override; + + protected: + LinuxSysmanImp *pLinuxSysmanImp = nullptr; + zes_ras_error_type_t osRasErrorType = {}; + PmuInterface *pPmuInterface = nullptr; + FsAccess *pFsAccess = nullptr; + SysfsAccess *pSysfsAccess = nullptr; + + private: + void initRasErrors(ze_bool_t clear); + ze_result_t getPmuConfig( + const std::string &eventDirectory, + const std::vector &listOfEvents, + const std::string &errorFileToGetConfig, + std::string &pmuConfig); + ze_result_t getBootUpErrorCountFromSysfs( + std::string nameOfError, + const std::string &errorCounterDir, + uint64_t &errorVal); + void closeFds(); + int64_t groupFd = -1; + std::vector memberFds = {}; + uint64_t initialErrorCount[maxRasErrorCategoryCount] = {0}; + std::map errorCategoryToEventCount; + uint64_t totalEventCount = 0; + bool isSubdevice = false; + uint32_t subdeviceId = 0; +}; + +class LinuxRasSourceHbm : public LinuxRasSources { + public: + ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; + static void getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId); + LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId); + LinuxRasSourceHbm() = default; + ~LinuxRasSourceHbm() override{}; + + protected: + LinuxSysmanImp *pLinuxSysmanImp = nullptr; + zes_ras_error_type_t osRasErrorType = {}; + FirmwareUtil *pFwInterface = nullptr; + SysmanDeviceImp *pDevice = nullptr; + + private: + uint64_t errorBaseline = 0; + uint32_t subdeviceId = 0; +}; + } // namespace Sysman } // namespace L0 diff --git a/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_gt.cpp b/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_gt.cpp index d7899bfb46..e643e2cbe7 100644 --- a/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_gt.cpp +++ b/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_gt.cpp @@ -7,7 +7,7 @@ #include "shared/source/debug_settings/debug_settings_manager.h" -#include "level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_prelim.h" +#include "level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.h" #include "level_zero/sysman/source/shared/linux/zes_os_sysman_imp.h" #include diff --git a/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_hbm.cpp b/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_hbm.cpp index 8faa55a11a..d3f4be019f 100644 --- a/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_hbm.cpp +++ b/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_hbm.cpp @@ -8,7 +8,7 @@ #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/helpers/gfx_core_helper.h" -#include "level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_prelim.h" +#include "level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.h" #include "level_zero/sysman/source/shared/firmware_util/sysman_firmware_util.h" #include "level_zero/sysman/source/shared/linux/zes_os_sysman_imp.h" diff --git a/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_prelim.cpp b/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_prelim.cpp deleted file mode 100644 index d756bc2910..0000000000 --- a/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_prelim.cpp +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (C) 2023 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#include "level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_prelim.h" - -#include "shared/source/debug_settings/debug_settings_manager.h" -#include "shared/source/helpers/string.h" -#include "shared/source/os_interface/linux/system_info.h" - -#include "level_zero/sysman/source/shared/linux/zes_os_sysman_imp.h" - -#include "drm/intel_hwconfig_types.h" - -namespace L0 { -namespace Sysman { - -static bool isMemoryTypeHbm(LinuxSysmanImp *pLinuxSysmanImp) { - uint32_t memType = pLinuxSysmanImp->getMemoryType(); - if (memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2e || memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2) { - return true; - } - return false; -} - -void OsRas::getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId) { - - constexpr auto maxErrorTypes = 2; - LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, isSubDevice, subDeviceId); - if (errorType.size() < maxErrorTypes) { - auto pLinuxSysmanImp = static_cast(pOsSysman); - if (isMemoryTypeHbm(pLinuxSysmanImp) == true) { - LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, isSubDevice, subDeviceId); - } - } -} - -ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) { - config->totalThreshold = totalThreshold; - memcpy_s(config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t), categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t)); - return ZE_RESULT_SUCCESS; -} - -ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) { - if (pFsAccess->isRootUser() == true) { - totalThreshold = config->totalThreshold; - memcpy_s(categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t), config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t)); - return ZE_RESULT_SUCCESS; - } - NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); - return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS; -} - -ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) { - properties.pNext = nullptr; - properties.type = osRasErrorType; - properties.onSubdevice = isSubdevice; - properties.subdeviceId = subdeviceId; - return ZE_RESULT_SUCCESS; -} - -ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) { - if (clear == true) { - if (pFsAccess->isRootUser() == false) { - NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); - return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS; - } - } - - ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; - for (auto &rasSource : rasSources) { - zes_ras_state_t localState = {}; - ze_result_t localResult = rasSource->osRasGetState(localState, clear); - if (localResult != ZE_RESULT_SUCCESS) { - continue; - } - for (uint32_t i = 0; i < maxRasErrorCategoryCount; i++) { - state.category[i] += localState.category[i]; - } - result = ZE_RESULT_SUCCESS; - } - return result; -} - -void LinuxRasImp::initSources() { - rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId)); - if (isMemoryTypeHbm(pLinuxSysmanImp) == true) { - rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, subdeviceId)); - } -} - -LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) { - pLinuxSysmanImp = static_cast(pOsSysman); - pFsAccess = &pLinuxSysmanImp->getFsAccess(); - initSources(); -} - -OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) { - LinuxRasImp *pLinuxRasImp = new LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId); - return static_cast(pLinuxRasImp); -} - -} // namespace Sysman -} // namespace L0 diff --git a/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_prelim.h b/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_prelim.h deleted file mode 100644 index 6b657b9ff9..0000000000 --- a/level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_prelim.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (C) 2023 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#pragma once -#include "shared/source/helpers/non_copyable_or_moveable.h" - -#include "level_zero/sysman/source/api/ras/sysman_os_ras.h" -#include "level_zero/sysman/source/device/sysman_device_imp.h" -#include "level_zero/sysman/source/shared/linux/pmu/sysman_pmu_imp.h" -#include "level_zero/sysman/source/shared/linux/sysman_fs_access.h" -#include "level_zero/sysman/source/sysman_const.h" - -#include -#include -#include -#include - -namespace L0 { -namespace Sysman { - -class LinuxSysmanImp; -class FirmwareUtil; - -class LinuxRasSources : NEO::NonCopyableOrMovableClass { - public: - virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0; - virtual ~LinuxRasSources() = default; -}; - -class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass { - public: - ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override; - ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; - ze_result_t osRasGetConfig(zes_ras_config_t *config) override; - ze_result_t osRasSetConfig(const zes_ras_config_t *config) override; - LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId); - LinuxRasImp() = default; - ~LinuxRasImp() override = default; - - protected: - zes_ras_error_type_t osRasErrorType = {}; - FsAccess *pFsAccess = nullptr; - LinuxSysmanImp *pLinuxSysmanImp = nullptr; - std::vector> rasSources = {}; - - private: - void initSources(); - bool isSubdevice = false; - uint32_t subdeviceId = 0; - uint64_t totalThreshold = 0; - uint64_t categoryThreshold[maxRasErrorCategoryCount] = {0}; -}; - -class LinuxRasSourceGt : public LinuxRasSources { - public: - ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; - static void getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId); - LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId); - LinuxRasSourceGt() = default; - ~LinuxRasSourceGt() override; - - protected: - LinuxSysmanImp *pLinuxSysmanImp = nullptr; - zes_ras_error_type_t osRasErrorType = {}; - PmuInterface *pPmuInterface = nullptr; - FsAccess *pFsAccess = nullptr; - SysfsAccess *pSysfsAccess = nullptr; - - private: - void initRasErrors(ze_bool_t clear); - ze_result_t getPmuConfig( - const std::string &eventDirectory, - const std::vector &listOfEvents, - const std::string &errorFileToGetConfig, - std::string &pmuConfig); - ze_result_t getBootUpErrorCountFromSysfs( - std::string nameOfError, - const std::string &errorCounterDir, - uint64_t &errorVal); - void closeFds(); - int64_t groupFd = -1; - std::vector memberFds = {}; - uint64_t initialErrorCount[maxRasErrorCategoryCount] = {0}; - std::map errorCategoryToEventCount; - uint64_t totalEventCount = 0; - bool isSubdevice = false; - uint32_t subdeviceId = 0; -}; - -class LinuxRasSourceHbm : public LinuxRasSources { - public: - ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; - static void getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId); - LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId); - LinuxRasSourceHbm() = default; - ~LinuxRasSourceHbm() override{}; - - protected: - LinuxSysmanImp *pLinuxSysmanImp = nullptr; - zes_ras_error_type_t osRasErrorType = {}; - FirmwareUtil *pFwInterface = nullptr; - SysmanDeviceImp *pDevice = nullptr; - - private: - uint64_t errorBaseline = 0; - uint32_t subdeviceId = 0; -}; - -} // namespace Sysman -} // namespace L0 diff --git a/level_zero/sysman/test/unit_tests/sources/ras/linux/CMakeLists.txt b/level_zero/sysman/test/unit_tests/sources/ras/linux/CMakeLists.txt index 9b5056600d..bb78057a9d 100644 --- a/level_zero/sysman/test/unit_tests/sources/ras/linux/CMakeLists.txt +++ b/level_zero/sysman/test/unit_tests/sources/ras/linux/CMakeLists.txt @@ -4,26 +4,16 @@ # SPDX-License-Identifier: MIT # -set(L0_TESTS_TOOLS_SYSMAN_RAS_LINUX +set(L0_TESTS_SYSMAN_RAS_LINUX ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mock_sysman_ras.h ) -if(NEO_ENABLE_i915_PRELIM_DETECTION) - list(APPEND L0_TESTS_TOOLS_SYSMAN_RAS_LINUX - ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras_prelim.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras_prelim.h - ) -else() - list(APPEND L0_TESTS_TOOLS_SYSMAN_RAS_LINUX - ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras.h - ) -endif() - if(UNIX) target_sources(${TARGET_NAME} PRIVATE - ${L0_TESTS_TOOLS_SYSMAN_RAS_LINUX} + ${L0_TESTS_SYSMAN_RAS_LINUX} ) endif() diff --git a/level_zero/sysman/test/unit_tests/sources/ras/linux/mock_fs_ras.h b/level_zero/sysman/test/unit_tests/sources/ras/linux/mock_fs_ras.h deleted file mode 100644 index 5172ea56ee..0000000000 --- a/level_zero/sysman/test/unit_tests/sources/ras/linux/mock_fs_ras.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) 2023 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#pragma once - -#include "level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.h" -#include "level_zero/sysman/source/api/ras/sysman_ras.h" -#include "level_zero/sysman/source/api/ras/sysman_ras_imp.h" -#include "level_zero/sysman/source/shared/linux/sysman_fs_access.h" - -namespace L0 { -namespace Sysman { -namespace ult { - -class MockRasFsAccess : public L0::Sysman::FsAccess { - public: - bool mockRootUser = true; - bool isRootUser() override { - return mockRootUser; - } - MockRasFsAccess() = default; -}; - -} // namespace ult -} // namespace Sysman -} // namespace L0 diff --git a/level_zero/sysman/test/unit_tests/sources/ras/linux/mock_fs_ras_prelim.h b/level_zero/sysman/test/unit_tests/sources/ras/linux/mock_sysman_ras.h similarity index 99% rename from level_zero/sysman/test/unit_tests/sources/ras/linux/mock_fs_ras_prelim.h rename to level_zero/sysman/test/unit_tests/sources/ras/linux/mock_sysman_ras.h index 8e45925330..eab0ef1b44 100644 --- a/level_zero/sysman/test/unit_tests/sources/ras/linux/mock_fs_ras_prelim.h +++ b/level_zero/sysman/test/unit_tests/sources/ras/linux/mock_sysman_ras.h @@ -10,7 +10,7 @@ #include "shared/source/os_interface/linux/ioctl_helper.h" #include "shared/source/os_interface/linux/system_info.h" -#include "level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp_prelim.h" +#include "level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.h" #include "level_zero/sysman/source/api/ras/sysman_ras.h" #include "level_zero/sysman/source/api/ras/sysman_ras_imp.h" #include "level_zero/sysman/source/shared/linux/pmu/sysman_pmu_imp.h" diff --git a/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras.cpp b/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras.cpp index 084ac4c790..5e9589c4ea 100644 --- a/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras.cpp +++ b/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras.cpp @@ -5,36 +5,70 @@ * */ +#include "shared/test/common/libult/linux/drm_mock.h" + #include "level_zero/sysman/source/sysman_const.h" #include "level_zero/sysman/test/unit_tests/sources/linux/mock_sysman_fixture.h" +#include "level_zero/sysman/test/unit_tests/sources/ras/linux/mock_sysman_ras.h" -#include "mock_fs_ras.h" +class OsRas; namespace L0 { namespace Sysman { namespace ult { -constexpr uint32_t mockHandleCount = 0; +constexpr uint32_t mockHandleCount = 2u; +constexpr uint32_t mockHandleCountForSubDevice = 4u; struct SysmanRasFixture : public SysmanDeviceFixture { protected: std::unique_ptr pFsAccess; + std::unique_ptr pSysfsAccess; + std::unique_ptr pPmuInterface; + std::unique_ptr pRasFwUtilInterface; + MockRasNeoDrm *pDrm = nullptr; L0::Sysman::FsAccess *pFsAccessOriginal = nullptr; + L0::Sysman::SysfsAccess *pSysfsAccessOriginal = nullptr; + L0::Sysman::PmuInterface *pOriginalPmuInterface = nullptr; + L0::Sysman::FirmwareUtil *pFwUtilOriginal = nullptr; L0::Sysman::SysmanDevice *device = nullptr; + void SetUp() override { SysmanDeviceFixture::SetUp(); - pFsAccess = std::make_unique(); + pFsAccessOriginal = pLinuxSysmanImp->pFsAccess; + pFsAccess = std::make_unique(); pLinuxSysmanImp->pFsAccess = pFsAccess.get(); - pFsAccess->mockRootUser = true; - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(pOsSysman->getSubDeviceCount()); + + pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess; + pSysfsAccess = std::make_unique(); + pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get(); + + pRasFwUtilInterface = std::make_unique(); + + pDrm = new MockRasNeoDrm(const_cast(pSysmanDeviceImp->getRootDeviceEnvironment())); + pDrm->setupIoctlHelper(pSysmanDeviceImp->getRootDeviceEnvironment().getHardwareInfo()->platform.eProductFamily); + + pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface; + pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get(); + + pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface; + pPmuInterface = std::make_unique(pLinuxSysmanImp); + pLinuxSysmanImp->pPmuInterface = pPmuInterface.get(); + + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e); + auto &osInterface = pSysmanDeviceImp->getRootDeviceEnvironment().osInterface; + osInterface->setDriverModel(std::unique_ptr(pDrm)); + + pSysmanDeviceImp->getRootDeviceEnvironment().getMutableHardwareInfo()->capabilityTable.isIntegratedDevice = false; device = pSysmanDevice; } void TearDown() override { pLinuxSysmanImp->pFsAccess = pFsAccessOriginal; + pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal; + pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface; + pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal; SysmanDeviceFixture::TearDown(); } - std::vector getRasHandles(uint32_t count) { std::vector handles(count, nullptr); EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); @@ -42,16 +76,7 @@ struct SysmanRasFixture : public SysmanDeviceFixture { } }; -TEST_F(SysmanRasFixture, GivenValidRasContextWhenRetrievingRasHandlesThenSuccessIsReturned) { - uint32_t count = 0; - L0::Sysman::RasHandleContext *pRasHandleContext = new L0::Sysman::RasHandleContext(pSysmanDeviceImp->pOsSysman); - ze_result_t result = pRasHandleContext->rasGet(&count, nullptr); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCount); - delete pRasHandleContext; -} - -TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRasErrorSetsThenCorrectCountIsReported) { +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesInThenSuccessReturn) { uint32_t count = 0; ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); EXPECT_EQ(ZE_RESULT_SUCCESS, result); @@ -61,37 +86,15 @@ TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRasErrorSetsThenCorrectCountI result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL); EXPECT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(testcount, mockHandleCount); - - count = 0; - std::vector handles(count, nullptr); - EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); - EXPECT_EQ(count, mockHandleCount); - - bool isSubDevice = false; - uint32_t subDeviceId = 0u; - L0::Sysman::RasImp *pTestRasImp = new L0::Sysman::RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, subDeviceId); - pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp); - EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, nullptr), ZE_RESULT_SUCCESS); - EXPECT_EQ(count, mockHandleCount + 1); - - testcount = count; - - handles.resize(testcount); - EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, handles.data()), ZE_RESULT_SUCCESS); - EXPECT_EQ(testcount, mockHandleCount + 1); - EXPECT_NE(nullptr, handles.data()); - - pSysmanDeviceImp->pRasHandleContext->handleList.pop_back(); - delete pTestRasImp; + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + EXPECT_NE(handle, nullptr); + } } TEST_F(SysmanRasFixture, GivenValidRasHandleWhenGettingRasPropertiesThenSuccessIsReturned) { - bool isSubDevice = false; - uint32_t subDeviceId = 0u; - L0::Sysman::RasImp *pTestRasImp = new L0::Sysman::RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, subDeviceId); - pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp); - - auto handles = getRasHandles(mockHandleCount + 1); + auto handles = getRasHandles(mockHandleCount); + bool correctable = true; for (auto handle : handles) { zes_ras_properties_t properties = {}; @@ -99,59 +102,329 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenGettingRasPropertiesThenSuccessI EXPECT_EQ(properties.pNext, nullptr); EXPECT_EQ(properties.onSubdevice, false); EXPECT_EQ(properties.subdeviceId, 0u); - EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE); + if (correctable == true) { + EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE); + correctable = false; + } else { + EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE); + } } - pSysmanDeviceImp->pRasHandleContext->handleList.pop_back(); - delete pTestRasImp; } -TEST_F(SysmanRasFixture, GivenValidRasHandleWhileCallingZesRasGetStateThenFailureIsReturned) { - bool isSubDevice = false; - uint32_t subDeviceId = 0u; - L0::Sysman::RasImp *pTestRasImp = new L0::Sysman::RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, subDeviceId); - pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp); +TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfReadSymLinkFailsThenNoSupportedErrorTypeIsReturned) { + std::set errorType = {}; - auto handles = getRasHandles(mockHandleCount + 1); + pSysfsAccess->mockReadSymLinkResult = true; + L0::Sysman::LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, false, 0); + EXPECT_EQ(errorType.size(), 0u); +} + +TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfListDirectoryFailsThenNoSupportedErrorTypeIsReturned) { + std::set errorType = {}; + + pFsAccess->mockReadDirectoryFailure = true; + + L0::Sysman::LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, false, 0); + EXPECT_EQ(errorType.size(), 0u); +} + +TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForHbmAndFwInterfaceIsAbsentThenNoSupportedErrorTypeIsReturned) { + std::set errorType = {}; + pLinuxSysmanImp->pFwUtilInterface = nullptr; + + L0::Sysman::LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, false, 0); + EXPECT_EQ(errorType.size(), 0u); +} + +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentThenZeroHandlesAreCreated) { + pFsAccess->mockReadDirectoryWithoutRasEvents = true; + + pLinuxSysmanImp->pFwUtilInterface = nullptr; + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + pSysmanDeviceImp->pRasHandleContext->init(pOsSysman->getSubDeviceCount()); + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, 0u); + uint32_t testcount = count + 1; + result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(testcount, 0u); +} + +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAndHbmAreAbsentThenZeroHandlesAreCreated) { + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_LPDDR4); + pRasFwUtilInterface->mockMemorySuccess = true; + pFsAccess->mockReadDirectoryWithoutRasEvents = true; + + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + pSysmanDeviceImp->pRasHandleContext->init(pOsSysman->getSubDeviceCount()); + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, 0u); +} + +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfHbmAndFwInterfaceArePresentThenSuccessIsReturned) { + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2); + pRasFwUtilInterface->mockMemorySuccess = true; + + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, mockHandleCount); +} + +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentAndQuerySystemInfoSucceedsButMemSysInfoIsNullThenZeroHandlesAreCreated) { + pFsAccess->mockReadDirectoryWithoutRasEvents = true; + pDrm->mockQuerySystemInfoReturnValue.push_back(true); + + pLinuxSysmanImp->pFwUtilInterface = nullptr; + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, 0u); +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtThenSuccessIsReturned) { + + pPmuInterface->mockPmuReadCorrectable = true; + pRasFwUtilInterface->mockMemorySuccess = false; + + auto handles = getRasHandles(mockHandleCount); + bool correctable = true; + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); + if (correctable == true) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); + correctable = false; + } else { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); + } + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasGetStateForGtAfterClearThenSuccessIsReturned) { + + pPmuInterface->mockPmuReadAfterClear = true; + + auto handles = getRasHandles(mockHandleCount); + bool correctable = true; + ze_bool_t clear = 0; + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); + if (correctable == true) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + correctable = false; + } else { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + } + } + correctable = true; + clear = 1; + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); + if (correctable == true) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); + correctable = false; + } else { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); + } + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForHbmThenSuccessIsReturned) { + + pPmuInterface->mockPmuReadResult = true; + pRasFwUtilInterface->mockMemorySuccess = true; + + auto handles = getRasHandles(mockHandleCount); + bool correctable = true; + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); + if (correctable == true) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); + correctable = false; + } else { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); + } + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForHbmWithClearThenSuccessIsReturned) { + + pPmuInterface->mockPmuReadResult = true; + pRasFwUtilInterface->mockMemorySuccess = true; + + auto handles = getRasHandles(mockHandleCount); + bool correctable = true; + ze_bool_t clear = 0; + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); + if (correctable == true) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); + correctable = false; + } else { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); + } + } + + correctable = true; + clear = 1; + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); + if (correctable == true) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + correctable = false; + } else { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + } + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateWithClearOptionWithoutPermissionsThenFailureIsReturned) { + + pFsAccess->mockRootUser = true; + + auto handles = getRasHandles(mockHandleCount); + ze_bool_t clear = 1; + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasGetState(handle, clear, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndUnableToRetrieveConfigValuesAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pFsAccess->mockReadFileFailure = true; + + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPerfEventOpenFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pPmuInterface->mockPerfEvent = true; + + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pPmuInterface->mockPmuReadResult = true; + + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceWithClearAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pPmuInterface->mockPmuReadResult = true; + + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 1, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateForGtInterfaceAndPMUGetEventTypeFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pFsAccess->mockReadVal = true; + + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateAndFirmwareInterfaceIsAbsentOtherInterfacesAreAlsoAbsentThenFailureIsReturned) { + + pFsAccess->mockReadVal = true; + + pLinuxSysmanImp->pFwUtilInterface = nullptr; + auto handles = getRasHandles(mockHandleCount); for (auto handle : handles) { zes_ras_state_t state = {}; EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); } - pSysmanDeviceImp->pRasHandleContext->handleList.pop_back(); - delete pTestRasImp; } TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetConfigAfterzesRasSetConfigThenSuccessIsReturned) { - bool isSubDevice = false; - uint32_t subDeviceId = 0u; - L0::Sysman::RasImp *pTestRasImp = new L0::Sysman::RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, subDeviceId); - pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp); - - auto handles = getRasHandles(mockHandleCount + 1); - + auto handles = getRasHandles(mockHandleCount); for (auto handle : handles) { zes_ras_config_t setConfig = {}; zes_ras_config_t getConfig = {}; setConfig.totalThreshold = 50; memset(setConfig.detailedThresholds.category, 1, maxRasErrorCategoryCount * sizeof(uint64_t)); + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasSetConfig(handle, &setConfig)); EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetConfig(handle, &getConfig)); EXPECT_EQ(setConfig.totalThreshold, getConfig.totalThreshold); int compare = std::memcmp(setConfig.detailedThresholds.category, getConfig.detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t)); EXPECT_EQ(0, compare); } - pSysmanDeviceImp->pRasHandleContext->handleList.pop_back(); - delete pTestRasImp; } TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasSetConfigWithoutPermissionThenFailureIsReturned) { - pFsAccess->mockRootUser = false; - bool isSubDevice = false; - uint32_t subDeviceId = 0u; - L0::Sysman::RasImp *pTestRasImp = new L0::Sysman::RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, subDeviceId); - pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp); - auto handles = getRasHandles(mockHandleCount + 1); + pFsAccess->mockRootUser = true; + + auto handles = getRasHandles(mockHandleCount); for (auto handle : handles) { zes_ras_config_t setConfig = {}; @@ -159,14 +432,220 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasSetConfigWithoutPer memset(setConfig.detailedThresholds.category, 1, maxRasErrorCategoryCount * sizeof(uint64_t)); EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasSetConfig(handle, &setConfig)); } - pSysmanDeviceImp->pRasHandleContext->releaseRasHandles(); } -TEST_F(SysmanRasFixture, GivenValidInstanceWhenOsRasImplementationIsNullThenDestructorIsCalledWithoutException) { +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) { - L0::Sysman::RasImp *pTestRasImp = new L0::Sysman::RasImp(); - pTestRasImp->pOsRas = nullptr; - EXPECT_NO_THROW(delete pTestRasImp;); // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) + pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; + + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsInsideGetEventOpenAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; + + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndListDirectoryFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pFsAccess->mockListDirectoryStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; + + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleAndHandleCountZeroWhenCallingReInitThenValidCountIsReturnedAndVerifyzesDeviceEnumRasErrorSetsSucceeds) { + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, mockHandleCount); + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + pSysmanDeviceImp->pRasHandleContext->init(pOsSysman->getSubDeviceCount()); + + count = 0; + result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, mockHandleCount); +} + +struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture { + protected: + std::unique_ptr pFsAccess; + std::unique_ptr pSysfsAccess; + std::unique_ptr pPmuInterface; + std::unique_ptr pRasFwUtilInterface; + MockRasNeoDrm *pDrm = nullptr; + L0::Sysman::FsAccess *pFsAccessOriginal = nullptr; + L0::Sysman::SysfsAccess *pSysfsAccessOriginal = nullptr; + L0::Sysman::PmuInterface *pOriginalPmuInterface = nullptr; + L0::Sysman::FirmwareUtil *pFwUtilOriginal = nullptr; + Drm *pOriginalDrm = nullptr; + L0::Sysman::SysmanDevice *device = nullptr; + + void SetUp() override { + SysmanMultiDeviceFixture::SetUp(); + pDrm = new MockRasNeoDrm(const_cast(pSysmanDeviceImp->getRootDeviceEnvironment())); + pDrm->setupIoctlHelper(pSysmanDeviceImp->getRootDeviceEnvironment().getHardwareInfo()->platform.eProductFamily); + + pFsAccessOriginal = pLinuxSysmanImp->pFsAccess; + pFsAccess = std::make_unique(); + pLinuxSysmanImp->pFsAccess = pFsAccess.get(); + + pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess; + pSysfsAccess = std::make_unique(); + pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get(); + + pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface; + pRasFwUtilInterface = std::make_unique(); + pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get(); + + pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface; + pPmuInterface = std::make_unique(pLinuxSysmanImp); + pLinuxSysmanImp->pPmuInterface = pPmuInterface.get(); + + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e); + auto &osInterface = pSysmanDeviceImp->getRootDeviceEnvironment().osInterface; + osInterface->setDriverModel(std::unique_ptr(pDrm)); + device = pSysmanDevice; + + pFsAccess->mockReadDirectoryForMultiDevice = true; + + pSysmanDeviceImp->getRootDeviceEnvironment().getMutableHardwareInfo()->capabilityTable.isIntegratedDevice = false; + } + void TearDown() override { + pLinuxSysmanImp->pFsAccess = pFsAccessOriginal; + pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal; + pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface; + pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal; + SysmanMultiDeviceFixture::TearDown(); + } + std::vector getRasHandles(uint32_t count) { + std::vector handles(count, nullptr); + EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); + return handles; + } +}; +TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWithMultiDeviceWhenRetrievingRasHandlesThenSuccessIsReturned) { + L0::Sysman::RasHandleContext *pRasHandleContext = new L0::Sysman::RasHandleContext(pSysmanDeviceImp->pOsSysman); + uint32_t count = 0; + ze_result_t result = pRasHandleContext->rasGet(&count, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ((count > 0), true); + delete pRasHandleContext; +} +TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesThenSuccessIsReturned) { + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, mockHandleCountForSubDevice); + + uint32_t testcount = count + 1; + result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(testcount, mockHandleCountForSubDevice); + auto handles = getRasHandles(mockHandleCountForSubDevice); + for (auto handle : handles) { + EXPECT_NE(handle, nullptr); + } +} +TEST_F(SysmanRasMultiDeviceFixture, GivenValidHandleWhenGettingRasPropertiesThenSuccessIsReturned) { + zes_ras_properties_t properties = {}; + bool isSubDevice = true; + uint32_t subDeviceId = 0u; + PublicLinuxRasImp *pLinuxRasImp = new PublicLinuxRasImp(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, subDeviceId); + EXPECT_EQ(ZE_RESULT_SUCCESS, pLinuxRasImp->osRasGetProperties(properties)); + EXPECT_EQ(properties.subdeviceId, subDeviceId); + EXPECT_EQ(properties.onSubdevice, isSubDevice); + EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE); + delete pLinuxRasImp; +} + +TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtThenSuccessIsReturned) { + + pPmuInterface->mockPmuReadTile = true; + pSysfsAccess->isMultiTileArch = true; + + auto handles = getRasHandles(mockHandleCountForSubDevice); + uint32_t handleIndex = 0u; + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); + if (handleIndex == 0u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctablel3Bank + initialCorrectableCacheErrorTile0); // No. of correctable error type for subdevice 0 + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], correctableGscSramEcc + initialCorrectableNonComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); + } else if (handleIndex == 1u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrorsTile0); // No. of uncorrectable error type for subdevice 0 + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0); + } else if (handleIndex == 2u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); // No. of correctable error type for subdevice 1 + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableSubsliceTile1 + correctableGucErrorCountTile1 + correctableSamplerErrorCountTile1 + initialCorrectableComputeErrorsTile1); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); + } else if (handleIndex == 3u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalL3BankTile1 + fatalIdiParityErrorCountTile1 + initialUncorrectableCacheErrorsTile1); // No. of uncorrectable error type for subdevice 1 + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCountTile1 + initialEngineResetTile1); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttentionTile1 + initialProgrammingErrorsTile1); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalGucErrorCountTile1 + initialUncorrectableComputeErrorsTile1); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiWestCountTile1 + socFatalPunitTile1 + initialUncorrectableNonComputeErrorsTile1); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverEngineOther + initialUncorrectableDriverErrorsTile1); + } + handleIndex++; + } +} + +TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGetStateForHbmThenSuccessIsReturned) { + + pPmuInterface->mockPmuReadResult = true; + pRasFwUtilInterface->mockMemorySuccess = true; + + auto handles = getRasHandles(mockHandleCountForSubDevice); + uint32_t handleIndex = 0u; + + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); + if (handleIndex == 0u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 0 + } else if (handleIndex == 1u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 0 + } else if (handleIndex == 2u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 1 + } else if (handleIndex == 3u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 1 + } + handleIndex++; + } } } // namespace ult diff --git a/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras_prelim.cpp b/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras_prelim.cpp deleted file mode 100644 index 5d364800ba..0000000000 --- a/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras_prelim.cpp +++ /dev/null @@ -1,653 +0,0 @@ -/* - * Copyright (C) 2023 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#include "shared/test/common/libult/linux/drm_mock.h" - -#include "level_zero/sysman/source/sysman_const.h" -#include "level_zero/sysman/test/unit_tests/sources/linux/mock_sysman_fixture.h" -#include "level_zero/sysman/test/unit_tests/sources/ras/linux/mock_fs_ras_prelim.h" - -class OsRas; - -namespace L0 { -namespace Sysman { -namespace ult { - -constexpr uint32_t mockHandleCount = 2u; -constexpr uint32_t mockHandleCountForSubDevice = 4u; -struct SysmanRasFixture : public SysmanDeviceFixture { - protected: - std::unique_ptr pFsAccess; - std::unique_ptr pSysfsAccess; - std::unique_ptr pPmuInterface; - std::unique_ptr pRasFwUtilInterface; - MockRasNeoDrm *pDrm = nullptr; - L0::Sysman::FsAccess *pFsAccessOriginal = nullptr; - L0::Sysman::SysfsAccess *pSysfsAccessOriginal = nullptr; - L0::Sysman::PmuInterface *pOriginalPmuInterface = nullptr; - L0::Sysman::FirmwareUtil *pFwUtilOriginal = nullptr; - L0::Sysman::SysmanDevice *device = nullptr; - - void SetUp() override { - SysmanDeviceFixture::SetUp(); - - pFsAccessOriginal = pLinuxSysmanImp->pFsAccess; - pFsAccess = std::make_unique(); - pLinuxSysmanImp->pFsAccess = pFsAccess.get(); - - pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess; - pSysfsAccess = std::make_unique(); - pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get(); - - pRasFwUtilInterface = std::make_unique(); - - pDrm = new MockRasNeoDrm(const_cast(pSysmanDeviceImp->getRootDeviceEnvironment())); - pDrm->setupIoctlHelper(pSysmanDeviceImp->getRootDeviceEnvironment().getHardwareInfo()->platform.eProductFamily); - - pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface; - pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get(); - - pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface; - pPmuInterface = std::make_unique(pLinuxSysmanImp); - pLinuxSysmanImp->pPmuInterface = pPmuInterface.get(); - - pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e); - auto &osInterface = pSysmanDeviceImp->getRootDeviceEnvironment().osInterface; - osInterface->setDriverModel(std::unique_ptr(pDrm)); - - pSysmanDeviceImp->getRootDeviceEnvironment().getMutableHardwareInfo()->capabilityTable.isIntegratedDevice = false; - device = pSysmanDevice; - } - void TearDown() override { - pLinuxSysmanImp->pFsAccess = pFsAccessOriginal; - pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal; - pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface; - pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal; - SysmanDeviceFixture::TearDown(); - } - std::vector getRasHandles(uint32_t count) { - std::vector handles(count, nullptr); - EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); - return handles; - } -}; - -TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesInThenSuccessReturn) { - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCount); - - uint32_t testcount = count + 1; - result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(testcount, mockHandleCount); - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - EXPECT_NE(handle, nullptr); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenGettingRasPropertiesThenSuccessIsReturned) { - auto handles = getRasHandles(mockHandleCount); - bool correctable = true; - - for (auto handle : handles) { - zes_ras_properties_t properties = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties)); - EXPECT_EQ(properties.pNext, nullptr); - EXPECT_EQ(properties.onSubdevice, false); - EXPECT_EQ(properties.subdeviceId, 0u); - if (correctable == true) { - EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE); - correctable = false; - } else { - EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE); - } - } -} - -TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfReadSymLinkFailsThenNoSupportedErrorTypeIsReturned) { - std::set errorType = {}; - - pSysfsAccess->mockReadSymLinkResult = true; - - L0::Sysman::LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, false, 0); - EXPECT_EQ(errorType.size(), 0u); -} - -TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfListDirectoryFailsThenNoSupportedErrorTypeIsReturned) { - std::set errorType = {}; - - pFsAccess->mockReadDirectoryFailure = true; - - L0::Sysman::LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, false, 0); - EXPECT_EQ(errorType.size(), 0u); -} - -TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForHbmAndFwInterfaceIsAbsentThenNoSupportedErrorTypeIsReturned) { - std::set errorType = {}; - pLinuxSysmanImp->pFwUtilInterface = nullptr; - - L0::Sysman::LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, false, 0); - EXPECT_EQ(errorType.size(), 0u); -} - -TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentThenZeroHandlesAreCreated) { - pFsAccess->mockReadDirectoryWithoutRasEvents = true; - - pLinuxSysmanImp->pFwUtilInterface = nullptr; - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(pOsSysman->getSubDeviceCount()); - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, 0u); - uint32_t testcount = count + 1; - result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(testcount, 0u); -} - -TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAndHbmAreAbsentThenZeroHandlesAreCreated) { - pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_LPDDR4); - pRasFwUtilInterface->mockMemorySuccess = true; - pFsAccess->mockReadDirectoryWithoutRasEvents = true; - - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(pOsSysman->getSubDeviceCount()); - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, 0u); -} - -TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfHbmAndFwInterfaceArePresentThenSuccessIsReturned) { - pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2); - pRasFwUtilInterface->mockMemorySuccess = true; - - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCount); -} - -TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentAndQuerySystemInfoSucceedsButMemSysInfoIsNullThenZeroHandlesAreCreated) { - pFsAccess->mockReadDirectoryWithoutRasEvents = true; - pDrm->mockQuerySystemInfoReturnValue.push_back(true); - - pLinuxSysmanImp->pFwUtilInterface = nullptr; - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, 0u); -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtThenSuccessIsReturned) { - - pPmuInterface->mockPmuReadCorrectable = true; - pRasFwUtilInterface->mockMemorySuccess = false; - - auto handles = getRasHandles(mockHandleCount); - bool correctable = true; - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); - if (correctable == true) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); - correctable = false; - } else { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); - } - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasGetStateForGtAfterClearThenSuccessIsReturned) { - - pPmuInterface->mockPmuReadAfterClear = true; - - auto handles = getRasHandles(mockHandleCount); - bool correctable = true; - ze_bool_t clear = 0; - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); - if (correctable == true) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - correctable = false; - } else { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - } - } - correctable = true; - clear = 1; - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); - if (correctable == true) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); - correctable = false; - } else { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); - } - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForHbmThenSuccessIsReturned) { - - pPmuInterface->mockPmuReadResult = true; - pRasFwUtilInterface->mockMemorySuccess = true; - - auto handles = getRasHandles(mockHandleCount); - bool correctable = true; - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); - if (correctable == true) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); - correctable = false; - } else { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); - } - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForHbmWithClearThenSuccessIsReturned) { - - pPmuInterface->mockPmuReadResult = true; - pRasFwUtilInterface->mockMemorySuccess = true; - - auto handles = getRasHandles(mockHandleCount); - bool correctable = true; - ze_bool_t clear = 0; - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); - if (correctable == true) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); - correctable = false; - } else { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); - } - } - - correctable = true; - clear = 1; - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); - if (correctable == true) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - correctable = false; - } else { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - } - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateWithClearOptionWithoutPermissionsThenFailureIsReturned) { - - pFsAccess->mockRootUser = true; - - auto handles = getRasHandles(mockHandleCount); - ze_bool_t clear = 1; - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasGetState(handle, clear, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndUnableToRetrieveConfigValuesAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pFsAccess->mockReadFileFailure = true; - - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPerfEventOpenFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pPmuInterface->mockPerfEvent = true; - - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pPmuInterface->mockPmuReadResult = true; - - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceWithClearAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pPmuInterface->mockPmuReadResult = true; - - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 1, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateForGtInterfaceAndPMUGetEventTypeFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pFsAccess->mockReadVal = true; - - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateAndFirmwareInterfaceIsAbsentOtherInterfacesAreAlsoAbsentThenFailureIsReturned) { - - pFsAccess->mockReadVal = true; - - pLinuxSysmanImp->pFwUtilInterface = nullptr; - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetConfigAfterzesRasSetConfigThenSuccessIsReturned) { - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - zes_ras_config_t setConfig = {}; - zes_ras_config_t getConfig = {}; - setConfig.totalThreshold = 50; - memset(setConfig.detailedThresholds.category, 1, maxRasErrorCategoryCount * sizeof(uint64_t)); - - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasSetConfig(handle, &setConfig)); - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetConfig(handle, &getConfig)); - EXPECT_EQ(setConfig.totalThreshold, getConfig.totalThreshold); - int compare = std::memcmp(setConfig.detailedThresholds.category, getConfig.detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t)); - EXPECT_EQ(0, compare); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasSetConfigWithoutPermissionThenFailureIsReturned) { - - pFsAccess->mockRootUser = true; - - auto handles = getRasHandles(mockHandleCount); - - for (auto handle : handles) { - zes_ras_config_t setConfig = {}; - setConfig.totalThreshold = 50; - memset(setConfig.detailedThresholds.category, 1, maxRasErrorCategoryCount * sizeof(uint64_t)); - EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasSetConfig(handle, &setConfig)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; - - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsInsideGetEventOpenAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; - - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndListDirectoryFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pFsAccess->mockListDirectoryStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; - - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleAndHandleCountZeroWhenCallingReInitThenValidCountIsReturnedAndVerifyzesDeviceEnumRasErrorSetsSucceeds) { - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCount); - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(pOsSysman->getSubDeviceCount()); - - count = 0; - result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCount); -} - -struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture { - protected: - std::unique_ptr pFsAccess; - std::unique_ptr pSysfsAccess; - std::unique_ptr pPmuInterface; - std::unique_ptr pRasFwUtilInterface; - MockRasNeoDrm *pDrm = nullptr; - L0::Sysman::FsAccess *pFsAccessOriginal = nullptr; - L0::Sysman::SysfsAccess *pSysfsAccessOriginal = nullptr; - L0::Sysman::PmuInterface *pOriginalPmuInterface = nullptr; - L0::Sysman::FirmwareUtil *pFwUtilOriginal = nullptr; - Drm *pOriginalDrm = nullptr; - L0::Sysman::SysmanDevice *device = nullptr; - - void SetUp() override { - SysmanMultiDeviceFixture::SetUp(); - pDrm = new MockRasNeoDrm(const_cast(pSysmanDeviceImp->getRootDeviceEnvironment())); - pDrm->setupIoctlHelper(pSysmanDeviceImp->getRootDeviceEnvironment().getHardwareInfo()->platform.eProductFamily); - - pFsAccessOriginal = pLinuxSysmanImp->pFsAccess; - pFsAccess = std::make_unique(); - pLinuxSysmanImp->pFsAccess = pFsAccess.get(); - - pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess; - pSysfsAccess = std::make_unique(); - pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get(); - - pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface; - pRasFwUtilInterface = std::make_unique(); - pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get(); - - pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface; - pPmuInterface = std::make_unique(pLinuxSysmanImp); - pLinuxSysmanImp->pPmuInterface = pPmuInterface.get(); - - pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e); - auto &osInterface = pSysmanDeviceImp->getRootDeviceEnvironment().osInterface; - osInterface->setDriverModel(std::unique_ptr(pDrm)); - device = pSysmanDevice; - - pFsAccess->mockReadDirectoryForMultiDevice = true; - - pSysmanDeviceImp->getRootDeviceEnvironment().getMutableHardwareInfo()->capabilityTable.isIntegratedDevice = false; - } - void TearDown() override { - pLinuxSysmanImp->pFsAccess = pFsAccessOriginal; - pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal; - pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface; - pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal; - SysmanMultiDeviceFixture::TearDown(); - } - std::vector getRasHandles(uint32_t count) { - std::vector handles(count, nullptr); - EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); - return handles; - } -}; -TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWithMultiDeviceWhenRetrievingRasHandlesThenSuccessIsReturned) { - L0::Sysman::RasHandleContext *pRasHandleContext = new L0::Sysman::RasHandleContext(pSysmanDeviceImp->pOsSysman); - uint32_t count = 0; - ze_result_t result = pRasHandleContext->rasGet(&count, nullptr); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ((count > 0), true); - delete pRasHandleContext; -} -TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesThenSuccessIsReturned) { - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCountForSubDevice); - - uint32_t testcount = count + 1; - result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(testcount, mockHandleCountForSubDevice); - auto handles = getRasHandles(mockHandleCountForSubDevice); - for (auto handle : handles) { - EXPECT_NE(handle, nullptr); - } -} -TEST_F(SysmanRasMultiDeviceFixture, GivenValidHandleWhenGettingRasPropertiesThenSuccessIsReturned) { - zes_ras_properties_t properties = {}; - bool isSubDevice = true; - uint32_t subDeviceId = 0u; - PublicLinuxRasImp *pLinuxRasImp = new PublicLinuxRasImp(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, subDeviceId); - EXPECT_EQ(ZE_RESULT_SUCCESS, pLinuxRasImp->osRasGetProperties(properties)); - EXPECT_EQ(properties.subdeviceId, subDeviceId); - EXPECT_EQ(properties.onSubdevice, isSubDevice); - EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE); - delete pLinuxRasImp; -} - -TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtThenSuccessIsReturned) { - - pPmuInterface->mockPmuReadTile = true; - pSysfsAccess->isMultiTileArch = true; - - auto handles = getRasHandles(mockHandleCountForSubDevice); - uint32_t handleIndex = 0u; - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); - if (handleIndex == 0u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctablel3Bank + initialCorrectableCacheErrorTile0); // No. of correctable error type for subdevice 0 - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], correctableGscSramEcc + initialCorrectableNonComputeErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); - } else if (handleIndex == 1u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrorsTile0); // No. of uncorrectable error type for subdevice 0 - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0); - } else if (handleIndex == 2u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); // No. of correctable error type for subdevice 1 - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableSubsliceTile1 + correctableGucErrorCountTile1 + correctableSamplerErrorCountTile1 + initialCorrectableComputeErrorsTile1); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); - } else if (handleIndex == 3u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalL3BankTile1 + fatalIdiParityErrorCountTile1 + initialUncorrectableCacheErrorsTile1); // No. of uncorrectable error type for subdevice 1 - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCountTile1 + initialEngineResetTile1); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttentionTile1 + initialProgrammingErrorsTile1); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalGucErrorCountTile1 + initialUncorrectableComputeErrorsTile1); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiWestCountTile1 + socFatalPunitTile1 + initialUncorrectableNonComputeErrorsTile1); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverEngineOther + initialUncorrectableDriverErrorsTile1); - } - handleIndex++; - } -} - -TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGetStateForHbmThenSuccessIsReturned) { - - pPmuInterface->mockPmuReadResult = true; - pRasFwUtilInterface->mockMemorySuccess = true; - - auto handles = getRasHandles(mockHandleCountForSubDevice); - uint32_t handleIndex = 0u; - - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); - if (handleIndex == 0u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 0 - } else if (handleIndex == 1u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 0 - } else if (handleIndex == 2u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 1 - } else if (handleIndex == 3u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 1 - } - handleIndex++; - } -} - -} // namespace ult -} // namespace Sysman -} // namespace L0