From 4398e4297fae172a4bcea6a58c24a0e8df4433b7 Mon Sep 17 00:00:00 2001 From: Bellekallu Rajkiran Date: Tue, 14 Nov 2023 09:08:10 +0000 Subject: [PATCH] fix: Fix Sysman ULT failures - Merge Ras and globalOps prelim files with non-prelim files. Related-To: NEO-9521 Signed-off-by: Bellekallu Rajkiran --- .../global_operations/linux/CMakeLists.txt | 14 +- .../sysman_os_global_operations_helper.cpp | 20 +- ...man_os_global_operations_helper_prelim.cpp | 30 - .../global_operations/linux/CMakeLists.txt | 7 +- ... => test_zes_global_operations_helper.cpp} | 0 .../global_operations/linux/CMakeLists.txt | 14 +- .../linux/os_global_operations_helper.cpp | 22 +- .../os_global_operations_helper_prelim.cpp | 28 - .../sysman/memory/linux/os_memory_imp_dg1.cpp | 1 - .../source/sysman/ras/linux/CMakeLists.txt | 20 +- .../source/sysman/ras/linux/os_ras_imp.cpp | 60 +- .../source/sysman/ras/linux/os_ras_imp.h | 74 ++ .../source/sysman/ras/linux/os_ras_imp_gt.cpp | 2 +- .../sysman/ras/linux/os_ras_imp_hbm.cpp | 2 +- .../sysman/ras/linux/os_ras_imp_prelim.cpp | 105 --- .../sysman/ras/linux/os_ras_imp_prelim.h | 113 --- .../global_operations/linux/CMakeLists.txt | 9 +- ... => test_zes_global_operations_helper.cpp} | 0 .../sources/sysman/memory/linux/mock_memory.h | 1 + .../memory/linux/test_sysman_memory_dg1.cpp | 2 +- .../sources/sysman/ras/linux/CMakeLists.txt | 14 +- .../sources/sysman/ras/linux/mock_fs_ras.h | 26 - ...mock_fs_ras_prelim.h => mock_sysman_ras.h} | 2 +- .../sources/sysman/ras/linux/test_zes_ras.cpp | 777 +++++++++++++++-- .../sysman/ras/linux/test_zes_ras_prelim.cpp | 819 ------------------ .../scheduler/linux/mock_sysfs_scheduler.h | 2 +- .../scheduler/linux/test_zes_scheduler.cpp | 2 +- 27 files changed, 894 insertions(+), 1272 deletions(-) delete mode 100644 level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_helper_prelim.cpp rename level_zero/sysman/test/unit_tests/sources/global_operations/linux/{test_zes_global_operations_prelim.cpp => test_zes_global_operations_helper.cpp} (100%) delete mode 100644 level_zero/tools/source/sysman/global_operations/linux/os_global_operations_helper_prelim.cpp delete mode 100644 level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.cpp delete mode 100644 level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h rename level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/{test_zes_global_operations_prelim.cpp => test_zes_global_operations_helper.cpp} (100%) delete mode 100644 level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras.h rename level_zero/tools/test/unit_tests/sources/sysman/ras/linux/{mock_fs_ras_prelim.h => mock_sysman_ras.h} (99%) delete mode 100644 level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp diff --git a/level_zero/sysman/source/api/global_operations/linux/CMakeLists.txt b/level_zero/sysman/source/api/global_operations/linux/CMakeLists.txt index 8a1659136b..c5e433708b 100644 --- a/level_zero/sysman/source/api/global_operations/linux/CMakeLists.txt +++ b/level_zero/sysman/source/api/global_operations/linux/CMakeLists.txt @@ -10,18 +10,6 @@ if(UNIX) ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_global_operations_imp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_global_operations_imp.h + ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_global_operations_helper.cpp ) - - if(NEO_ENABLE_i915_PRELIM_DETECTION) - target_sources(${L0_STATIC_LIB_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_global_operations_helper_prelim.cpp - ) - else() - target_sources(${L0_STATIC_LIB_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/sysman_os_global_operations_helper.cpp - ) - endif() - endif() diff --git a/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_helper.cpp b/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_helper.cpp index 84665d4ccd..60978c9567 100644 --- a/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_helper.cpp +++ b/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_helper.cpp @@ -6,9 +6,25 @@ */ #include "level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.h" +#include "level_zero/sysman/source/shared/firmware_util/sysman_firmware_util.h" namespace L0 { namespace Sysman { -void LinuxGlobalOperationsImp::getRepairStatus(zes_device_state_t *pState) {} +void LinuxGlobalOperationsImp::getRepairStatus(zes_device_state_t *pState) { + bool ifrStatus = false; + if (IGFX_PVC == pLinuxSysmanImp->getParentSysmanDeviceImp()->getProductFamily()) { + auto pFwInterface = pLinuxSysmanImp->getFwUtilInterface(); + if (pFwInterface != nullptr) { + auto result = pFwInterface->fwIfrApplied(ifrStatus); + if (result == ZE_RESULT_SUCCESS) { + pState->repaired = ZES_REPAIR_STATUS_NOT_PERFORMED; + if (ifrStatus) { + pState->reset |= ZES_RESET_REASON_FLAG_REPAIR; + pState->repaired = ZES_REPAIR_STATUS_PERFORMED; + } + } + } + } +} } // namespace Sysman -} // namespace L0 \ No newline at end of file +} // namespace L0 diff --git a/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_helper_prelim.cpp b/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_helper_prelim.cpp deleted file mode 100644 index 60978c9567..0000000000 --- a/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_helper_prelim.cpp +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) 2023 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#include "level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.h" -#include "level_zero/sysman/source/shared/firmware_util/sysman_firmware_util.h" - -namespace L0 { -namespace Sysman { -void LinuxGlobalOperationsImp::getRepairStatus(zes_device_state_t *pState) { - bool ifrStatus = false; - if (IGFX_PVC == pLinuxSysmanImp->getParentSysmanDeviceImp()->getProductFamily()) { - auto pFwInterface = pLinuxSysmanImp->getFwUtilInterface(); - if (pFwInterface != nullptr) { - auto result = pFwInterface->fwIfrApplied(ifrStatus); - if (result == ZE_RESULT_SUCCESS) { - pState->repaired = ZES_REPAIR_STATUS_NOT_PERFORMED; - if (ifrStatus) { - pState->reset |= ZES_RESET_REASON_FLAG_REPAIR; - pState->repaired = ZES_REPAIR_STATUS_PERFORMED; - } - } - } - } -} -} // namespace Sysman -} // namespace L0 diff --git a/level_zero/sysman/test/unit_tests/sources/global_operations/linux/CMakeLists.txt b/level_zero/sysman/test/unit_tests/sources/global_operations/linux/CMakeLists.txt index 27930ea145..49b1ba4003 100644 --- a/level_zero/sysman/test/unit_tests/sources/global_operations/linux/CMakeLists.txt +++ b/level_zero/sysman/test/unit_tests/sources/global_operations/linux/CMakeLists.txt @@ -7,15 +7,10 @@ set(L0_TESTS_SYSMAN_GLOBAL_OPERATIONS_LINUX ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_global_operations.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_global_operations_helper.cpp ${CMAKE_CURRENT_SOURCE_DIR}/mock_global_operations.h ) -if(NEO_ENABLE_i915_PRELIM_DETECTION) - list(APPEND L0_TESTS_SYSMAN_GLOBAL_OPERATIONS_LINUX - ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_global_operations_prelim.cpp - ) -endif() - if(UNIX) target_sources(${TARGET_NAME} PRIVATE diff --git a/level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations_prelim.cpp b/level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations_helper.cpp similarity index 100% rename from level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations_prelim.cpp rename to level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations_helper.cpp diff --git a/level_zero/tools/source/sysman/global_operations/linux/CMakeLists.txt b/level_zero/tools/source/sysman/global_operations/linux/CMakeLists.txt index 8772872d20..8200303b11 100644 --- a/level_zero/tools/source/sysman/global_operations/linux/CMakeLists.txt +++ b/level_zero/tools/source/sysman/global_operations/linux/CMakeLists.txt @@ -10,18 +10,6 @@ if(UNIX) ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt ${CMAKE_CURRENT_SOURCE_DIR}/os_global_operations_imp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/os_global_operations_imp.h + ${CMAKE_CURRENT_SOURCE_DIR}/os_global_operations_helper.cpp ) - - if(NEO_ENABLE_i915_PRELIM_DETECTION) - target_sources(${L0_STATIC_LIB_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/os_global_operations_helper_prelim.cpp - ) - else() - target_sources(${L0_STATIC_LIB_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/os_global_operations_helper.cpp - ) - endif() - endif() diff --git a/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_helper.cpp b/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_helper.cpp index de9fa57cbe..75491e5c67 100644 --- a/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_helper.cpp +++ b/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_helper.cpp @@ -1,12 +1,28 @@ /* - * Copyright (C) 2021 Intel Corporation + * Copyright (C) 2022-2023 Intel Corporation * * SPDX-License-Identifier: MIT * */ +#include "level_zero/tools/source/sysman/firmware_util/firmware_util.h" #include "level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.h" namespace L0 { -void LinuxGlobalOperationsImp::getRepairStatus(zes_device_state_t *pState) {} -} // namespace L0 \ No newline at end of file +void LinuxGlobalOperationsImp::getRepairStatus(zes_device_state_t *pState) { + bool ifrStatus = false; + if (IGFX_PVC == SysmanDeviceImp::getProductFamily(pDevice)) { + auto pFwInterface = pLinuxSysmanImp->getFwUtilInterface(); + if (pFwInterface != nullptr) { + auto result = pFwInterface->fwIfrApplied(ifrStatus); + if (result == ZE_RESULT_SUCCESS) { + pState->repaired = ZES_REPAIR_STATUS_NOT_PERFORMED; + if (ifrStatus) { + pState->reset |= ZES_RESET_REASON_FLAG_REPAIR; + pState->repaired = ZES_REPAIR_STATUS_PERFORMED; + } + } + } + } +} +} // namespace L0 diff --git a/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_helper_prelim.cpp b/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_helper_prelim.cpp deleted file mode 100644 index 60ff0e6f43..0000000000 --- a/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_helper_prelim.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (C) 2022 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#include "level_zero/tools/source/sysman/firmware_util/firmware_util.h" -#include "level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.h" - -namespace L0 { -void LinuxGlobalOperationsImp::getRepairStatus(zes_device_state_t *pState) { - bool ifrStatus = false; - if (IGFX_PVC == SysmanDeviceImp::getProductFamily(pDevice)) { - auto pFwInterface = pLinuxSysmanImp->getFwUtilInterface(); - if (pFwInterface != nullptr) { - auto result = pFwInterface->fwIfrApplied(ifrStatus); - if (result == ZE_RESULT_SUCCESS) { - pState->repaired = ZES_REPAIR_STATUS_NOT_PERFORMED; - if (ifrStatus) { - pState->reset |= ZES_RESET_REASON_FLAG_REPAIR; - pState->repaired = ZES_REPAIR_STATUS_PERFORMED; - } - } - } - } -} -} // namespace L0 diff --git a/level_zero/tools/source/sysman/memory/linux/os_memory_imp_dg1.cpp b/level_zero/tools/source/sysman/memory/linux/os_memory_imp_dg1.cpp index 55da3914d3..b0839ca379 100644 --- a/level_zero/tools/source/sysman/memory/linux/os_memory_imp_dg1.cpp +++ b/level_zero/tools/source/sysman/memory/linux/os_memory_imp_dg1.cpp @@ -59,7 +59,6 @@ ze_result_t LinuxMemoryImp::getState(zes_mem_state_t *pState) { pState->free = deviceRegions[subdeviceId].unallocatedSize; pState->size = deviceRegions[subdeviceId].probedSize; pState->health = ZES_MEM_HEALTH_OK; - return ZE_RESULT_SUCCESS; } diff --git a/level_zero/tools/source/sysman/ras/linux/CMakeLists.txt b/level_zero/tools/source/sysman/ras/linux/CMakeLists.txt index abae3bed0e..a93990bb36 100644 --- a/level_zero/tools/source/sysman/ras/linux/CMakeLists.txt +++ b/level_zero/tools/source/sysman/ras/linux/CMakeLists.txt @@ -8,21 +8,9 @@ if(UNIX) target_sources(${L0_STATIC_LIB_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.h + ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_gt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_hbm.cpp ) - - if(NEO_ENABLE_i915_PRELIM_DETECTION) - target_sources(${L0_STATIC_LIB_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.h - ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_gt.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_hbm.cpp - ) - else() - target_sources(${L0_STATIC_LIB_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.h - ) - endif() endif() diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp.cpp index c069e12523..6210ec889a 100644 --- a/level_zero/tools/source/sysman/ras/linux/os_ras_imp.cpp +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp.cpp @@ -8,27 +8,45 @@ #include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h" #include "shared/source/debug_settings/debug_settings_manager.h" +#include "shared/source/helpers/string.h" +#include "shared/source/os_interface/linux/system_info.h" #include "level_zero/tools/source/sysman/linux/os_sysman_imp.h" +#include "drm/intel_hwconfig_types.h" + namespace L0 { -void OsRas::getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {} +static bool isMemoryTypeHbm(LinuxSysmanImp *pLinuxSysmanImp) { + uint32_t memType = pLinuxSysmanImp->getMemoryType(); + if (memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2e || memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2) { + return true; + } + return false; +} -ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) { - return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; +void OsRas::getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) { + + constexpr auto maxErrorTypes = 2; + LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle); + if (errorType.size() < maxErrorTypes) { + auto pLinuxSysmanImp = static_cast(pOsSysman); + if (isMemoryTypeHbm(pLinuxSysmanImp) == true) { + LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle); + } + } } ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) { config->totalThreshold = totalThreshold; - memcpy(config->detailedThresholds.category, categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t)); + memcpy_s(config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t), categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t)); return ZE_RESULT_SUCCESS; } ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) { if (pFsAccess->isRootUser() == true) { totalThreshold = config->totalThreshold; - memcpy(categoryThreshold, config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t)); + memcpy_s(categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t), config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t)); return ZE_RESULT_SUCCESS; } NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); @@ -42,9 +60,41 @@ ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) { properties.subdeviceId = subdeviceId; return ZE_RESULT_SUCCESS; } + +ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) { + if (clear == true) { + if (pFsAccess->isRootUser() == false) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); + return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS; + } + } + + ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + for (auto &rasSource : rasSources) { + zes_ras_state_t localState = {}; + ze_result_t localResult = rasSource->osRasGetState(localState, clear); + if (localResult != ZE_RESULT_SUCCESS) { + continue; + } + for (uint32_t i = 0; i < maxRasErrorCategoryCount; i++) { + state.category[i] += localState.category[i]; + } + result = ZE_RESULT_SUCCESS; + } + return result; +} + +void LinuxRasImp::initSources() { + rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId)); + if (isMemoryTypeHbm(pLinuxSysmanImp) == true) { + rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, subdeviceId)); + } +} + LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) { pLinuxSysmanImp = static_cast(pOsSysman); pFsAccess = &pLinuxSysmanImp->getFsAccess(); + initSources(); } OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) { diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp.h b/level_zero/tools/source/sysman/ras/linux/os_ras_imp.h index 73550ab564..9b85a3e90f 100644 --- a/level_zero/tools/source/sysman/ras/linux/os_ras_imp.h +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp.h @@ -11,9 +11,20 @@ #include "level_zero/tools/source/sysman/ras/os_ras.h" #include "level_zero/tools/source/sysman/sysman_const.h" +#include +#include +#include +#include + namespace L0 { class FsAccess; +class SysfsAccess; +class PmuInterface; class LinuxSysmanImp; +class LinuxRasSources; +class FirmwareUtil; +struct Device; + class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass { public: ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override; @@ -28,12 +39,75 @@ class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass { zes_ras_error_type_t osRasErrorType = {}; FsAccess *pFsAccess = nullptr; LinuxSysmanImp *pLinuxSysmanImp = nullptr; + std::vector> rasSources = {}; private: + void initSources(); bool isSubdevice = false; uint32_t subdeviceId = 0; uint64_t totalThreshold = 0; uint64_t categoryThreshold[maxRasErrorCategoryCount] = {0}; }; +class LinuxRasSources : NEO::NonCopyableOrMovableClass { + public: + virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0; + virtual ~LinuxRasSources() = default; +}; + +class LinuxRasSourceGt : public LinuxRasSources { + public: + ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; + static void getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle); + LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId); + LinuxRasSourceGt() = default; + ~LinuxRasSourceGt() override; + + protected: + LinuxSysmanImp *pLinuxSysmanImp = nullptr; + zes_ras_error_type_t osRasErrorType = {}; + PmuInterface *pPmuInterface = nullptr; + FsAccess *pFsAccess = nullptr; + SysfsAccess *pSysfsAccess = nullptr; + + private: + void initRasErrors(ze_bool_t clear); + ze_result_t getPmuConfig( + const std::string &eventDirectory, + const std::vector &listOfEvents, + const std::string &errorFileToGetConfig, + std::string &pmuConfig); + ze_result_t getBootUpErrorCountFromSysfs( + std::string nameOfError, + const std::string &errorCounterDir, + uint64_t &errorVal); + void closeFds(); + int64_t groupFd = -1; + std::vector memberFds = {}; + uint64_t initialErrorCount[maxRasErrorCategoryCount] = {0}; + std::map errorCategoryToEventCount; + uint64_t totalEventCount = 0; + bool isSubdevice = false; + uint32_t subdeviceId = 0; +}; + +class LinuxRasSourceHbm : public LinuxRasSources { + public: + ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; + static void getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle); + LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId); + LinuxRasSourceHbm() = default; + ~LinuxRasSourceHbm() override{}; + + protected: + LinuxSysmanImp *pLinuxSysmanImp = nullptr; + zes_ras_error_type_t osRasErrorType = {}; + FirmwareUtil *pFwInterface = nullptr; + Device *pDevice = nullptr; + + private: + uint64_t errorBaseline = 0; + uint32_t subdeviceId = 0; +}; + } // namespace L0 diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp index f2c1097574..97880e4635 100644 --- a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp @@ -8,7 +8,7 @@ #include "shared/source/debug_settings/debug_settings_manager.h" #include "level_zero/tools/source/sysman/linux/os_sysman_imp.h" -#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h" +#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h" #include "level_zero/tools/source/sysman/sysman_imp.h" namespace L0 { diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_hbm.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_hbm.cpp index a618912f4e..b51c13f0fc 100644 --- a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_hbm.cpp +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_hbm.cpp @@ -9,7 +9,7 @@ #include "level_zero/tools/source/sysman/firmware_util/firmware_util.h" #include "level_zero/tools/source/sysman/linux/os_sysman_imp.h" -#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h" +#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h" namespace L0 { diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.cpp deleted file mode 100644 index af9e6debfe..0000000000 --- a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (C) 2020-2023 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h" - -#include "shared/source/debug_settings/debug_settings_manager.h" -#include "shared/source/helpers/string.h" -#include "shared/source/os_interface/linux/system_info.h" - -#include "level_zero/tools/source/sysman/linux/os_sysman_imp.h" - -#include "drm/intel_hwconfig_types.h" - -namespace L0 { - -static bool isMemoryTypeHbm(LinuxSysmanImp *pLinuxSysmanImp) { - uint32_t memType = pLinuxSysmanImp->getMemoryType(); - if (memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2e || memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2) { - return true; - } - return false; -} - -void OsRas::getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) { - - constexpr auto maxErrorTypes = 2; - LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle); - if (errorType.size() < maxErrorTypes) { - auto pLinuxSysmanImp = static_cast(pOsSysman); - if (isMemoryTypeHbm(pLinuxSysmanImp) == true) { - LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle); - } - } -} - -ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) { - config->totalThreshold = totalThreshold; - memcpy_s(config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t), categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t)); - return ZE_RESULT_SUCCESS; -} - -ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) { - if (pFsAccess->isRootUser() == true) { - totalThreshold = config->totalThreshold; - memcpy_s(categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t), config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t)); - return ZE_RESULT_SUCCESS; - } - NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); - return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS; -} - -ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) { - properties.pNext = nullptr; - properties.type = osRasErrorType; - properties.onSubdevice = isSubdevice; - properties.subdeviceId = subdeviceId; - return ZE_RESULT_SUCCESS; -} - -ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) { - if (clear == true) { - if (pFsAccess->isRootUser() == false) { - NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); - return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS; - } - } - - ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; - for (auto &rasSource : rasSources) { - zes_ras_state_t localState = {}; - ze_result_t localResult = rasSource->osRasGetState(localState, clear); - if (localResult != ZE_RESULT_SUCCESS) { - continue; - } - for (uint32_t i = 0; i < maxRasErrorCategoryCount; i++) { - state.category[i] += localState.category[i]; - } - result = ZE_RESULT_SUCCESS; - } - return result; -} - -void LinuxRasImp::initSources() { - rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId)); - if (isMemoryTypeHbm(pLinuxSysmanImp) == true) { - rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, subdeviceId)); - } -} - -LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) { - pLinuxSysmanImp = static_cast(pOsSysman); - pFsAccess = &pLinuxSysmanImp->getFsAccess(); - initSources(); -} - -OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) { - LinuxRasImp *pLinuxRasImp = new LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId); - return static_cast(pLinuxRasImp); -} - -} // namespace L0 diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h deleted file mode 100644 index 9b85a3e90f..0000000000 --- a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (C) 2020-2023 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#pragma once -#include "shared/source/helpers/non_copyable_or_moveable.h" - -#include "level_zero/tools/source/sysman/ras/os_ras.h" -#include "level_zero/tools/source/sysman/sysman_const.h" - -#include -#include -#include -#include - -namespace L0 { -class FsAccess; -class SysfsAccess; -class PmuInterface; -class LinuxSysmanImp; -class LinuxRasSources; -class FirmwareUtil; -struct Device; - -class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass { - public: - ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override; - ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; - ze_result_t osRasGetConfig(zes_ras_config_t *config) override; - ze_result_t osRasSetConfig(const zes_ras_config_t *config) override; - LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId); - LinuxRasImp() = default; - ~LinuxRasImp() override = default; - - protected: - zes_ras_error_type_t osRasErrorType = {}; - FsAccess *pFsAccess = nullptr; - LinuxSysmanImp *pLinuxSysmanImp = nullptr; - std::vector> rasSources = {}; - - private: - void initSources(); - bool isSubdevice = false; - uint32_t subdeviceId = 0; - uint64_t totalThreshold = 0; - uint64_t categoryThreshold[maxRasErrorCategoryCount] = {0}; -}; - -class LinuxRasSources : NEO::NonCopyableOrMovableClass { - public: - virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0; - virtual ~LinuxRasSources() = default; -}; - -class LinuxRasSourceGt : public LinuxRasSources { - public: - ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; - static void getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle); - LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId); - LinuxRasSourceGt() = default; - ~LinuxRasSourceGt() override; - - protected: - LinuxSysmanImp *pLinuxSysmanImp = nullptr; - zes_ras_error_type_t osRasErrorType = {}; - PmuInterface *pPmuInterface = nullptr; - FsAccess *pFsAccess = nullptr; - SysfsAccess *pSysfsAccess = nullptr; - - private: - void initRasErrors(ze_bool_t clear); - ze_result_t getPmuConfig( - const std::string &eventDirectory, - const std::vector &listOfEvents, - const std::string &errorFileToGetConfig, - std::string &pmuConfig); - ze_result_t getBootUpErrorCountFromSysfs( - std::string nameOfError, - const std::string &errorCounterDir, - uint64_t &errorVal); - void closeFds(); - int64_t groupFd = -1; - std::vector memberFds = {}; - uint64_t initialErrorCount[maxRasErrorCategoryCount] = {0}; - std::map errorCategoryToEventCount; - uint64_t totalEventCount = 0; - bool isSubdevice = false; - uint32_t subdeviceId = 0; -}; - -class LinuxRasSourceHbm : public LinuxRasSources { - public: - ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; - static void getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle); - LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId); - LinuxRasSourceHbm() = default; - ~LinuxRasSourceHbm() override{}; - - protected: - LinuxSysmanImp *pLinuxSysmanImp = nullptr; - zes_ras_error_type_t osRasErrorType = {}; - FirmwareUtil *pFwInterface = nullptr; - Device *pDevice = nullptr; - - private: - uint64_t errorBaseline = 0; - uint32_t subdeviceId = 0; -}; - -} // namespace L0 diff --git a/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/CMakeLists.txt b/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/CMakeLists.txt index 9a9bfca3a1..690fefbe3b 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/CMakeLists.txt +++ b/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2020-2022 Intel Corporation +# Copyright (C) 2020-2023 Intel Corporation # # SPDX-License-Identifier: MIT # @@ -7,15 +7,10 @@ set(L0_TESTS_TOOLS_SYSMAN_GLOBAL_OPERATIONS_LINUX ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_global_operations.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_global_operations_helper.cpp ${CMAKE_CURRENT_SOURCE_DIR}/mock_global_operations.h ) -if(NEO_ENABLE_i915_PRELIM_DETECTION) - list(APPEND L0_TESTS_TOOLS_SYSMAN_GLOBAL_OPERATIONS_LINUX - ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_global_operations_prelim.cpp - ) -endif() - if(UNIX) target_sources(${TARGET_NAME} PRIVATE diff --git a/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations_prelim.cpp b/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations_helper.cpp similarity index 100% rename from level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations_prelim.cpp rename to level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations_helper.cpp diff --git a/level_zero/tools/test/unit_tests/sources/sysman/memory/linux/mock_memory.h b/level_zero/tools/test/unit_tests/sources/sysman/memory/linux/mock_memory.h index 7414bc8dea..ff0b354bda 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/memory/linux/mock_memory.h +++ b/level_zero/tools/test/unit_tests/sources/sysman/memory/linux/mock_memory.h @@ -31,6 +31,7 @@ struct MockMemoryManagerSysman : public MemoryManagerMock { }; struct MockMemoryNeoDrm : public Drm { + using Drm::ioctlHelper; using Drm::memoryInfo; const int mockFd = 33; MockMemoryNeoDrm(RootDeviceEnvironment &rootDeviceEnvironment) : Drm(std::make_unique(mockFd, ""), rootDeviceEnvironment) {} diff --git a/level_zero/tools/test/unit_tests/sources/sysman/memory/linux/test_sysman_memory_dg1.cpp b/level_zero/tools/test/unit_tests/sources/sysman/memory/linux/test_sysman_memory_dg1.cpp index d4718ef740..3eff2aeae0 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/memory/linux/test_sysman_memory_dg1.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/memory/linux/test_sysman_memory_dg1.cpp @@ -33,6 +33,7 @@ class SysmanDeviceMemoryFixture : public SysmanDeviceFixture { device->getDriverHandle()->setMemoryManager(pMemoryManager); pDrm = new MockMemoryNeoDrm(const_cast(neoDevice->getRootDeviceEnvironment())); + pDrm->ioctlHelper = static_cast>(std::make_unique(*pDrm)); pSysmanDevice = device->getSysmanHandle(); pSysmanDeviceImp = static_cast(pSysmanDevice); @@ -194,7 +195,6 @@ TEST_F(SysmanDeviceMemoryFixture, GivenValidMemoryHandleWhenGettingStateThenCall for (auto handle : handles) { zes_mem_state_t state; - ze_result_t result = zesMemoryGetState(handle, &state); EXPECT_EQ(result, ZE_RESULT_SUCCESS); diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/CMakeLists.txt b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/CMakeLists.txt index 1b5eb7dd1d..3938a1d8d2 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/CMakeLists.txt +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/CMakeLists.txt @@ -6,20 +6,10 @@ set(L0_TESTS_TOOLS_SYSMAN_RAS_LINUX ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mock_sysman_ras.h ) -if(NEO_ENABLE_i915_PRELIM_DETECTION) - list(APPEND L0_TESTS_TOOLS_SYSMAN_RAS_LINUX - ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras_prelim.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras_prelim.h - ) -else() - list(APPEND L0_TESTS_TOOLS_SYSMAN_RAS_LINUX - ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras.h - ) -endif() - if(UNIX) target_sources(${TARGET_NAME} PRIVATE diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras.h b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras.h deleted file mode 100644 index 8fd80ff5f1..0000000000 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (C) 2020-2023 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#pragma once -#include "level_zero/tools/source/sysman/linux/fs_access.h" -#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h" -#include "level_zero/tools/source/sysman/ras/ras.h" -#include "level_zero/tools/source/sysman/ras/ras_imp.h" - -namespace L0 { -namespace ult { - -struct MockRasFsAccess : public FsAccess { - bool mockRootUser = true; - bool isRootUser() override { - return mockRootUser; - } - MockRasFsAccess() = default; -}; - -} // namespace ult -} // namespace L0 diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_sysman_ras.h similarity index 99% rename from level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h rename to level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_sysman_ras.h index f30d2998b0..50d013108a 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_sysman_ras.h @@ -13,7 +13,7 @@ #include "level_zero/tools/source/sysman/linux/fs_access.h" #include "level_zero/tools/source/sysman/linux/os_sysman_imp.h" #include "level_zero/tools/source/sysman/linux/pmu/pmu_imp.h" -#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h" +#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h" #include "level_zero/tools/source/sysman/ras/ras.h" #include "level_zero/tools/source/sysman/ras/ras_imp.h" diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras.cpp b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras.cpp index 97dbe7b7a8..deca312482 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2022-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -7,29 +7,62 @@ #include "level_zero/tools/source/sysman/sysman_const.h" #include "level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h" - -#include "mock_fs_ras.h" +#include "level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_sysman_ras.h" extern bool sysmanUltsEnable; +class OsRas; namespace L0 { namespace ult { - -constexpr uint32_t mockHandleCount = 0; +constexpr uint32_t mockHandleCount = 2u; +constexpr uint32_t mockHandleCountForSubDevice = 4u; struct SysmanRasFixture : public SysmanDeviceFixture { protected: std::unique_ptr pFsAccess; - std::vector deviceHandles; + std::unique_ptr pSysfsAccess; + std::unique_ptr pPmuInterface; + std::unique_ptr pRasFwUtilInterface; + std::unique_ptr pDrm; + MemoryManager *pMemoryManagerOriginal = nullptr; + std::unique_ptr pMemoryManager; FsAccess *pFsAccessOriginal = nullptr; + Drm *pOriginalDrm = nullptr; + SysfsAccess *pSysfsAccessOriginal = nullptr; + PmuInterface *pOriginalPmuInterface = nullptr; + FirmwareUtil *pFwUtilOriginal = nullptr; + std::vector deviceHandles; + void SetUp() override { if (!sysmanUltsEnable) { GTEST_SKIP(); } SysmanDeviceFixture::SetUp(); + pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager(); + pMemoryManager = std::make_unique(*neoDevice->getExecutionEnvironment()); + pMemoryManager->localMemorySupported[0] = true; + device->getDriverHandle()->setMemoryManager(pMemoryManager.get()); pFsAccess = std::make_unique(); + pSysfsAccess = std::make_unique(); + pRasFwUtilInterface = std::make_unique(); + pDrm = std::make_unique(const_cast(neoDevice->getRootDeviceEnvironment())); + pDrm->ioctlHelper = static_cast>(std::make_unique(*pDrm)); pFsAccessOriginal = pLinuxSysmanImp->pFsAccess; + pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess; + pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface; + pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface; + pOriginalDrm = pLinuxSysmanImp->pDrm; pLinuxSysmanImp->pFsAccess = pFsAccess.get(); - pFsAccess->mockRootUser = true; + pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get(); + pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get(); + pPmuInterface = std::make_unique(pLinuxSysmanImp); + pLinuxSysmanImp->pPmuInterface = pPmuInterface.get(); + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e); + pLinuxSysmanImp->pDrm = pDrm.get(); + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); uint32_t subDeviceCount = 0; Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr); @@ -44,10 +77,14 @@ struct SysmanRasFixture : public SysmanDeviceFixture { if (!sysmanUltsEnable) { GTEST_SKIP(); } + device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal); pLinuxSysmanImp->pFsAccess = pFsAccessOriginal; + pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal; + pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface; + pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal; + pLinuxSysmanImp->pDrm = pOriginalDrm; SysmanDeviceFixture::TearDown(); } - std::vector getRasHandles(uint32_t count) { std::vector handles(count, nullptr); EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); @@ -55,16 +92,7 @@ struct SysmanRasFixture : public SysmanDeviceFixture { } }; -TEST_F(SysmanRasFixture, GivenValidRasContextWhenRetrievingRasHandlesThenSuccessIsReturned) { - uint32_t count = 0; - RasHandleContext *pRasHandleContext = new RasHandleContext(pSysmanDeviceImp->pOsSysman); - ze_result_t result = pRasHandleContext->rasGet(&count, nullptr); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCount); - delete pRasHandleContext; -} - -TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRasErrorSetsThenCorrectCountIsReported) { +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesInThenSuccessReturn) { uint32_t count = 0; ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); EXPECT_EQ(ZE_RESULT_SUCCESS, result); @@ -74,102 +102,717 @@ TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRasErrorSetsThenCorrectCountI result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL); EXPECT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(testcount, mockHandleCount); - - count = 0; - std::vector handles(count, nullptr); - EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); - EXPECT_EQ(count, mockHandleCount); - - RasImp *pTestRasImp = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle()); - pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp); - EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, nullptr), ZE_RESULT_SUCCESS); - EXPECT_EQ(count, mockHandleCount + 1); - - testcount = count; - - handles.resize(testcount); - EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, handles.data()), ZE_RESULT_SUCCESS); - EXPECT_EQ(testcount, mockHandleCount + 1); - EXPECT_NE(nullptr, handles.data()); - - pSysmanDeviceImp->pRasHandleContext->handleList.pop_back(); - delete pTestRasImp; + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + EXPECT_NE(handle, nullptr); + } } TEST_F(SysmanRasFixture, GivenValidRasHandleWhenGettingRasPropertiesThenSuccessIsReturned) { - RasImp *pTestRasImp = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle()); - pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp); - - auto handles = getRasHandles(mockHandleCount + 1); + auto handles = getRasHandles(mockHandleCount); + bool correctable = true; for (auto handle : handles) { + ASSERT_NE(nullptr, handle); zes_ras_properties_t properties = {}; EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties)); EXPECT_EQ(properties.pNext, nullptr); EXPECT_EQ(properties.onSubdevice, false); EXPECT_EQ(properties.subdeviceId, 0u); - EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE); + if (correctable == true) { + EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE); + correctable = false; + } else { + EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE); + } } - pSysmanDeviceImp->pRasHandleContext->handleList.pop_back(); - delete pTestRasImp; } -TEST_F(SysmanRasFixture, GivenValidRasHandleWhileCallingZesRasGetStateThenFailureIsReturned) { - RasImp *pTestRasImp = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle()); - pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp); +TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfReadSymLinkFailsThenNoSupportedErrorTypeIsReturned) { + std::set errorType = {}; - auto handles = getRasHandles(mockHandleCount + 1); + pSysfsAccess->mockReadSymLinkResult = true; + LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, device->toHandle()); + EXPECT_EQ(errorType.size(), 0u); +} + +TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfListDirectoryFailsThenNoSupportedErrorTypeIsReturned) { + std::set errorType = {}; + + pFsAccess->mockReadDirectoryFailure = true; + + LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, device); + EXPECT_EQ(errorType.size(), 0u); +} + +TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForHbmAndFwInterfaceIsAbsentThenNoSupportedErrorTypeIsReturned) { + std::set errorType = {}; + pLinuxSysmanImp->pFwUtilInterface = nullptr; + + LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, device); + EXPECT_EQ(errorType.size(), 0u); +} + +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentThenZeroHandlesAreCreated) { + pFsAccess->mockReadDirectoryWithoutRasEvents = true; + + pLinuxSysmanImp->pFwUtilInterface = nullptr; + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, 0u); + uint32_t testcount = count + 1; + result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(testcount, 0u); +} + +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAndHbmAreAbsentThenZeroHandlesAreCreated) { + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_LPDDR4); + pRasFwUtilInterface->mockMemorySuccess = true; + pFsAccess->mockReadDirectoryWithoutRasEvents = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, 0u); +} + +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfHbmAndFwInterfaceArePresentThenSuccessIsReturned) { + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2); + pRasFwUtilInterface->mockMemorySuccess = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, mockHandleCount); +} + +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentAndQuerySystemInfoSucceedsButMemSysInfoIsNullThenZeroHandlesAreCreated) { + pFsAccess->mockReadDirectoryWithoutRasEvents = true; + pDrm->mockQuerySystemInfoReturnValue.push_back(true); + + pLinuxSysmanImp->pFwUtilInterface = nullptr; + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, 0u); +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) { + + pPmuInterface->mockPmuReadCorrectable = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + auto handles = getRasHandles(mockHandleCount); + bool correctable = true; for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); + if (correctable == true) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); + correctable = false; + } else { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalMdfiEastCount + initialUncorrectableNonComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); + } + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtAfterClearThenSuccessIsReturned) { + + pPmuInterface->mockPmuReadAfterClear = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + auto handles = getRasHandles(mockHandleCount); + bool correctable = true; + ze_bool_t clear = 0; + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); + if (correctable == true) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); + correctable = false; + } else { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); + } + } + correctable = true; + clear = 1; + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); + if (correctable == true) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); + correctable = false; + } else { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); + } + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSuccessIsReturned) { + + pPmuInterface->mockPmuReadResult = true; + pRasFwUtilInterface->mockMemorySuccess = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + + auto handles = getRasHandles(mockHandleCount); + bool correctable = true; + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); + if (correctable == true) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); + correctable = false; + } else { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); + } + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmWithClearThenSuccessIsReturned) { + + pPmuInterface->mockPmuReadResult = true; + pRasFwUtilInterface->mockMemorySuccess = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + auto handles = getRasHandles(mockHandleCount); + bool correctable = true; + ze_bool_t clear = 0; + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); + if (correctable == true) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); + correctable = false; + } else { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); + } + } + + correctable = true; + clear = 1; + for (auto handle : handles) { + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); + if (correctable == true) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + correctable = false; + } else { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + } + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateWithClearOptionWithoutPermissionsThenFailureIsReturned) { + + pFsAccess->mockRootUser = true; + + auto handles = getRasHandles(mockHandleCount); + ze_bool_t clear = 1; + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasGetState(handle, clear, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndUnableToRetrieveConfigValuesAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pFsAccess->mockReadFileFailure = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPerfEventOpenFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pPmuInterface->mockPerfEvent = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pPmuInterface->mockPmuReadResult = true; + + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceWithClearAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pPmuInterface->mockPmuReadResult = true; + + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 1, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateForGtInterfaceAndPMUGetEventTypeFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pFsAccess->mockReadVal = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateAndFirmwareInterfaceIsAbsentOtherInterfacesAreAlsoAbsentThenFailureIsReturned) { + + pFsAccess->mockReadVal = true; + + pLinuxSysmanImp->pFwUtilInterface = nullptr; + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); zes_ras_state_t state = {}; EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); } - pSysmanDeviceImp->pRasHandleContext->handleList.pop_back(); - delete pTestRasImp; } TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetConfigAfterzesRasSetConfigThenSuccessIsReturned) { - RasImp *pTestRasImp = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle()); - pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp); - - auto handles = getRasHandles(mockHandleCount + 1); - + auto handles = getRasHandles(mockHandleCount); for (auto handle : handles) { + ASSERT_NE(nullptr, handle); zes_ras_config_t setConfig = {}; zes_ras_config_t getConfig = {}; setConfig.totalThreshold = 50; memset(setConfig.detailedThresholds.category, 1, maxRasErrorCategoryCount * sizeof(uint64_t)); + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasSetConfig(handle, &setConfig)); EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetConfig(handle, &getConfig)); EXPECT_EQ(setConfig.totalThreshold, getConfig.totalThreshold); int compare = std::memcmp(setConfig.detailedThresholds.category, getConfig.detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t)); EXPECT_EQ(0, compare); } - pSysmanDeviceImp->pRasHandleContext->handleList.pop_back(); - delete pTestRasImp; } TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasSetConfigWithoutPermissionThenFailureIsReturned) { - pFsAccess->mockRootUser = false; - RasImp *pTestRasImp = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle()); - pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp); - auto handles = getRasHandles(mockHandleCount + 1); + pFsAccess->mockRootUser = true; + + auto handles = getRasHandles(mockHandleCount); for (auto handle : handles) { + ASSERT_NE(nullptr, handle); zes_ras_config_t setConfig = {}; setConfig.totalThreshold = 50; memset(setConfig.detailedThresholds.category, 1, maxRasErrorCategoryCount * sizeof(uint64_t)); EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasSetConfig(handle, &setConfig)); } - pSysmanDeviceImp->pRasHandleContext->releaseRasHandles(); } -TEST_F(SysmanRasFixture, GivenValidInstanceWhenOsRasImplementationIsNullThenDestructorIsCalledWithoutException) { +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) { - RasImp *pTestRasImp = new RasImp(); - pTestRasImp->pOsRas = nullptr; - EXPECT_NO_THROW(delete pTestRasImp;); // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) + pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsInsideGetEventOpenAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndListDirectoryFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) { + + pFsAccess->mockListDirectoryStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + auto handles = getRasHandles(mockHandleCount); + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); + } +} + +TEST_F(SysmanRasFixture, GivenValidRasHandleAndHandleCountZeroWhenCallingReInitThenValidCountIsReturnedAndVerifyzesDeviceEnumRasErrorSetsSucceeds) { + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, mockHandleCount); + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + + pLinuxSysmanImp->reInitSysmanDeviceResources(); + + count = 0; + result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, mockHandleCount); +} + +struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture { + protected: + std::unique_ptr pFsAccess; + std::unique_ptr pSysfsAccess; + std::unique_ptr pPmuInterface; + MemoryManager *pMemoryManagerOriginal = nullptr; + std::unique_ptr pMemoryManager; + std::unique_ptr pRasFwUtilInterface; + std::unique_ptr pDrm; + FsAccess *pFsAccessOriginal = nullptr; + SysfsAccess *pSysfsAccessOriginal = nullptr; + PmuInterface *pOriginalPmuInterface = nullptr; + FirmwareUtil *pFwUtilOriginal = nullptr; + Drm *pOriginalDrm = nullptr; + std::vector deviceHandles; + + void SetUp() override { + if (!sysmanUltsEnable) { + GTEST_SKIP(); + } + SysmanMultiDeviceFixture::SetUp(); + pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager(); + pMemoryManager = std::make_unique(*neoDevice->getExecutionEnvironment()); + pMemoryManager->localMemorySupported[0] = true; + device->getDriverHandle()->setMemoryManager(pMemoryManager.get()); + pDrm = std::make_unique(const_cast(neoDevice->getRootDeviceEnvironment())); + pDrm->ioctlHelper = static_cast>(std::make_unique(*pDrm)); + pFsAccess = std::make_unique(); + pSysfsAccess = std::make_unique(); + pRasFwUtilInterface = std::make_unique(); + pFsAccessOriginal = pLinuxSysmanImp->pFsAccess; + pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess; + pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface; + pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface; + pOriginalDrm = pLinuxSysmanImp->pDrm; + pLinuxSysmanImp->pFsAccess = pFsAccess.get(); + pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get(); + pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get(); + pPmuInterface = std::make_unique(pLinuxSysmanImp); + pLinuxSysmanImp->pPmuInterface = pPmuInterface.get(); + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e); + pLinuxSysmanImp->pDrm = pDrm.get(); + + pFsAccess->mockReadDirectoryForMultiDevice = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + uint32_t subDeviceCount = 0; + Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr); + if (subDeviceCount == 0) { + deviceHandles.resize(1, device->toHandle()); + } else { + deviceHandles.resize(subDeviceCount, nullptr); + Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, deviceHandles.data()); + } + } + void TearDown() override { + if (!sysmanUltsEnable) { + GTEST_SKIP(); + } + device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal); + pLinuxSysmanImp->pFsAccess = pFsAccessOriginal; + pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal; + pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface; + pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal; + pLinuxSysmanImp->pDrm = pOriginalDrm; + SysmanMultiDeviceFixture::TearDown(); + } + std::vector getRasHandles(uint32_t count) { + std::vector handles(count, nullptr); + EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); + return handles; + } +}; +TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWithMultiDeviceWhenRetrievingRasHandlesThenSuccessIsReturned) { + RasHandleContext *pRasHandleContext = new RasHandleContext(pSysmanDeviceImp->pOsSysman); + uint32_t count = 0; + ze_result_t result = pRasHandleContext->rasGet(&count, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ((count > 0), true); + delete pRasHandleContext; +} +TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesThenSuccessIsReturned) { + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, mockHandleCountForSubDevice); + + uint32_t testcount = count + 1; + result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(testcount, mockHandleCountForSubDevice); + auto handles = getRasHandles(mockHandleCountForSubDevice); + for (auto handle : handles) { + EXPECT_NE(handle, nullptr); + } +} +TEST_F(SysmanRasMultiDeviceFixture, GivenValidHandleWhenGettingRasPropertiesThenSuccessIsReturned) { + for (auto deviceHandle : deviceHandles) { + zes_ras_properties_t properties = {}; + ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; + Device::fromHandle(deviceHandle)->getProperties(&deviceProperties); + bool isSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE; + PublicLinuxRasImp *pLinuxRasImp = new PublicLinuxRasImp(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, deviceProperties.subdeviceId); + EXPECT_EQ(ZE_RESULT_SUCCESS, pLinuxRasImp->osRasGetProperties(properties)); + EXPECT_EQ(properties.subdeviceId, deviceProperties.subdeviceId); + EXPECT_EQ(properties.onSubdevice, isSubDevice); + EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE); + delete pLinuxRasImp; + } +} + +TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) { + + pPmuInterface->mockPmuReadTile = true; + pSysfsAccess->isMultiTileArch = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + auto handles = getRasHandles(mockHandleCountForSubDevice); + uint32_t handleIndex = 0u; + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); + if (handleIndex == 0u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctablel3Bank + initialCorrectableCacheErrorTile0); // No. of correctable error type for subdevice 0 + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], correctableGscSramEcc + initialCorrectableNonComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); + } else if (handleIndex == 1u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrorsTile0); // No. of uncorrectable error type for subdevice 0 + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0); + } else if (handleIndex == 2u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); // No. of correctable error type for subdevice 1 + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableSubsliceTile1 + correctableGucErrorCountTile1 + correctableSamplerErrorCountTile1 + initialCorrectableComputeErrorsTile1); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); + } else if (handleIndex == 3u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalL3BankTile1 + fatalIdiParityErrorCountTile1 + initialUncorrectableCacheErrorsTile1); // No. of uncorrectable error type for subdevice 1 + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCountTile1 + initialEngineResetTile1); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttentionTile1 + initialProgrammingErrorsTile1); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalGucErrorCountTile1 + initialUncorrectableComputeErrorsTile1); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiWestCountTile1 + socFatalPunitTile1 + initialUncorrectableNonComputeErrorsTile1); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverEngineOther + initialUncorrectableDriverErrorsTile1); + } + handleIndex++; + } +} + +TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSuccessIsReturned) { + + pPmuInterface->mockPmuReadResult = true; + pRasFwUtilInterface->mockMemorySuccess = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + auto handles = getRasHandles(mockHandleCountForSubDevice); + uint32_t handleIndex = 0u; + + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_state_t state = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); + if (handleIndex == 0u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 0 + } else if (handleIndex == 1u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 0 + } else if (handleIndex == 2u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 1 + } else if (handleIndex == 3u) { + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 1 + } + handleIndex++; + } +} + +class SysmanRasAffinityMaskFixture : public SysmanRasMultiDeviceFixture { + void SetUp() override { + if (!sysmanUltsEnable) { + GTEST_SKIP(); + } + NEO::DebugManager.flags.ZE_AFFINITY_MASK.set("0.1"); + SysmanRasMultiDeviceFixture::SetUp(); + } + + void TearDown() override { + if (!sysmanUltsEnable) { + GTEST_SKIP(); + } + SysmanRasMultiDeviceFixture::TearDown(); + } + DebugManagerStateRestore restorer; +}; + +TEST_F(SysmanRasAffinityMaskFixture, GivenAffinityMaskIsSetWhenCallingRasPropertiesThenPropertiesAreReturnedForTheSubDevicesAccordingToAffinityMask) { + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, mockHandleCount); + auto handles = getRasHandles(mockHandleCount); + uint32_t handleIndex = 0u; + for (auto handle : handles) { + ASSERT_NE(nullptr, handle); + zes_ras_properties_t properties = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties)); + EXPECT_EQ(properties.pNext, nullptr); + EXPECT_EQ(properties.onSubdevice, true); + EXPECT_EQ(properties.subdeviceId, 1u); // Affinity mask 0.1 is set which means only subdevice 1 is exposed + if (handleIndex == 0u) { + EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE); + + } else if (handleIndex == 1u) { + EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE); + } + handleIndex++; + } } } // namespace ult diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp deleted file mode 100644 index 3e66fbc04f..0000000000 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp +++ /dev/null @@ -1,819 +0,0 @@ -/* - * Copyright (C) 2022-2023 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#include "level_zero/tools/source/sysman/sysman_const.h" -#include "level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h" -#include "level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h" - -extern bool sysmanUltsEnable; - -class OsRas; -namespace L0 { -namespace ult { -constexpr uint32_t mockHandleCount = 2u; -constexpr uint32_t mockHandleCountForSubDevice = 4u; -struct SysmanRasFixture : public SysmanDeviceFixture { - protected: - std::unique_ptr pFsAccess; - std::unique_ptr pSysfsAccess; - std::unique_ptr pPmuInterface; - std::unique_ptr pRasFwUtilInterface; - std::unique_ptr pDrm; - MemoryManager *pMemoryManagerOriginal = nullptr; - std::unique_ptr pMemoryManager; - FsAccess *pFsAccessOriginal = nullptr; - Drm *pOriginalDrm = nullptr; - SysfsAccess *pSysfsAccessOriginal = nullptr; - PmuInterface *pOriginalPmuInterface = nullptr; - FirmwareUtil *pFwUtilOriginal = nullptr; - std::vector deviceHandles; - - void SetUp() override { - if (!sysmanUltsEnable) { - GTEST_SKIP(); - } - SysmanDeviceFixture::SetUp(); - pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager(); - pMemoryManager = std::make_unique(*neoDevice->getExecutionEnvironment()); - pMemoryManager->localMemorySupported[0] = true; - device->getDriverHandle()->setMemoryManager(pMemoryManager.get()); - pFsAccess = std::make_unique(); - pSysfsAccess = std::make_unique(); - pRasFwUtilInterface = std::make_unique(); - pDrm = std::make_unique(const_cast(neoDevice->getRootDeviceEnvironment())); - pDrm->ioctlHelper = static_cast>(std::make_unique(*pDrm)); - pFsAccessOriginal = pLinuxSysmanImp->pFsAccess; - pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess; - pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface; - pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface; - pOriginalDrm = pLinuxSysmanImp->pDrm; - pLinuxSysmanImp->pFsAccess = pFsAccess.get(); - pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get(); - pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get(); - pPmuInterface = std::make_unique(pLinuxSysmanImp); - pLinuxSysmanImp->pPmuInterface = pPmuInterface.get(); - pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e); - pLinuxSysmanImp->pDrm = pDrm.get(); - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - uint32_t subDeviceCount = 0; - Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr); - if (subDeviceCount == 0) { - deviceHandles.resize(1, device->toHandle()); - } else { - deviceHandles.resize(subDeviceCount, nullptr); - Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, deviceHandles.data()); - } - } - void TearDown() override { - if (!sysmanUltsEnable) { - GTEST_SKIP(); - } - device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal); - pLinuxSysmanImp->pFsAccess = pFsAccessOriginal; - pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal; - pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface; - pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal; - pLinuxSysmanImp->pDrm = pOriginalDrm; - SysmanDeviceFixture::TearDown(); - } - std::vector getRasHandles(uint32_t count) { - std::vector handles(count, nullptr); - EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); - return handles; - } -}; - -TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesInThenSuccessReturn) { - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCount); - - uint32_t testcount = count + 1; - result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(testcount, mockHandleCount); - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - EXPECT_NE(handle, nullptr); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenGettingRasPropertiesThenSuccessIsReturned) { - auto handles = getRasHandles(mockHandleCount); - bool correctable = true; - - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_properties_t properties = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties)); - EXPECT_EQ(properties.pNext, nullptr); - EXPECT_EQ(properties.onSubdevice, false); - EXPECT_EQ(properties.subdeviceId, 0u); - if (correctable == true) { - EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE); - correctable = false; - } else { - EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE); - } - } -} - -TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfReadSymLinkFailsThenNoSupportedErrorTypeIsReturned) { - std::set errorType = {}; - - pSysfsAccess->mockReadSymLinkResult = true; - - LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, device->toHandle()); - EXPECT_EQ(errorType.size(), 0u); -} - -TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfListDirectoryFailsThenNoSupportedErrorTypeIsReturned) { - std::set errorType = {}; - - pFsAccess->mockReadDirectoryFailure = true; - - LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, device); - EXPECT_EQ(errorType.size(), 0u); -} - -TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForHbmAndFwInterfaceIsAbsentThenNoSupportedErrorTypeIsReturned) { - std::set errorType = {}; - pLinuxSysmanImp->pFwUtilInterface = nullptr; - - LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, device); - EXPECT_EQ(errorType.size(), 0u); -} - -TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentThenZeroHandlesAreCreated) { - pFsAccess->mockReadDirectoryWithoutRasEvents = true; - - pLinuxSysmanImp->pFwUtilInterface = nullptr; - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, 0u); - uint32_t testcount = count + 1; - result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(testcount, 0u); -} - -TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAndHbmAreAbsentThenZeroHandlesAreCreated) { - pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_LPDDR4); - pRasFwUtilInterface->mockMemorySuccess = true; - pFsAccess->mockReadDirectoryWithoutRasEvents = true; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, 0u); -} - -TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfHbmAndFwInterfaceArePresentThenSuccessIsReturned) { - pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2); - pRasFwUtilInterface->mockMemorySuccess = true; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCount); -} - -TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentAndQuerySystemInfoSucceedsButMemSysInfoIsNullThenZeroHandlesAreCreated) { - pFsAccess->mockReadDirectoryWithoutRasEvents = true; - pDrm->mockQuerySystemInfoReturnValue.push_back(true); - - pLinuxSysmanImp->pFwUtilInterface = nullptr; - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, 0u); -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) { - - pPmuInterface->mockPmuReadCorrectable = true; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - auto handles = getRasHandles(mockHandleCount); - bool correctable = true; - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); - if (correctable == true) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); - correctable = false; - } else { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalMdfiEastCount + initialUncorrectableNonComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); - } - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtAfterClearThenSuccessIsReturned) { - - pPmuInterface->mockPmuReadAfterClear = true; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - auto handles = getRasHandles(mockHandleCount); - bool correctable = true; - ze_bool_t clear = 0; - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); - if (correctable == true) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); - correctable = false; - } else { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); - } - } - correctable = true; - clear = 1; - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); - if (correctable == true) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); - correctable = false; - } else { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); - } - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSuccessIsReturned) { - - pPmuInterface->mockPmuReadResult = true; - pRasFwUtilInterface->mockMemorySuccess = true; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - - auto handles = getRasHandles(mockHandleCount); - bool correctable = true; - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); - if (correctable == true) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); - correctable = false; - } else { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); - } - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmWithClearThenSuccessIsReturned) { - - pPmuInterface->mockPmuReadResult = true; - pRasFwUtilInterface->mockMemorySuccess = true; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - auto handles = getRasHandles(mockHandleCount); - bool correctable = true; - ze_bool_t clear = 0; - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); - if (correctable == true) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); - correctable = false; - } else { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); - } - } - - correctable = true; - clear = 1; - for (auto handle : handles) { - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state)); - if (correctable == true) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - correctable = false; - } else { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - } - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateWithClearOptionWithoutPermissionsThenFailureIsReturned) { - - pFsAccess->mockRootUser = true; - - auto handles = getRasHandles(mockHandleCount); - ze_bool_t clear = 1; - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasGetState(handle, clear, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndUnableToRetrieveConfigValuesAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pFsAccess->mockReadFileFailure = true; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPerfEventOpenFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pPmuInterface->mockPerfEvent = true; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pPmuInterface->mockPmuReadResult = true; - - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceWithClearAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pPmuInterface->mockPmuReadResult = true; - - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 1, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateForGtInterfaceAndPMUGetEventTypeFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pFsAccess->mockReadVal = true; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateAndFirmwareInterfaceIsAbsentOtherInterfacesAreAlsoAbsentThenFailureIsReturned) { - - pFsAccess->mockReadVal = true; - - pLinuxSysmanImp->pFwUtilInterface = nullptr; - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetConfigAfterzesRasSetConfigThenSuccessIsReturned) { - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_config_t setConfig = {}; - zes_ras_config_t getConfig = {}; - setConfig.totalThreshold = 50; - memset(setConfig.detailedThresholds.category, 1, maxRasErrorCategoryCount * sizeof(uint64_t)); - - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasSetConfig(handle, &setConfig)); - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetConfig(handle, &getConfig)); - EXPECT_EQ(setConfig.totalThreshold, getConfig.totalThreshold); - int compare = std::memcmp(setConfig.detailedThresholds.category, getConfig.detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t)); - EXPECT_EQ(0, compare); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasSetConfigWithoutPermissionThenFailureIsReturned) { - - pFsAccess->mockRootUser = true; - - auto handles = getRasHandles(mockHandleCount); - - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_config_t setConfig = {}; - setConfig.totalThreshold = 50; - memset(setConfig.detailedThresholds.category, 1, maxRasErrorCategoryCount * sizeof(uint64_t)); - EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasSetConfig(handle, &setConfig)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsInsideGetEventOpenAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndListDirectoryFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) { - - pFsAccess->mockListDirectoryStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - auto handles = getRasHandles(mockHandleCount); - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state)); - } -} - -TEST_F(SysmanRasFixture, GivenValidRasHandleAndHandleCountZeroWhenCallingReInitThenValidCountIsReturnedAndVerifyzesDeviceEnumRasErrorSetsSucceeds) { - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCount); - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - - pLinuxSysmanImp->reInitSysmanDeviceResources(); - - count = 0; - result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCount); -} - -struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture { - protected: - std::unique_ptr pFsAccess; - std::unique_ptr pSysfsAccess; - std::unique_ptr pPmuInterface; - MemoryManager *pMemoryManagerOriginal = nullptr; - std::unique_ptr pMemoryManager; - std::unique_ptr pRasFwUtilInterface; - std::unique_ptr pDrm; - FsAccess *pFsAccessOriginal = nullptr; - SysfsAccess *pSysfsAccessOriginal = nullptr; - PmuInterface *pOriginalPmuInterface = nullptr; - FirmwareUtil *pFwUtilOriginal = nullptr; - Drm *pOriginalDrm = nullptr; - std::vector deviceHandles; - - void SetUp() override { - if (!sysmanUltsEnable) { - GTEST_SKIP(); - } - SysmanMultiDeviceFixture::SetUp(); - pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager(); - pMemoryManager = std::make_unique(*neoDevice->getExecutionEnvironment()); - pMemoryManager->localMemorySupported[0] = true; - device->getDriverHandle()->setMemoryManager(pMemoryManager.get()); - pDrm = std::make_unique(const_cast(neoDevice->getRootDeviceEnvironment())); - pDrm->ioctlHelper = static_cast>(std::make_unique(*pDrm)); - pFsAccess = std::make_unique(); - pSysfsAccess = std::make_unique(); - pRasFwUtilInterface = std::make_unique(); - pFsAccessOriginal = pLinuxSysmanImp->pFsAccess; - pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess; - pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface; - pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface; - pOriginalDrm = pLinuxSysmanImp->pDrm; - pLinuxSysmanImp->pFsAccess = pFsAccess.get(); - pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get(); - pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get(); - pPmuInterface = std::make_unique(pLinuxSysmanImp); - pLinuxSysmanImp->pPmuInterface = pPmuInterface.get(); - pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e); - pLinuxSysmanImp->pDrm = pDrm.get(); - - pFsAccess->mockReadDirectoryForMultiDevice = true; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - uint32_t subDeviceCount = 0; - Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr); - if (subDeviceCount == 0) { - deviceHandles.resize(1, device->toHandle()); - } else { - deviceHandles.resize(subDeviceCount, nullptr); - Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, deviceHandles.data()); - } - } - void TearDown() override { - if (!sysmanUltsEnable) { - GTEST_SKIP(); - } - device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal); - pLinuxSysmanImp->pFsAccess = pFsAccessOriginal; - pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal; - pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface; - pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal; - pLinuxSysmanImp->pDrm = pOriginalDrm; - SysmanMultiDeviceFixture::TearDown(); - } - std::vector getRasHandles(uint32_t count) { - std::vector handles(count, nullptr); - EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS); - return handles; - } -}; -TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWithMultiDeviceWhenRetrievingRasHandlesThenSuccessIsReturned) { - RasHandleContext *pRasHandleContext = new RasHandleContext(pSysmanDeviceImp->pOsSysman); - uint32_t count = 0; - ze_result_t result = pRasHandleContext->rasGet(&count, nullptr); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ((count > 0), true); - delete pRasHandleContext; -} -TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesThenSuccessIsReturned) { - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCountForSubDevice); - - uint32_t testcount = count + 1; - result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(testcount, mockHandleCountForSubDevice); - auto handles = getRasHandles(mockHandleCountForSubDevice); - for (auto handle : handles) { - EXPECT_NE(handle, nullptr); - } -} -TEST_F(SysmanRasMultiDeviceFixture, GivenValidHandleWhenGettingRasPropertiesThenSuccessIsReturned) { - for (auto deviceHandle : deviceHandles) { - zes_ras_properties_t properties = {}; - ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; - Device::fromHandle(deviceHandle)->getProperties(&deviceProperties); - bool isSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE; - PublicLinuxRasImp *pLinuxRasImp = new PublicLinuxRasImp(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, deviceProperties.subdeviceId); - EXPECT_EQ(ZE_RESULT_SUCCESS, pLinuxRasImp->osRasGetProperties(properties)); - EXPECT_EQ(properties.subdeviceId, deviceProperties.subdeviceId); - EXPECT_EQ(properties.onSubdevice, isSubDevice); - EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE); - delete pLinuxRasImp; - } -} - -TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) { - - pPmuInterface->mockPmuReadTile = true; - pSysfsAccess->isMultiTileArch = true; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - auto handles = getRasHandles(mockHandleCountForSubDevice); - uint32_t handleIndex = 0u; - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); - if (handleIndex == 0u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctablel3Bank + initialCorrectableCacheErrorTile0); // No. of correctable error type for subdevice 0 - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], correctableGscSramEcc + initialCorrectableNonComputeErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); - } else if (handleIndex == 1u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrorsTile0); // No. of uncorrectable error type for subdevice 0 - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0); - } else if (handleIndex == 2u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); // No. of correctable error type for subdevice 1 - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableSubsliceTile1 + correctableGucErrorCountTile1 + correctableSamplerErrorCountTile1 + initialCorrectableComputeErrorsTile1); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); - } else if (handleIndex == 3u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalL3BankTile1 + fatalIdiParityErrorCountTile1 + initialUncorrectableCacheErrorsTile1); // No. of uncorrectable error type for subdevice 1 - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCountTile1 + initialEngineResetTile1); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttentionTile1 + initialProgrammingErrorsTile1); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalGucErrorCountTile1 + initialUncorrectableComputeErrorsTile1); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiWestCountTile1 + socFatalPunitTile1 + initialUncorrectableNonComputeErrorsTile1); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverEngineOther + initialUncorrectableDriverErrorsTile1); - } - handleIndex++; - } -} - -TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSuccessIsReturned) { - - pPmuInterface->mockPmuReadResult = true; - pRasFwUtilInterface->mockMemorySuccess = true; - - for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { - delete handle; - } - pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - auto handles = getRasHandles(mockHandleCountForSubDevice); - uint32_t handleIndex = 0u; - - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_state_t state = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state)); - if (handleIndex == 0u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 0 - } else if (handleIndex == 1u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 0 - } else if (handleIndex == 2u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 1 - } else if (handleIndex == 3u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 1 - } - handleIndex++; - } -} - -class SysmanRasAffinityMaskFixture : public SysmanRasMultiDeviceFixture { - void SetUp() override { - if (!sysmanUltsEnable) { - GTEST_SKIP(); - } - NEO::DebugManager.flags.ZE_AFFINITY_MASK.set("0.1"); - SysmanRasMultiDeviceFixture::SetUp(); - } - - void TearDown() override { - if (!sysmanUltsEnable) { - GTEST_SKIP(); - } - SysmanRasMultiDeviceFixture::TearDown(); - } - DebugManagerStateRestore restorer; -}; - -TEST_F(SysmanRasAffinityMaskFixture, GivenAffinityMaskIsSetWhenCallingRasPropertiesThenPropertiesAreReturnedForTheSubDevicesAccordingToAffinityMask) { - uint32_t count = 0; - ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, mockHandleCount); - auto handles = getRasHandles(mockHandleCount); - uint32_t handleIndex = 0u; - for (auto handle : handles) { - ASSERT_NE(nullptr, handle); - zes_ras_properties_t properties = {}; - EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties)); - EXPECT_EQ(properties.pNext, nullptr); - EXPECT_EQ(properties.onSubdevice, true); - EXPECT_EQ(properties.subdeviceId, 1u); // Affinity mask 0.1 is set which means only subdevice 1 is exposed - if (handleIndex == 0u) { - EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE); - - } else if (handleIndex == 1u) { - EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE); - } - handleIndex++; - } -} - -} // namespace ult -} // namespace L0 diff --git a/level_zero/tools/test/unit_tests/sources/sysman/scheduler/linux/mock_sysfs_scheduler.h b/level_zero/tools/test/unit_tests/sources/sysman/scheduler/linux/mock_sysfs_scheduler.h index e06e2f4424..b43ab20f34 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/scheduler/linux/mock_sysfs_scheduler.h +++ b/level_zero/tools/test/unit_tests/sources/sysman/scheduler/linux/mock_sysfs_scheduler.h @@ -170,7 +170,6 @@ struct MockSchedulerSysfsAccess : public SysfsAccess { } ze_result_t write(const std::string file, const uint64_t val) override { - if (mockWriteFileStatus != ZE_RESULT_SUCCESS) { return mockWriteFileStatus; } @@ -280,6 +279,7 @@ struct MockSchedulerSysfsAccess : public SysfsAccess { engineDirectoryPermissions = permission; } + ADDMETHOD_NOBASE(write, ze_result_t, ZE_RESULT_SUCCESS, (const std::string file, const int val)); MockSchedulerSysfsAccess() = default; private: diff --git a/level_zero/tools/test/unit_tests/sources/sysman/scheduler/linux/test_zes_scheduler.cpp b/level_zero/tools/test/unit_tests/sources/sysman/scheduler/linux/test_zes_scheduler.cpp index b5d99b9957..39f895e48e 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/scheduler/linux/test_zes_scheduler.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/scheduler/linux/test_zes_scheduler.cpp @@ -560,7 +560,7 @@ TEST_F(SysmanDeviceSchedulerFixture, GivenValidDeviceHandleWhenCallingzesSchedul } TEST_F(SysmanDeviceSchedulerFixture, GivenValidDeviceHandleWhenCallingzesSchedulerSetComputeUnitDebugModeThenUnsupportedFeatureIsReturned) { - pSysfsAccess->mockWriteFileStatus = ZE_RESULT_ERROR_NOT_AVAILABLE; + pSysfsAccess->writeResult = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; auto handles = getSchedHandles(handleComponentCount); for (auto handle : handles) { ze_bool_t needReload;