mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 14:55:24 +08:00
fix: Fix Sysman ULT failures
- Merge Ras and globalOps prelim files with non-prelim files. Related-To: NEO-9521 Signed-off-by: Bellekallu Rajkiran <bellekallu.rajkiran@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
55ef9516f2
commit
4398e4297f
@@ -10,18 +10,6 @@ if(UNIX)
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_global_operations_imp.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_global_operations_imp.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_global_operations_helper.cpp
|
||||
)
|
||||
|
||||
if(NEO_ENABLE_i915_PRELIM_DETECTION)
|
||||
target_sources(${L0_STATIC_LIB_NAME}
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_global_operations_helper_prelim.cpp
|
||||
)
|
||||
else()
|
||||
target_sources(${L0_STATIC_LIB_NAME}
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_global_operations_helper.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
||||
@@ -1,12 +1,28 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Intel Corporation
|
||||
* Copyright (C) 2022-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "level_zero/tools/source/sysman/firmware_util/firmware_util.h"
|
||||
#include "level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.h"
|
||||
|
||||
namespace L0 {
|
||||
void LinuxGlobalOperationsImp::getRepairStatus(zes_device_state_t *pState) {}
|
||||
} // namespace L0
|
||||
void LinuxGlobalOperationsImp::getRepairStatus(zes_device_state_t *pState) {
|
||||
bool ifrStatus = false;
|
||||
if (IGFX_PVC == SysmanDeviceImp::getProductFamily(pDevice)) {
|
||||
auto pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
|
||||
if (pFwInterface != nullptr) {
|
||||
auto result = pFwInterface->fwIfrApplied(ifrStatus);
|
||||
if (result == ZE_RESULT_SUCCESS) {
|
||||
pState->repaired = ZES_REPAIR_STATUS_NOT_PERFORMED;
|
||||
if (ifrStatus) {
|
||||
pState->reset |= ZES_RESET_REASON_FLAG_REPAIR;
|
||||
pState->repaired = ZES_REPAIR_STATUS_PERFORMED;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace L0
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "level_zero/tools/source/sysman/firmware_util/firmware_util.h"
|
||||
#include "level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.h"
|
||||
|
||||
namespace L0 {
|
||||
void LinuxGlobalOperationsImp::getRepairStatus(zes_device_state_t *pState) {
|
||||
bool ifrStatus = false;
|
||||
if (IGFX_PVC == SysmanDeviceImp::getProductFamily(pDevice)) {
|
||||
auto pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
|
||||
if (pFwInterface != nullptr) {
|
||||
auto result = pFwInterface->fwIfrApplied(ifrStatus);
|
||||
if (result == ZE_RESULT_SUCCESS) {
|
||||
pState->repaired = ZES_REPAIR_STATUS_NOT_PERFORMED;
|
||||
if (ifrStatus) {
|
||||
pState->reset |= ZES_RESET_REASON_FLAG_REPAIR;
|
||||
pState->repaired = ZES_REPAIR_STATUS_PERFORMED;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace L0
|
||||
@@ -59,7 +59,6 @@ ze_result_t LinuxMemoryImp::getState(zes_mem_state_t *pState) {
|
||||
pState->free = deviceRegions[subdeviceId].unallocatedSize;
|
||||
pState->size = deviceRegions[subdeviceId].probedSize;
|
||||
pState->health = ZES_MEM_HEALTH_OK;
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -8,21 +8,9 @@ if(UNIX)
|
||||
target_sources(${L0_STATIC_LIB_NAME}
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_gt.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_hbm.cpp
|
||||
)
|
||||
|
||||
if(NEO_ENABLE_i915_PRELIM_DETECTION)
|
||||
target_sources(${L0_STATIC_LIB_NAME}
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_gt.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_hbm.cpp
|
||||
)
|
||||
else()
|
||||
target_sources(${L0_STATIC_LIB_NAME}
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.h
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -8,27 +8,45 @@
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h"
|
||||
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/helpers/string.h"
|
||||
#include "shared/source/os_interface/linux/system_info.h"
|
||||
|
||||
#include "level_zero/tools/source/sysman/linux/os_sysman_imp.h"
|
||||
|
||||
#include "drm/intel_hwconfig_types.h"
|
||||
|
||||
namespace L0 {
|
||||
|
||||
void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {}
|
||||
static bool isMemoryTypeHbm(LinuxSysmanImp *pLinuxSysmanImp) {
|
||||
uint32_t memType = pLinuxSysmanImp->getMemoryType();
|
||||
if (memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2e || memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
|
||||
|
||||
constexpr auto maxErrorTypes = 2;
|
||||
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
|
||||
if (errorType.size() < maxErrorTypes) {
|
||||
auto pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
|
||||
if (isMemoryTypeHbm(pLinuxSysmanImp) == true) {
|
||||
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) {
|
||||
config->totalThreshold = totalThreshold;
|
||||
memcpy(config->detailedThresholds.category, categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
memcpy_s(config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t), categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) {
|
||||
if (pFsAccess->isRootUser() == true) {
|
||||
totalThreshold = config->totalThreshold;
|
||||
memcpy(categoryThreshold, config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
memcpy_s(categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t), config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
|
||||
@@ -42,9 +60,41 @@ ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) {
|
||||
properties.subdeviceId = subdeviceId;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
|
||||
if (clear == true) {
|
||||
if (pFsAccess->isRootUser() == false) {
|
||||
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
|
||||
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
for (auto &rasSource : rasSources) {
|
||||
zes_ras_state_t localState = {};
|
||||
ze_result_t localResult = rasSource->osRasGetState(localState, clear);
|
||||
if (localResult != ZE_RESULT_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
for (uint32_t i = 0; i < maxRasErrorCategoryCount; i++) {
|
||||
state.category[i] += localState.category[i];
|
||||
}
|
||||
result = ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void LinuxRasImp::initSources() {
|
||||
rasSources.push_back(std::make_unique<L0::LinuxRasSourceGt>(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId));
|
||||
if (isMemoryTypeHbm(pLinuxSysmanImp) == true) {
|
||||
rasSources.push_back(std::make_unique<L0::LinuxRasSourceHbm>(pLinuxSysmanImp, osRasErrorType, subdeviceId));
|
||||
}
|
||||
}
|
||||
|
||||
LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
|
||||
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
|
||||
pFsAccess = &pLinuxSysmanImp->getFsAccess();
|
||||
initSources();
|
||||
}
|
||||
|
||||
OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) {
|
||||
|
||||
@@ -11,9 +11,20 @@
|
||||
#include "level_zero/tools/source/sysman/ras/os_ras.h"
|
||||
#include "level_zero/tools/source/sysman/sysman_const.h"
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace L0 {
|
||||
class FsAccess;
|
||||
class SysfsAccess;
|
||||
class PmuInterface;
|
||||
class LinuxSysmanImp;
|
||||
class LinuxRasSources;
|
||||
class FirmwareUtil;
|
||||
struct Device;
|
||||
|
||||
class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
|
||||
public:
|
||||
ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override;
|
||||
@@ -28,12 +39,75 @@ class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
|
||||
zes_ras_error_type_t osRasErrorType = {};
|
||||
FsAccess *pFsAccess = nullptr;
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
std::vector<std::unique_ptr<L0::LinuxRasSources>> rasSources = {};
|
||||
|
||||
private:
|
||||
void initSources();
|
||||
bool isSubdevice = false;
|
||||
uint32_t subdeviceId = 0;
|
||||
uint64_t totalThreshold = 0;
|
||||
uint64_t categoryThreshold[maxRasErrorCategoryCount] = {0};
|
||||
};
|
||||
|
||||
class LinuxRasSources : NEO::NonCopyableOrMovableClass {
|
||||
public:
|
||||
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0;
|
||||
virtual ~LinuxRasSources() = default;
|
||||
};
|
||||
|
||||
class LinuxRasSourceGt : public LinuxRasSources {
|
||||
public:
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
|
||||
LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
|
||||
LinuxRasSourceGt() = default;
|
||||
~LinuxRasSourceGt() override;
|
||||
|
||||
protected:
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
zes_ras_error_type_t osRasErrorType = {};
|
||||
PmuInterface *pPmuInterface = nullptr;
|
||||
FsAccess *pFsAccess = nullptr;
|
||||
SysfsAccess *pSysfsAccess = nullptr;
|
||||
|
||||
private:
|
||||
void initRasErrors(ze_bool_t clear);
|
||||
ze_result_t getPmuConfig(
|
||||
const std::string &eventDirectory,
|
||||
const std::vector<std::string> &listOfEvents,
|
||||
const std::string &errorFileToGetConfig,
|
||||
std::string &pmuConfig);
|
||||
ze_result_t getBootUpErrorCountFromSysfs(
|
||||
std::string nameOfError,
|
||||
const std::string &errorCounterDir,
|
||||
uint64_t &errorVal);
|
||||
void closeFds();
|
||||
int64_t groupFd = -1;
|
||||
std::vector<int64_t> memberFds = {};
|
||||
uint64_t initialErrorCount[maxRasErrorCategoryCount] = {0};
|
||||
std::map<zes_ras_error_cat_t, uint64_t> errorCategoryToEventCount;
|
||||
uint64_t totalEventCount = 0;
|
||||
bool isSubdevice = false;
|
||||
uint32_t subdeviceId = 0;
|
||||
};
|
||||
|
||||
class LinuxRasSourceHbm : public LinuxRasSources {
|
||||
public:
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
|
||||
LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId);
|
||||
LinuxRasSourceHbm() = default;
|
||||
~LinuxRasSourceHbm() override{};
|
||||
|
||||
protected:
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
zes_ras_error_type_t osRasErrorType = {};
|
||||
FirmwareUtil *pFwInterface = nullptr;
|
||||
Device *pDevice = nullptr;
|
||||
|
||||
private:
|
||||
uint64_t errorBaseline = 0;
|
||||
uint32_t subdeviceId = 0;
|
||||
};
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
|
||||
#include "level_zero/tools/source/sysman/linux/os_sysman_imp.h"
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h"
|
||||
#include "level_zero/tools/source/sysman/sysman_imp.h"
|
||||
|
||||
namespace L0 {
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
|
||||
#include "level_zero/tools/source/sysman/firmware_util/firmware_util.h"
|
||||
#include "level_zero/tools/source/sysman/linux/os_sysman_imp.h"
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h"
|
||||
|
||||
namespace L0 {
|
||||
|
||||
|
||||
@@ -1,105 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
|
||||
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/helpers/string.h"
|
||||
#include "shared/source/os_interface/linux/system_info.h"
|
||||
|
||||
#include "level_zero/tools/source/sysman/linux/os_sysman_imp.h"
|
||||
|
||||
#include "drm/intel_hwconfig_types.h"
|
||||
|
||||
namespace L0 {
|
||||
|
||||
static bool isMemoryTypeHbm(LinuxSysmanImp *pLinuxSysmanImp) {
|
||||
uint32_t memType = pLinuxSysmanImp->getMemoryType();
|
||||
if (memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2e || memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
|
||||
|
||||
constexpr auto maxErrorTypes = 2;
|
||||
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
|
||||
if (errorType.size() < maxErrorTypes) {
|
||||
auto pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
|
||||
if (isMemoryTypeHbm(pLinuxSysmanImp) == true) {
|
||||
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) {
|
||||
config->totalThreshold = totalThreshold;
|
||||
memcpy_s(config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t), categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) {
|
||||
if (pFsAccess->isRootUser() == true) {
|
||||
totalThreshold = config->totalThreshold;
|
||||
memcpy_s(categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t), config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
|
||||
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) {
|
||||
properties.pNext = nullptr;
|
||||
properties.type = osRasErrorType;
|
||||
properties.onSubdevice = isSubdevice;
|
||||
properties.subdeviceId = subdeviceId;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
|
||||
if (clear == true) {
|
||||
if (pFsAccess->isRootUser() == false) {
|
||||
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
|
||||
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
for (auto &rasSource : rasSources) {
|
||||
zes_ras_state_t localState = {};
|
||||
ze_result_t localResult = rasSource->osRasGetState(localState, clear);
|
||||
if (localResult != ZE_RESULT_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
for (uint32_t i = 0; i < maxRasErrorCategoryCount; i++) {
|
||||
state.category[i] += localState.category[i];
|
||||
}
|
||||
result = ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void LinuxRasImp::initSources() {
|
||||
rasSources.push_back(std::make_unique<L0::LinuxRasSourceGt>(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId));
|
||||
if (isMemoryTypeHbm(pLinuxSysmanImp) == true) {
|
||||
rasSources.push_back(std::make_unique<L0::LinuxRasSourceHbm>(pLinuxSysmanImp, osRasErrorType, subdeviceId));
|
||||
}
|
||||
}
|
||||
|
||||
LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
|
||||
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
|
||||
pFsAccess = &pLinuxSysmanImp->getFsAccess();
|
||||
initSources();
|
||||
}
|
||||
|
||||
OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) {
|
||||
LinuxRasImp *pLinuxRasImp = new LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId);
|
||||
return static_cast<OsRas *>(pLinuxRasImp);
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
@@ -1,113 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "shared/source/helpers/non_copyable_or_moveable.h"
|
||||
|
||||
#include "level_zero/tools/source/sysman/ras/os_ras.h"
|
||||
#include "level_zero/tools/source/sysman/sysman_const.h"
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace L0 {
|
||||
class FsAccess;
|
||||
class SysfsAccess;
|
||||
class PmuInterface;
|
||||
class LinuxSysmanImp;
|
||||
class LinuxRasSources;
|
||||
class FirmwareUtil;
|
||||
struct Device;
|
||||
|
||||
class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
|
||||
public:
|
||||
ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override;
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
|
||||
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
|
||||
LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
|
||||
LinuxRasImp() = default;
|
||||
~LinuxRasImp() override = default;
|
||||
|
||||
protected:
|
||||
zes_ras_error_type_t osRasErrorType = {};
|
||||
FsAccess *pFsAccess = nullptr;
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
std::vector<std::unique_ptr<L0::LinuxRasSources>> rasSources = {};
|
||||
|
||||
private:
|
||||
void initSources();
|
||||
bool isSubdevice = false;
|
||||
uint32_t subdeviceId = 0;
|
||||
uint64_t totalThreshold = 0;
|
||||
uint64_t categoryThreshold[maxRasErrorCategoryCount] = {0};
|
||||
};
|
||||
|
||||
class LinuxRasSources : NEO::NonCopyableOrMovableClass {
|
||||
public:
|
||||
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0;
|
||||
virtual ~LinuxRasSources() = default;
|
||||
};
|
||||
|
||||
class LinuxRasSourceGt : public LinuxRasSources {
|
||||
public:
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
|
||||
LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
|
||||
LinuxRasSourceGt() = default;
|
||||
~LinuxRasSourceGt() override;
|
||||
|
||||
protected:
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
zes_ras_error_type_t osRasErrorType = {};
|
||||
PmuInterface *pPmuInterface = nullptr;
|
||||
FsAccess *pFsAccess = nullptr;
|
||||
SysfsAccess *pSysfsAccess = nullptr;
|
||||
|
||||
private:
|
||||
void initRasErrors(ze_bool_t clear);
|
||||
ze_result_t getPmuConfig(
|
||||
const std::string &eventDirectory,
|
||||
const std::vector<std::string> &listOfEvents,
|
||||
const std::string &errorFileToGetConfig,
|
||||
std::string &pmuConfig);
|
||||
ze_result_t getBootUpErrorCountFromSysfs(
|
||||
std::string nameOfError,
|
||||
const std::string &errorCounterDir,
|
||||
uint64_t &errorVal);
|
||||
void closeFds();
|
||||
int64_t groupFd = -1;
|
||||
std::vector<int64_t> memberFds = {};
|
||||
uint64_t initialErrorCount[maxRasErrorCategoryCount] = {0};
|
||||
std::map<zes_ras_error_cat_t, uint64_t> errorCategoryToEventCount;
|
||||
uint64_t totalEventCount = 0;
|
||||
bool isSubdevice = false;
|
||||
uint32_t subdeviceId = 0;
|
||||
};
|
||||
|
||||
class LinuxRasSourceHbm : public LinuxRasSources {
|
||||
public:
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
|
||||
LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId);
|
||||
LinuxRasSourceHbm() = default;
|
||||
~LinuxRasSourceHbm() override{};
|
||||
|
||||
protected:
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
zes_ras_error_type_t osRasErrorType = {};
|
||||
FirmwareUtil *pFwInterface = nullptr;
|
||||
Device *pDevice = nullptr;
|
||||
|
||||
private:
|
||||
uint64_t errorBaseline = 0;
|
||||
uint32_t subdeviceId = 0;
|
||||
};
|
||||
|
||||
} // namespace L0
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (C) 2020-2022 Intel Corporation
|
||||
# Copyright (C) 2020-2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
#
|
||||
@@ -7,15 +7,10 @@
|
||||
set(L0_TESTS_TOOLS_SYSMAN_GLOBAL_OPERATIONS_LINUX
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_global_operations.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_global_operations_helper.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_global_operations.h
|
||||
)
|
||||
|
||||
if(NEO_ENABLE_i915_PRELIM_DETECTION)
|
||||
list(APPEND L0_TESTS_TOOLS_SYSMAN_GLOBAL_OPERATIONS_LINUX
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_global_operations_prelim.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
if(UNIX)
|
||||
target_sources(${TARGET_NAME}
|
||||
PRIVATE
|
||||
|
||||
@@ -31,6 +31,7 @@ struct MockMemoryManagerSysman : public MemoryManagerMock {
|
||||
};
|
||||
|
||||
struct MockMemoryNeoDrm : public Drm {
|
||||
using Drm::ioctlHelper;
|
||||
using Drm::memoryInfo;
|
||||
const int mockFd = 33;
|
||||
MockMemoryNeoDrm(RootDeviceEnvironment &rootDeviceEnvironment) : Drm(std::make_unique<HwDeviceIdDrm>(mockFd, ""), rootDeviceEnvironment) {}
|
||||
|
||||
@@ -33,6 +33,7 @@ class SysmanDeviceMemoryFixture : public SysmanDeviceFixture {
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManager);
|
||||
|
||||
pDrm = new MockMemoryNeoDrm(const_cast<NEO::RootDeviceEnvironment &>(neoDevice->getRootDeviceEnvironment()));
|
||||
pDrm->ioctlHelper = static_cast<std::unique_ptr<NEO::IoctlHelper>>(std::make_unique<IoctlHelperUpstream>(*pDrm));
|
||||
|
||||
pSysmanDevice = device->getSysmanHandle();
|
||||
pSysmanDeviceImp = static_cast<SysmanDeviceImp *>(pSysmanDevice);
|
||||
@@ -194,7 +195,6 @@ TEST_F(SysmanDeviceMemoryFixture, GivenValidMemoryHandleWhenGettingStateThenCall
|
||||
|
||||
for (auto handle : handles) {
|
||||
zes_mem_state_t state;
|
||||
|
||||
ze_result_t result = zesMemoryGetState(handle, &state);
|
||||
|
||||
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
|
||||
|
||||
@@ -6,20 +6,10 @@
|
||||
|
||||
set(L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_sysman_ras.h
|
||||
)
|
||||
|
||||
if(NEO_ENABLE_i915_PRELIM_DETECTION)
|
||||
list(APPEND L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras_prelim.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras_prelim.h
|
||||
)
|
||||
else()
|
||||
list(APPEND L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras.h
|
||||
)
|
||||
endif()
|
||||
|
||||
if(UNIX)
|
||||
target_sources(${TARGET_NAME}
|
||||
PRIVATE
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "level_zero/tools/source/sysman/linux/fs_access.h"
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h"
|
||||
#include "level_zero/tools/source/sysman/ras/ras.h"
|
||||
#include "level_zero/tools/source/sysman/ras/ras_imp.h"
|
||||
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
|
||||
struct MockRasFsAccess : public FsAccess {
|
||||
bool mockRootUser = true;
|
||||
bool isRootUser() override {
|
||||
return mockRootUser;
|
||||
}
|
||||
MockRasFsAccess() = default;
|
||||
};
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
@@ -13,7 +13,7 @@
|
||||
#include "level_zero/tools/source/sysman/linux/fs_access.h"
|
||||
#include "level_zero/tools/source/sysman/linux/os_sysman_imp.h"
|
||||
#include "level_zero/tools/source/sysman/linux/pmu/pmu_imp.h"
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h"
|
||||
#include "level_zero/tools/source/sysman/ras/ras.h"
|
||||
#include "level_zero/tools/source/sysman/ras/ras_imp.h"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
* Copyright (C) 2022-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -7,29 +7,62 @@
|
||||
|
||||
#include "level_zero/tools/source/sysman/sysman_const.h"
|
||||
#include "level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h"
|
||||
|
||||
#include "mock_fs_ras.h"
|
||||
#include "level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_sysman_ras.h"
|
||||
|
||||
extern bool sysmanUltsEnable;
|
||||
|
||||
class OsRas;
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
|
||||
constexpr uint32_t mockHandleCount = 0;
|
||||
constexpr uint32_t mockHandleCount = 2u;
|
||||
constexpr uint32_t mockHandleCountForSubDevice = 4u;
|
||||
struct SysmanRasFixture : public SysmanDeviceFixture {
|
||||
protected:
|
||||
std::unique_ptr<MockRasFsAccess> pFsAccess;
|
||||
std::vector<ze_device_handle_t> deviceHandles;
|
||||
std::unique_ptr<MockRasSysfsAccess> pSysfsAccess;
|
||||
std::unique_ptr<MockRasPmuInterfaceImp> pPmuInterface;
|
||||
std::unique_ptr<MockRasFwInterface> pRasFwUtilInterface;
|
||||
std::unique_ptr<MockRasNeoDrm> pDrm;
|
||||
MemoryManager *pMemoryManagerOriginal = nullptr;
|
||||
std::unique_ptr<MockMemoryManagerInRasSysman> pMemoryManager;
|
||||
FsAccess *pFsAccessOriginal = nullptr;
|
||||
Drm *pOriginalDrm = nullptr;
|
||||
SysfsAccess *pSysfsAccessOriginal = nullptr;
|
||||
PmuInterface *pOriginalPmuInterface = nullptr;
|
||||
FirmwareUtil *pFwUtilOriginal = nullptr;
|
||||
std::vector<ze_device_handle_t> deviceHandles;
|
||||
|
||||
void SetUp() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
SysmanDeviceFixture::SetUp();
|
||||
pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager();
|
||||
pMemoryManager = std::make_unique<MockMemoryManagerInRasSysman>(*neoDevice->getExecutionEnvironment());
|
||||
pMemoryManager->localMemorySupported[0] = true;
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManager.get());
|
||||
pFsAccess = std::make_unique<MockRasFsAccess>();
|
||||
pSysfsAccess = std::make_unique<MockRasSysfsAccess>();
|
||||
pRasFwUtilInterface = std::make_unique<MockRasFwInterface>();
|
||||
pDrm = std::make_unique<MockRasNeoDrm>(const_cast<NEO::RootDeviceEnvironment &>(neoDevice->getRootDeviceEnvironment()));
|
||||
pDrm->ioctlHelper = static_cast<std::unique_ptr<NEO::IoctlHelper>>(std::make_unique<IoctlHelperPrelim20>(*pDrm));
|
||||
pFsAccessOriginal = pLinuxSysmanImp->pFsAccess;
|
||||
pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess;
|
||||
pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface;
|
||||
pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface;
|
||||
pOriginalDrm = pLinuxSysmanImp->pDrm;
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
pFsAccess->mockRootUser = true;
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
|
||||
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
|
||||
pPmuInterface = std::make_unique<MockRasPmuInterfaceImp>(pLinuxSysmanImp);
|
||||
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
|
||||
pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e);
|
||||
pLinuxSysmanImp->pDrm = pDrm.get();
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t subDeviceCount = 0;
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr);
|
||||
@@ -44,10 +77,14 @@ struct SysmanRasFixture : public SysmanDeviceFixture {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccessOriginal;
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal;
|
||||
pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface;
|
||||
pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal;
|
||||
pLinuxSysmanImp->pDrm = pOriginalDrm;
|
||||
SysmanDeviceFixture::TearDown();
|
||||
}
|
||||
|
||||
std::vector<zes_ras_handle_t> getRasHandles(uint32_t count) {
|
||||
std::vector<zes_ras_handle_t> handles(count, nullptr);
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS);
|
||||
@@ -55,16 +92,7 @@ struct SysmanRasFixture : public SysmanDeviceFixture {
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasContextWhenRetrievingRasHandlesThenSuccessIsReturned) {
|
||||
uint32_t count = 0;
|
||||
RasHandleContext *pRasHandleContext = new RasHandleContext(pSysmanDeviceImp->pOsSysman);
|
||||
ze_result_t result = pRasHandleContext->rasGet(&count, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
delete pRasHandleContext;
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRasErrorSetsThenCorrectCountIsReported) {
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesInThenSuccessReturn) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
@@ -74,102 +102,717 @@ TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRasErrorSetsThenCorrectCountI
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, mockHandleCount);
|
||||
|
||||
count = 0;
|
||||
std::vector<zes_ras_handle_t> handles(count, nullptr);
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
|
||||
RasImp *pTestRasImp = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle());
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp);
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, nullptr), ZE_RESULT_SUCCESS);
|
||||
EXPECT_EQ(count, mockHandleCount + 1);
|
||||
|
||||
testcount = count;
|
||||
|
||||
handles.resize(testcount);
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, handles.data()), ZE_RESULT_SUCCESS);
|
||||
EXPECT_EQ(testcount, mockHandleCount + 1);
|
||||
EXPECT_NE(nullptr, handles.data());
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.pop_back();
|
||||
delete pTestRasImp;
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenGettingRasPropertiesThenSuccessIsReturned) {
|
||||
RasImp *pTestRasImp = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle());
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp);
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount + 1);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_properties_t properties = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
|
||||
EXPECT_EQ(properties.pNext, nullptr);
|
||||
EXPECT_EQ(properties.onSubdevice, false);
|
||||
EXPECT_EQ(properties.subdeviceId, 0u);
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
|
||||
}
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.pop_back();
|
||||
delete pTestRasImp;
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhileCallingZesRasGetStateThenFailureIsReturned) {
|
||||
RasImp *pTestRasImp = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle());
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp);
|
||||
TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfReadSymLinkFailsThenNoSupportedErrorTypeIsReturned) {
|
||||
std::set<zes_ras_error_type_t> errorType = {};
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount + 1);
|
||||
pSysfsAccess->mockReadSymLinkResult = true;
|
||||
|
||||
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, device->toHandle());
|
||||
EXPECT_EQ(errorType.size(), 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfListDirectoryFailsThenNoSupportedErrorTypeIsReturned) {
|
||||
std::set<zes_ras_error_type_t> errorType = {};
|
||||
|
||||
pFsAccess->mockReadDirectoryFailure = true;
|
||||
|
||||
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, device);
|
||||
EXPECT_EQ(errorType.size(), 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForHbmAndFwInterfaceIsAbsentThenNoSupportedErrorTypeIsReturned) {
|
||||
std::set<zes_ras_error_type_t> errorType = {};
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
|
||||
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, device);
|
||||
EXPECT_EQ(errorType.size(), 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentThenZeroHandlesAreCreated) {
|
||||
pFsAccess->mockReadDirectoryWithoutRasEvents = true;
|
||||
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, 0u);
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAndHbmAreAbsentThenZeroHandlesAreCreated) {
|
||||
pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_LPDDR4);
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
pFsAccess->mockReadDirectoryWithoutRasEvents = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfHbmAndFwInterfaceArePresentThenSuccessIsReturned) {
|
||||
pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2);
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentAndQuerySystemInfoSucceedsButMemSysInfoIsNullThenZeroHandlesAreCreated) {
|
||||
pFsAccess->mockReadDirectoryWithoutRasEvents = true;
|
||||
pDrm->mockQuerySystemInfoReturnValue.push_back(true);
|
||||
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadCorrectable = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalMdfiEastCount + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtAfterClearThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadAfterClear = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
ze_bool_t clear = 0;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
|
||||
}
|
||||
}
|
||||
correctable = true;
|
||||
clear = 1;
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmWithClearThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
ze_bool_t clear = 0;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount);
|
||||
}
|
||||
}
|
||||
|
||||
correctable = true;
|
||||
clear = 1;
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateWithClearOptionWithoutPermissionsThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockRootUser = true;
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
ze_bool_t clear = 1;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasGetState(handle, clear, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndUnableToRetrieveConfigValuesAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockReadFileFailure = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPerfEventOpenFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pPmuInterface->mockPerfEvent = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceWithClearAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 1, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateForGtInterfaceAndPMUGetEventTypeFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockReadVal = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateAndFirmwareInterfaceIsAbsentOtherInterfacesAreAlsoAbsentThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockReadVal = true;
|
||||
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.pop_back();
|
||||
delete pTestRasImp;
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetConfigAfterzesRasSetConfigThenSuccessIsReturned) {
|
||||
RasImp *pTestRasImp = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle());
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp);
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount + 1);
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_config_t setConfig = {};
|
||||
zes_ras_config_t getConfig = {};
|
||||
setConfig.totalThreshold = 50;
|
||||
memset(setConfig.detailedThresholds.category, 1, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasSetConfig(handle, &setConfig));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetConfig(handle, &getConfig));
|
||||
EXPECT_EQ(setConfig.totalThreshold, getConfig.totalThreshold);
|
||||
int compare = std::memcmp(setConfig.detailedThresholds.category, getConfig.detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
EXPECT_EQ(0, compare);
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.pop_back();
|
||||
delete pTestRasImp;
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasSetConfigWithoutPermissionThenFailureIsReturned) {
|
||||
pFsAccess->mockRootUser = false;
|
||||
RasImp *pTestRasImp = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle());
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp);
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount + 1);
|
||||
pFsAccess->mockRootUser = true;
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_config_t setConfig = {};
|
||||
setConfig.totalThreshold = 50;
|
||||
memset(setConfig.detailedThresholds.category, 1, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasSetConfig(handle, &setConfig));
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->releaseRasHandles();
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidInstanceWhenOsRasImplementationIsNullThenDestructorIsCalledWithoutException) {
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
RasImp *pTestRasImp = new RasImp();
|
||||
pTestRasImp->pOsRas = nullptr;
|
||||
EXPECT_NO_THROW(delete pTestRasImp;); // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
|
||||
pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsInsideGetEventOpenAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndListDirectoryFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockListDirectoryStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleAndHandleCountZeroWhenCallingReInitThenValidCountIsReturnedAndVerifyzesDeviceEnumRasErrorSetsSucceeds) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
|
||||
pLinuxSysmanImp->reInitSysmanDeviceResources();
|
||||
|
||||
count = 0;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
}
|
||||
|
||||
struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture {
|
||||
protected:
|
||||
std::unique_ptr<MockRasFsAccess> pFsAccess;
|
||||
std::unique_ptr<MockRasSysfsAccess> pSysfsAccess;
|
||||
std::unique_ptr<MockRasPmuInterfaceImp> pPmuInterface;
|
||||
MemoryManager *pMemoryManagerOriginal = nullptr;
|
||||
std::unique_ptr<MockMemoryManagerInRasSysman> pMemoryManager;
|
||||
std::unique_ptr<MockRasFwInterface> pRasFwUtilInterface;
|
||||
std::unique_ptr<MockRasNeoDrm> pDrm;
|
||||
FsAccess *pFsAccessOriginal = nullptr;
|
||||
SysfsAccess *pSysfsAccessOriginal = nullptr;
|
||||
PmuInterface *pOriginalPmuInterface = nullptr;
|
||||
FirmwareUtil *pFwUtilOriginal = nullptr;
|
||||
Drm *pOriginalDrm = nullptr;
|
||||
std::vector<ze_device_handle_t> deviceHandles;
|
||||
|
||||
void SetUp() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
SysmanMultiDeviceFixture::SetUp();
|
||||
pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager();
|
||||
pMemoryManager = std::make_unique<MockMemoryManagerInRasSysman>(*neoDevice->getExecutionEnvironment());
|
||||
pMemoryManager->localMemorySupported[0] = true;
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManager.get());
|
||||
pDrm = std::make_unique<MockRasNeoDrm>(const_cast<NEO::RootDeviceEnvironment &>(neoDevice->getRootDeviceEnvironment()));
|
||||
pDrm->ioctlHelper = static_cast<std::unique_ptr<NEO::IoctlHelper>>(std::make_unique<IoctlHelperPrelim20>(*pDrm));
|
||||
pFsAccess = std::make_unique<MockRasFsAccess>();
|
||||
pSysfsAccess = std::make_unique<MockRasSysfsAccess>();
|
||||
pRasFwUtilInterface = std::make_unique<MockRasFwInterface>();
|
||||
pFsAccessOriginal = pLinuxSysmanImp->pFsAccess;
|
||||
pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess;
|
||||
pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface;
|
||||
pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface;
|
||||
pOriginalDrm = pLinuxSysmanImp->pDrm;
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
|
||||
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
|
||||
pPmuInterface = std::make_unique<MockRasPmuInterfaceImp>(pLinuxSysmanImp);
|
||||
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
|
||||
pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e);
|
||||
pLinuxSysmanImp->pDrm = pDrm.get();
|
||||
|
||||
pFsAccess->mockReadDirectoryForMultiDevice = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t subDeviceCount = 0;
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr);
|
||||
if (subDeviceCount == 0) {
|
||||
deviceHandles.resize(1, device->toHandle());
|
||||
} else {
|
||||
deviceHandles.resize(subDeviceCount, nullptr);
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, deviceHandles.data());
|
||||
}
|
||||
}
|
||||
void TearDown() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccessOriginal;
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal;
|
||||
pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface;
|
||||
pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal;
|
||||
pLinuxSysmanImp->pDrm = pOriginalDrm;
|
||||
SysmanMultiDeviceFixture::TearDown();
|
||||
}
|
||||
std::vector<zes_ras_handle_t> getRasHandles(uint32_t count) {
|
||||
std::vector<zes_ras_handle_t> handles(count, nullptr);
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS);
|
||||
return handles;
|
||||
}
|
||||
};
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWithMultiDeviceWhenRetrievingRasHandlesThenSuccessIsReturned) {
|
||||
RasHandleContext *pRasHandleContext = new RasHandleContext(pSysmanDeviceImp->pOsSysman);
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = pRasHandleContext->rasGet(&count, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ((count > 0), true);
|
||||
delete pRasHandleContext;
|
||||
}
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesThenSuccessIsReturned) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCountForSubDevice);
|
||||
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, mockHandleCountForSubDevice);
|
||||
auto handles = getRasHandles(mockHandleCountForSubDevice);
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
}
|
||||
}
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidHandleWhenGettingRasPropertiesThenSuccessIsReturned) {
|
||||
for (auto deviceHandle : deviceHandles) {
|
||||
zes_ras_properties_t properties = {};
|
||||
ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
|
||||
Device::fromHandle(deviceHandle)->getProperties(&deviceProperties);
|
||||
bool isSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE;
|
||||
PublicLinuxRasImp *pLinuxRasImp = new PublicLinuxRasImp(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, deviceProperties.subdeviceId);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, pLinuxRasImp->osRasGetProperties(properties));
|
||||
EXPECT_EQ(properties.subdeviceId, deviceProperties.subdeviceId);
|
||||
EXPECT_EQ(properties.onSubdevice, isSubDevice);
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
delete pLinuxRasImp;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadTile = true;
|
||||
pSysfsAccess->isMultiTileArch = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCountForSubDevice);
|
||||
uint32_t handleIndex = 0u;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
if (handleIndex == 0u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctablel3Bank + initialCorrectableCacheErrorTile0); // No. of correctable error type for subdevice 0
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], correctableGscSramEcc + initialCorrectableNonComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
} else if (handleIndex == 1u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrorsTile0); // No. of uncorrectable error type for subdevice 0
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0);
|
||||
} else if (handleIndex == 2u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); // No. of correctable error type for subdevice 1
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableSubsliceTile1 + correctableGucErrorCountTile1 + correctableSamplerErrorCountTile1 + initialCorrectableComputeErrorsTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
} else if (handleIndex == 3u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalL3BankTile1 + fatalIdiParityErrorCountTile1 + initialUncorrectableCacheErrorsTile1); // No. of uncorrectable error type for subdevice 1
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCountTile1 + initialEngineResetTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttentionTile1 + initialProgrammingErrorsTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalGucErrorCountTile1 + initialUncorrectableComputeErrorsTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiWestCountTile1 + socFatalPunitTile1 + initialUncorrectableNonComputeErrorsTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverEngineOther + initialUncorrectableDriverErrorsTile1);
|
||||
}
|
||||
handleIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCountForSubDevice);
|
||||
uint32_t handleIndex = 0u;
|
||||
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
if (handleIndex == 0u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 0
|
||||
} else if (handleIndex == 1u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 0
|
||||
} else if (handleIndex == 2u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 1
|
||||
} else if (handleIndex == 3u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 1
|
||||
}
|
||||
handleIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
class SysmanRasAffinityMaskFixture : public SysmanRasMultiDeviceFixture {
|
||||
void SetUp() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
NEO::DebugManager.flags.ZE_AFFINITY_MASK.set("0.1");
|
||||
SysmanRasMultiDeviceFixture::SetUp();
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
SysmanRasMultiDeviceFixture::TearDown();
|
||||
}
|
||||
DebugManagerStateRestore restorer;
|
||||
};
|
||||
|
||||
TEST_F(SysmanRasAffinityMaskFixture, GivenAffinityMaskIsSetWhenCallingRasPropertiesThenPropertiesAreReturnedForTheSubDevicesAccordingToAffinityMask) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
uint32_t handleIndex = 0u;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_properties_t properties = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
|
||||
EXPECT_EQ(properties.pNext, nullptr);
|
||||
EXPECT_EQ(properties.onSubdevice, true);
|
||||
EXPECT_EQ(properties.subdeviceId, 1u); // Affinity mask 0.1 is set which means only subdevice 1 is exposed
|
||||
if (handleIndex == 0u) {
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
|
||||
} else if (handleIndex == 1u) {
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
|
||||
}
|
||||
handleIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
|
||||
@@ -1,819 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2022-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "level_zero/tools/source/sysman/sysman_const.h"
|
||||
#include "level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h"
|
||||
#include "level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h"
|
||||
|
||||
extern bool sysmanUltsEnable;
|
||||
|
||||
class OsRas;
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
constexpr uint32_t mockHandleCount = 2u;
|
||||
constexpr uint32_t mockHandleCountForSubDevice = 4u;
|
||||
struct SysmanRasFixture : public SysmanDeviceFixture {
|
||||
protected:
|
||||
std::unique_ptr<MockRasFsAccess> pFsAccess;
|
||||
std::unique_ptr<MockRasSysfsAccess> pSysfsAccess;
|
||||
std::unique_ptr<MockRasPmuInterfaceImp> pPmuInterface;
|
||||
std::unique_ptr<MockRasFwInterface> pRasFwUtilInterface;
|
||||
std::unique_ptr<MockRasNeoDrm> pDrm;
|
||||
MemoryManager *pMemoryManagerOriginal = nullptr;
|
||||
std::unique_ptr<MockMemoryManagerInRasSysman> pMemoryManager;
|
||||
FsAccess *pFsAccessOriginal = nullptr;
|
||||
Drm *pOriginalDrm = nullptr;
|
||||
SysfsAccess *pSysfsAccessOriginal = nullptr;
|
||||
PmuInterface *pOriginalPmuInterface = nullptr;
|
||||
FirmwareUtil *pFwUtilOriginal = nullptr;
|
||||
std::vector<ze_device_handle_t> deviceHandles;
|
||||
|
||||
void SetUp() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
SysmanDeviceFixture::SetUp();
|
||||
pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager();
|
||||
pMemoryManager = std::make_unique<MockMemoryManagerInRasSysman>(*neoDevice->getExecutionEnvironment());
|
||||
pMemoryManager->localMemorySupported[0] = true;
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManager.get());
|
||||
pFsAccess = std::make_unique<MockRasFsAccess>();
|
||||
pSysfsAccess = std::make_unique<MockRasSysfsAccess>();
|
||||
pRasFwUtilInterface = std::make_unique<MockRasFwInterface>();
|
||||
pDrm = std::make_unique<MockRasNeoDrm>(const_cast<NEO::RootDeviceEnvironment &>(neoDevice->getRootDeviceEnvironment()));
|
||||
pDrm->ioctlHelper = static_cast<std::unique_ptr<NEO::IoctlHelper>>(std::make_unique<IoctlHelperPrelim20>(*pDrm));
|
||||
pFsAccessOriginal = pLinuxSysmanImp->pFsAccess;
|
||||
pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess;
|
||||
pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface;
|
||||
pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface;
|
||||
pOriginalDrm = pLinuxSysmanImp->pDrm;
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
|
||||
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
|
||||
pPmuInterface = std::make_unique<MockRasPmuInterfaceImp>(pLinuxSysmanImp);
|
||||
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
|
||||
pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e);
|
||||
pLinuxSysmanImp->pDrm = pDrm.get();
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t subDeviceCount = 0;
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr);
|
||||
if (subDeviceCount == 0) {
|
||||
deviceHandles.resize(1, device->toHandle());
|
||||
} else {
|
||||
deviceHandles.resize(subDeviceCount, nullptr);
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, deviceHandles.data());
|
||||
}
|
||||
}
|
||||
void TearDown() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccessOriginal;
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal;
|
||||
pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface;
|
||||
pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal;
|
||||
pLinuxSysmanImp->pDrm = pOriginalDrm;
|
||||
SysmanDeviceFixture::TearDown();
|
||||
}
|
||||
std::vector<zes_ras_handle_t> getRasHandles(uint32_t count) {
|
||||
std::vector<zes_ras_handle_t> handles(count, nullptr);
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS);
|
||||
return handles;
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesInThenSuccessReturn) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, mockHandleCount);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenGettingRasPropertiesThenSuccessIsReturned) {
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_properties_t properties = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
|
||||
EXPECT_EQ(properties.pNext, nullptr);
|
||||
EXPECT_EQ(properties.onSubdevice, false);
|
||||
EXPECT_EQ(properties.subdeviceId, 0u);
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfReadSymLinkFailsThenNoSupportedErrorTypeIsReturned) {
|
||||
std::set<zes_ras_error_type_t> errorType = {};
|
||||
|
||||
pSysfsAccess->mockReadSymLinkResult = true;
|
||||
|
||||
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, device->toHandle());
|
||||
EXPECT_EQ(errorType.size(), 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfListDirectoryFailsThenNoSupportedErrorTypeIsReturned) {
|
||||
std::set<zes_ras_error_type_t> errorType = {};
|
||||
|
||||
pFsAccess->mockReadDirectoryFailure = true;
|
||||
|
||||
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, device);
|
||||
EXPECT_EQ(errorType.size(), 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForHbmAndFwInterfaceIsAbsentThenNoSupportedErrorTypeIsReturned) {
|
||||
std::set<zes_ras_error_type_t> errorType = {};
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
|
||||
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, device);
|
||||
EXPECT_EQ(errorType.size(), 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentThenZeroHandlesAreCreated) {
|
||||
pFsAccess->mockReadDirectoryWithoutRasEvents = true;
|
||||
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, 0u);
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAndHbmAreAbsentThenZeroHandlesAreCreated) {
|
||||
pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_LPDDR4);
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
pFsAccess->mockReadDirectoryWithoutRasEvents = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfHbmAndFwInterfaceArePresentThenSuccessIsReturned) {
|
||||
pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2);
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentAndQuerySystemInfoSucceedsButMemSysInfoIsNullThenZeroHandlesAreCreated) {
|
||||
pFsAccess->mockReadDirectoryWithoutRasEvents = true;
|
||||
pDrm->mockQuerySystemInfoReturnValue.push_back(true);
|
||||
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadCorrectable = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalMdfiEastCount + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtAfterClearThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadAfterClear = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
ze_bool_t clear = 0;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
|
||||
}
|
||||
}
|
||||
correctable = true;
|
||||
clear = 1;
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmWithClearThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
ze_bool_t clear = 0;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount);
|
||||
}
|
||||
}
|
||||
|
||||
correctable = true;
|
||||
clear = 1;
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateWithClearOptionWithoutPermissionsThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockRootUser = true;
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
ze_bool_t clear = 1;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasGetState(handle, clear, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndUnableToRetrieveConfigValuesAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockReadFileFailure = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPerfEventOpenFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pPmuInterface->mockPerfEvent = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceWithClearAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 1, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateForGtInterfaceAndPMUGetEventTypeFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockReadVal = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateAndFirmwareInterfaceIsAbsentOtherInterfacesAreAlsoAbsentThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockReadVal = true;
|
||||
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetConfigAfterzesRasSetConfigThenSuccessIsReturned) {
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_config_t setConfig = {};
|
||||
zes_ras_config_t getConfig = {};
|
||||
setConfig.totalThreshold = 50;
|
||||
memset(setConfig.detailedThresholds.category, 1, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasSetConfig(handle, &setConfig));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetConfig(handle, &getConfig));
|
||||
EXPECT_EQ(setConfig.totalThreshold, getConfig.totalThreshold);
|
||||
int compare = std::memcmp(setConfig.detailedThresholds.category, getConfig.detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
EXPECT_EQ(0, compare);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasSetConfigWithoutPermissionThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockRootUser = true;
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_config_t setConfig = {};
|
||||
setConfig.totalThreshold = 50;
|
||||
memset(setConfig.detailedThresholds.category, 1, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasSetConfig(handle, &setConfig));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsInsideGetEventOpenAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndListDirectoryFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockListDirectoryStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleAndHandleCountZeroWhenCallingReInitThenValidCountIsReturnedAndVerifyzesDeviceEnumRasErrorSetsSucceeds) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
|
||||
pLinuxSysmanImp->reInitSysmanDeviceResources();
|
||||
|
||||
count = 0;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
}
|
||||
|
||||
struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture {
|
||||
protected:
|
||||
std::unique_ptr<MockRasFsAccess> pFsAccess;
|
||||
std::unique_ptr<MockRasSysfsAccess> pSysfsAccess;
|
||||
std::unique_ptr<MockRasPmuInterfaceImp> pPmuInterface;
|
||||
MemoryManager *pMemoryManagerOriginal = nullptr;
|
||||
std::unique_ptr<MockMemoryManagerInRasSysman> pMemoryManager;
|
||||
std::unique_ptr<MockRasFwInterface> pRasFwUtilInterface;
|
||||
std::unique_ptr<MockRasNeoDrm> pDrm;
|
||||
FsAccess *pFsAccessOriginal = nullptr;
|
||||
SysfsAccess *pSysfsAccessOriginal = nullptr;
|
||||
PmuInterface *pOriginalPmuInterface = nullptr;
|
||||
FirmwareUtil *pFwUtilOriginal = nullptr;
|
||||
Drm *pOriginalDrm = nullptr;
|
||||
std::vector<ze_device_handle_t> deviceHandles;
|
||||
|
||||
void SetUp() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
SysmanMultiDeviceFixture::SetUp();
|
||||
pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager();
|
||||
pMemoryManager = std::make_unique<MockMemoryManagerInRasSysman>(*neoDevice->getExecutionEnvironment());
|
||||
pMemoryManager->localMemorySupported[0] = true;
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManager.get());
|
||||
pDrm = std::make_unique<MockRasNeoDrm>(const_cast<NEO::RootDeviceEnvironment &>(neoDevice->getRootDeviceEnvironment()));
|
||||
pDrm->ioctlHelper = static_cast<std::unique_ptr<NEO::IoctlHelper>>(std::make_unique<IoctlHelperPrelim20>(*pDrm));
|
||||
pFsAccess = std::make_unique<MockRasFsAccess>();
|
||||
pSysfsAccess = std::make_unique<MockRasSysfsAccess>();
|
||||
pRasFwUtilInterface = std::make_unique<MockRasFwInterface>();
|
||||
pFsAccessOriginal = pLinuxSysmanImp->pFsAccess;
|
||||
pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess;
|
||||
pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface;
|
||||
pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface;
|
||||
pOriginalDrm = pLinuxSysmanImp->pDrm;
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
|
||||
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
|
||||
pPmuInterface = std::make_unique<MockRasPmuInterfaceImp>(pLinuxSysmanImp);
|
||||
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
|
||||
pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e);
|
||||
pLinuxSysmanImp->pDrm = pDrm.get();
|
||||
|
||||
pFsAccess->mockReadDirectoryForMultiDevice = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t subDeviceCount = 0;
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr);
|
||||
if (subDeviceCount == 0) {
|
||||
deviceHandles.resize(1, device->toHandle());
|
||||
} else {
|
||||
deviceHandles.resize(subDeviceCount, nullptr);
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, deviceHandles.data());
|
||||
}
|
||||
}
|
||||
void TearDown() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccessOriginal;
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal;
|
||||
pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface;
|
||||
pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal;
|
||||
pLinuxSysmanImp->pDrm = pOriginalDrm;
|
||||
SysmanMultiDeviceFixture::TearDown();
|
||||
}
|
||||
std::vector<zes_ras_handle_t> getRasHandles(uint32_t count) {
|
||||
std::vector<zes_ras_handle_t> handles(count, nullptr);
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS);
|
||||
return handles;
|
||||
}
|
||||
};
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWithMultiDeviceWhenRetrievingRasHandlesThenSuccessIsReturned) {
|
||||
RasHandleContext *pRasHandleContext = new RasHandleContext(pSysmanDeviceImp->pOsSysman);
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = pRasHandleContext->rasGet(&count, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ((count > 0), true);
|
||||
delete pRasHandleContext;
|
||||
}
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesThenSuccessIsReturned) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCountForSubDevice);
|
||||
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, mockHandleCountForSubDevice);
|
||||
auto handles = getRasHandles(mockHandleCountForSubDevice);
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
}
|
||||
}
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidHandleWhenGettingRasPropertiesThenSuccessIsReturned) {
|
||||
for (auto deviceHandle : deviceHandles) {
|
||||
zes_ras_properties_t properties = {};
|
||||
ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
|
||||
Device::fromHandle(deviceHandle)->getProperties(&deviceProperties);
|
||||
bool isSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE;
|
||||
PublicLinuxRasImp *pLinuxRasImp = new PublicLinuxRasImp(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, deviceProperties.subdeviceId);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, pLinuxRasImp->osRasGetProperties(properties));
|
||||
EXPECT_EQ(properties.subdeviceId, deviceProperties.subdeviceId);
|
||||
EXPECT_EQ(properties.onSubdevice, isSubDevice);
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
delete pLinuxRasImp;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadTile = true;
|
||||
pSysfsAccess->isMultiTileArch = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCountForSubDevice);
|
||||
uint32_t handleIndex = 0u;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
if (handleIndex == 0u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctablel3Bank + initialCorrectableCacheErrorTile0); // No. of correctable error type for subdevice 0
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], correctableGscSramEcc + initialCorrectableNonComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
} else if (handleIndex == 1u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrorsTile0); // No. of uncorrectable error type for subdevice 0
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0);
|
||||
} else if (handleIndex == 2u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); // No. of correctable error type for subdevice 1
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableSubsliceTile1 + correctableGucErrorCountTile1 + correctableSamplerErrorCountTile1 + initialCorrectableComputeErrorsTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
} else if (handleIndex == 3u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalL3BankTile1 + fatalIdiParityErrorCountTile1 + initialUncorrectableCacheErrorsTile1); // No. of uncorrectable error type for subdevice 1
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCountTile1 + initialEngineResetTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttentionTile1 + initialProgrammingErrorsTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalGucErrorCountTile1 + initialUncorrectableComputeErrorsTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiWestCountTile1 + socFatalPunitTile1 + initialUncorrectableNonComputeErrorsTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverEngineOther + initialUncorrectableDriverErrorsTile1);
|
||||
}
|
||||
handleIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
auto handles = getRasHandles(mockHandleCountForSubDevice);
|
||||
uint32_t handleIndex = 0u;
|
||||
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
if (handleIndex == 0u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 0
|
||||
} else if (handleIndex == 1u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 0
|
||||
} else if (handleIndex == 2u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 1
|
||||
} else if (handleIndex == 3u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 1
|
||||
}
|
||||
handleIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
class SysmanRasAffinityMaskFixture : public SysmanRasMultiDeviceFixture {
|
||||
void SetUp() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
NEO::DebugManager.flags.ZE_AFFINITY_MASK.set("0.1");
|
||||
SysmanRasMultiDeviceFixture::SetUp();
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
SysmanRasMultiDeviceFixture::TearDown();
|
||||
}
|
||||
DebugManagerStateRestore restorer;
|
||||
};
|
||||
|
||||
TEST_F(SysmanRasAffinityMaskFixture, GivenAffinityMaskIsSetWhenCallingRasPropertiesThenPropertiesAreReturnedForTheSubDevicesAccordingToAffinityMask) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
uint32_t handleIndex = 0u;
|
||||
for (auto handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
zes_ras_properties_t properties = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
|
||||
EXPECT_EQ(properties.pNext, nullptr);
|
||||
EXPECT_EQ(properties.onSubdevice, true);
|
||||
EXPECT_EQ(properties.subdeviceId, 1u); // Affinity mask 0.1 is set which means only subdevice 1 is exposed
|
||||
if (handleIndex == 0u) {
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
|
||||
} else if (handleIndex == 1u) {
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
|
||||
}
|
||||
handleIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
@@ -170,7 +170,6 @@ struct MockSchedulerSysfsAccess : public SysfsAccess {
|
||||
}
|
||||
|
||||
ze_result_t write(const std::string file, const uint64_t val) override {
|
||||
|
||||
if (mockWriteFileStatus != ZE_RESULT_SUCCESS) {
|
||||
return mockWriteFileStatus;
|
||||
}
|
||||
@@ -280,6 +279,7 @@ struct MockSchedulerSysfsAccess : public SysfsAccess {
|
||||
engineDirectoryPermissions = permission;
|
||||
}
|
||||
|
||||
ADDMETHOD_NOBASE(write, ze_result_t, ZE_RESULT_SUCCESS, (const std::string file, const int val));
|
||||
MockSchedulerSysfsAccess() = default;
|
||||
|
||||
private:
|
||||
|
||||
@@ -560,7 +560,7 @@ TEST_F(SysmanDeviceSchedulerFixture, GivenValidDeviceHandleWhenCallingzesSchedul
|
||||
}
|
||||
|
||||
TEST_F(SysmanDeviceSchedulerFixture, GivenValidDeviceHandleWhenCallingzesSchedulerSetComputeUnitDebugModeThenUnsupportedFeatureIsReturned) {
|
||||
pSysfsAccess->mockWriteFileStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
pSysfsAccess->writeResult = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
auto handles = getSchedHandles(handleComponentCount);
|
||||
for (auto handle : handles) {
|
||||
ze_bool_t needReload;
|
||||
|
||||
Reference in New Issue
Block a user