Added RAS ULTs

Related-To: LOCI-3759

Signed-off-by: Bari, Pratik <pratik.bari@intel.com>
This commit is contained in:
Bari, Pratik 2022-12-14 14:46:47 +00:00 committed by Compute-Runtime-Automation
parent a60b5898dc
commit 072963d0f7
7 changed files with 1964 additions and 21 deletions

View File

@ -6,12 +6,17 @@
set(L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_ras.cpp
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}mock_fs_ras.h
)
if((NEO_ENABLE_i915_PRELIM_DETECTION) AND ("${BRANCH_TYPE}" STREQUAL ""))
list(REMOVE_ITEM L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
if(NEO_ENABLE_i915_PRELIM_DETECTION)
list(APPEND L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras_fabric_prelim.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras_prelim.cpp
${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras_fabric_prelim.h
${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras_prelim.h
)
else()
list(APPEND L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras.cpp
${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras.h
)
@ -23,5 +28,4 @@ if(UNIX)
${L0_TESTS_TOOLS_SYSMAN_RAS_LINUX}
)
endif()
add_subdirectories()

View File

@ -16,16 +16,13 @@ namespace L0 {
namespace ult {
class RasFsAccess : public FsAccess {};
template <>
struct Mock<RasFsAccess> : public RasFsAccess {
MOCK_METHOD(bool, isRootUser, (), (override));
bool userIsRoot() {
return true;
struct MockRasFsAccess : public RasFsAccess {
bool mockRootUser = true;
bool isRootUser() override {
return mockRootUser;
}
bool userIsNotRoot() {
return false;
}
Mock<RasFsAccess>() = default;
MockRasFsAccess() = default;
};
} // namespace ult

View File

@ -0,0 +1,86 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "level_zero/core/test/unit_tests/mocks/mock_memory_manager.h"
#include "level_zero/tools/source/sysman/linux/pmu/pmu_imp.h"
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
#include "sysman/linux/fs_access.h"
#include "sysman/linux/os_sysman_imp.h"
#include "sysman/ras/ras.h"
#include "sysman/ras/ras_imp.h"
#include <map>
namespace L0 {
namespace ult {
class MockRasFabricFsAccess : public FsAccess {
public:
ze_result_t canRead(const std::string file) override {
if (accessibleNodes.find(file) != accessibleNodes.end()) {
return ZE_RESULT_SUCCESS;
}
return ZE_RESULT_ERROR_UNKNOWN;
}
~MockRasFabricFsAccess() override = default;
bool isRootUser() override {
return true;
}
ze_result_t read(const std::string file, uint64_t &val) override {
if (canRead(file) == ZE_RESULT_SUCCESS) {
val = accessibleNodes[file];
return ZE_RESULT_SUCCESS;
}
return ZE_RESULT_ERROR_UNKNOWN;
}
void setAccessibleNodes(std::map<std::string, uint64_t> &nodes) {
accessibleNodes = nodes;
}
void setAccessibleDirectories(std::vector<std::string> &dirs) {
accessibleDirectories = dirs;
}
bool directoryExists(const std::string path) override {
if (std::find(accessibleDirectories.begin(), accessibleDirectories.end(), path) != accessibleDirectories.end()) {
return true;
} else {
return false;
}
}
private:
std::map<std::string, uint64_t> accessibleNodes = {};
std::vector<std::string> accessibleDirectories = {};
};
class MockRasFabricSysFsAccess : public SysfsAccess {
public:
ze_result_t readSymLink(const std::string path, std::string &buf) override {
return ZE_RESULT_ERROR_UNKNOWN;
}
ze_result_t getRealPath(const std::string path, std::string &buf) override {
buf.append("/mockRealPath");
return mockRealPathStatus;
}
ze_result_t mockRealPathStatus = ZE_RESULT_SUCCESS;
};
struct MockMemoryManagerInRasSysman : public MemoryManagerMock {
MockMemoryManagerInRasSysman(NEO::ExecutionEnvironment &executionEnvironment) : MemoryManagerMock(const_cast<NEO::ExecutionEnvironment &>(executionEnvironment)) {}
};
} // namespace ult
} // namespace L0

View File

@ -0,0 +1,572 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "level_zero/core/test/unit_tests/mocks/mock_memory_manager.h"
#include "level_zero/tools/source/sysman/linux/pmu/pmu_imp.h"
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
#include "sysman/linux/fs_access.h"
#include "sysman/linux/os_sysman_imp.h"
#include "sysman/ras/ras.h"
#include "sysman/ras/ras_imp.h"
using namespace NEO;
namespace L0 {
namespace ult {
const std::string deviceDir("device");
const std::string eventsDir("/sys/devices/i915_0000_03_00.0/events");
constexpr int64_t mockPmuFd = 10;
constexpr uint64_t correctableGrfErrorCount = 100u;
constexpr uint64_t correctableEuErrorCount = 75u;
constexpr uint64_t fatalEuErrorCount = 50u;
constexpr uint64_t fatalTlb = 3u;
constexpr uint64_t fatalEngineResetCount = 45u;
constexpr uint64_t correctableGrfErrorCountTile0 = 90u;
constexpr uint64_t correctableEuErrorCountTile0 = 70u;
constexpr uint64_t fatalEuErrorCountTile0 = 55u;
constexpr uint64_t fatalEngineResetCountTile0 = 72u;
constexpr uint64_t correctableSamplerErrorCountTile1 = 30u;
constexpr uint64_t fatalGucErrorCountTile1 = 40u;
constexpr uint64_t fatalIdiParityErrorCountTile1 = 60u;
constexpr uint64_t correctableGucErrorCountTile1 = 25u;
constexpr uint64_t fatalEngineResetCountTile1 = 85u;
constexpr uint64_t socCorrectableFabricSs0_0Count = 2u;
constexpr uint64_t socFatalMdfiEastCount = 3u;
constexpr uint64_t socNonFatalPsfCsc0Count = 5u;
constexpr uint64_t socCorrectableHbmSs0_1CountTile0 = 6u;
constexpr uint64_t socNonFatalPsfCsc0CountTile0 = 6u;
constexpr uint64_t socFatalHbmSs1_15CountTile0 = 7u;
constexpr uint64_t socCorrectableFabricSs1_0CountTile1 = 8u;
constexpr uint64_t socNonFatalPunitCountTile1 = 9u;
constexpr uint64_t socFatalMdfiWestCountTile1 = 0u;
constexpr uint64_t fatalFpuTile0 = 1u;
constexpr uint64_t FatalL3FabricTile0 = 4u;
constexpr uint64_t euAttention = 10u;
constexpr uint64_t euAttentionTile0 = 5u;
constexpr uint64_t euAttentionTile1 = 2u;
constexpr uint64_t driverMigration = 2u;
constexpr uint64_t driverGgtt = 1u;
constexpr uint64_t driverRps = 2u;
constexpr uint64_t driverEngineOther = 3u;
constexpr uint64_t initialCorrectableCacheErrors = 6u;
constexpr uint64_t initialUncorrectableCacheErrors = 7u;
constexpr uint64_t initialEngineReset = 2u;
constexpr uint64_t initialProgrammingErrors = 7u;
constexpr uint64_t initialCorrectableNonComputeErrors = 6u;
constexpr uint64_t initialUncorrectableNonComputeErrors = 13u;
constexpr uint64_t initialUncorrectableComputeErrors = 5u;
constexpr uint64_t initialUncorrectableDriverErrors = 5u;
constexpr uint64_t initialCorrectableCacheErrorsTile1 = 5u;
constexpr uint64_t initialUncorrectableCacheErrorsTile1 = 7u;
constexpr uint64_t initialEngineResetTile1 = 4u;
constexpr uint64_t initialProgrammingErrorsTile1 = 5u;
constexpr uint64_t initialCorrectableNonComputeErrorsTile1 = 4u;
constexpr uint64_t initialUncorrectableNonComputeErrorsTile1 = 5u;
constexpr uint64_t initialUncorrectableDriverErrorsTile1 = 4u;
constexpr uint64_t timeStamp = 1000u;
constexpr uint32_t pmuDriverType = 16u;
constexpr uint64_t hbmCorrectableErrorCount = 2;
constexpr uint64_t hbmUncorrectableErrorCount = 3;
struct MockMemoryManagerInRasSysman : public MemoryManagerMock {
MockMemoryManagerInRasSysman(NEO::ExecutionEnvironment &executionEnvironment) : MemoryManagerMock(const_cast<NEO::ExecutionEnvironment &>(executionEnvironment)) {}
};
class MockPmuInterfaceImpForRas : public PmuInterfaceImp {
public:
using PmuInterfaceImp::perfEventOpen;
MockPmuInterfaceImpForRas(LinuxSysmanImp *pLinuxSysmanImp) : PmuInterfaceImp(pLinuxSysmanImp) {}
};
template <>
struct Mock<MockPmuInterfaceImpForRas> : public MockPmuInterfaceImpForRas {
int32_t mockPmuReadCount = 0;
int32_t mockPmuReadCountAfterClear = 0;
int32_t mockPmuReadTileCount = 0;
bool mockPmuReadCorrectable = false;
bool mockPmuReadAfterClear = false;
bool mockPmuReadResult = false;
bool mockPerfEvent = false;
bool mockPmuReadTile = false;
Mock<MockPmuInterfaceImpForRas>(LinuxSysmanImp *pLinuxSysmanImp) : MockPmuInterfaceImpForRas(pLinuxSysmanImp) {}
int64_t perfEventOpen(perf_event_attr *attr, pid_t pid, int cpu, int groupFd, uint64_t flags) override {
if (mockPerfEvent == true) {
return mockedPerfEventOpenAndFailureReturn(attr, pid, cpu, groupFd, flags);
}
return mockPmuFd;
}
int64_t mockedPerfEventOpenAndFailureReturn(perf_event_attr *attr, pid_t pid, int cpu, int groupFd, uint64_t flags) {
return -1;
}
int mockedPmuReadForCorrectableAndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
memset(data, 0, sizeOfdata);
data[1] = timeStamp;
data[2] = socCorrectableFabricSs0_0Count;
data[3] = 0;
data[4] = correctableGrfErrorCount;
data[5] = correctableEuErrorCount;
return 0;
}
int mockedPmuReadForUncorrectableAndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
memset(data, 0, sizeOfdata);
data[1] = timeStamp;
data[2] = fatalEngineResetCount;
data[3] = euAttention;
data[4] = driverMigration;
data[5] = driverGgtt;
data[6] = driverRps;
data[7] = 0;
data[8] = 0;
data[9] = socFatalMdfiEastCount;
data[10] = socNonFatalPsfCsc0Count;
data[11] = 0;
data[12] = fatalEuErrorCount;
data[13] = fatalTlb;
return 0;
}
int mockedPmuReadForCorrectableTile0AndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
memset(data, 0, sizeOfdata);
data[1] = timeStamp;
data[2] = 0;
data[3] = socCorrectableHbmSs0_1CountTile0;
data[4] = correctableGrfErrorCountTile0;
data[5] = correctableEuErrorCountTile0;
return 0;
}
int mockedPmuReadForUncorrectableTile0AndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
memset(data, 0, sizeOfdata);
data[1] = timeStamp;
data[2] = fatalEngineResetCountTile0;
data[3] = euAttentionTile0;
data[4] = driverMigration;
data[5] = driverGgtt;
data[6] = driverRps;
data[7] = fatalFpuTile0;
data[8] = FatalL3FabricTile0;
data[9] = socFatalHbmSs1_15CountTile0;
data[10] = 0;
data[11] = socNonFatalPsfCsc0CountTile0;
data[12] = fatalEuErrorCountTile0;
data[13] = 0;
return 0;
}
int mockedPmuReadForCorrectableTile1AndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
memset(data, 0, sizeOfdata);
data[1] = timeStamp;
data[2] = socCorrectableFabricSs1_0CountTile1;
data[3] = correctableGucErrorCountTile1;
data[4] = correctableSamplerErrorCountTile1;
return 0;
}
int mockedPmuReadForUncorrectableTile1AndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
memset(data, 0, sizeOfdata);
data[1] = timeStamp;
data[2] = fatalEngineResetCountTile1;
data[3] = euAttentionTile1;
data[4] = driverMigration;
data[5] = driverEngineOther;
data[6] = socFatalMdfiWestCountTile1;
data[7] = socNonFatalPunitCountTile1;
data[8] = fatalGucErrorCountTile1;
data[9] = fatalIdiParityErrorCountTile1;
return 0;
}
int mockedPmuReadAfterClearAndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
memset(data, 0, sizeOfdata);
return 0;
}
int mockedPmuReadAndFailureReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
return -1;
}
int pmuRead(int fd, uint64_t *data, ssize_t sizeOfdata) override {
if (mockPmuReadResult == true) {
return mockedPmuReadAndFailureReturn(fd, data, sizeOfdata);
}
if (mockPmuReadCorrectable == true) {
if (mockPmuReadCount == 0) {
mockPmuReadCount++;
return mockedPmuReadForCorrectableAndSuccessReturn(fd, data, sizeOfdata);
}
else if (mockPmuReadCount == 1) {
mockPmuReadCount++;
return mockedPmuReadForUncorrectableAndSuccessReturn(fd, data, sizeOfdata);
}
}
if (mockPmuReadAfterClear == true) {
if (mockPmuReadCountAfterClear == 0) {
mockPmuReadCountAfterClear++;
return mockedPmuReadForCorrectableAndSuccessReturn(fd, data, sizeOfdata);
}
else if (mockPmuReadCountAfterClear == 1) {
mockPmuReadCountAfterClear++;
return mockedPmuReadForUncorrectableAndSuccessReturn(fd, data, sizeOfdata);
}
else {
mockPmuReadCountAfterClear++;
return mockedPmuReadAfterClearAndSuccessReturn(fd, data, sizeOfdata);
}
}
if (mockPmuReadTile == true) {
if (mockPmuReadTileCount == 0) {
mockPmuReadTileCount++;
return mockedPmuReadForCorrectableTile0AndSuccessReturn(fd, data, sizeOfdata);
}
else if (mockPmuReadTileCount == 1) {
mockPmuReadTileCount++;
return mockedPmuReadForUncorrectableTile0AndSuccessReturn(fd, data, sizeOfdata);
}
else if (mockPmuReadTileCount == 2) {
mockPmuReadTileCount++;
return mockedPmuReadForCorrectableTile1AndSuccessReturn(fd, data, sizeOfdata);
}
else if (mockPmuReadTileCount == 3) {
mockPmuReadTileCount++;
return mockedPmuReadForUncorrectableTile1AndSuccessReturn(fd, data, sizeOfdata);
}
}
return 0;
}
};
class RasFsAccess : public FsAccess {};
class RasSysfsAccess : public SysfsAccess {};
template <>
struct Mock<RasSysfsAccess> : public RasSysfsAccess {
ze_result_t mockReadSymLinkStatus = ZE_RESULT_SUCCESS;
bool mockReadSymLinkResult = false;
ze_result_t readSymLink(const std::string file, std::string &val) override {
if (mockReadSymLinkStatus != ZE_RESULT_SUCCESS) {
return mockReadSymLinkStatus;
}
if (mockReadSymLinkResult == true) {
return getValStringSymLinkFailure(file, val);
}
if (file.compare(deviceDir) == 0) {
val = "/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0";
return ZE_RESULT_SUCCESS;
}
return ZE_RESULT_ERROR_NOT_AVAILABLE;
}
ze_result_t getValStringSymLinkFailure(const std::string file, std::string &val) {
return ZE_RESULT_ERROR_NOT_AVAILABLE;
}
ze_result_t read(const std::string file, uint64_t &val) override {
if (file.compare("gt/gt0/error_counter/correctable_eu_grf") == 0) {
val = 5u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/correctable_eu_ic") == 0) {
val = 1u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/fatal_eu_ic") == 0) {
val = 5u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/fatal_tlb") == 0) {
val = 2u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/engine_reset") == 0) {
val = 2u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt1/error_counter/correctable_sampler") == 0) {
val = 2u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt1/error_counter/fatal_guc") == 0) {
val = 6u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt1/error_counter/fatal_idi_parity") == 0) {
val = 1u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt1/error_counter/correctable_guc") == 0) {
val = 3u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt1/error_counter/engine_reset") == 0) {
val = 4u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/eu_attention") == 0) {
val = 7u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt1/error_counter/eu_attention") == 0) {
val = 5u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/soc_correctable_fabric_ss0_0") == 0) {
val = 1u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/soc_fatal_mdfi_east") == 0) {
val = 5u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/soc_nonfatal_psf_csc_0") == 0) {
val = 3u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/soc_correctable_hbm_ss0_1") == 0) {
val = 5u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/soc_fatal_hbm_ss1_15") == 0) {
val = 5u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt1/error_counter/soc_correctable_fabric_ss1_0") == 0) {
val = 4u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt1/error_counter/soc_nonfatal_punit") == 0) {
val = 3u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt1/error_counter/soc_fatal_mdfi_west") == 0) {
val = 2u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/fatal_fpu") == 0) {
val = 2u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/fatal_l3_fabric") == 0) {
val = 3u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/driver_ggtt") == 0) {
val = 2u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt0/error_counter/driver_rps") == 0) {
val = 2u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("error_counter/driver_object_migration") == 0) {
val = 1u;
return ZE_RESULT_SUCCESS;
} else if (file.compare("gt/gt1/error_counter/driver_engine_other") == 0) {
val = 3u;
return ZE_RESULT_SUCCESS;
}
return ZE_RESULT_ERROR_NOT_AVAILABLE;
}
};
template <>
struct Mock<RasFsAccess> : public RasFsAccess {
ze_result_t mockListDirectoryStatus = ZE_RESULT_SUCCESS;
bool mockReadDirectoryFailure = false;
bool mockReadFileFailure = false;
bool mockReadDirectoryWithoutRasEvents = false;
bool mockRootUser = false;
bool mockReadVal = false;
bool mockReadDirectoryForMultiDevice = false;
bool directoryExists(const std::string path) override {
// disables fabric errors
return false;
}
ze_result_t listDirectory(const std::string directory, std::vector<std::string> &events) override {
if (mockListDirectoryStatus != ZE_RESULT_SUCCESS) {
return mockListDirectoryStatus;
}
if (mockReadDirectoryFailure == true) {
return readDirectoryFailure(directory, events);
}
if (mockReadDirectoryWithoutRasEvents == true) {
return readDirectoryWithoutRasEvents(directory, events);
}
if (mockReadDirectoryForMultiDevice == true) {
return readDirectorySuccessForMultiDevice(directory, events);
}
if (directory.compare(eventsDir) == 0) {
events.push_back("bcs0-busy");
events.push_back("error--correctable-eu-grf");
events.push_back("error--correctable-eu-ic");
events.push_back("error--soc-correctable-fabric-ss0-0");
events.push_back("error--soc-correctable-hbm-ss0-1");
events.push_back("error--soc-fatal-hbm-ss1-15");
events.push_back("error--soc-fatal-mdfi-east");
events.push_back("error--soc-nonfatal-psf-csc-0");
events.push_back("error--fatal-eu-ic");
events.push_back("error--fatal-tlb");
events.push_back("error--engine-reset");
events.push_back("error--eu-attention");
events.push_back("error--driver-object-migration");
events.push_back("error--driver-ggtt");
events.push_back("error--driver-rps");
events.push_back("error--fatal-fpu");
events.push_back("error--fatal-l3-fabric");
events.push_back("ccs0-busy");
return ZE_RESULT_SUCCESS;
}
return ZE_RESULT_ERROR_NOT_AVAILABLE;
}
ze_result_t readDirectoryWithoutRasEvents(const std::string directory, std::vector<std::string> &events) {
if (directory.compare(eventsDir) == 0) {
events.push_back("bcs0-busy");
events.push_back("ccs0-busy");
return ZE_RESULT_SUCCESS;
}
return ZE_RESULT_ERROR_NOT_AVAILABLE;
}
ze_result_t readDirectorySuccessForMultiDevice(const std::string directory, std::vector<std::string> &events) {
if (directory.compare(eventsDir) == 0) {
events.push_back("bcs0-busy");
events.push_back("error-gt0--correctable-eu-grf");
events.push_back("error-gt0--correctable-eu-ic");
events.push_back("error-gt0--soc-correctable-hbm-ss0-1");
events.push_back("error-gt0--soc-correctable-fabric-ss0-0");
events.push_back("error-gt0--soc-nonfatal-psf-csc-0");
events.push_back("error-gt0--soc-fatal-hbm-ss1-15");
events.push_back("error-gt0--soc-fatal-mdfi-east");
events.push_back("error-gt0--fatal-eu-ic");
events.push_back("error-gt0--fatal-tlb");
events.push_back("error-gt0--engine-reset");
events.push_back("error-gt0--eu-attention");
events.push_back("error-gt0--fatal-fpu");
events.push_back("error-gt0--fatal-l3-fabric");
events.push_back("error--driver-object-migration");
events.push_back("error-gt0--driver-ggtt");
events.push_back("error-gt0--driver-rps");
events.push_back("error-gt1--correctable-sampler");
events.push_back("error-gt1--soc-correctable-fabric-ss1-0");
events.push_back("error-gt1--soc-nonfatal-punit");
events.push_back("error-gt1--soc-fatal-mdfi-west");
events.push_back("error-gt1--fatal-guc");
events.push_back("error-gt1--fatal-idi-parity");
events.push_back("error-gt1--correctable-guc");
events.push_back("error-gt1--engine-reset");
events.push_back("error-gt1--eu-attention");
events.push_back("error-gt1--driver-engine-other");
return ZE_RESULT_SUCCESS;
}
return ZE_RESULT_ERROR_NOT_AVAILABLE;
}
ze_result_t readDirectoryFailure(const std::string directory, std::vector<std::string> &events) {
return ZE_RESULT_ERROR_NOT_AVAILABLE;
}
ze_result_t read(const std::string file, std::string &config) override {
if (mockReadFileFailure == true) {
return readFileFailure(file, config);
}
config = "config=0x0000000000000001";
return ZE_RESULT_SUCCESS;
}
ze_result_t readFileFailure(const std::string, std::string &config) {
return ZE_RESULT_ERROR_NOT_AVAILABLE;
}
ze_result_t read(const std::string file, uint32_t &val) override {
if (mockReadVal == true) {
return readValFailure(file, val);
}
val = pmuDriverType;
return ZE_RESULT_SUCCESS;
}
ze_result_t readValFailure(const std::string file, uint32_t &val) {
val = 0;
return ZE_RESULT_ERROR_NOT_AVAILABLE;
}
bool isRootUser() override {
if (mockRootUser == true) {
return userIsNotRoot();
}
return true;
}
bool userIsNotRoot() {
return false;
}
Mock<RasFsAccess>() = default;
};
class RasFwInterface : public FirmwareUtil {};
template <>
struct Mock<RasFwInterface> : public FirmwareUtil {
bool mockMemorySuccess = false;
ze_result_t mockGetMemoryErrorSuccess(zes_ras_error_type_t category, uint64_t subDeviceCount, uint64_t subDeviceId, uint64_t &count) {
if (category == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
count = hbmCorrectableErrorCount;
}
if (category == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
count = hbmUncorrectableErrorCount;
}
return ZE_RESULT_SUCCESS;
}
ze_result_t fwGetMemoryErrorCount(zes_ras_error_type_t category, uint32_t subDeviceCount, uint32_t subDeviceId, uint64_t &count) override {
if (mockMemorySuccess == true) {
return mockGetMemoryErrorSuccess(category, subDeviceCount, subDeviceId, count);
}
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
Mock<RasFwInterface>() = default;
ADDMETHOD_NOBASE(fwDeviceInit, ze_result_t, ZE_RESULT_SUCCESS, ());
ADDMETHOD_NOBASE(getFirstDevice, ze_result_t, ZE_RESULT_SUCCESS, (igsc_device_info * info));
ADDMETHOD_NOBASE(getFwVersion, ze_result_t, ZE_RESULT_SUCCESS, (std::string fwType, std::string &firmwareVersion));
ADDMETHOD_NOBASE(flashFirmware, ze_result_t, ZE_RESULT_SUCCESS, (std::string fwType, void *pImage, uint32_t size));
ADDMETHOD_NOBASE(fwIfrApplied, ze_result_t, ZE_RESULT_SUCCESS, (bool &ifrStatus));
ADDMETHOD_NOBASE(fwSupportedDiagTests, ze_result_t, ZE_RESULT_SUCCESS, (std::vector<std::string> & supportedDiagTests));
ADDMETHOD_NOBASE(fwRunDiagTests, ze_result_t, ZE_RESULT_SUCCESS, (std::string & osDiagType, zes_diag_result_t *pResult));
ADDMETHOD_NOBASE_VOIDRETURN(getDeviceSupportedFwTypes, (std::vector<std::string> & fwTypes));
ADDMETHOD_NOBASE(fwGetEccConfig, ze_result_t, ZE_RESULT_SUCCESS, (uint8_t * currentState, uint8_t *pendingState));
ADDMETHOD_NOBASE(fwSetEccConfig, ze_result_t, ZE_RESULT_SUCCESS, (uint8_t newState, uint8_t *currentState, uint8_t *pendingState));
};
class PublicLinuxRasImp : public L0::LinuxRasImp {
public:
PublicLinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId) {}
using LinuxRasImp::pFsAccess;
};
} // namespace ult
} // namespace L0

View File

@ -13,7 +13,6 @@ extern bool sysmanUltsEnable;
using ::testing::_;
using ::testing::Matcher;
using ::testing::NiceMock;
namespace L0 {
namespace ult {
@ -21,7 +20,7 @@ namespace ult {
constexpr uint32_t mockHandleCount = 0;
struct SysmanRasFixture : public SysmanDeviceFixture {
protected:
std::unique_ptr<Mock<RasFsAccess>> pFsAccess;
std::unique_ptr<MockRasFsAccess> pFsAccess;
std::vector<ze_device_handle_t> deviceHandles;
FsAccess *pFsAccessOriginal = nullptr;
void SetUp() override {
@ -29,11 +28,10 @@ struct SysmanRasFixture : public SysmanDeviceFixture {
GTEST_SKIP();
}
SysmanDeviceFixture::SetUp();
pFsAccess = std::make_unique<NiceMock<Mock<RasFsAccess>>>();
pFsAccess = std::make_unique<MockRasFsAccess>();
pFsAccessOriginal = pLinuxSysmanImp->pFsAccess;
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
ON_CALL(*pFsAccess.get(), isRootUser())
.WillByDefault(::testing::Invoke(pFsAccess.get(), &Mock<RasFsAccess>::userIsRoot));
pFsAccess->mockRootUser = true;
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
uint32_t subDeviceCount = 0;
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr);
@ -154,8 +152,7 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetConfigAfterzesRa
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasSetConfigWithoutPermissionThenFailureIsReturned) {
ON_CALL(*pFsAccess.get(), isRootUser())
.WillByDefault(::testing::Invoke(pFsAccess.get(), &Mock<RasFsAccess>::userIsNotRoot));
pFsAccess->mockRootUser = false;
RasImp *pTestRasImp = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle());
pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp);

View File

@ -0,0 +1,529 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h"
#include "level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_fabric_prelim.h"
namespace L0 {
namespace ult {
constexpr uint32_t mockHandleCount = 2u;
class TestRasFabricFixture : public SysmanDeviceFixture {
protected:
std::unique_ptr<MockRasFabricFsAccess> pFsAccess;
std::unique_ptr<MockRasFabricSysFsAccess> pSysfsAccess;
MemoryManager *pMemoryManagerOriginal = nullptr;
std::unique_ptr<MockMemoryManagerInRasSysman> pMemoryManager;
FsAccess *pFsAccessOriginal = nullptr;
SysfsAccess *pSysfsAccessOriginal = nullptr;
PmuInterface *pOriginalPmuInterface = nullptr;
FirmwareUtil *pOriginalFwUtilInterface = nullptr;
std::vector<ze_device_handle_t> deviceHandles;
void SetUp() override {
if (!sysmanUltsEnable) {
GTEST_SKIP();
}
SysmanDeviceFixture::SetUp();
pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager();
pMemoryManager = std::make_unique<::testing::NiceMock<MockMemoryManagerInRasSysman>>(*neoDevice->getExecutionEnvironment());
pMemoryManager->localMemorySupported[0] = true;
device->getDriverHandle()->setMemoryManager(pMemoryManager.get());
pFsAccess = std::make_unique<MockRasFabricFsAccess>();
pSysfsAccess = std::make_unique<MockRasFabricSysFsAccess>();
pFsAccessOriginal = pLinuxSysmanImp->pFsAccess;
pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess;
pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface;
pOriginalFwUtilInterface = pLinuxSysmanImp->pFwUtilInterface;
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
pLinuxSysmanImp->pPmuInterface = nullptr;
pLinuxSysmanImp->pFwUtilInterface = nullptr;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
uint32_t subDeviceCount = 0;
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr);
if (subDeviceCount == 0) {
deviceHandles.resize(1, device->toHandle());
} else {
deviceHandles.resize(subDeviceCount, nullptr);
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, deviceHandles.data());
}
}
void TearDown() override {
if (!sysmanUltsEnable) {
GTEST_SKIP();
}
device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal);
pLinuxSysmanImp->pFsAccess = pFsAccessOriginal;
pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal;
pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface;
pLinuxSysmanImp->pFwUtilInterface = pOriginalFwUtilInterface;
SysmanDeviceFixture::TearDown();
}
std::vector<zes_ras_handle_t> getRasHandles(uint32_t count) {
std::vector<zes_ras_handle_t> handles(count, nullptr);
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS);
return handles;
}
};
TEST_F(TestRasFabricFixture, GivenValidRasFabricNodesThenGetStateIsSuccessful) {
std::vector<std::string> dirs = {"/mockRealPath/iaf.0",
"/sys/module/iaf/drivers/platform:iaf/"};
std::map<std::string, uint64_t> nodes = {
{"/mockRealPath/iaf.0/sd.0/fw_comm_errors", 101},
{"/mockRealPath/iaf.0/sd.0/sd_failure", 201},
{"/mockRealPath/iaf.0/sd.0/fw_error", 301},
{"/mockRealPath/iaf.0/sd.0/port.1/link_failures", 401},
{"/mockRealPath/iaf.0/sd.0/port.1/link_degrades", 501},
{"/mockRealPath/iaf.0/sd.0/port.2/link_failures", 601},
{"/mockRealPath/iaf.0/sd.0/port.2/link_degrades", 701},
{"/mockRealPath/iaf.0/sd.0/port.3/link_failures", 801},
{"/mockRealPath/iaf.0/sd.0/port.3/link_degrades", 901},
{"/mockRealPath/iaf.0/sd.0/port.4/link_failures", 1001},
{"/mockRealPath/iaf.0/sd.0/port.4/link_degrades", 1101},
{"/mockRealPath/iaf.0/sd.0/port.5/link_failures", 2101},
{"/mockRealPath/iaf.0/sd.0/port.5/link_degrades", 3101},
{"/mockRealPath/iaf.0/sd.0/port.6/link_failures", 4101},
{"/mockRealPath/iaf.0/sd.0/port.6/link_degrades", 5101},
{"/mockRealPath/iaf.0/sd.0/port.7/link_failures", 6101},
{"/mockRealPath/iaf.0/sd.0/port.7/link_degrades", 7101},
{"/mockRealPath/iaf.0/sd.0/port.8/link_failures", 8101},
{"/mockRealPath/iaf.0/sd.0/port.8/link_degrades", 9101},
};
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleDirectories(dirs);
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodes);
uint32_t count = 0;
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(count, mockHandleCount);
uint32_t testcount = count + 1;
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(testcount, mockHandleCount);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
EXPECT_NE(handle, nullptr);
zes_ras_state_t state = {};
zes_ras_properties_t properties = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
if (properties.type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 27709u);
}
if (properties.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 23710u);
}
}
}
TEST_F(TestRasFabricFixture, GivenInValidRasFabricNodesThenEnumerationDoesNotReturnAnyHandles) {
pSysfsAccess->mockRealPathStatus = ZE_RESULT_ERROR_UNKNOWN;
uint32_t count = 0;
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL), ZE_RESULT_SUCCESS);
EXPECT_EQ(count, 0u);
}
TEST_F(TestRasFabricFixture, GivenValidRasFabricAuxiliaryNodesThenGetStateIsSuccessful) {
std::vector<std::string> dirs = {"/mockRealPath/i915.iaf.0",
"/sys/module/iaf/drivers/auxiliary:iaf/"};
std::map<std::string, uint64_t> nodes = {
{"/mockRealPath/i915.iaf.0/sd.0/fw_comm_errors", 101},
{"/mockRealPath/i915.iaf.0/sd.0/sd_failure", 201},
{"/mockRealPath/i915.iaf.0/sd.0/fw_error", 301},
{"/mockRealPath/i915.iaf.0/sd.0/port.1/link_failures", 401},
{"/mockRealPath/i915.iaf.0/sd.0/port.1/link_degrades", 501},
};
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleDirectories(dirs);
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodes);
uint32_t count = 0;
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(count, mockHandleCount);
uint32_t testcount = count + 1;
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(testcount, mockHandleCount);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
EXPECT_NE(handle, nullptr);
zes_ras_state_t state = {};
zes_ras_properties_t properties = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
if (properties.type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 602u);
}
if (properties.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 903u);
}
}
}
TEST_F(TestRasFabricFixture, GivenSomeRasFabricNodesThenGetStateIsSuccessful) {
std::vector<std::string> dirs = {"/mockRealPath/iaf.31",
"/sys/module/iaf/drivers/platform:iaf/"};
std::map<std::string, uint64_t> nodes = {
{"/mockRealPath/iaf.31/sd.0/fw_comm_errors", 101},
{"/mockRealPath/iaf.31/sd.0/sd_failure", 201},
{"/mockRealPath/iaf.31/sd.0/fw_error", 301},
{"/mockRealPath/iaf.31/sd.0/port.1/link_failures", 401},
{"/mockRealPath/iaf.31/sd.0/port.2/link_failures", 601},
{"/mockRealPath/iaf.31/sd.0/port.2/link_degrades", 701},
{"/mockRealPath/iaf.31/sd.0/port.3/link_failures", 801},
{"/mockRealPath/iaf.31/sd.0/port.3/link_degrades", 901},
{"/mockRealPath/iaf.31/sd.0/port.4/link_failures", 1001},
{"/mockRealPath/iaf.31/sd.0/port.4/link_degrades", 1101},
{"/mockRealPath/iaf.31/sd.0/port.5/link_failures", 2101},
{"/mockRealPath/iaf.31/sd.0/port.5/link_degrades", 3101},
{"/mockRealPath/iaf.31/sd.0/port.6/link_failures", 4101},
{"/mockRealPath/iaf.31/sd.0/port.6/link_degrades", 5101},
{"/mockRealPath/iaf.31/sd.0/port.7/link_failures", 6101},
{"/mockRealPath/iaf.31/sd.0/port.7/link_degrades", 7101},
{"/mockRealPath/iaf.31/sd.0/port.8/link_degrades", 9101},
};
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleDirectories(dirs);
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodes);
uint32_t count = 0;
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(count, mockHandleCount);
uint32_t testcount = count + 1;
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(testcount, mockHandleCount);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
EXPECT_NE(handle, nullptr);
zes_ras_state_t state = {};
zes_ras_properties_t properties = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
if (properties.type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 27709u - 501u);
}
if (properties.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 23710u - 8101u);
}
}
}
TEST_F(TestRasFabricFixture, GivenValidRasFabricNodesWhenGetStateIsCalledTwiceThenRasErrorCountIsDoubled) {
std::vector<std::string> dirs = {"/mockRealPath/iaf.27",
"/sys/module/iaf/drivers/platform:iaf/"};
std::map<std::string, uint64_t> nodes = {
{"/mockRealPath/iaf.27/sd.0/fw_comm_errors", 101},
{"/mockRealPath/iaf.27/sd.0/sd_failure", 201},
{"/mockRealPath/iaf.27/sd.0/fw_error", 301},
{"/mockRealPath/iaf.27/sd.0/port.1/link_failures", 401},
{"/mockRealPath/iaf.27/sd.0/port.1/link_degrades", 501},
};
std::map<std::string, uint64_t> nodesSecondRead = {
{"/mockRealPath/iaf.27/sd.0/fw_comm_errors", 101 * 2},
{"/mockRealPath/iaf.27/sd.0/sd_failure", 201 * 2},
{"/mockRealPath/iaf.27/sd.0/fw_error", 301 * 2},
{"/mockRealPath/iaf.27/sd.0/port.1/link_failures", 401 * 2},
{"/mockRealPath/iaf.27/sd.0/port.1/link_degrades", 501 * 2},
};
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleDirectories(dirs);
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodes);
uint32_t count = 0;
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(count, mockHandleCount);
uint32_t testcount = count + 1;
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(testcount, mockHandleCount);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
EXPECT_NE(handle, nullptr);
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
}
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodesSecondRead);
for (auto handle : handles) {
EXPECT_NE(handle, nullptr);
zes_ras_state_t state = {};
zes_ras_properties_t properties = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
if (properties.type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 602u * 2);
}
if (properties.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 903u * 2);
}
}
}
TEST_F(TestRasFabricFixture, GivenValidRasFabricNodesWhenGetStateIsCalledTwiceWithClearThenNewRasErrorCountIsRetrieved) {
std::vector<std::string> dirs = {"/mockRealPath/iaf.27",
"/sys/module/iaf/drivers/platform:iaf/"};
std::map<std::string, uint64_t> nodes = {
{"/mockRealPath/iaf.27/sd.0/fw_comm_errors", 101},
{"/mockRealPath/iaf.27/sd.0/sd_failure", 201},
{"/mockRealPath/iaf.27/sd.0/fw_error", 301},
{"/mockRealPath/iaf.27/sd.0/port.1/link_failures", 401},
{"/mockRealPath/iaf.27/sd.0/port.1/link_degrades", 501},
};
std::map<std::string, uint64_t> nodesSecondRead = {
{"/mockRealPath/iaf.27/sd.0/fw_comm_errors", 101 * 2},
{"/mockRealPath/iaf.27/sd.0/sd_failure", 201 * 2},
{"/mockRealPath/iaf.27/sd.0/fw_error", 301 * 2},
{"/mockRealPath/iaf.27/sd.0/port.1/link_failures", 401 * 2},
{"/mockRealPath/iaf.27/sd.0/port.1/link_degrades", 501 * 2},
};
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleDirectories(dirs);
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodes);
uint32_t count = 0;
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(count, mockHandleCount);
uint32_t testcount = count + 1;
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(testcount, mockHandleCount);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
EXPECT_NE(handle, nullptr);
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 1, &state));
}
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodesSecondRead);
for (auto handle : handles) {
EXPECT_NE(handle, nullptr);
zes_ras_state_t state = {};
zes_ras_properties_t properties = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
if (properties.type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 602u);
}
if (properties.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 903u);
}
}
}
class SysmanRasFabricMultiDeviceFixture : public MultiDeviceFixture, public ::testing::Test {
public:
void SetUp() override {
if (!sysmanUltsEnable) {
GTEST_SKIP();
}
MultiDeviceFixture::setUp();
for (auto &device : driverHandle->devices) {
auto neoDevice = device->getNEODevice();
neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()]->osInterface = std::make_unique<NEO::OSInterface>();
auto &osInterface = device->getOsInterface();
osInterface.setDriverModel(std::make_unique<SysmanMockDrm>(const_cast<NEO::RootDeviceEnvironment &>(neoDevice->getRootDeviceEnvironment())));
setenv("ZES_ENABLE_SYSMAN", "1", 1);
delete device->getSysmanHandle();
device->setSysmanHandle(new SysmanDeviceImp(device->toHandle()));
auto pSysmanDevice = device->getSysmanHandle();
for (auto &subDevice : static_cast<DeviceImp *>(device)->subDevices) {
static_cast<DeviceImp *>(subDevice)->setSysmanHandle(pSysmanDevice);
}
auto pSysmanDeviceImp = static_cast<SysmanDeviceImp *>(pSysmanDevice);
auto pOsSysman = pSysmanDeviceImp->pOsSysman;
auto pLinuxSysmanImp = static_cast<PublicLinuxSysmanImp *>(pOsSysman);
pSysmanDeviceImp->init();
delete pLinuxSysmanImp->pFwUtilInterface;
delete pLinuxSysmanImp->pSysfsAccess;
delete pLinuxSysmanImp->pProcfsAccess;
delete pLinuxSysmanImp->pFsAccess;
auto pProcfsAccess = new NiceMock<Mock<LinuxProcfsAccess>>();
auto pFsAccess = new MockRasFabricFsAccess();
auto pSysfsAccess = new MockRasFabricSysFsAccess();
pLinuxSysmanImp->pFwUtilInterface = nullptr;
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess;
pLinuxSysmanImp->pProcfsAccess = pProcfsAccess;
pLinuxSysmanImp->pFsAccess = pFsAccess;
}
}
void TearDown() override {
if (!sysmanUltsEnable) {
GTEST_SKIP();
}
for (auto &device : driverHandle->devices) {
auto pSysmanDevice = device->getSysmanHandle();
auto pSysmanDeviceImp = static_cast<SysmanDeviceImp *>(pSysmanDevice);
auto pOsSysman = pSysmanDeviceImp->pOsSysman;
auto pLinuxSysmanImp = static_cast<PublicLinuxSysmanImp *>(pOsSysman);
delete pLinuxSysmanImp->pSysfsAccess;
delete pLinuxSysmanImp->pProcfsAccess;
delete pLinuxSysmanImp->pFsAccess;
pLinuxSysmanImp->pFwUtilInterface = nullptr;
pLinuxSysmanImp->pSysfsAccess = nullptr;
pLinuxSysmanImp->pProcfsAccess = nullptr;
pLinuxSysmanImp->pFsAccess = nullptr;
delete pSysmanDevice;
device->setSysmanHandle(nullptr);
}
unsetenv("ZES_ENABLE_SYSMAN");
MultiDeviceFixture::tearDown();
}
};
TEST_F(SysmanRasFabricMultiDeviceFixture, GivenValidRasFabricNodesForMultipleDevicesThenGetStateReturnsErrorCountSpecificToEachOfDevice) {
const uint32_t testUseSubDeviceCount = 2u;
ASSERT_GE(numRootDevices, 2u);
ASSERT_GE(numSubDevices, testUseSubDeviceCount);
std::vector<std::string> dirs = {"/mockRealPath/iaf.27",
"/sys/module/iaf/drivers/platform:iaf/"};
{
std::map<std::string, uint64_t> nodes = {
{"/mockRealPath/iaf.27/sd.0/fw_comm_errors", 1},
{"/mockRealPath/iaf.27/sd.0/sd_failure", 1},
{"/mockRealPath/iaf.27/sd.0/fw_error", 1},
{"/mockRealPath/iaf.27/sd.0/port.1/link_failures", 1},
{"/mockRealPath/iaf.27/sd.0/port.1/link_degrades", 1},
{"/mockRealPath/iaf.27/sd.1/fw_comm_errors", 2},
{"/mockRealPath/iaf.27/sd.1/sd_failure", 2},
{"/mockRealPath/iaf.27/sd.1/fw_error", 2},
{"/mockRealPath/iaf.27/sd.1/port.1/link_failures", 2},
{"/mockRealPath/iaf.27/sd.1/port.1/link_degrades", 2},
};
auto pOsSysman = static_cast<SysmanDeviceImp *>(driverHandle->devices[0]->getSysmanHandle())->pOsSysman;
auto pLinuxSysmanImp = static_cast<PublicLinuxSysmanImp *>(pOsSysman);
static_cast<MockRasFabricFsAccess *>(pLinuxSysmanImp->pFsAccess)->setAccessibleDirectories(dirs);
static_cast<MockRasFabricFsAccess *>(pLinuxSysmanImp->pFsAccess)->setAccessibleNodes(nodes);
}
{
std::map<std::string, uint64_t> nodes = {
{"/mockRealPath/iaf.27/sd.0/fw_comm_errors", 3},
{"/mockRealPath/iaf.27/sd.0/sd_failure", 3},
{"/mockRealPath/iaf.27/sd.0/fw_error", 3},
{"/mockRealPath/iaf.27/sd.0/port.1/link_failures", 3},
{"/mockRealPath/iaf.27/sd.0/port.1/link_degrades", 3},
{"/mockRealPath/iaf.27/sd.1/fw_comm_errors", 4},
{"/mockRealPath/iaf.27/sd.1/sd_failure", 4},
{"/mockRealPath/iaf.27/sd.1/fw_error", 4},
{"/mockRealPath/iaf.27/sd.1/port.1/link_failures", 4},
{"/mockRealPath/iaf.27/sd.1/port.1/link_degrades", 4},
};
auto pOsSysman = static_cast<SysmanDeviceImp *>(driverHandle->devices[1]->getSysmanHandle())->pOsSysman;
auto pLinuxSysmanImp = static_cast<PublicLinuxSysmanImp *>(pOsSysman);
static_cast<MockRasFabricFsAccess *>(pLinuxSysmanImp->pFsAccess)->setAccessibleDirectories(dirs);
static_cast<MockRasFabricFsAccess *>(pLinuxSysmanImp->pFsAccess)->setAccessibleNodes(nodes);
}
const std::vector<std::pair<uint32_t, uint32_t>> errorCounts{
{2, 3}, // Device 0, subdevice 0
{4, 6}, // Device 0, subdevice 1
{6, 9}, // Device 1, subdevice 0
{8, 12}, // Device 1, subdevice 1
};
for (uint32_t deviceIndex = 0; deviceIndex < testUseSubDeviceCount; deviceIndex++) {
uint32_t count = 0;
auto hDevice = driverHandle->devices[deviceIndex]->toHandle();
EXPECT_EQ(zesDeviceEnumRasErrorSets(hDevice, &count, NULL), ZE_RESULT_SUCCESS);
EXPECT_GT(count, 0u);
std::vector<zes_ras_handle_t> handles(count, nullptr);
EXPECT_EQ(zesDeviceEnumRasErrorSets(hDevice, &count, handles.data()), ZE_RESULT_SUCCESS);
for (auto handle : handles) {
zes_ras_state_t state = {};
zes_ras_properties_t properties = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
const auto accessIndex = deviceIndex * testUseSubDeviceCount + properties.subdeviceId;
if (properties.type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], errorCounts[accessIndex].first);
}
if (properties.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], errorCounts[accessIndex].second);
}
}
}
}
} // namespace ult
} // namespace L0

View File

@ -0,0 +1,758 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h"
#include "level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h"
extern bool sysmanUltsEnable;
using ::testing::_;
using ::testing::DoDefault;
using ::testing::Matcher;
using ::testing::NiceMock;
using ::testing::Return;
class OsRas;
namespace L0 {
namespace ult {
constexpr uint32_t mockHandleCount = 2u;
constexpr uint32_t mockHandleCountForSubDevice = 4u;
struct SysmanRasFixture : public SysmanDeviceFixture {
protected:
std::unique_ptr<Mock<RasFsAccess>> pFsAccess;
std::unique_ptr<Mock<RasSysfsAccess>> pSysfsAccess;
std::unique_ptr<Mock<MockPmuInterfaceImpForRas>> pPmuInterface;
std::unique_ptr<Mock<RasFwInterface>> pRasFwUtilInterface;
MemoryManager *pMemoryManagerOriginal = nullptr;
std::unique_ptr<MockMemoryManagerInRasSysman> pMemoryManager;
FsAccess *pFsAccessOriginal = nullptr;
SysfsAccess *pSysfsAccessOriginal = nullptr;
PmuInterface *pOriginalPmuInterface = nullptr;
FirmwareUtil *pFwUtilOriginal = nullptr;
std::vector<ze_device_handle_t> deviceHandles;
void SetUp() override {
if (!sysmanUltsEnable) {
GTEST_SKIP();
}
SysmanDeviceFixture::SetUp();
pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager();
pMemoryManager = std::make_unique<::testing::NiceMock<MockMemoryManagerInRasSysman>>(*neoDevice->getExecutionEnvironment());
pMemoryManager->localMemorySupported[0] = true;
device->getDriverHandle()->setMemoryManager(pMemoryManager.get());
pFsAccess = std::make_unique<NiceMock<Mock<RasFsAccess>>>();
pSysfsAccess = std::make_unique<NiceMock<Mock<RasSysfsAccess>>>();
pRasFwUtilInterface = std::make_unique<NiceMock<Mock<RasFwInterface>>>();
pFsAccessOriginal = pLinuxSysmanImp->pFsAccess;
pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess;
pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface;
pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface;
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
pPmuInterface = std::make_unique<NiceMock<Mock<MockPmuInterfaceImpForRas>>>(pLinuxSysmanImp);
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
uint32_t subDeviceCount = 0;
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr);
if (subDeviceCount == 0) {
deviceHandles.resize(1, device->toHandle());
} else {
deviceHandles.resize(subDeviceCount, nullptr);
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, deviceHandles.data());
}
}
void TearDown() override {
if (!sysmanUltsEnable) {
GTEST_SKIP();
}
device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal);
pLinuxSysmanImp->pFsAccess = pFsAccessOriginal;
pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal;
pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface;
pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal;
SysmanDeviceFixture::TearDown();
}
std::vector<zes_ras_handle_t> getRasHandles(uint32_t count) {
std::vector<zes_ras_handle_t> handles(count, nullptr);
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS);
return handles;
}
};
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesInThenSuccessReturn) {
uint32_t count = 0;
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(count, mockHandleCount);
uint32_t testcount = count + 1;
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(testcount, mockHandleCount);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
EXPECT_NE(handle, nullptr);
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenGettingRasPropertiesThenSuccessIsReturned) {
auto handles = getRasHandles(mockHandleCount);
bool correctable = true;
for (auto handle : handles) {
zes_ras_properties_t properties = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
EXPECT_EQ(properties.pNext, nullptr);
EXPECT_EQ(properties.onSubdevice, false);
EXPECT_EQ(properties.subdeviceId, 0u);
if (correctable == true) {
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
correctable = false;
} else {
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
}
}
}
TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfReadSymLinkFailsThenNoSupportedErrorTypeIsReturned) {
std::set<zes_ras_error_type_t> errorType = {};
pSysfsAccess->mockReadSymLinkResult = true;
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, device->toHandle());
EXPECT_EQ(errorType.size(), 0u);
}
TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfListDirectoryFailsThenNoSupportedErrorTypeIsReturned) {
std::set<zes_ras_error_type_t> errorType = {};
pFsAccess->mockReadDirectoryFailure = true;
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, device);
EXPECT_EQ(errorType.size(), 0u);
}
TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForHbmAndFwInterfaceIsAbsentThenNoSupportedErrorTypeIsReturned) {
std::set<zes_ras_error_type_t> errorType = {};
pLinuxSysmanImp->pFwUtilInterface = nullptr;
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, device);
EXPECT_EQ(errorType.size(), 0u);
}
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentThenZeroHandlesAreCreated) {
pFsAccess->mockReadDirectoryWithoutRasEvents = true;
pLinuxSysmanImp->pFwUtilInterface = nullptr;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
uint32_t count = 0;
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(count, 0u);
uint32_t testcount = count + 1;
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(testcount, 0u);
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) {
pPmuInterface->mockPmuReadCorrectable = true;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCount);
bool correctable = true;
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
if (correctable == true) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableCacheErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socCorrectableFabricSs0_0Count + initialCorrectableNonComputeErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
correctable = false;
} else {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalEuErrorCount + fatalTlb + initialUncorrectableCacheErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], initialUncorrectableComputeErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socNonFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
}
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtAfterClearThenSuccessIsReturned) {
pPmuInterface->mockPmuReadAfterClear = true;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCount);
bool correctable = true;
ze_bool_t clear = 0;
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
if (correctable == true) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableCacheErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socCorrectableFabricSs0_0Count + initialCorrectableNonComputeErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
correctable = false;
} else {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalEuErrorCount + fatalTlb + initialUncorrectableCacheErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], initialUncorrectableComputeErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socNonFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
}
}
correctable = true;
clear = 1;
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
if (correctable == true) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
correctable = false;
} else {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
}
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSuccessIsReturned) {
pPmuInterface->mockPmuReadResult = true;
pRasFwUtilInterface->mockMemorySuccess = true;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCount);
bool correctable = true;
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
if (correctable == true) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount);
correctable = false;
} else {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount);
}
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmWithClearThenSuccessIsReturned) {
pPmuInterface->mockPmuReadResult = true;
pRasFwUtilInterface->mockMemorySuccess = true;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCount);
bool correctable = true;
ze_bool_t clear = 0;
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
if (correctable == true) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount);
correctable = false;
} else {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount);
}
}
correctable = true;
clear = 1;
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
if (correctable == true) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
correctable = false;
} else {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
}
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateWithClearOptionWithoutPermissionsThenFailureIsReturned) {
pFsAccess->mockRootUser = true;
auto handles = getRasHandles(mockHandleCount);
ze_bool_t clear = 1;
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasGetState(handle, clear, &state));
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndUnableToRetrieveConfigValuesAndOtherInterfacesAreAbsentThenFailureIsReturned) {
pFsAccess->mockReadFileFailure = true;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPerfEventOpenFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
pPmuInterface->mockPerfEvent = true;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
pPmuInterface->mockPmuReadResult = true;
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceWithClearAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
pPmuInterface->mockPmuReadResult = true;
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 1, &state));
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateForGtInterfaceAndPMUGetEventTypeFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
pFsAccess->mockReadVal = true;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateAndFirmwareInterfaceIsAbsentOtherInterfacesAreAlsoAbsentThenFailureIsReturned) {
pFsAccess->mockReadVal = true;
pLinuxSysmanImp->pFwUtilInterface = nullptr;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetConfigAfterzesRasSetConfigThenSuccessIsReturned) {
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
zes_ras_config_t setConfig = {};
zes_ras_config_t getConfig = {};
setConfig.totalThreshold = 50;
memset(setConfig.detailedThresholds.category, 1, sizeof(setConfig.detailedThresholds.category));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasSetConfig(handle, &setConfig));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetConfig(handle, &getConfig));
EXPECT_EQ(setConfig.totalThreshold, getConfig.totalThreshold);
int compare = std::memcmp(setConfig.detailedThresholds.category, getConfig.detailedThresholds.category, sizeof(setConfig.detailedThresholds.category));
EXPECT_EQ(0, compare);
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasSetConfigWithoutPermissionThenFailureIsReturned) {
pFsAccess->mockRootUser = true;
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
zes_ras_config_t setConfig = {};
setConfig.totalThreshold = 50;
memset(setConfig.detailedThresholds.category, 1, sizeof(setConfig.detailedThresholds.category));
EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasSetConfig(handle, &setConfig));
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) {
pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsInsideGetEventOpenAndOtherInterfacesAreAbsentThenFailureIsReturned) {
pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndListDirectoryFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) {
pFsAccess->mockListDirectoryStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCount);
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
}
}
TEST_F(SysmanRasFixture, GivenValidRasHandleAndHandleCountZeroWhenCallingReInitThenValidCountIsReturnedAndVerifyzesDeviceEnumRasErrorSetsSucceeds) {
uint32_t count = 0;
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(count, mockHandleCount);
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pLinuxSysmanImp->reInitSysmanDeviceResources();
count = 0;
result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(count, mockHandleCount);
}
struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture {
protected:
std::unique_ptr<Mock<RasFsAccess>> pFsAccess;
std::unique_ptr<Mock<RasSysfsAccess>> pSysfsAccess;
std::unique_ptr<Mock<MockPmuInterfaceImpForRas>> pPmuInterface;
MemoryManager *pMemoryManagerOriginal = nullptr;
std::unique_ptr<MockMemoryManagerInRasSysman> pMemoryManager;
std::unique_ptr<Mock<RasFwInterface>> pRasFwUtilInterface;
FsAccess *pFsAccessOriginal = nullptr;
SysfsAccess *pSysfsAccessOriginal = nullptr;
PmuInterface *pOriginalPmuInterface = nullptr;
FirmwareUtil *pFwUtilOriginal = nullptr;
std::vector<ze_device_handle_t> deviceHandles;
void SetUp() override {
if (!sysmanUltsEnable) {
GTEST_SKIP();
}
SysmanMultiDeviceFixture::SetUp();
pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager();
pMemoryManager = std::make_unique<::testing::NiceMock<MockMemoryManagerInRasSysman>>(*neoDevice->getExecutionEnvironment());
pMemoryManager->localMemorySupported[0] = true;
device->getDriverHandle()->setMemoryManager(pMemoryManager.get());
pFsAccess = std::make_unique<NiceMock<Mock<RasFsAccess>>>();
pSysfsAccess = std::make_unique<NiceMock<Mock<RasSysfsAccess>>>();
pRasFwUtilInterface = std::make_unique<NiceMock<Mock<RasFwInterface>>>();
pFsAccessOriginal = pLinuxSysmanImp->pFsAccess;
pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess;
pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface;
pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface;
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
pPmuInterface = std::make_unique<NiceMock<Mock<MockPmuInterfaceImpForRas>>>(pLinuxSysmanImp);
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
pFsAccess->mockReadDirectoryForMultiDevice = true;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
uint32_t subDeviceCount = 0;
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr);
if (subDeviceCount == 0) {
deviceHandles.resize(1, device->toHandle());
} else {
deviceHandles.resize(subDeviceCount, nullptr);
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, deviceHandles.data());
}
}
void TearDown() override {
if (!sysmanUltsEnable) {
GTEST_SKIP();
}
device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal);
pLinuxSysmanImp->pFsAccess = pFsAccessOriginal;
pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal;
pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface;
pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal;
SysmanMultiDeviceFixture::TearDown();
}
std::vector<zes_ras_handle_t> getRasHandles(uint32_t count) {
std::vector<zes_ras_handle_t> handles(count, nullptr);
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS);
return handles;
}
};
TEST_F(SysmanMultiDeviceFixture, GivenValidSysmanHandleWithMultiDeviceWhenRetrievingRasHandlesThenSuccessIsReturned) {
RasHandleContext *pRasHandleContext = new RasHandleContext(pSysmanDeviceImp->pOsSysman);
uint32_t count = 0;
ze_result_t result = pRasHandleContext->rasGet(&count, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ((count > 0), true);
delete pRasHandleContext;
}
TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesThenSuccessIsReturned) {
uint32_t count = 0;
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(count, mockHandleCountForSubDevice);
uint32_t testcount = count + 1;
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(testcount, mockHandleCountForSubDevice);
auto handles = getRasHandles(mockHandleCountForSubDevice);
for (auto handle : handles) {
EXPECT_NE(handle, nullptr);
}
}
TEST_F(SysmanRasMultiDeviceFixture, GivenValidHandleWhenGettingRasPropertiesThenSuccessIsReturned) {
for (auto deviceHandle : deviceHandles) {
zes_ras_properties_t properties = {};
ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
Device::fromHandle(deviceHandle)->getProperties(&deviceProperties);
bool isSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE;
PublicLinuxRasImp *pLinuxRasImp = new PublicLinuxRasImp(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, deviceProperties.subdeviceId);
EXPECT_EQ(ZE_RESULT_SUCCESS, pLinuxRasImp->osRasGetProperties(properties));
EXPECT_EQ(properties.subdeviceId, deviceProperties.subdeviceId);
EXPECT_EQ(properties.onSubdevice, isSubDevice);
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
delete pLinuxRasImp;
}
}
TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) {
pPmuInterface->mockPmuReadTile = true;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCountForSubDevice);
uint32_t handleIndex = 0u;
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
if (handleIndex == 0u) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctableGrfErrorCountTile0 + correctableEuErrorCountTile0 + initialCorrectableCacheErrors); // No. of correctable error type for subdevice 0
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socCorrectableHbmSs0_1CountTile0 + initialCorrectableNonComputeErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
} else if (handleIndex == 1u) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalEuErrorCountTile0 + initialUncorrectableCacheErrors); // No. of uncorrectable error type for subdevice 0
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCountTile0 + initialEngineReset);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttentionTile0 + initialProgrammingErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalFpuTile0 + FatalL3FabricTile0 + initialUncorrectableComputeErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socNonFatalPsfCsc0CountTile0 + socFatalHbmSs1_15CountTile0 + initialUncorrectableNonComputeErrors);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
} else if (handleIndex == 2u) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctableSamplerErrorCountTile1 + correctableGucErrorCountTile1 + initialCorrectableCacheErrorsTile1); // No. of correctable error type for subdevice 1
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socCorrectableFabricSs1_0CountTile1 + initialCorrectableNonComputeErrorsTile1);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
} else if (handleIndex == 3u) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalGucErrorCountTile1 + fatalIdiParityErrorCountTile1 + initialUncorrectableCacheErrorsTile1); // No. of uncorrectable error type for subdevice 1
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCountTile1 + initialEngineResetTile1);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttentionTile1 + initialProgrammingErrorsTile1);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiWestCountTile1 + socNonFatalPunitCountTile1 + initialUncorrectableNonComputeErrorsTile1);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverEngineOther + initialUncorrectableDriverErrorsTile1);
}
handleIndex++;
}
}
TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSuccessIsReturned) {
pPmuInterface->mockPmuReadResult = true;
pRasFwUtilInterface->mockMemorySuccess = true;
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
delete handle;
}
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
auto handles = getRasHandles(mockHandleCountForSubDevice);
uint32_t handleIndex = 0u;
for (auto handle : handles) {
zes_ras_state_t state = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
if (handleIndex == 0u) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 0
} else if (handleIndex == 1u) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 0
} else if (handleIndex == 2u) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 1
} else if (handleIndex == 3u) {
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 1
}
handleIndex++;
}
}
class SysmanRasAffinityMaskFixture : public SysmanRasMultiDeviceFixture {
void SetUp() override {
if (!sysmanUltsEnable) {
GTEST_SKIP();
}
NEO::DebugManager.flags.ZE_AFFINITY_MASK.set("0.1");
SysmanRasMultiDeviceFixture::SetUp();
}
void TearDown() override {
if (!sysmanUltsEnable) {
GTEST_SKIP();
}
SysmanRasMultiDeviceFixture::TearDown();
}
DebugManagerStateRestore restorer;
};
TEST_F(SysmanRasAffinityMaskFixture, GivenAffinityMaskIsSetWhenCallingRasPropertiesThenPropertiesAreReturnedForTheSubDevicesAccordingToAffinityMask) {
uint32_t count = 0;
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(count, mockHandleCount);
auto handles = getRasHandles(mockHandleCount);
uint32_t handleIndex = 0u;
for (auto handle : handles) {
zes_ras_properties_t properties = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
EXPECT_EQ(properties.pNext, nullptr);
EXPECT_EQ(properties.onSubdevice, true);
EXPECT_EQ(properties.subdeviceId, 1u); //Affinity mask 0.1 is set which means only subdevice 1 is exposed
if (handleIndex == 0u) {
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
} else if (handleIndex == 1u) {
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
}
handleIndex++;
}
}
} // namespace ult
} // namespace L0