Added RAS ULTs
Related-To: LOCI-3759 Signed-off-by: Bari, Pratik <pratik.bari@intel.com>
This commit is contained in:
parent
a60b5898dc
commit
072963d0f7
|
@ -6,12 +6,17 @@
|
|||
|
||||
set(L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
|
||||
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_ras.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}mock_fs_ras.h
|
||||
)
|
||||
|
||||
if((NEO_ENABLE_i915_PRELIM_DETECTION) AND ("${BRANCH_TYPE}" STREQUAL ""))
|
||||
list(REMOVE_ITEM L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
|
||||
if(NEO_ENABLE_i915_PRELIM_DETECTION)
|
||||
list(APPEND L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras_fabric_prelim.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras_prelim.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras_fabric_prelim.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras_prelim.h
|
||||
)
|
||||
else()
|
||||
list(APPEND L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras.h
|
||||
)
|
||||
|
@ -23,5 +28,4 @@ if(UNIX)
|
|||
${L0_TESTS_TOOLS_SYSMAN_RAS_LINUX}
|
||||
)
|
||||
endif()
|
||||
add_subdirectories()
|
||||
|
||||
|
|
|
@ -16,16 +16,13 @@ namespace L0 {
|
|||
namespace ult {
|
||||
|
||||
class RasFsAccess : public FsAccess {};
|
||||
template <>
|
||||
struct Mock<RasFsAccess> : public RasFsAccess {
|
||||
MOCK_METHOD(bool, isRootUser, (), (override));
|
||||
bool userIsRoot() {
|
||||
return true;
|
||||
|
||||
struct MockRasFsAccess : public RasFsAccess {
|
||||
bool mockRootUser = true;
|
||||
bool isRootUser() override {
|
||||
return mockRootUser;
|
||||
}
|
||||
bool userIsNotRoot() {
|
||||
return false;
|
||||
}
|
||||
Mock<RasFsAccess>() = default;
|
||||
MockRasFsAccess() = default;
|
||||
};
|
||||
|
||||
} // namespace ult
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
/*
|
||||
* Copyright (C) 2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_memory_manager.h"
|
||||
#include "level_zero/tools/source/sysman/linux/pmu/pmu_imp.h"
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
|
||||
|
||||
#include "sysman/linux/fs_access.h"
|
||||
#include "sysman/linux/os_sysman_imp.h"
|
||||
#include "sysman/ras/ras.h"
|
||||
#include "sysman/ras/ras_imp.h"
|
||||
|
||||
#include <map>
|
||||
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
|
||||
class MockRasFabricFsAccess : public FsAccess {
|
||||
public:
|
||||
ze_result_t canRead(const std::string file) override {
|
||||
if (accessibleNodes.find(file) != accessibleNodes.end()) {
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
}
|
||||
|
||||
~MockRasFabricFsAccess() override = default;
|
||||
|
||||
bool isRootUser() override {
|
||||
return true;
|
||||
}
|
||||
|
||||
ze_result_t read(const std::string file, uint64_t &val) override {
|
||||
if (canRead(file) == ZE_RESULT_SUCCESS) {
|
||||
val = accessibleNodes[file];
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
}
|
||||
|
||||
void setAccessibleNodes(std::map<std::string, uint64_t> &nodes) {
|
||||
accessibleNodes = nodes;
|
||||
}
|
||||
|
||||
void setAccessibleDirectories(std::vector<std::string> &dirs) {
|
||||
accessibleDirectories = dirs;
|
||||
}
|
||||
|
||||
bool directoryExists(const std::string path) override {
|
||||
if (std::find(accessibleDirectories.begin(), accessibleDirectories.end(), path) != accessibleDirectories.end()) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::map<std::string, uint64_t> accessibleNodes = {};
|
||||
std::vector<std::string> accessibleDirectories = {};
|
||||
};
|
||||
|
||||
class MockRasFabricSysFsAccess : public SysfsAccess {
|
||||
public:
|
||||
ze_result_t readSymLink(const std::string path, std::string &buf) override {
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
}
|
||||
|
||||
ze_result_t getRealPath(const std::string path, std::string &buf) override {
|
||||
buf.append("/mockRealPath");
|
||||
return mockRealPathStatus;
|
||||
}
|
||||
|
||||
ze_result_t mockRealPathStatus = ZE_RESULT_SUCCESS;
|
||||
};
|
||||
|
||||
struct MockMemoryManagerInRasSysman : public MemoryManagerMock {
|
||||
MockMemoryManagerInRasSysman(NEO::ExecutionEnvironment &executionEnvironment) : MemoryManagerMock(const_cast<NEO::ExecutionEnvironment &>(executionEnvironment)) {}
|
||||
};
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
|
@ -0,0 +1,572 @@
|
|||
/*
|
||||
* Copyright (C) 2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_memory_manager.h"
|
||||
#include "level_zero/tools/source/sysman/linux/pmu/pmu_imp.h"
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
|
||||
|
||||
#include "sysman/linux/fs_access.h"
|
||||
#include "sysman/linux/os_sysman_imp.h"
|
||||
#include "sysman/ras/ras.h"
|
||||
#include "sysman/ras/ras_imp.h"
|
||||
|
||||
using namespace NEO;
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
|
||||
const std::string deviceDir("device");
|
||||
const std::string eventsDir("/sys/devices/i915_0000_03_00.0/events");
|
||||
constexpr int64_t mockPmuFd = 10;
|
||||
constexpr uint64_t correctableGrfErrorCount = 100u;
|
||||
constexpr uint64_t correctableEuErrorCount = 75u;
|
||||
constexpr uint64_t fatalEuErrorCount = 50u;
|
||||
constexpr uint64_t fatalTlb = 3u;
|
||||
constexpr uint64_t fatalEngineResetCount = 45u;
|
||||
constexpr uint64_t correctableGrfErrorCountTile0 = 90u;
|
||||
constexpr uint64_t correctableEuErrorCountTile0 = 70u;
|
||||
constexpr uint64_t fatalEuErrorCountTile0 = 55u;
|
||||
constexpr uint64_t fatalEngineResetCountTile0 = 72u;
|
||||
constexpr uint64_t correctableSamplerErrorCountTile1 = 30u;
|
||||
constexpr uint64_t fatalGucErrorCountTile1 = 40u;
|
||||
constexpr uint64_t fatalIdiParityErrorCountTile1 = 60u;
|
||||
constexpr uint64_t correctableGucErrorCountTile1 = 25u;
|
||||
constexpr uint64_t fatalEngineResetCountTile1 = 85u;
|
||||
constexpr uint64_t socCorrectableFabricSs0_0Count = 2u;
|
||||
constexpr uint64_t socFatalMdfiEastCount = 3u;
|
||||
constexpr uint64_t socNonFatalPsfCsc0Count = 5u;
|
||||
constexpr uint64_t socCorrectableHbmSs0_1CountTile0 = 6u;
|
||||
constexpr uint64_t socNonFatalPsfCsc0CountTile0 = 6u;
|
||||
constexpr uint64_t socFatalHbmSs1_15CountTile0 = 7u;
|
||||
constexpr uint64_t socCorrectableFabricSs1_0CountTile1 = 8u;
|
||||
constexpr uint64_t socNonFatalPunitCountTile1 = 9u;
|
||||
constexpr uint64_t socFatalMdfiWestCountTile1 = 0u;
|
||||
constexpr uint64_t fatalFpuTile0 = 1u;
|
||||
constexpr uint64_t FatalL3FabricTile0 = 4u;
|
||||
constexpr uint64_t euAttention = 10u;
|
||||
constexpr uint64_t euAttentionTile0 = 5u;
|
||||
constexpr uint64_t euAttentionTile1 = 2u;
|
||||
constexpr uint64_t driverMigration = 2u;
|
||||
constexpr uint64_t driverGgtt = 1u;
|
||||
constexpr uint64_t driverRps = 2u;
|
||||
constexpr uint64_t driverEngineOther = 3u;
|
||||
constexpr uint64_t initialCorrectableCacheErrors = 6u;
|
||||
constexpr uint64_t initialUncorrectableCacheErrors = 7u;
|
||||
constexpr uint64_t initialEngineReset = 2u;
|
||||
constexpr uint64_t initialProgrammingErrors = 7u;
|
||||
constexpr uint64_t initialCorrectableNonComputeErrors = 6u;
|
||||
constexpr uint64_t initialUncorrectableNonComputeErrors = 13u;
|
||||
constexpr uint64_t initialUncorrectableComputeErrors = 5u;
|
||||
constexpr uint64_t initialUncorrectableDriverErrors = 5u;
|
||||
constexpr uint64_t initialCorrectableCacheErrorsTile1 = 5u;
|
||||
constexpr uint64_t initialUncorrectableCacheErrorsTile1 = 7u;
|
||||
constexpr uint64_t initialEngineResetTile1 = 4u;
|
||||
constexpr uint64_t initialProgrammingErrorsTile1 = 5u;
|
||||
constexpr uint64_t initialCorrectableNonComputeErrorsTile1 = 4u;
|
||||
constexpr uint64_t initialUncorrectableNonComputeErrorsTile1 = 5u;
|
||||
constexpr uint64_t initialUncorrectableDriverErrorsTile1 = 4u;
|
||||
constexpr uint64_t timeStamp = 1000u;
|
||||
constexpr uint32_t pmuDriverType = 16u;
|
||||
constexpr uint64_t hbmCorrectableErrorCount = 2;
|
||||
constexpr uint64_t hbmUncorrectableErrorCount = 3;
|
||||
|
||||
struct MockMemoryManagerInRasSysman : public MemoryManagerMock {
|
||||
MockMemoryManagerInRasSysman(NEO::ExecutionEnvironment &executionEnvironment) : MemoryManagerMock(const_cast<NEO::ExecutionEnvironment &>(executionEnvironment)) {}
|
||||
};
|
||||
|
||||
class MockPmuInterfaceImpForRas : public PmuInterfaceImp {
|
||||
public:
|
||||
using PmuInterfaceImp::perfEventOpen;
|
||||
MockPmuInterfaceImpForRas(LinuxSysmanImp *pLinuxSysmanImp) : PmuInterfaceImp(pLinuxSysmanImp) {}
|
||||
};
|
||||
template <>
|
||||
struct Mock<MockPmuInterfaceImpForRas> : public MockPmuInterfaceImpForRas {
|
||||
|
||||
int32_t mockPmuReadCount = 0;
|
||||
int32_t mockPmuReadCountAfterClear = 0;
|
||||
int32_t mockPmuReadTileCount = 0;
|
||||
|
||||
bool mockPmuReadCorrectable = false;
|
||||
bool mockPmuReadAfterClear = false;
|
||||
bool mockPmuReadResult = false;
|
||||
bool mockPerfEvent = false;
|
||||
bool mockPmuReadTile = false;
|
||||
|
||||
Mock<MockPmuInterfaceImpForRas>(LinuxSysmanImp *pLinuxSysmanImp) : MockPmuInterfaceImpForRas(pLinuxSysmanImp) {}
|
||||
|
||||
int64_t perfEventOpen(perf_event_attr *attr, pid_t pid, int cpu, int groupFd, uint64_t flags) override {
|
||||
|
||||
if (mockPerfEvent == true) {
|
||||
return mockedPerfEventOpenAndFailureReturn(attr, pid, cpu, groupFd, flags);
|
||||
}
|
||||
|
||||
return mockPmuFd;
|
||||
}
|
||||
|
||||
int64_t mockedPerfEventOpenAndFailureReturn(perf_event_attr *attr, pid_t pid, int cpu, int groupFd, uint64_t flags) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int mockedPmuReadForCorrectableAndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
|
||||
memset(data, 0, sizeOfdata);
|
||||
data[1] = timeStamp;
|
||||
data[2] = socCorrectableFabricSs0_0Count;
|
||||
data[3] = 0;
|
||||
data[4] = correctableGrfErrorCount;
|
||||
data[5] = correctableEuErrorCount;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mockedPmuReadForUncorrectableAndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
|
||||
memset(data, 0, sizeOfdata);
|
||||
data[1] = timeStamp;
|
||||
data[2] = fatalEngineResetCount;
|
||||
data[3] = euAttention;
|
||||
data[4] = driverMigration;
|
||||
data[5] = driverGgtt;
|
||||
data[6] = driverRps;
|
||||
data[7] = 0;
|
||||
data[8] = 0;
|
||||
data[9] = socFatalMdfiEastCount;
|
||||
data[10] = socNonFatalPsfCsc0Count;
|
||||
data[11] = 0;
|
||||
data[12] = fatalEuErrorCount;
|
||||
data[13] = fatalTlb;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mockedPmuReadForCorrectableTile0AndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
|
||||
memset(data, 0, sizeOfdata);
|
||||
data[1] = timeStamp;
|
||||
data[2] = 0;
|
||||
data[3] = socCorrectableHbmSs0_1CountTile0;
|
||||
data[4] = correctableGrfErrorCountTile0;
|
||||
data[5] = correctableEuErrorCountTile0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mockedPmuReadForUncorrectableTile0AndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
|
||||
memset(data, 0, sizeOfdata);
|
||||
data[1] = timeStamp;
|
||||
data[2] = fatalEngineResetCountTile0;
|
||||
data[3] = euAttentionTile0;
|
||||
data[4] = driverMigration;
|
||||
data[5] = driverGgtt;
|
||||
data[6] = driverRps;
|
||||
data[7] = fatalFpuTile0;
|
||||
data[8] = FatalL3FabricTile0;
|
||||
data[9] = socFatalHbmSs1_15CountTile0;
|
||||
data[10] = 0;
|
||||
data[11] = socNonFatalPsfCsc0CountTile0;
|
||||
data[12] = fatalEuErrorCountTile0;
|
||||
data[13] = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mockedPmuReadForCorrectableTile1AndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
|
||||
memset(data, 0, sizeOfdata);
|
||||
data[1] = timeStamp;
|
||||
data[2] = socCorrectableFabricSs1_0CountTile1;
|
||||
data[3] = correctableGucErrorCountTile1;
|
||||
data[4] = correctableSamplerErrorCountTile1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mockedPmuReadForUncorrectableTile1AndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
|
||||
memset(data, 0, sizeOfdata);
|
||||
data[1] = timeStamp;
|
||||
data[2] = fatalEngineResetCountTile1;
|
||||
data[3] = euAttentionTile1;
|
||||
data[4] = driverMigration;
|
||||
data[5] = driverEngineOther;
|
||||
data[6] = socFatalMdfiWestCountTile1;
|
||||
data[7] = socNonFatalPunitCountTile1;
|
||||
data[8] = fatalGucErrorCountTile1;
|
||||
data[9] = fatalIdiParityErrorCountTile1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mockedPmuReadAfterClearAndSuccessReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
|
||||
memset(data, 0, sizeOfdata);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mockedPmuReadAndFailureReturn(int fd, uint64_t *data, ssize_t sizeOfdata) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int pmuRead(int fd, uint64_t *data, ssize_t sizeOfdata) override {
|
||||
|
||||
if (mockPmuReadResult == true) {
|
||||
return mockedPmuReadAndFailureReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
|
||||
if (mockPmuReadCorrectable == true) {
|
||||
if (mockPmuReadCount == 0) {
|
||||
mockPmuReadCount++;
|
||||
return mockedPmuReadForCorrectableAndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
|
||||
else if (mockPmuReadCount == 1) {
|
||||
mockPmuReadCount++;
|
||||
return mockedPmuReadForUncorrectableAndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
}
|
||||
|
||||
if (mockPmuReadAfterClear == true) {
|
||||
if (mockPmuReadCountAfterClear == 0) {
|
||||
mockPmuReadCountAfterClear++;
|
||||
return mockedPmuReadForCorrectableAndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
|
||||
else if (mockPmuReadCountAfterClear == 1) {
|
||||
mockPmuReadCountAfterClear++;
|
||||
return mockedPmuReadForUncorrectableAndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
|
||||
else {
|
||||
mockPmuReadCountAfterClear++;
|
||||
return mockedPmuReadAfterClearAndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
}
|
||||
|
||||
if (mockPmuReadTile == true) {
|
||||
if (mockPmuReadTileCount == 0) {
|
||||
mockPmuReadTileCount++;
|
||||
return mockedPmuReadForCorrectableTile0AndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
|
||||
else if (mockPmuReadTileCount == 1) {
|
||||
mockPmuReadTileCount++;
|
||||
return mockedPmuReadForUncorrectableTile0AndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
|
||||
else if (mockPmuReadTileCount == 2) {
|
||||
mockPmuReadTileCount++;
|
||||
return mockedPmuReadForCorrectableTile1AndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
|
||||
else if (mockPmuReadTileCount == 3) {
|
||||
mockPmuReadTileCount++;
|
||||
return mockedPmuReadForUncorrectableTile1AndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
class RasFsAccess : public FsAccess {};
|
||||
class RasSysfsAccess : public SysfsAccess {};
|
||||
template <>
|
||||
struct Mock<RasSysfsAccess> : public RasSysfsAccess {
|
||||
|
||||
ze_result_t mockReadSymLinkStatus = ZE_RESULT_SUCCESS;
|
||||
bool mockReadSymLinkResult = false;
|
||||
|
||||
ze_result_t readSymLink(const std::string file, std::string &val) override {
|
||||
|
||||
if (mockReadSymLinkStatus != ZE_RESULT_SUCCESS) {
|
||||
return mockReadSymLinkStatus;
|
||||
}
|
||||
|
||||
if (mockReadSymLinkResult == true) {
|
||||
return getValStringSymLinkFailure(file, val);
|
||||
}
|
||||
|
||||
if (file.compare(deviceDir) == 0) {
|
||||
val = "/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0";
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
ze_result_t getValStringSymLinkFailure(const std::string file, std::string &val) {
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
ze_result_t read(const std::string file, uint64_t &val) override {
|
||||
if (file.compare("gt/gt0/error_counter/correctable_eu_grf") == 0) {
|
||||
val = 5u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/correctable_eu_ic") == 0) {
|
||||
val = 1u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/fatal_eu_ic") == 0) {
|
||||
val = 5u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/fatal_tlb") == 0) {
|
||||
val = 2u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/engine_reset") == 0) {
|
||||
val = 2u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt1/error_counter/correctable_sampler") == 0) {
|
||||
val = 2u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt1/error_counter/fatal_guc") == 0) {
|
||||
val = 6u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt1/error_counter/fatal_idi_parity") == 0) {
|
||||
val = 1u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt1/error_counter/correctable_guc") == 0) {
|
||||
val = 3u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt1/error_counter/engine_reset") == 0) {
|
||||
val = 4u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/eu_attention") == 0) {
|
||||
val = 7u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt1/error_counter/eu_attention") == 0) {
|
||||
val = 5u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/soc_correctable_fabric_ss0_0") == 0) {
|
||||
val = 1u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/soc_fatal_mdfi_east") == 0) {
|
||||
val = 5u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/soc_nonfatal_psf_csc_0") == 0) {
|
||||
val = 3u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/soc_correctable_hbm_ss0_1") == 0) {
|
||||
val = 5u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/soc_fatal_hbm_ss1_15") == 0) {
|
||||
val = 5u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt1/error_counter/soc_correctable_fabric_ss1_0") == 0) {
|
||||
val = 4u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt1/error_counter/soc_nonfatal_punit") == 0) {
|
||||
val = 3u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt1/error_counter/soc_fatal_mdfi_west") == 0) {
|
||||
val = 2u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/fatal_fpu") == 0) {
|
||||
val = 2u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/fatal_l3_fabric") == 0) {
|
||||
val = 3u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/driver_ggtt") == 0) {
|
||||
val = 2u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/driver_rps") == 0) {
|
||||
val = 2u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("error_counter/driver_object_migration") == 0) {
|
||||
val = 1u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt1/error_counter/driver_engine_other") == 0) {
|
||||
val = 3u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Mock<RasFsAccess> : public RasFsAccess {
|
||||
|
||||
ze_result_t mockListDirectoryStatus = ZE_RESULT_SUCCESS;
|
||||
bool mockReadDirectoryFailure = false;
|
||||
bool mockReadFileFailure = false;
|
||||
bool mockReadDirectoryWithoutRasEvents = false;
|
||||
bool mockRootUser = false;
|
||||
bool mockReadVal = false;
|
||||
bool mockReadDirectoryForMultiDevice = false;
|
||||
|
||||
bool directoryExists(const std::string path) override {
|
||||
// disables fabric errors
|
||||
return false;
|
||||
}
|
||||
|
||||
ze_result_t listDirectory(const std::string directory, std::vector<std::string> &events) override {
|
||||
|
||||
if (mockListDirectoryStatus != ZE_RESULT_SUCCESS) {
|
||||
return mockListDirectoryStatus;
|
||||
}
|
||||
|
||||
if (mockReadDirectoryFailure == true) {
|
||||
return readDirectoryFailure(directory, events);
|
||||
}
|
||||
|
||||
if (mockReadDirectoryWithoutRasEvents == true) {
|
||||
return readDirectoryWithoutRasEvents(directory, events);
|
||||
}
|
||||
|
||||
if (mockReadDirectoryForMultiDevice == true) {
|
||||
return readDirectorySuccessForMultiDevice(directory, events);
|
||||
}
|
||||
|
||||
if (directory.compare(eventsDir) == 0) {
|
||||
events.push_back("bcs0-busy");
|
||||
events.push_back("error--correctable-eu-grf");
|
||||
events.push_back("error--correctable-eu-ic");
|
||||
events.push_back("error--soc-correctable-fabric-ss0-0");
|
||||
events.push_back("error--soc-correctable-hbm-ss0-1");
|
||||
events.push_back("error--soc-fatal-hbm-ss1-15");
|
||||
events.push_back("error--soc-fatal-mdfi-east");
|
||||
events.push_back("error--soc-nonfatal-psf-csc-0");
|
||||
events.push_back("error--fatal-eu-ic");
|
||||
events.push_back("error--fatal-tlb");
|
||||
events.push_back("error--engine-reset");
|
||||
events.push_back("error--eu-attention");
|
||||
events.push_back("error--driver-object-migration");
|
||||
events.push_back("error--driver-ggtt");
|
||||
events.push_back("error--driver-rps");
|
||||
events.push_back("error--fatal-fpu");
|
||||
events.push_back("error--fatal-l3-fabric");
|
||||
events.push_back("ccs0-busy");
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
ze_result_t readDirectoryWithoutRasEvents(const std::string directory, std::vector<std::string> &events) {
|
||||
if (directory.compare(eventsDir) == 0) {
|
||||
events.push_back("bcs0-busy");
|
||||
events.push_back("ccs0-busy");
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
ze_result_t readDirectorySuccessForMultiDevice(const std::string directory, std::vector<std::string> &events) {
|
||||
if (directory.compare(eventsDir) == 0) {
|
||||
events.push_back("bcs0-busy");
|
||||
events.push_back("error-gt0--correctable-eu-grf");
|
||||
events.push_back("error-gt0--correctable-eu-ic");
|
||||
events.push_back("error-gt0--soc-correctable-hbm-ss0-1");
|
||||
events.push_back("error-gt0--soc-correctable-fabric-ss0-0");
|
||||
events.push_back("error-gt0--soc-nonfatal-psf-csc-0");
|
||||
events.push_back("error-gt0--soc-fatal-hbm-ss1-15");
|
||||
events.push_back("error-gt0--soc-fatal-mdfi-east");
|
||||
events.push_back("error-gt0--fatal-eu-ic");
|
||||
events.push_back("error-gt0--fatal-tlb");
|
||||
events.push_back("error-gt0--engine-reset");
|
||||
events.push_back("error-gt0--eu-attention");
|
||||
events.push_back("error-gt0--fatal-fpu");
|
||||
events.push_back("error-gt0--fatal-l3-fabric");
|
||||
events.push_back("error--driver-object-migration");
|
||||
events.push_back("error-gt0--driver-ggtt");
|
||||
events.push_back("error-gt0--driver-rps");
|
||||
events.push_back("error-gt1--correctable-sampler");
|
||||
events.push_back("error-gt1--soc-correctable-fabric-ss1-0");
|
||||
events.push_back("error-gt1--soc-nonfatal-punit");
|
||||
events.push_back("error-gt1--soc-fatal-mdfi-west");
|
||||
events.push_back("error-gt1--fatal-guc");
|
||||
events.push_back("error-gt1--fatal-idi-parity");
|
||||
events.push_back("error-gt1--correctable-guc");
|
||||
events.push_back("error-gt1--engine-reset");
|
||||
events.push_back("error-gt1--eu-attention");
|
||||
events.push_back("error-gt1--driver-engine-other");
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
ze_result_t readDirectoryFailure(const std::string directory, std::vector<std::string> &events) {
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
ze_result_t read(const std::string file, std::string &config) override {
|
||||
|
||||
if (mockReadFileFailure == true) {
|
||||
return readFileFailure(file, config);
|
||||
}
|
||||
|
||||
config = "config=0x0000000000000001";
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t readFileFailure(const std::string, std::string &config) {
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
ze_result_t read(const std::string file, uint32_t &val) override {
|
||||
|
||||
if (mockReadVal == true) {
|
||||
return readValFailure(file, val);
|
||||
}
|
||||
|
||||
val = pmuDriverType;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t readValFailure(const std::string file, uint32_t &val) {
|
||||
val = 0;
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
bool isRootUser() override {
|
||||
|
||||
if (mockRootUser == true) {
|
||||
return userIsNotRoot();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool userIsNotRoot() {
|
||||
return false;
|
||||
}
|
||||
|
||||
Mock<RasFsAccess>() = default;
|
||||
};
|
||||
|
||||
class RasFwInterface : public FirmwareUtil {};
|
||||
|
||||
template <>
|
||||
struct Mock<RasFwInterface> : public FirmwareUtil {
|
||||
|
||||
bool mockMemorySuccess = false;
|
||||
|
||||
ze_result_t mockGetMemoryErrorSuccess(zes_ras_error_type_t category, uint64_t subDeviceCount, uint64_t subDeviceId, uint64_t &count) {
|
||||
if (category == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
|
||||
count = hbmCorrectableErrorCount;
|
||||
}
|
||||
if (category == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
|
||||
count = hbmUncorrectableErrorCount;
|
||||
}
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t fwGetMemoryErrorCount(zes_ras_error_type_t category, uint32_t subDeviceCount, uint32_t subDeviceId, uint64_t &count) override {
|
||||
|
||||
if (mockMemorySuccess == true) {
|
||||
return mockGetMemoryErrorSuccess(category, subDeviceCount, subDeviceId, count);
|
||||
}
|
||||
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
|
||||
Mock<RasFwInterface>() = default;
|
||||
|
||||
ADDMETHOD_NOBASE(fwDeviceInit, ze_result_t, ZE_RESULT_SUCCESS, ());
|
||||
ADDMETHOD_NOBASE(getFirstDevice, ze_result_t, ZE_RESULT_SUCCESS, (igsc_device_info * info));
|
||||
ADDMETHOD_NOBASE(getFwVersion, ze_result_t, ZE_RESULT_SUCCESS, (std::string fwType, std::string &firmwareVersion));
|
||||
ADDMETHOD_NOBASE(flashFirmware, ze_result_t, ZE_RESULT_SUCCESS, (std::string fwType, void *pImage, uint32_t size));
|
||||
ADDMETHOD_NOBASE(fwIfrApplied, ze_result_t, ZE_RESULT_SUCCESS, (bool &ifrStatus));
|
||||
ADDMETHOD_NOBASE(fwSupportedDiagTests, ze_result_t, ZE_RESULT_SUCCESS, (std::vector<std::string> & supportedDiagTests));
|
||||
ADDMETHOD_NOBASE(fwRunDiagTests, ze_result_t, ZE_RESULT_SUCCESS, (std::string & osDiagType, zes_diag_result_t *pResult));
|
||||
ADDMETHOD_NOBASE_VOIDRETURN(getDeviceSupportedFwTypes, (std::vector<std::string> & fwTypes));
|
||||
ADDMETHOD_NOBASE(fwGetEccConfig, ze_result_t, ZE_RESULT_SUCCESS, (uint8_t * currentState, uint8_t *pendingState));
|
||||
ADDMETHOD_NOBASE(fwSetEccConfig, ze_result_t, ZE_RESULT_SUCCESS, (uint8_t newState, uint8_t *currentState, uint8_t *pendingState));
|
||||
};
|
||||
|
||||
class PublicLinuxRasImp : public L0::LinuxRasImp {
|
||||
public:
|
||||
PublicLinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId) {}
|
||||
using LinuxRasImp::pFsAccess;
|
||||
};
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
|
@ -13,7 +13,6 @@ extern bool sysmanUltsEnable;
|
|||
|
||||
using ::testing::_;
|
||||
using ::testing::Matcher;
|
||||
using ::testing::NiceMock;
|
||||
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
|
@ -21,7 +20,7 @@ namespace ult {
|
|||
constexpr uint32_t mockHandleCount = 0;
|
||||
struct SysmanRasFixture : public SysmanDeviceFixture {
|
||||
protected:
|
||||
std::unique_ptr<Mock<RasFsAccess>> pFsAccess;
|
||||
std::unique_ptr<MockRasFsAccess> pFsAccess;
|
||||
std::vector<ze_device_handle_t> deviceHandles;
|
||||
FsAccess *pFsAccessOriginal = nullptr;
|
||||
void SetUp() override {
|
||||
|
@ -29,11 +28,10 @@ struct SysmanRasFixture : public SysmanDeviceFixture {
|
|||
GTEST_SKIP();
|
||||
}
|
||||
SysmanDeviceFixture::SetUp();
|
||||
pFsAccess = std::make_unique<NiceMock<Mock<RasFsAccess>>>();
|
||||
pFsAccess = std::make_unique<MockRasFsAccess>();
|
||||
pFsAccessOriginal = pLinuxSysmanImp->pFsAccess;
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
ON_CALL(*pFsAccess.get(), isRootUser())
|
||||
.WillByDefault(::testing::Invoke(pFsAccess.get(), &Mock<RasFsAccess>::userIsRoot));
|
||||
pFsAccess->mockRootUser = true;
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t subDeviceCount = 0;
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr);
|
||||
|
@ -154,8 +152,7 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetConfigAfterzesRa
|
|||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasSetConfigWithoutPermissionThenFailureIsReturned) {
|
||||
ON_CALL(*pFsAccess.get(), isRootUser())
|
||||
.WillByDefault(::testing::Invoke(pFsAccess.get(), &Mock<RasFsAccess>::userIsNotRoot));
|
||||
pFsAccess->mockRootUser = false;
|
||||
RasImp *pTestRasImp = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle());
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pTestRasImp);
|
||||
|
||||
|
|
|
@ -0,0 +1,529 @@
|
|||
/*
|
||||
* Copyright (C) 2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h"
|
||||
#include "level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_fabric_prelim.h"
|
||||
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
constexpr uint32_t mockHandleCount = 2u;
|
||||
class TestRasFabricFixture : public SysmanDeviceFixture {
|
||||
protected:
|
||||
std::unique_ptr<MockRasFabricFsAccess> pFsAccess;
|
||||
std::unique_ptr<MockRasFabricSysFsAccess> pSysfsAccess;
|
||||
MemoryManager *pMemoryManagerOriginal = nullptr;
|
||||
std::unique_ptr<MockMemoryManagerInRasSysman> pMemoryManager;
|
||||
FsAccess *pFsAccessOriginal = nullptr;
|
||||
SysfsAccess *pSysfsAccessOriginal = nullptr;
|
||||
PmuInterface *pOriginalPmuInterface = nullptr;
|
||||
FirmwareUtil *pOriginalFwUtilInterface = nullptr;
|
||||
std::vector<ze_device_handle_t> deviceHandles;
|
||||
|
||||
void SetUp() override {
|
||||
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
SysmanDeviceFixture::SetUp();
|
||||
pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager();
|
||||
pMemoryManager = std::make_unique<::testing::NiceMock<MockMemoryManagerInRasSysman>>(*neoDevice->getExecutionEnvironment());
|
||||
pMemoryManager->localMemorySupported[0] = true;
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManager.get());
|
||||
pFsAccess = std::make_unique<MockRasFabricFsAccess>();
|
||||
pSysfsAccess = std::make_unique<MockRasFabricSysFsAccess>();
|
||||
pFsAccessOriginal = pLinuxSysmanImp->pFsAccess;
|
||||
pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess;
|
||||
pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface;
|
||||
pOriginalFwUtilInterface = pLinuxSysmanImp->pFwUtilInterface;
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
|
||||
pLinuxSysmanImp->pPmuInterface = nullptr;
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t subDeviceCount = 0;
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr);
|
||||
if (subDeviceCount == 0) {
|
||||
deviceHandles.resize(1, device->toHandle());
|
||||
} else {
|
||||
deviceHandles.resize(subDeviceCount, nullptr);
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, deviceHandles.data());
|
||||
}
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccessOriginal;
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal;
|
||||
pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface;
|
||||
pLinuxSysmanImp->pFwUtilInterface = pOriginalFwUtilInterface;
|
||||
SysmanDeviceFixture::TearDown();
|
||||
}
|
||||
|
||||
std::vector<zes_ras_handle_t> getRasHandles(uint32_t count) {
|
||||
std::vector<zes_ras_handle_t> handles(count, nullptr);
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS);
|
||||
return handles;
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(TestRasFabricFixture, GivenValidRasFabricNodesThenGetStateIsSuccessful) {
|
||||
|
||||
std::vector<std::string> dirs = {"/mockRealPath/iaf.0",
|
||||
"/sys/module/iaf/drivers/platform:iaf/"};
|
||||
std::map<std::string, uint64_t> nodes = {
|
||||
{"/mockRealPath/iaf.0/sd.0/fw_comm_errors", 101},
|
||||
{"/mockRealPath/iaf.0/sd.0/sd_failure", 201},
|
||||
{"/mockRealPath/iaf.0/sd.0/fw_error", 301},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.1/link_failures", 401},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.1/link_degrades", 501},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.2/link_failures", 601},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.2/link_degrades", 701},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.3/link_failures", 801},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.3/link_degrades", 901},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.4/link_failures", 1001},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.4/link_degrades", 1101},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.5/link_failures", 2101},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.5/link_degrades", 3101},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.6/link_failures", 4101},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.6/link_degrades", 5101},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.7/link_failures", 6101},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.7/link_degrades", 7101},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.8/link_failures", 8101},
|
||||
{"/mockRealPath/iaf.0/sd.0/port.8/link_degrades", 9101},
|
||||
};
|
||||
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleDirectories(dirs);
|
||||
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodes);
|
||||
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, mockHandleCount);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
zes_ras_state_t state = {};
|
||||
zes_ras_properties_t properties = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
|
||||
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
|
||||
if (properties.type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 27709u);
|
||||
}
|
||||
if (properties.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 23710u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TestRasFabricFixture, GivenInValidRasFabricNodesThenEnumerationDoesNotReturnAnyHandles) {
|
||||
|
||||
pSysfsAccess->mockRealPathStatus = ZE_RESULT_ERROR_UNKNOWN;
|
||||
uint32_t count = 0;
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL), ZE_RESULT_SUCCESS);
|
||||
EXPECT_EQ(count, 0u);
|
||||
}
|
||||
|
||||
TEST_F(TestRasFabricFixture, GivenValidRasFabricAuxiliaryNodesThenGetStateIsSuccessful) {
|
||||
|
||||
std::vector<std::string> dirs = {"/mockRealPath/i915.iaf.0",
|
||||
"/sys/module/iaf/drivers/auxiliary:iaf/"};
|
||||
std::map<std::string, uint64_t> nodes = {
|
||||
{"/mockRealPath/i915.iaf.0/sd.0/fw_comm_errors", 101},
|
||||
{"/mockRealPath/i915.iaf.0/sd.0/sd_failure", 201},
|
||||
{"/mockRealPath/i915.iaf.0/sd.0/fw_error", 301},
|
||||
{"/mockRealPath/i915.iaf.0/sd.0/port.1/link_failures", 401},
|
||||
{"/mockRealPath/i915.iaf.0/sd.0/port.1/link_degrades", 501},
|
||||
};
|
||||
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleDirectories(dirs);
|
||||
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodes);
|
||||
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, mockHandleCount);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
zes_ras_state_t state = {};
|
||||
zes_ras_properties_t properties = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
|
||||
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
|
||||
if (properties.type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 602u);
|
||||
}
|
||||
if (properties.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 903u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TestRasFabricFixture, GivenSomeRasFabricNodesThenGetStateIsSuccessful) {
|
||||
|
||||
std::vector<std::string> dirs = {"/mockRealPath/iaf.31",
|
||||
"/sys/module/iaf/drivers/platform:iaf/"};
|
||||
std::map<std::string, uint64_t> nodes = {
|
||||
{"/mockRealPath/iaf.31/sd.0/fw_comm_errors", 101},
|
||||
{"/mockRealPath/iaf.31/sd.0/sd_failure", 201},
|
||||
{"/mockRealPath/iaf.31/sd.0/fw_error", 301},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.1/link_failures", 401},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.2/link_failures", 601},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.2/link_degrades", 701},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.3/link_failures", 801},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.3/link_degrades", 901},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.4/link_failures", 1001},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.4/link_degrades", 1101},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.5/link_failures", 2101},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.5/link_degrades", 3101},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.6/link_failures", 4101},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.6/link_degrades", 5101},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.7/link_failures", 6101},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.7/link_degrades", 7101},
|
||||
{"/mockRealPath/iaf.31/sd.0/port.8/link_degrades", 9101},
|
||||
};
|
||||
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleDirectories(dirs);
|
||||
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodes);
|
||||
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, mockHandleCount);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
zes_ras_state_t state = {};
|
||||
zes_ras_properties_t properties = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
|
||||
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
|
||||
if (properties.type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 27709u - 501u);
|
||||
}
|
||||
if (properties.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 23710u - 8101u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TestRasFabricFixture, GivenValidRasFabricNodesWhenGetStateIsCalledTwiceThenRasErrorCountIsDoubled) {
|
||||
|
||||
std::vector<std::string> dirs = {"/mockRealPath/iaf.27",
|
||||
"/sys/module/iaf/drivers/platform:iaf/"};
|
||||
std::map<std::string, uint64_t> nodes = {
|
||||
{"/mockRealPath/iaf.27/sd.0/fw_comm_errors", 101},
|
||||
{"/mockRealPath/iaf.27/sd.0/sd_failure", 201},
|
||||
{"/mockRealPath/iaf.27/sd.0/fw_error", 301},
|
||||
{"/mockRealPath/iaf.27/sd.0/port.1/link_failures", 401},
|
||||
{"/mockRealPath/iaf.27/sd.0/port.1/link_degrades", 501},
|
||||
};
|
||||
std::map<std::string, uint64_t> nodesSecondRead = {
|
||||
{"/mockRealPath/iaf.27/sd.0/fw_comm_errors", 101 * 2},
|
||||
{"/mockRealPath/iaf.27/sd.0/sd_failure", 201 * 2},
|
||||
{"/mockRealPath/iaf.27/sd.0/fw_error", 301 * 2},
|
||||
{"/mockRealPath/iaf.27/sd.0/port.1/link_failures", 401 * 2},
|
||||
{"/mockRealPath/iaf.27/sd.0/port.1/link_degrades", 501 * 2},
|
||||
};
|
||||
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleDirectories(dirs);
|
||||
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodes);
|
||||
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, mockHandleCount);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodesSecondRead);
|
||||
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
zes_ras_state_t state = {};
|
||||
zes_ras_properties_t properties = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
|
||||
if (properties.type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 602u * 2);
|
||||
}
|
||||
if (properties.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 903u * 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TestRasFabricFixture, GivenValidRasFabricNodesWhenGetStateIsCalledTwiceWithClearThenNewRasErrorCountIsRetrieved) {
|
||||
|
||||
std::vector<std::string> dirs = {"/mockRealPath/iaf.27",
|
||||
"/sys/module/iaf/drivers/platform:iaf/"};
|
||||
std::map<std::string, uint64_t> nodes = {
|
||||
{"/mockRealPath/iaf.27/sd.0/fw_comm_errors", 101},
|
||||
{"/mockRealPath/iaf.27/sd.0/sd_failure", 201},
|
||||
{"/mockRealPath/iaf.27/sd.0/fw_error", 301},
|
||||
{"/mockRealPath/iaf.27/sd.0/port.1/link_failures", 401},
|
||||
{"/mockRealPath/iaf.27/sd.0/port.1/link_degrades", 501},
|
||||
};
|
||||
std::map<std::string, uint64_t> nodesSecondRead = {
|
||||
{"/mockRealPath/iaf.27/sd.0/fw_comm_errors", 101 * 2},
|
||||
{"/mockRealPath/iaf.27/sd.0/sd_failure", 201 * 2},
|
||||
{"/mockRealPath/iaf.27/sd.0/fw_error", 301 * 2},
|
||||
{"/mockRealPath/iaf.27/sd.0/port.1/link_failures", 401 * 2},
|
||||
{"/mockRealPath/iaf.27/sd.0/port.1/link_degrades", 501 * 2},
|
||||
};
|
||||
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleDirectories(dirs);
|
||||
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodes);
|
||||
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, mockHandleCount);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 1, &state));
|
||||
}
|
||||
static_cast<MockRasFabricFsAccess *>(pFsAccess.get())->setAccessibleNodes(nodesSecondRead);
|
||||
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
zes_ras_state_t state = {};
|
||||
zes_ras_properties_t properties = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
|
||||
if (properties.type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 602u);
|
||||
}
|
||||
if (properties.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 903u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class SysmanRasFabricMultiDeviceFixture : public MultiDeviceFixture, public ::testing::Test {
|
||||
public:
|
||||
void SetUp() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
MultiDeviceFixture::setUp();
|
||||
for (auto &device : driverHandle->devices) {
|
||||
auto neoDevice = device->getNEODevice();
|
||||
neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()]->osInterface = std::make_unique<NEO::OSInterface>();
|
||||
auto &osInterface = device->getOsInterface();
|
||||
osInterface.setDriverModel(std::make_unique<SysmanMockDrm>(const_cast<NEO::RootDeviceEnvironment &>(neoDevice->getRootDeviceEnvironment())));
|
||||
setenv("ZES_ENABLE_SYSMAN", "1", 1);
|
||||
delete device->getSysmanHandle();
|
||||
device->setSysmanHandle(new SysmanDeviceImp(device->toHandle()));
|
||||
auto pSysmanDevice = device->getSysmanHandle();
|
||||
for (auto &subDevice : static_cast<DeviceImp *>(device)->subDevices) {
|
||||
static_cast<DeviceImp *>(subDevice)->setSysmanHandle(pSysmanDevice);
|
||||
}
|
||||
|
||||
auto pSysmanDeviceImp = static_cast<SysmanDeviceImp *>(pSysmanDevice);
|
||||
auto pOsSysman = pSysmanDeviceImp->pOsSysman;
|
||||
auto pLinuxSysmanImp = static_cast<PublicLinuxSysmanImp *>(pOsSysman);
|
||||
|
||||
pSysmanDeviceImp->init();
|
||||
|
||||
delete pLinuxSysmanImp->pFwUtilInterface;
|
||||
delete pLinuxSysmanImp->pSysfsAccess;
|
||||
delete pLinuxSysmanImp->pProcfsAccess;
|
||||
delete pLinuxSysmanImp->pFsAccess;
|
||||
|
||||
auto pProcfsAccess = new NiceMock<Mock<LinuxProcfsAccess>>();
|
||||
auto pFsAccess = new MockRasFabricFsAccess();
|
||||
auto pSysfsAccess = new MockRasFabricSysFsAccess();
|
||||
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess;
|
||||
pLinuxSysmanImp->pProcfsAccess = pProcfsAccess;
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess;
|
||||
}
|
||||
}
|
||||
void TearDown() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
|
||||
for (auto &device : driverHandle->devices) {
|
||||
auto pSysmanDevice = device->getSysmanHandle();
|
||||
auto pSysmanDeviceImp = static_cast<SysmanDeviceImp *>(pSysmanDevice);
|
||||
auto pOsSysman = pSysmanDeviceImp->pOsSysman;
|
||||
auto pLinuxSysmanImp = static_cast<PublicLinuxSysmanImp *>(pOsSysman);
|
||||
|
||||
delete pLinuxSysmanImp->pSysfsAccess;
|
||||
delete pLinuxSysmanImp->pProcfsAccess;
|
||||
delete pLinuxSysmanImp->pFsAccess;
|
||||
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
pLinuxSysmanImp->pSysfsAccess = nullptr;
|
||||
pLinuxSysmanImp->pProcfsAccess = nullptr;
|
||||
pLinuxSysmanImp->pFsAccess = nullptr;
|
||||
|
||||
delete pSysmanDevice;
|
||||
device->setSysmanHandle(nullptr);
|
||||
}
|
||||
|
||||
unsetenv("ZES_ENABLE_SYSMAN");
|
||||
MultiDeviceFixture::tearDown();
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(SysmanRasFabricMultiDeviceFixture, GivenValidRasFabricNodesForMultipleDevicesThenGetStateReturnsErrorCountSpecificToEachOfDevice) {
|
||||
|
||||
const uint32_t testUseSubDeviceCount = 2u;
|
||||
ASSERT_GE(numRootDevices, 2u);
|
||||
ASSERT_GE(numSubDevices, testUseSubDeviceCount);
|
||||
|
||||
std::vector<std::string> dirs = {"/mockRealPath/iaf.27",
|
||||
"/sys/module/iaf/drivers/platform:iaf/"};
|
||||
{
|
||||
std::map<std::string, uint64_t> nodes = {
|
||||
{"/mockRealPath/iaf.27/sd.0/fw_comm_errors", 1},
|
||||
{"/mockRealPath/iaf.27/sd.0/sd_failure", 1},
|
||||
{"/mockRealPath/iaf.27/sd.0/fw_error", 1},
|
||||
{"/mockRealPath/iaf.27/sd.0/port.1/link_failures", 1},
|
||||
{"/mockRealPath/iaf.27/sd.0/port.1/link_degrades", 1},
|
||||
|
||||
{"/mockRealPath/iaf.27/sd.1/fw_comm_errors", 2},
|
||||
{"/mockRealPath/iaf.27/sd.1/sd_failure", 2},
|
||||
{"/mockRealPath/iaf.27/sd.1/fw_error", 2},
|
||||
{"/mockRealPath/iaf.27/sd.1/port.1/link_failures", 2},
|
||||
{"/mockRealPath/iaf.27/sd.1/port.1/link_degrades", 2},
|
||||
};
|
||||
|
||||
auto pOsSysman = static_cast<SysmanDeviceImp *>(driverHandle->devices[0]->getSysmanHandle())->pOsSysman;
|
||||
auto pLinuxSysmanImp = static_cast<PublicLinuxSysmanImp *>(pOsSysman);
|
||||
|
||||
static_cast<MockRasFabricFsAccess *>(pLinuxSysmanImp->pFsAccess)->setAccessibleDirectories(dirs);
|
||||
static_cast<MockRasFabricFsAccess *>(pLinuxSysmanImp->pFsAccess)->setAccessibleNodes(nodes);
|
||||
}
|
||||
|
||||
{
|
||||
std::map<std::string, uint64_t> nodes = {
|
||||
{"/mockRealPath/iaf.27/sd.0/fw_comm_errors", 3},
|
||||
{"/mockRealPath/iaf.27/sd.0/sd_failure", 3},
|
||||
{"/mockRealPath/iaf.27/sd.0/fw_error", 3},
|
||||
{"/mockRealPath/iaf.27/sd.0/port.1/link_failures", 3},
|
||||
{"/mockRealPath/iaf.27/sd.0/port.1/link_degrades", 3},
|
||||
|
||||
{"/mockRealPath/iaf.27/sd.1/fw_comm_errors", 4},
|
||||
{"/mockRealPath/iaf.27/sd.1/sd_failure", 4},
|
||||
{"/mockRealPath/iaf.27/sd.1/fw_error", 4},
|
||||
{"/mockRealPath/iaf.27/sd.1/port.1/link_failures", 4},
|
||||
{"/mockRealPath/iaf.27/sd.1/port.1/link_degrades", 4},
|
||||
};
|
||||
|
||||
auto pOsSysman = static_cast<SysmanDeviceImp *>(driverHandle->devices[1]->getSysmanHandle())->pOsSysman;
|
||||
auto pLinuxSysmanImp = static_cast<PublicLinuxSysmanImp *>(pOsSysman);
|
||||
|
||||
static_cast<MockRasFabricFsAccess *>(pLinuxSysmanImp->pFsAccess)->setAccessibleDirectories(dirs);
|
||||
static_cast<MockRasFabricFsAccess *>(pLinuxSysmanImp->pFsAccess)->setAccessibleNodes(nodes);
|
||||
}
|
||||
|
||||
const std::vector<std::pair<uint32_t, uint32_t>> errorCounts{
|
||||
{2, 3}, // Device 0, subdevice 0
|
||||
{4, 6}, // Device 0, subdevice 1
|
||||
{6, 9}, // Device 1, subdevice 0
|
||||
{8, 12}, // Device 1, subdevice 1
|
||||
};
|
||||
|
||||
for (uint32_t deviceIndex = 0; deviceIndex < testUseSubDeviceCount; deviceIndex++) {
|
||||
uint32_t count = 0;
|
||||
auto hDevice = driverHandle->devices[deviceIndex]->toHandle();
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(hDevice, &count, NULL), ZE_RESULT_SUCCESS);
|
||||
EXPECT_GT(count, 0u);
|
||||
std::vector<zes_ras_handle_t> handles(count, nullptr);
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(hDevice, &count, handles.data()), ZE_RESULT_SUCCESS);
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
zes_ras_properties_t properties = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
|
||||
|
||||
const auto accessIndex = deviceIndex * testUseSubDeviceCount + properties.subdeviceId;
|
||||
if (properties.type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], errorCounts[accessIndex].first);
|
||||
}
|
||||
if (properties.type == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], errorCounts[accessIndex].second);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
|
@ -0,0 +1,758 @@
|
|||
/*
|
||||
* Copyright (C) 2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h"
|
||||
#include "level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h"
|
||||
|
||||
extern bool sysmanUltsEnable;
|
||||
|
||||
using ::testing::_;
|
||||
using ::testing::DoDefault;
|
||||
using ::testing::Matcher;
|
||||
using ::testing::NiceMock;
|
||||
using ::testing::Return;
|
||||
class OsRas;
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
constexpr uint32_t mockHandleCount = 2u;
|
||||
constexpr uint32_t mockHandleCountForSubDevice = 4u;
|
||||
struct SysmanRasFixture : public SysmanDeviceFixture {
|
||||
protected:
|
||||
std::unique_ptr<Mock<RasFsAccess>> pFsAccess;
|
||||
std::unique_ptr<Mock<RasSysfsAccess>> pSysfsAccess;
|
||||
std::unique_ptr<Mock<MockPmuInterfaceImpForRas>> pPmuInterface;
|
||||
std::unique_ptr<Mock<RasFwInterface>> pRasFwUtilInterface;
|
||||
MemoryManager *pMemoryManagerOriginal = nullptr;
|
||||
std::unique_ptr<MockMemoryManagerInRasSysman> pMemoryManager;
|
||||
FsAccess *pFsAccessOriginal = nullptr;
|
||||
SysfsAccess *pSysfsAccessOriginal = nullptr;
|
||||
PmuInterface *pOriginalPmuInterface = nullptr;
|
||||
FirmwareUtil *pFwUtilOriginal = nullptr;
|
||||
std::vector<ze_device_handle_t> deviceHandles;
|
||||
|
||||
void SetUp() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
SysmanDeviceFixture::SetUp();
|
||||
pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager();
|
||||
pMemoryManager = std::make_unique<::testing::NiceMock<MockMemoryManagerInRasSysman>>(*neoDevice->getExecutionEnvironment());
|
||||
pMemoryManager->localMemorySupported[0] = true;
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManager.get());
|
||||
pFsAccess = std::make_unique<NiceMock<Mock<RasFsAccess>>>();
|
||||
pSysfsAccess = std::make_unique<NiceMock<Mock<RasSysfsAccess>>>();
|
||||
pRasFwUtilInterface = std::make_unique<NiceMock<Mock<RasFwInterface>>>();
|
||||
pFsAccessOriginal = pLinuxSysmanImp->pFsAccess;
|
||||
pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess;
|
||||
pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface;
|
||||
pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface;
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
|
||||
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
|
||||
pPmuInterface = std::make_unique<NiceMock<Mock<MockPmuInterfaceImpForRas>>>(pLinuxSysmanImp);
|
||||
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t subDeviceCount = 0;
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr);
|
||||
if (subDeviceCount == 0) {
|
||||
deviceHandles.resize(1, device->toHandle());
|
||||
} else {
|
||||
deviceHandles.resize(subDeviceCount, nullptr);
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, deviceHandles.data());
|
||||
}
|
||||
}
|
||||
void TearDown() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccessOriginal;
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal;
|
||||
pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface;
|
||||
pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal;
|
||||
SysmanDeviceFixture::TearDown();
|
||||
}
|
||||
std::vector<zes_ras_handle_t> getRasHandles(uint32_t count) {
|
||||
std::vector<zes_ras_handle_t> handles(count, nullptr);
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS);
|
||||
return handles;
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesInThenSuccessReturn) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, mockHandleCount);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenGettingRasPropertiesThenSuccessIsReturned) {
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
|
||||
for (auto handle : handles) {
|
||||
zes_ras_properties_t properties = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
|
||||
EXPECT_EQ(properties.pNext, nullptr);
|
||||
EXPECT_EQ(properties.onSubdevice, false);
|
||||
EXPECT_EQ(properties.subdeviceId, 0u);
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfReadSymLinkFailsThenNoSupportedErrorTypeIsReturned) {
|
||||
std::set<zes_ras_error_type_t> errorType = {};
|
||||
|
||||
pSysfsAccess->mockReadSymLinkResult = true;
|
||||
|
||||
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, device->toHandle());
|
||||
EXPECT_EQ(errorType.size(), 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForGtAndIfListDirectoryFailsThenNoSupportedErrorTypeIsReturned) {
|
||||
std::set<zes_ras_error_type_t> errorType = {};
|
||||
|
||||
pFsAccess->mockReadDirectoryFailure = true;
|
||||
|
||||
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, device);
|
||||
EXPECT_EQ(errorType.size(), 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErrorsForHbmAndFwInterfaceIsAbsentThenNoSupportedErrorTypeIsReturned) {
|
||||
std::set<zes_ras_error_type_t> errorType = {};
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
|
||||
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, device);
|
||||
EXPECT_EQ(errorType.size(), 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentThenZeroHandlesAreCreated) {
|
||||
|
||||
pFsAccess->mockReadDirectoryWithoutRasEvents = true;
|
||||
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, 0u);
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, 0u);
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadCorrectable = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableCacheErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socCorrectableFabricSs0_0Count + initialCorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalEuErrorCount + fatalTlb + initialUncorrectableCacheErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], initialUncorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socNonFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtAfterClearThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadAfterClear = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
ze_bool_t clear = 0;
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableCacheErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socCorrectableFabricSs0_0Count + initialCorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalEuErrorCount + fatalTlb + initialUncorrectableCacheErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], initialUncorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socNonFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
|
||||
}
|
||||
}
|
||||
correctable = true;
|
||||
clear = 1;
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmWithClearThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
ze_bool_t clear = 0;
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount);
|
||||
}
|
||||
}
|
||||
|
||||
correctable = true;
|
||||
clear = 1;
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, clear, &state));
|
||||
if (correctable == true) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
correctable = false;
|
||||
} else {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateWithClearOptionWithoutPermissionsThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockRootUser = true;
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
ze_bool_t clear = 1;
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasGetState(handle, clear, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndUnableToRetrieveConfigValuesAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockReadFileFailure = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPerfEventOpenFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pPmuInterface->mockPerfEvent = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceWithClearAndPmuReadFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 1, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateForGtInterfaceAndPMUGetEventTypeFailsAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockReadVal = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateAndFirmwareInterfaceIsAbsentOtherInterfacesAreAlsoAbsentThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockReadVal = true;
|
||||
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetConfigAfterzesRasSetConfigThenSuccessIsReturned) {
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
zes_ras_config_t setConfig = {};
|
||||
zes_ras_config_t getConfig = {};
|
||||
setConfig.totalThreshold = 50;
|
||||
memset(setConfig.detailedThresholds.category, 1, sizeof(setConfig.detailedThresholds.category));
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasSetConfig(handle, &setConfig));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetConfig(handle, &getConfig));
|
||||
EXPECT_EQ(setConfig.totalThreshold, getConfig.totalThreshold);
|
||||
int compare = std::memcmp(setConfig.detailedThresholds.category, getConfig.detailedThresholds.category, sizeof(setConfig.detailedThresholds.category));
|
||||
EXPECT_EQ(0, compare);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasSetConfigWithoutPermissionThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockRootUser = true;
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
|
||||
for (auto handle : handles) {
|
||||
zes_ras_config_t setConfig = {};
|
||||
setConfig.totalThreshold = 50;
|
||||
memset(setConfig.detailedThresholds.category, 1, sizeof(setConfig.detailedThresholds.category));
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasSetConfig(handle, &setConfig));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndReadSymLinkFailsInsideGetEventOpenAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pSysfsAccess->mockReadSymLinkStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterfaceAndListDirectoryFailsDuringInitAndOtherInterfacesAreAbsentThenFailureIsReturned) {
|
||||
|
||||
pFsAccess->mockListDirectoryStatus = ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasGetState(handle, 0, &state));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleAndHandleCountZeroWhenCallingReInitThenValidCountIsReturnedAndVerifyzesDeviceEnumRasErrorSetsSucceeds) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
|
||||
pLinuxSysmanImp->reInitSysmanDeviceResources();
|
||||
|
||||
count = 0;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
}
|
||||
|
||||
struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture {
|
||||
protected:
|
||||
std::unique_ptr<Mock<RasFsAccess>> pFsAccess;
|
||||
std::unique_ptr<Mock<RasSysfsAccess>> pSysfsAccess;
|
||||
std::unique_ptr<Mock<MockPmuInterfaceImpForRas>> pPmuInterface;
|
||||
MemoryManager *pMemoryManagerOriginal = nullptr;
|
||||
std::unique_ptr<MockMemoryManagerInRasSysman> pMemoryManager;
|
||||
std::unique_ptr<Mock<RasFwInterface>> pRasFwUtilInterface;
|
||||
FsAccess *pFsAccessOriginal = nullptr;
|
||||
SysfsAccess *pSysfsAccessOriginal = nullptr;
|
||||
PmuInterface *pOriginalPmuInterface = nullptr;
|
||||
FirmwareUtil *pFwUtilOriginal = nullptr;
|
||||
std::vector<ze_device_handle_t> deviceHandles;
|
||||
|
||||
void SetUp() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
SysmanMultiDeviceFixture::SetUp();
|
||||
pMemoryManagerOriginal = device->getDriverHandle()->getMemoryManager();
|
||||
pMemoryManager = std::make_unique<::testing::NiceMock<MockMemoryManagerInRasSysman>>(*neoDevice->getExecutionEnvironment());
|
||||
pMemoryManager->localMemorySupported[0] = true;
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManager.get());
|
||||
pFsAccess = std::make_unique<NiceMock<Mock<RasFsAccess>>>();
|
||||
pSysfsAccess = std::make_unique<NiceMock<Mock<RasSysfsAccess>>>();
|
||||
pRasFwUtilInterface = std::make_unique<NiceMock<Mock<RasFwInterface>>>();
|
||||
pFsAccessOriginal = pLinuxSysmanImp->pFsAccess;
|
||||
pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess;
|
||||
pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface;
|
||||
pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface;
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
|
||||
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
|
||||
pPmuInterface = std::make_unique<NiceMock<Mock<MockPmuInterfaceImpForRas>>>(pLinuxSysmanImp);
|
||||
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
|
||||
|
||||
pFsAccess->mockReadDirectoryForMultiDevice = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
uint32_t subDeviceCount = 0;
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, nullptr);
|
||||
if (subDeviceCount == 0) {
|
||||
deviceHandles.resize(1, device->toHandle());
|
||||
} else {
|
||||
deviceHandles.resize(subDeviceCount, nullptr);
|
||||
Device::fromHandle(device->toHandle())->getSubDevices(&subDeviceCount, deviceHandles.data());
|
||||
}
|
||||
}
|
||||
void TearDown() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
device->getDriverHandle()->setMemoryManager(pMemoryManagerOriginal);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccessOriginal;
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal;
|
||||
pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface;
|
||||
pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal;
|
||||
SysmanMultiDeviceFixture::TearDown();
|
||||
}
|
||||
std::vector<zes_ras_handle_t> getRasHandles(uint32_t count) {
|
||||
std::vector<zes_ras_handle_t> handles(count, nullptr);
|
||||
EXPECT_EQ(zesDeviceEnumRasErrorSets(device->toHandle(), &count, handles.data()), ZE_RESULT_SUCCESS);
|
||||
return handles;
|
||||
}
|
||||
};
|
||||
TEST_F(SysmanMultiDeviceFixture, GivenValidSysmanHandleWithMultiDeviceWhenRetrievingRasHandlesThenSuccessIsReturned) {
|
||||
RasHandleContext *pRasHandleContext = new RasHandleContext(pSysmanDeviceImp->pOsSysman);
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = pRasHandleContext->rasGet(&count, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ((count > 0), true);
|
||||
delete pRasHandleContext;
|
||||
}
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesThenSuccessIsReturned) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCountForSubDevice);
|
||||
|
||||
uint32_t testcount = count + 1;
|
||||
result = zesDeviceEnumRasErrorSets(device->toHandle(), &testcount, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(testcount, mockHandleCountForSubDevice);
|
||||
auto handles = getRasHandles(mockHandleCountForSubDevice);
|
||||
for (auto handle : handles) {
|
||||
EXPECT_NE(handle, nullptr);
|
||||
}
|
||||
}
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidHandleWhenGettingRasPropertiesThenSuccessIsReturned) {
|
||||
for (auto deviceHandle : deviceHandles) {
|
||||
zes_ras_properties_t properties = {};
|
||||
ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
|
||||
Device::fromHandle(deviceHandle)->getProperties(&deviceProperties);
|
||||
bool isSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE;
|
||||
PublicLinuxRasImp *pLinuxRasImp = new PublicLinuxRasImp(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, isSubDevice, deviceProperties.subdeviceId);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, pLinuxRasImp->osRasGetProperties(properties));
|
||||
EXPECT_EQ(properties.subdeviceId, deviceProperties.subdeviceId);
|
||||
EXPECT_EQ(properties.onSubdevice, isSubDevice);
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
delete pLinuxRasImp;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadTile = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
auto handles = getRasHandles(mockHandleCountForSubDevice);
|
||||
uint32_t handleIndex = 0u;
|
||||
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
if (handleIndex == 0u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctableGrfErrorCountTile0 + correctableEuErrorCountTile0 + initialCorrectableCacheErrors); // No. of correctable error type for subdevice 0
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socCorrectableHbmSs0_1CountTile0 + initialCorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
} else if (handleIndex == 1u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalEuErrorCountTile0 + initialUncorrectableCacheErrors); // No. of uncorrectable error type for subdevice 0
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCountTile0 + initialEngineReset);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttentionTile0 + initialProgrammingErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalFpuTile0 + FatalL3FabricTile0 + initialUncorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socNonFatalPsfCsc0CountTile0 + socFatalHbmSs1_15CountTile0 + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
|
||||
} else if (handleIndex == 2u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], correctableSamplerErrorCountTile1 + correctableGucErrorCountTile1 + initialCorrectableCacheErrorsTile1); // No. of correctable error type for subdevice 1
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socCorrectableFabricSs1_0CountTile1 + initialCorrectableNonComputeErrorsTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u);
|
||||
} else if (handleIndex == 3u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalGucErrorCountTile1 + fatalIdiParityErrorCountTile1 + initialUncorrectableCacheErrorsTile1); // No. of uncorrectable error type for subdevice 1
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCountTile1 + initialEngineResetTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttentionTile1 + initialProgrammingErrorsTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiWestCountTile1 + socNonFatalPunitCountTile1 + initialUncorrectableNonComputeErrorsTile1);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverEngineOther + initialUncorrectableDriverErrorsTile1);
|
||||
}
|
||||
handleIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSuccessIsReturned) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
|
||||
for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) {
|
||||
delete handle;
|
||||
}
|
||||
pSysmanDeviceImp->pRasHandleContext->handleList.clear();
|
||||
pSysmanDeviceImp->pRasHandleContext->init(deviceHandles);
|
||||
auto handles = getRasHandles(mockHandleCountForSubDevice);
|
||||
uint32_t handleIndex = 0u;
|
||||
|
||||
for (auto handle : handles) {
|
||||
zes_ras_state_t state = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetState(handle, 0, &state));
|
||||
if (handleIndex == 0u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 0
|
||||
} else if (handleIndex == 1u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 0
|
||||
} else if (handleIndex == 2u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmCorrectableErrorCount); // No. of correctable error type for subdevice 1
|
||||
} else if (handleIndex == 3u) {
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], hbmUncorrectableErrorCount); // No. of uncorrectable error type for subdevice 1
|
||||
}
|
||||
handleIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
class SysmanRasAffinityMaskFixture : public SysmanRasMultiDeviceFixture {
|
||||
void SetUp() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
NEO::DebugManager.flags.ZE_AFFINITY_MASK.set("0.1");
|
||||
SysmanRasMultiDeviceFixture::SetUp();
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
if (!sysmanUltsEnable) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
SysmanRasMultiDeviceFixture::TearDown();
|
||||
}
|
||||
DebugManagerStateRestore restorer;
|
||||
};
|
||||
|
||||
TEST_F(SysmanRasAffinityMaskFixture, GivenAffinityMaskIsSetWhenCallingRasPropertiesThenPropertiesAreReturnedForTheSubDevicesAccordingToAffinityMask) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(count, mockHandleCount);
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
uint32_t handleIndex = 0u;
|
||||
for (auto handle : handles) {
|
||||
zes_ras_properties_t properties = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetProperties(handle, &properties));
|
||||
EXPECT_EQ(properties.pNext, nullptr);
|
||||
EXPECT_EQ(properties.onSubdevice, true);
|
||||
EXPECT_EQ(properties.subdeviceId, 1u); //Affinity mask 0.1 is set which means only subdevice 1 is exposed
|
||||
if (handleIndex == 0u) {
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
|
||||
} else if (handleIndex == 1u) {
|
||||
EXPECT_EQ(properties.type, ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
|
||||
}
|
||||
handleIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
Loading…
Reference in New Issue