From 3816b85fa03d11edf8a8737777a9690071a5caec Mon Sep 17 00:00:00 2001 From: Mayank Raghuwanshi Date: Tue, 14 Feb 2023 12:30:28 +0000 Subject: [PATCH] Add check for memory type before calculating ras hbm errors Related-To: LOCI-3500 Signed-off-by: Mayank Raghuwanshi --- .../source/sysman/linux/os_sysman_imp.cpp | 16 ++++ .../tools/source/sysman/linux/os_sysman_imp.h | 4 + .../sysman/ras/linux/os_ras_imp_prelim.cpp | 20 ++++- level_zero/tools/source/sysman/sysman_const.h | 1 + .../sources/sysman/events/linux/mock_events.h | 41 ++++++++++ .../sysman/events/linux/test_zes_events.cpp | 7 ++ .../sysman/linux/mock_sysman_fixture.h | 1 + .../sources/sysman/linux/test_sysman.cpp | 5 +- .../sysman/ras/linux/mock_fs_ras_prelim.h | 39 ++++++++++ .../sysman/ras/linux/test_zes_ras_prelim.cpp | 77 +++++++++++++++---- 10 files changed, 192 insertions(+), 19 deletions(-) diff --git a/level_zero/tools/source/sysman/linux/os_sysman_imp.cpp b/level_zero/tools/source/sysman/linux/os_sysman_imp.cpp index 66783b437b..6ffe6bba5a 100644 --- a/level_zero/tools/source/sysman/linux/os_sysman_imp.cpp +++ b/level_zero/tools/source/sysman/linux/os_sysman_imp.cpp @@ -12,6 +12,7 @@ #include "shared/source/helpers/sleep.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/os_interface/device_factory.h" +#include "shared/source/os_interface/linux/system_info.h" #include "level_zero/core/source/device/device_imp.h" #include "level_zero/core/source/driver/driver_handle_imp.h" @@ -51,6 +52,7 @@ ze_result_t LinuxSysmanImp::init() { DEBUG_BREAK_IF(nullptr == pPmuInterface); + getMemoryType(); return createPmtHandles(); } @@ -472,6 +474,20 @@ ze_result_t LinuxSysmanImp::osColdReset() { return ZE_RESULT_ERROR_DEVICE_LOST; // incase the reset fails inform upper layers. } +uint32_t LinuxSysmanImp::getMemoryType() { + if (isMemTypeRetrieved == false) { + auto pDrm = &getDrm(); + if (pDrm->querySystemInfo()) { + auto memSystemInfo = pDrm->getSystemInfo(); + if (memSystemInfo != nullptr) { + memType = memSystemInfo->getMemoryType(); + isMemTypeRetrieved = true; + } + } + } + return memType; +} + OsSysman *OsSysman::create(SysmanDeviceImp *pParentSysmanDeviceImp) { LinuxSysmanImp *pLinuxSysmanImp = new LinuxSysmanImp(pParentSysmanDeviceImp); return static_cast(pLinuxSysmanImp); diff --git a/level_zero/tools/source/sysman/linux/os_sysman_imp.h b/level_zero/tools/source/sysman/linux/os_sysman_imp.h index b37f82dd6c..0d03e19a25 100644 --- a/level_zero/tools/source/sysman/linux/os_sysman_imp.h +++ b/level_zero/tools/source/sysman/linux/os_sysman_imp.h @@ -16,6 +16,7 @@ #include "level_zero/tools/source/sysman/linux/pmt/pmt.h" #include "level_zero/tools/source/sysman/linux/pmu/pmu_imp.h" #include "level_zero/tools/source/sysman/linux/udev/udev_lib.h" +#include "level_zero/tools/source/sysman/sysman_const.h" #include "level_zero/tools/source/sysman/sysman_imp.h" #include @@ -58,6 +59,7 @@ class LinuxSysmanImp : public OsSysman, NEO::NonCopyableOrMovableClass { ze_device_handle_t getCoreDeviceHandle() override; SysmanDeviceImp *getSysmanDeviceImp(); std::string getPciCardBusDirectoryPath(std::string realPciPath); + uint32_t getMemoryType(); static std::string getPciRootPortDirectoryPath(std::string realPciPath); void releasePmtObject(); ze_result_t createPmtHandles(); @@ -94,8 +96,10 @@ class LinuxSysmanImp : public OsSysman, NEO::NonCopyableOrMovableClass { L0::UdevLib *pUdevLib = nullptr; std::map mapOfSubDeviceIdToPmtObject; ze_result_t initLocalDeviceAndDrmHandles(); + uint32_t memType = unknownMemoryType; private: + bool isMemTypeRetrieved = false; LinuxSysmanImp() = delete; SysmanDeviceImp *pParentSysmanDeviceImp = nullptr; static const std::string deviceDir; diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.cpp index 728881d134..c2646d419e 100644 --- a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.cpp +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.cpp @@ -8,17 +8,31 @@ #include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h" #include "shared/source/helpers/string.h" +#include "shared/source/os_interface/linux/system_info.h" #include "level_zero/tools/source/sysman/linux/os_sysman_imp.h" +#include "drm/intel_hwconfig_types.h" + namespace L0 { +static bool isMemoryTypeHbm(LinuxSysmanImp *pLinuxSysmanImp) { + uint32_t memType = pLinuxSysmanImp->getMemoryType(); + if (memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2e || memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2) { + return true; + } + return false; +} + void OsRas::getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) { constexpr auto maxErrorTypes = 2; LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle); if (errorType.size() < maxErrorTypes) { - LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle); + auto pLinuxSysmanImp = static_cast(pOsSysman); + if (isMemoryTypeHbm(pLinuxSysmanImp) == true) { + LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle); + } } } @@ -69,7 +83,9 @@ ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) void LinuxRasImp::initSources() { rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId)); - rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, subdeviceId)); + if (isMemoryTypeHbm(pLinuxSysmanImp) == true) { + rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, subdeviceId)); + } } LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) { diff --git a/level_zero/tools/source/sysman/sysman_const.h b/level_zero/tools/source/sysman/sysman_const.h index ea7b6a066a..6a860d49ff 100644 --- a/level_zero/tools/source/sysman/sysman_const.h +++ b/level_zero/tools/source/sysman/sysman_const.h @@ -59,4 +59,5 @@ constexpr uint64_t gigaUnitTransferToUnitTransfer = 1000 * 1000 * 1000; constexpr int32_t memoryBusWidth = 128; // bus width in bytes constexpr int32_t numMemoryChannels = 8; +constexpr uint32_t unknownMemoryType = UINT32_MAX; #define BITS(x, at, width) (((x) >> (at)) & ((1 << (width)) - 1)) diff --git a/level_zero/tools/test/unit_tests/sources/sysman/events/linux/mock_events.h b/level_zero/tools/test/unit_tests/sources/sysman/events/linux/mock_events.h index f808b995fc..ee793f2844 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/events/linux/mock_events.h +++ b/level_zero/tools/test/unit_tests/sources/sysman/events/linux/mock_events.h @@ -6,6 +6,10 @@ */ #pragma once +#include "shared/source/os_interface/linux/drm_neo.h" +#include "shared/source/os_interface/linux/ioctl_helper.h" +#include "shared/source/os_interface/linux/system_info.h" +#include "shared/source/os_interface/os_interface.h" #include "shared/test/common/test_macros/mock_method_macros.h" #include "level_zero/tools/source/sysman/events/events_imp.h" @@ -13,6 +17,9 @@ #include "level_zero/tools/source/sysman/firmware_util/firmware_util.h" #include "level_zero/tools/source/sysman/linux/os_sysman_driver_imp.h" +#include "drm/intel_hwconfig_types.h" + +using namespace NEO; namespace L0 { namespace ult { @@ -227,6 +234,40 @@ struct MockEventsFwInterface : public FirmwareUtil { ADDMETHOD_NOBASE_VOIDRETURN(fwGetMemoryHealthIndicator, (zes_mem_health_t * health)); }; +struct MockEventNeoDrm : public Drm { + using Drm::ioctlHelper; + uint32_t mockMemoryType = INTEL_HWCONFIG_MEMORY_TYPE_HBM2e; + const int mockFd = 33; + std::vector mockQuerySystemInfoReturnValue{}; + bool isRepeated = false; + bool mockReturnEmptyRegions = false; + MockEventNeoDrm(RootDeviceEnvironment &rootDeviceEnvironment) : Drm(std::make_unique(mockFd, ""), rootDeviceEnvironment) {} + + void setMemoryType(uint32_t memory) { + mockMemoryType = memory; + } + + std::vector getMemoryRegionsReturnsEmpty() { + return {}; + } + + bool querySystemInfo() override { + bool returnValue = true; + if (!mockQuerySystemInfoReturnValue.empty()) { + returnValue = mockQuerySystemInfoReturnValue.front(); + if (isRepeated != true) { + mockQuerySystemInfoReturnValue.erase(mockQuerySystemInfoReturnValue.begin()); + } + return returnValue; + } + + uint32_t hwBlob[] = {INTEL_HWCONFIG_MAX_MEMORY_CHANNELS, 1, 8, INTEL_HWCONFIG_MEMORY_TYPE, 0, mockMemoryType}; + std::vector inputBlobData(reinterpret_cast(hwBlob), reinterpret_cast(hwBlob) + sizeof(hwBlob)); + this->systemInfo.reset(new SystemInfo(inputBlobData)); + return returnValue; + } +}; + class PublicLinuxEventsImp : public L0::LinuxEventsImp { public: PublicLinuxEventsImp(OsSysman *pOsSysman) : LinuxEventsImp(pOsSysman) {} diff --git a/level_zero/tools/test/unit_tests/sources/sysman/events/linux/test_zes_events.cpp b/level_zero/tools/test/unit_tests/sources/sysman/events/linux/test_zes_events.cpp index 9fb6f21dda..cdc1f60d6f 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/events/linux/test_zes_events.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/events/linux/test_zes_events.cpp @@ -20,6 +20,8 @@ constexpr int drmDeviceFd = 0; class SysmanEventsFixture : public SysmanDeviceFixture { protected: std::unique_ptr pFsAccess; + std::unique_ptr pDrm; + Drm *pOriginalDrm = nullptr; FsAccess *pFsAccessOriginal = nullptr; OsEvents *pOsEventsPrev = nullptr; L0::EventsImp *pEventsImp; @@ -38,6 +40,10 @@ class SysmanEventsFixture : public SysmanDeviceFixture { pFsAccessOriginal = pLinuxSysmanImp->pFsAccess; pFsAccess = std::make_unique(); pLinuxSysmanImp->pFsAccess = pFsAccess.get(); + pDrm = std::make_unique(const_cast(neoDevice->getRootDeviceEnvironment())); + pDrm->ioctlHelper = static_cast>(std::make_unique(*pDrm)); + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e); + pLinuxSysmanImp->pDrm = pDrm.get(); pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess; pSysfsAccess = std::make_unique(); @@ -81,6 +87,7 @@ class SysmanEventsFixture : public SysmanDeviceFixture { pEventsImp = nullptr; pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal; pLinuxSysmanImp->pFsAccess = pFsAccessOriginal; + pLinuxSysmanImp->pDrm = pOriginalDrm; pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface; SysmanDeviceFixture::TearDown(); diff --git a/level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h b/level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h index 58be59c499..f34000c59b 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h +++ b/level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h @@ -38,6 +38,7 @@ class SysmanMockDrm : public Drm { class PublicLinuxSysmanImp : public L0::LinuxSysmanImp { public: using LinuxSysmanImp::mapOfSubDeviceIdToPmtObject; + using LinuxSysmanImp::memType; using LinuxSysmanImp::pDrm; using LinuxSysmanImp::pFsAccess; using LinuxSysmanImp::pFwUtilInterface; diff --git a/level_zero/tools/test/unit_tests/sources/sysman/linux/test_sysman.cpp b/level_zero/tools/test/unit_tests/sources/sysman/linux/test_sysman.cpp index b0c1f90dcb..16decf8527 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/linux/test_sysman.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/linux/test_sysman.cpp @@ -15,6 +15,8 @@ #include "level_zero/tools/source/sysman/ras/ras_imp.h" #include "level_zero/tools/test/unit_tests/sources/sysman/linux/mock_sysman_fixture.h" +#include "drm/intel_hwconfig_types.h" + namespace NEO { namespace SysCalls { extern bool allowFakeDevicePath; @@ -633,10 +635,11 @@ TEST_F(SysmanDeviceFixture, GivenValidEnumeratedHandlesWhenReleaseIsCalledThenHa count = 0; RasImp *pRas = new RasImp(pSysmanDeviceImp->pRasHandleContext->pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle()); + pLinuxSysmanImp->memType = INTEL_HWCONFIG_MEMORY_TYPE_LPDDR4; pSysmanDeviceImp->pRasHandleContext->handleList.push_back(pRas); result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); EXPECT_EQ(ZE_RESULT_SUCCESS, result); - EXPECT_EQ(count, 3u); + EXPECT_EQ(count, 1u); pLinuxSysmanImp->releaseSysmanDeviceResources(); diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h index d0253a6517..78f1cc1281 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h @@ -6,6 +6,9 @@ */ #pragma once +#include "shared/source/os_interface/linux/ioctl_helper.h" +#include "shared/source/os_interface/linux/system_info.h" + #include "level_zero/core/test/unit_tests/mocks/mock_memory_manager.h" #include "level_zero/tools/source/sysman/linux/fs_access.h" #include "level_zero/tools/source/sysman/linux/os_sysman_imp.h" @@ -14,6 +17,8 @@ #include "level_zero/tools/source/sysman/ras/ras.h" #include "level_zero/tools/source/sysman/ras/ras_imp.h" +#include "drm/intel_hwconfig_types.h" + using namespace NEO; namespace L0 { namespace ult { @@ -651,6 +656,40 @@ struct MockRasFwInterface : public FirmwareUtil { ADDMETHOD_NOBASE_VOIDRETURN(fwGetMemoryHealthIndicator, (zes_mem_health_t * health)); }; +struct MockRasNeoDrm : public Drm { + using Drm::ioctlHelper; + uint32_t mockMemoryType = INTEL_HWCONFIG_MEMORY_TYPE_HBM2e; + const int mockFd = 33; + std::vector mockQuerySystemInfoReturnValue{}; + bool isRepeated = false; + bool mockReturnEmptyRegions = false; + MockRasNeoDrm(RootDeviceEnvironment &rootDeviceEnvironment) : Drm(std::make_unique(mockFd, ""), rootDeviceEnvironment) {} + + void setMemoryType(uint32_t memory) { + mockMemoryType = memory; + } + + std::vector getMemoryRegionsReturnsEmpty() { + return {}; + } + + bool querySystemInfo() override { + bool returnValue = true; + if (!mockQuerySystemInfoReturnValue.empty()) { + returnValue = mockQuerySystemInfoReturnValue.front(); + if (isRepeated != true) { + mockQuerySystemInfoReturnValue.erase(mockQuerySystemInfoReturnValue.begin()); + } + return returnValue; + } + + uint32_t hwBlob[] = {INTEL_HWCONFIG_MAX_MEMORY_CHANNELS, 1, 8, INTEL_HWCONFIG_MEMORY_TYPE, 0, mockMemoryType}; + std::vector inputBlobData(reinterpret_cast(hwBlob), reinterpret_cast(hwBlob) + sizeof(hwBlob)); + this->systemInfo.reset(new SystemInfo(inputBlobData)); + return returnValue; + } +}; + class PublicLinuxRasImp : public L0::LinuxRasImp { public: PublicLinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId) {} diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp index ac80ef920b..96644043e6 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp @@ -21,9 +21,11 @@ struct SysmanRasFixture : public SysmanDeviceFixture { std::unique_ptr pSysfsAccess; std::unique_ptr pPmuInterface; std::unique_ptr pRasFwUtilInterface; + std::unique_ptr pDrm; MemoryManager *pMemoryManagerOriginal = nullptr; std::unique_ptr pMemoryManager; FsAccess *pFsAccessOriginal = nullptr; + Drm *pOriginalDrm = nullptr; SysfsAccess *pSysfsAccessOriginal = nullptr; PmuInterface *pOriginalPmuInterface = nullptr; FirmwareUtil *pFwUtilOriginal = nullptr; @@ -41,15 +43,20 @@ struct SysmanRasFixture : public SysmanDeviceFixture { pFsAccess = std::make_unique(); pSysfsAccess = std::make_unique(); pRasFwUtilInterface = std::make_unique(); + pDrm = std::make_unique(const_cast(neoDevice->getRootDeviceEnvironment())); + pDrm->ioctlHelper = static_cast>(std::make_unique(*pDrm)); pFsAccessOriginal = pLinuxSysmanImp->pFsAccess; pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess; pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface; pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface; + pOriginalDrm = pLinuxSysmanImp->pDrm; pLinuxSysmanImp->pFsAccess = pFsAccess.get(); pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get(); pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get(); pPmuInterface = std::make_unique(pLinuxSysmanImp); pLinuxSysmanImp->pPmuInterface = pPmuInterface.get(); + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e); + pLinuxSysmanImp->pDrm = pDrm.get(); for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { delete handle; @@ -74,6 +81,7 @@ struct SysmanRasFixture : public SysmanDeviceFixture { pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal; pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface; pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal; + pLinuxSysmanImp->pDrm = pOriginalDrm; SysmanDeviceFixture::TearDown(); } std::vector getRasHandles(uint32_t count) { @@ -145,7 +153,6 @@ TEST_F(SysmanRasFixture, GivenValidOsSysmanPointerWhenRetrievingSupportedRasErro } TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentThenZeroHandlesAreCreated) { - pFsAccess->mockReadDirectoryWithoutRasEvents = true; pLinuxSysmanImp->pFwUtilInterface = nullptr; @@ -153,7 +160,6 @@ TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEven delete handle; } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); uint32_t count = 0; ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); EXPECT_EQ(ZE_RESULT_SUCCESS, result); @@ -164,6 +170,50 @@ TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEven EXPECT_EQ(testcount, 0u); } +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAndHbmAreAbsentThenZeroHandlesAreCreated) { + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_LPDDR4); + pRasFwUtilInterface->mockMemorySuccess = true; + pFsAccess->mockReadDirectoryWithoutRasEvents = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, 0u); +} + +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfHbmAndFwInterfaceArePresentThenSuccessIsReturned) { + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2); + pRasFwUtilInterface->mockMemorySuccess = true; + + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, mockHandleCount); +} + +TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEventsAreAbsentAndQuerySystemInfoSucceedsButMemSysInfoIsNullThenZeroHandlesAreCreated) { + pFsAccess->mockReadDirectoryWithoutRasEvents = true; + pDrm->mockQuerySystemInfoReturnValue.push_back(true); + + pLinuxSysmanImp->pFwUtilInterface = nullptr; + for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { + delete handle; + } + pSysmanDeviceImp->pRasHandleContext->handleList.clear(); + uint32_t count = 0; + ze_result_t result = zesDeviceEnumRasErrorSets(device->toHandle(), &count, NULL); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(count, 0u); +} + TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) { pPmuInterface->mockPmuReadCorrectable = true; @@ -172,7 +222,6 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuc delete handle; } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCount); bool correctable = true; for (auto handle : handles) { @@ -209,7 +258,6 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtAfterCl } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCount); bool correctable = true; ze_bool_t clear = 0; @@ -272,7 +320,6 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmThenSu } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCount); bool correctable = true; @@ -297,7 +344,6 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForHbmWithCl delete handle; } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCount); bool correctable = true; ze_bool_t clear = 0; @@ -346,7 +392,6 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterf delete handle; } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCount); for (auto handle : handles) { zes_ras_state_t state = {}; @@ -362,7 +407,6 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterf delete handle; } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCount); for (auto handle : handles) { zes_ras_state_t state = {}; @@ -400,7 +444,6 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateForGtInterf delete handle; } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCount); for (auto handle : handles) { zes_ras_state_t state = {}; @@ -417,7 +460,6 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesGetRasStateAndFirmware delete handle; } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCount); for (auto handle : handles) { zes_ras_state_t state = {}; @@ -463,7 +505,6 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterf delete handle; } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCount); for (auto handle : handles) { zes_ras_state_t state = {}; @@ -479,7 +520,6 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterf delete handle; } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCount); for (auto handle : handles) { zes_ras_state_t state = {}; @@ -495,7 +535,6 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtInterf delete handle; } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCount); for (auto handle : handles) { zes_ras_state_t state = {}; @@ -530,10 +569,12 @@ struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture { MemoryManager *pMemoryManagerOriginal = nullptr; std::unique_ptr pMemoryManager; std::unique_ptr pRasFwUtilInterface; + std::unique_ptr pDrm; FsAccess *pFsAccessOriginal = nullptr; SysfsAccess *pSysfsAccessOriginal = nullptr; PmuInterface *pOriginalPmuInterface = nullptr; FirmwareUtil *pFwUtilOriginal = nullptr; + Drm *pOriginalDrm = nullptr; std::vector deviceHandles; void SetUp() override { @@ -545,6 +586,8 @@ struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture { pMemoryManager = std::make_unique(*neoDevice->getExecutionEnvironment()); pMemoryManager->localMemorySupported[0] = true; device->getDriverHandle()->setMemoryManager(pMemoryManager.get()); + pDrm = std::make_unique(const_cast(neoDevice->getRootDeviceEnvironment())); + pDrm->ioctlHelper = static_cast>(std::make_unique(*pDrm)); pFsAccess = std::make_unique(); pSysfsAccess = std::make_unique(); pRasFwUtilInterface = std::make_unique(); @@ -552,11 +595,14 @@ struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture { pSysfsAccessOriginal = pLinuxSysmanImp->pSysfsAccess; pOriginalPmuInterface = pLinuxSysmanImp->pPmuInterface; pFwUtilOriginal = pLinuxSysmanImp->pFwUtilInterface; + pOriginalDrm = pLinuxSysmanImp->pDrm; pLinuxSysmanImp->pFsAccess = pFsAccess.get(); pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get(); pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get(); pPmuInterface = std::make_unique(pLinuxSysmanImp); pLinuxSysmanImp->pPmuInterface = pPmuInterface.get(); + pDrm->setMemoryType(INTEL_HWCONFIG_MEMORY_TYPE_HBM2e); + pLinuxSysmanImp->pDrm = pDrm.get(); pFsAccess->mockReadDirectoryForMultiDevice = true; @@ -583,6 +629,7 @@ struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture { pLinuxSysmanImp->pSysfsAccess = pSysfsAccessOriginal; pLinuxSysmanImp->pPmuInterface = pOriginalPmuInterface; pLinuxSysmanImp->pFwUtilInterface = pFwUtilOriginal; + pLinuxSysmanImp->pDrm = pOriginalDrm; SysmanMultiDeviceFixture::TearDown(); } std::vector getRasHandles(uint32_t count) { @@ -591,7 +638,7 @@ struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture { return handles; } }; -TEST_F(SysmanMultiDeviceFixture, GivenValidSysmanHandleWithMultiDeviceWhenRetrievingRasHandlesThenSuccessIsReturned) { +TEST_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWithMultiDeviceWhenRetrievingRasHandlesThenSuccessIsReturned) { RasHandleContext *pRasHandleContext = new RasHandleContext(pSysmanDeviceImp->pOsSysman); uint32_t count = 0; ze_result_t result = pRasHandleContext->rasGet(&count, nullptr); @@ -638,7 +685,6 @@ TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateF delete handle; } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCountForSubDevice); uint32_t handleIndex = 0u; for (auto handle : handles) { @@ -692,7 +738,6 @@ TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateF delete handle; } pSysmanDeviceImp->pRasHandleContext->handleList.clear(); - pSysmanDeviceImp->pRasHandleContext->init(deviceHandles); auto handles = getRasHandles(mockHandleCountForSubDevice); uint32_t handleIndex = 0u;