diff --git a/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.cpp b/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.cpp index 0941938cf0..e6d5f8ce88 100644 --- a/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.cpp +++ b/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.cpp @@ -345,6 +345,21 @@ ze_result_t LinuxGlobalOperationsImp::resetImpl(ze_bool_t force, zes_reset_type_ for (auto &&pid : deviceUsingPids) { while (pProcfsAccess->isAlive(pid)) { if (std::chrono::duration_cast(end - start).count() > resetTimeout) { + + if (resetType == ZES_RESET_TYPE_FLR || resetType == ZES_RESET_TYPE_COLD) { + result = pSysfsAccess->bindDevice(resetName); + if (ZE_RESULT_SUCCESS != result) { + NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to bind the device to the kernel driver and returning error:0x%x \n", __FUNCTION__, result); + return result; + } + } + + result = pLinuxSysmanImp->reInitSysmanDeviceResources(); + if (ZE_RESULT_SUCCESS != result) { + NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to init the device and returning error:0x%x \n", __FUNCTION__, result); + return result; + } + NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Timeout reached, device still in use and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE); return ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE; } diff --git a/level_zero/sysman/source/shared/linux/zes_os_sysman_imp.cpp b/level_zero/sysman/source/shared/linux/zes_os_sysman_imp.cpp index 5fab52c72d..33ac109a26 100644 --- a/level_zero/sysman/source/shared/linux/zes_os_sysman_imp.cpp +++ b/level_zero/sysman/source/shared/linux/zes_os_sysman_imp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023 Intel Corporation + * Copyright (C) 2023-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -466,6 +466,7 @@ ze_result_t LinuxSysmanImp::osWarmReset() { "Card Bus remove after resizing VF bar failed\n"); return result; } + NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds to make sure that the config spaces of all devices are saved correctly. result = pFsAccess->write(rootPortPath + '/' + "rescan", "1"); if (ZE_RESULT_SUCCESS != result) { @@ -473,6 +474,7 @@ ze_result_t LinuxSysmanImp::osWarmReset() { "Rescanning root port failed after resizing VF bar failed\n"); return result; } + NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds, allows the rescan to complete on all devices attached to the root port. } return result; } diff --git a/level_zero/sysman/test/unit_tests/sources/global_operations/linux/mock_global_operations.h b/level_zero/sysman/test/unit_tests/sources/global_operations/linux/mock_global_operations.h index 4366779ae2..259963df0b 100644 --- a/level_zero/sysman/test/unit_tests/sources/global_operations/linux/mock_global_operations.h +++ b/level_zero/sysman/test/unit_tests/sources/global_operations/linux/mock_global_operations.h @@ -635,6 +635,12 @@ struct MockGlobalOpsLinuxSysmanImp : public L0::Sysman::LinuxSysmanImp { void setMockInitDeviceError(ze_result_t result) { mockInitDeviceError = result; } + ze_result_t reInitSysmanDeviceResources() override { + if (mockInitDeviceError != ZE_RESULT_SUCCESS) { + return mockInitDeviceError; + } + return ZE_RESULT_SUCCESS; + } }; constexpr int mockFdGlobalOperations = 33; diff --git a/level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations.cpp b/level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations.cpp index 7791e5558b..c2ebd8c54e 100644 --- a/level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations.cpp +++ b/level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations.cpp @@ -54,6 +54,7 @@ class SysmanGlobalOperationsFixture : public SysmanDeviceFixture { std::unique_ptr pSysfsAccess; std::unique_ptr pProcfsAccess; std::unique_ptr pFsAccess; + std::unique_ptr pMockGlobalOpsLinuxSysmanImp; L0::Sysman::EngineHandleContext *pEngineHandleContextOld = nullptr; L0::Sysman::DiagnosticsHandleContext *pDiagnosticsHandleContextOld = nullptr; L0::Sysman::FirmwareHandleContext *pFirmwareHandleContextOld = nullptr; @@ -85,6 +86,7 @@ class SysmanGlobalOperationsFixture : public SysmanDeviceFixture { pDiagnosticsHandleContext = std::make_unique(pOsSysman); pFirmwareHandleContext = std::make_unique(pOsSysman); pRasHandleContext = std::make_unique(pOsSysman); + pMockGlobalOpsLinuxSysmanImp = std::make_unique(pLinuxSysmanImp->getSysmanDeviceImp()); auto pDrmLocal = new DrmGlobalOpsMock(const_cast(pSysmanDeviceImp->getRootDeviceEnvironment())); pDrmLocal->setupIoctlHelper(pSysmanDeviceImp->getRootDeviceEnvironment().getHardwareInfo()->platform.eProductFamily); @@ -772,6 +774,22 @@ TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingzesDeviceResetE EXPECT_EQ(pFsAccess->mockFlrValue, "1"); } +TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingZesDeviceResetExtForColdResetThenErrorIsReturned) { + initGlobalOps(); + pProcfsAccess->ourDevicePid = pProcfsAccess->pidList[0]; + pProcfsAccess->ourDeviceFd = pProcfsAccess->extraFd; + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->resetTimeout = 0; // timeout immediate + pProcfsAccess->mockListProcessCall.push_back(DEVICE_UNUSED); + pProcfsAccess->isRepeated.push_back(false); + pProcfsAccess->mockListProcessCall.push_back(DEVICE_IN_USE); + pProcfsAccess->isRepeated.push_back(true); + pProcfsAccess->mockNoKill = true; + pSysfsAccess->mockBindDeviceError = ZE_RESULT_ERROR_NOT_AVAILABLE; + zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_COLD}; + ze_result_t result = zesDeviceResetExt(device, &pProperties); + EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, result); +} + TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingResetExtWithInvalidTypeThenFailureIsReturned) { init(true); DebugManagerStateRestore dbgRestore; @@ -836,6 +854,51 @@ TEST_F(SysmanGlobalOperationsIntegratedFixture, GivenDeviceInUseWhenCallingReset EXPECT_EQ(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, result); } +TEST_F(SysmanGlobalOperationsIntegratedFixture, GivenDeviceInUseAndBindingFailsDuringResetWhenCallingResetThenErrorIsReturned) { + + initGlobalOps(); + + pProcfsAccess->ourDevicePid = pProcfsAccess->pidList[0]; + pProcfsAccess->ourDeviceFd = pProcfsAccess->extraFd; + + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->resetTimeout = 0; // timeout immediate + + pProcfsAccess->mockListProcessCall.push_back(DEVICE_UNUSED); + pProcfsAccess->isRepeated.push_back(false); + pProcfsAccess->mockListProcessCall.push_back(DEVICE_IN_USE); + pProcfsAccess->isRepeated.push_back(true); + pProcfsAccess->mockNoKill = true; + pSysfsAccess->mockBindDeviceError = ZE_RESULT_ERROR_NOT_AVAILABLE; + ze_result_t result = zesDeviceReset(device, true); + EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, result); +} + +TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseAndReInitFailsDuringResetWhenCallingResetThenErrorIsReturned) { + + initGlobalOps(); + pProcfsAccess->ourDevicePid = pProcfsAccess->pidList[0]; + pProcfsAccess->ourDeviceFd = pProcfsAccess->extraFd; + + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp = pMockGlobalOpsLinuxSysmanImp.get(); + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->resetTimeout = 0; // timeout immediate + + pMockGlobalOpsLinuxSysmanImp->pProcfsAccess = pProcfsAccess.get(); + pMockGlobalOpsLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get(); + pMockGlobalOpsLinuxSysmanImp->pFsAccess = pFsAccess.get(); + + pMockGlobalOpsLinuxSysmanImp->ourDevicePid = pProcfsAccess->ourDevicePid; + pMockGlobalOpsLinuxSysmanImp->ourDeviceFd = pProcfsAccess->ourDevicePid; + pMockGlobalOpsLinuxSysmanImp->setMockInitDeviceError(ZE_RESULT_ERROR_UNKNOWN); + + pProcfsAccess->mockListProcessCall.push_back(DEVICE_UNUSED); + pProcfsAccess->isRepeated.push_back(false); + pProcfsAccess->mockListProcessCall.push_back(DEVICE_IN_USE); + pProcfsAccess->isRepeated.push_back(true); + pProcfsAccess->mockNoKill = true; + ze_result_t result = zesDeviceReset(device, true); + EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, result); +} + TEST_F(SysmanGlobalOperationsIntegratedFixture, GivenDeviceNotInUseWhenCallingResetThenSuccessIsReturned) { // Pretend we have the device open diff --git a/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.cpp b/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.cpp index 69a6825b92..c9648b611d 100644 --- a/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.cpp +++ b/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.cpp @@ -265,9 +265,25 @@ ze_result_t LinuxGlobalOperationsImp::resetImpl(ze_bool_t force, zes_reset_type_ for (auto &&pid : deviceUsingPids) { while (pProcfsAccess->isAlive(pid)) { if (std::chrono::duration_cast(end - start).count() > resetTimeout) { + + if (resetType == ZES_RESET_TYPE_FLR || resetType == ZES_RESET_TYPE_COLD) { + result = pSysfsAccess->bindDevice(resetName); + if (ZE_RESULT_SUCCESS != result) { + NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to bind the device to the kernel driver and returning error:0x%x \n", __FUNCTION__, result); + return result; + } + } + + result = pLinuxSysmanImp->initDevice(); + if (ZE_RESULT_SUCCESS != result) { + NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to init the device and returning error:0x%x \n", __FUNCTION__, result); + return result; + } + NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Timeout reached, device still in use and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE); return ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE; } + struct ::timespec timeout = {.tv_sec = 0, .tv_nsec = 1000}; ::nanosleep(&timeout, NULL); end = std::chrono::steady_clock::now(); diff --git a/level_zero/tools/source/sysman/linux/os_sysman_imp.cpp b/level_zero/tools/source/sysman/linux/os_sysman_imp.cpp index 23b9a30365..583bb4f258 100644 --- a/level_zero/tools/source/sysman/linux/os_sysman_imp.cpp +++ b/level_zero/tools/source/sysman/linux/os_sysman_imp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -516,6 +516,7 @@ ze_result_t LinuxSysmanImp::osWarmReset() { "Card Bus remove after resizing VF bar failed\n"); return result; } + NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds to make sure that the config spaces of all devices are saved correctly. result = pFsAccess->write(rootPortPath + '/' + "rescan", "1"); if (ZE_RESULT_SUCCESS != result) { @@ -523,6 +524,7 @@ ze_result_t LinuxSysmanImp::osWarmReset() { "Rescanning root port failed after resizing VF bar failed\n"); return result; } + NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds, allows the rescan to complete on all devices attached to the root port. } return result; } diff --git a/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations.cpp b/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations.cpp index 98c77d5a6e..7c9ee045a7 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations.cpp @@ -168,6 +168,26 @@ TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingzesDeviceResetE EXPECT_EQ(ZE_RESULT_SUCCESS, result); } +TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingZesDeviceResetExtForColdResetThenErrorIsReturned) { + initGlobalOps(); + pProcfsAccess->ourDevicePid = pProcfsAccess->pidList[0]; + pProcfsAccess->ourDeviceFd = pProcfsAccess->extraFd; + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp = pMockGlobalOpsLinuxSysmanImp.get(); + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp->pDevice = pLinuxSysmanImp->getDeviceHandle(); + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->resetTimeout = 0; // timeout immediate + pMockGlobalOpsLinuxSysmanImp->ourDevicePid = pProcfsAccess->ourDevicePid; + pMockGlobalOpsLinuxSysmanImp->ourDeviceFd = pProcfsAccess->ourDevicePid; + pProcfsAccess->mockListProcessCall.push_back(DEVICE_UNUSED); + pProcfsAccess->isRepeated.push_back(false); + pProcfsAccess->mockListProcessCall.push_back(DEVICE_IN_USE); + pProcfsAccess->isRepeated.push_back(true); + pProcfsAccess->mockNoKill = true; + pSysfsAccess->mockBindDeviceError = ZE_RESULT_ERROR_NOT_AVAILABLE; + zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_COLD}; + ze_result_t result = zesDeviceResetExt(device, &pProperties); + EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, result); +} + TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingResetExtWithInvalidTypeThenFailureIsReturned) { DebugManagerStateRestore dbgRestore; debugManager.flags.VfBarResourceAllocationWa.set(false); @@ -837,6 +857,51 @@ TEST_F(SysmanGlobalOperationsIntegratedFixture, GivenDeviceInUseWhenCallingReset EXPECT_EQ(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, result); } +TEST_F(SysmanGlobalOperationsIntegratedFixture, GivenDeviceInUseAndBindingFailsDuringResetWhenCallingResetThenErrorIsReturned) { + + initGlobalOps(); + pProcfsAccess->ourDevicePid = pProcfsAccess->pidList[0]; // make sure it isn't our process id + pProcfsAccess->ourDeviceFd = pProcfsAccess->extraFd; + + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp = pMockGlobalOpsLinuxSysmanImp.get(); + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp->pDevice = pLinuxSysmanImp->getDeviceHandle(); + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->resetTimeout = 0; // timeout immediate + + pMockGlobalOpsLinuxSysmanImp->ourDevicePid = pProcfsAccess->ourDevicePid; + pMockGlobalOpsLinuxSysmanImp->ourDeviceFd = pProcfsAccess->ourDevicePid; + + pProcfsAccess->mockListProcessCall.push_back(DEVICE_UNUSED); + pProcfsAccess->isRepeated.push_back(false); + pProcfsAccess->mockListProcessCall.push_back(DEVICE_IN_USE); + pProcfsAccess->isRepeated.push_back(true); + pProcfsAccess->mockNoKill = true; + pSysfsAccess->mockBindDeviceError = ZE_RESULT_ERROR_NOT_AVAILABLE; + ze_result_t result = zesDeviceReset(device, true); + EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, result); +} + +TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseAndReInitFailsDuringResetWhenCallingResetThenErrorIsReturned) { + + initGlobalOps(); + pProcfsAccess->ourDevicePid = pProcfsAccess->pidList[0]; // make sure it isn't our process id + pProcfsAccess->ourDeviceFd = pProcfsAccess->extraFd; + + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp = pMockGlobalOpsLinuxSysmanImp.get(); + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp->pDevice = pLinuxSysmanImp->getDeviceHandle(); + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->resetTimeout = 0; // timeout immediate + + pMockGlobalOpsLinuxSysmanImp->ourDevicePid = pProcfsAccess->ourDevicePid; + pMockGlobalOpsLinuxSysmanImp->ourDeviceFd = pProcfsAccess->ourDevicePid; + pMockGlobalOpsLinuxSysmanImp->setMockInitDeviceError(ZE_RESULT_ERROR_UNKNOWN); + pProcfsAccess->mockListProcessCall.push_back(DEVICE_UNUSED); + pProcfsAccess->isRepeated.push_back(false); + pProcfsAccess->mockListProcessCall.push_back(DEVICE_IN_USE); + pProcfsAccess->isRepeated.push_back(true); + pProcfsAccess->mockNoKill = true; + ze_result_t result = zesDeviceReset(device, true); + EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, result); +} + TEST_F(SysmanGlobalOperationsIntegratedFixture, GivenDeviceNotInUseWhenCallingResetThenSuccessIsReturned) { // Pretend we have the device open