diff --git a/level_zero/sysman/source/api/diagnostics/linux/sysman_os_diagnostics_imp.cpp b/level_zero/sysman/source/api/diagnostics/linux/sysman_os_diagnostics_imp.cpp index 51cd6de4fb..22855c5406 100644 --- a/level_zero/sysman/source/api/diagnostics/linux/sysman_os_diagnostics_imp.cpp +++ b/level_zero/sysman/source/api/diagnostics/linux/sysman_os_diagnostics_imp.cpp @@ -54,7 +54,7 @@ ze_result_t LinuxDiagnosticsImp::waitForQuiescentCompletion() { // Sleep for 1second every loop, gives enough time for KMD to clear all allocations and wedge the system NEO::sleep(std::chrono::seconds(1)); - auto processResult = pLinuxSysmanImp->gpuProcessCleanup(); + auto processResult = pLinuxSysmanImp->gpuProcessCleanup(true); if (ZE_RESULT_SUCCESS != processResult) { return processResult; } @@ -75,7 +75,7 @@ ze_result_t LinuxDiagnosticsImp::waitForQuiescentCompletion() { ze_result_t LinuxDiagnosticsImp::osRunDiagTestsinFW(zes_diag_result_t *pResult) { pLinuxSysmanImp->diagnosticsReset = true; pLinuxSysmanImp->releaseSysmanDeviceResources(); - ze_result_t result = pLinuxSysmanImp->gpuProcessCleanup(); + ze_result_t result = pLinuxSysmanImp->gpuProcessCleanup(true); if (ZE_RESULT_SUCCESS != result) { NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): gpuProcessCleanup() failed and returning error:0x%x \n", __FUNCTION__, result); return result; diff --git a/level_zero/sysman/source/api/diagnostics/linux/sysman_os_diagnostics_imp.h b/level_zero/sysman/source/api/diagnostics/linux/sysman_os_diagnostics_imp.h index 00fb83966e..42125858f2 100644 --- a/level_zero/sysman/source/api/diagnostics/linux/sysman_os_diagnostics_imp.h +++ b/level_zero/sysman/source/api/diagnostics/linux/sysman_os_diagnostics_imp.h @@ -33,7 +33,6 @@ class LinuxDiagnosticsImp : public OsDiagnostics, NEO::NonCopyableOrMovableClass FirmwareUtil *pFwInterface = nullptr; SysfsAccess *pSysfsAccess = nullptr; FsAccess *pFsAccess = nullptr; - ze_result_t gpuProcessCleanup(); ze_result_t waitForQuiescentCompletion(); private: diff --git a/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.cpp b/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.cpp index 21bc9f56ab..299bb4be89 100644 --- a/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.cpp +++ b/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.cpp @@ -277,56 +277,33 @@ bool LinuxGlobalOperationsImp::getUuid(std::arraygetParentSysmanDeviceImp()->getHardwareInfo(); + auto resetType = hwInfo.capabilityTable.isIntegratedDevice ? ZES_RESET_TYPE_FLR : ZES_RESET_TYPE_WARM; + + return resetImpl(force, resetType); +} + +ze_result_t LinuxGlobalOperationsImp::resetExt(zes_reset_properties_t *pProperties) { + return resetImpl(pProperties->force, pProperties->resetType); +} + +ze_result_t LinuxGlobalOperationsImp::resetImpl(ze_bool_t force, zes_reset_type_t resetType) { if (!pSysfsAccess->isRootUser()) { NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Not running as root user and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS; } + pLinuxSysmanImp->releaseSysmanDeviceResources(); - std::string resetPath; - std::string resetName; - ze_result_t result = ZE_RESULT_SUCCESS; - - ::pid_t myPid = pProcfsAccess->myProcessId(); - std::vector myPidFds; - std::vector<::pid_t> processes; - - result = pProcfsAccess->listProcesses(processes); + ze_result_t result = pLinuxSysmanImp->gpuProcessCleanup(force); if (ZE_RESULT_SUCCESS != result) { - NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Unable to list processes and returning error:0x%x \n", __FUNCTION__, result); + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): gpuProcessCleanup() failed and returning error:0x%x \n", __FUNCTION__, result); return result; } - for (auto &&pid : processes) { - std::vector fds; - pLinuxSysmanImp->getPidFdsForOpenDevice(pid, fds); - if (pid == myPid) { - // L0 is expected to have this file open. - // Keep list of fds. Close before unbind. - myPidFds = fds; - } else if (!fds.empty()) { - if (force) { - pProcfsAccess->kill(pid); - } else { - // Device is in use by another process. - // Don't reset while in use. - NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Device in use by another process, not resetting and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE); - return ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE; - } - } - } + std::string resetName; pSysfsAccess->getRealPath(deviceDir, resetName); resetName = pFsAccess->getBaseName(resetName); - for (auto &&fd : myPidFds) { - // Close open filedescriptors to the device - // before unbinding device. - // From this point forward, there is no - // graceful way to fail the reset call. - // All future ze calls by this process for this - // device will fail. - ::close(fd); - } - // Unbind the device from the kernel driver. result = pSysfsAccess->unbindDevice(resetName); if (ZE_RESULT_SUCCESS != result) { @@ -334,8 +311,7 @@ ze_result_t LinuxGlobalOperationsImp::reset(ze_bool_t force) { return result; } - // If someone opened the device - // after we check, kill them here. + std::vector<::pid_t> processes; result = pProcfsAccess->listProcesses(processes); if (ZE_RESULT_SUCCESS != result) { NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to list processes and returning error:0x%x \n", __FUNCTION__, result); @@ -370,18 +346,22 @@ ze_result_t LinuxGlobalOperationsImp::reset(ze_bool_t force) { } } - if (!pLinuxSysmanImp->getParentSysmanDeviceImp()->getHardwareInfo().capabilityTable.isIntegratedDevice) { + std::string resetPath = {}; + switch (resetType) { + case ZES_RESET_TYPE_WARM: result = pLinuxSysmanImp->osWarmReset(); - if (ZE_RESULT_SUCCESS == result) { - return pLinuxSysmanImp->reInitSysmanDeviceResources(); - } - return result; + break; + case ZES_RESET_TYPE_COLD: + result = pLinuxSysmanImp->osColdReset(); + break; + case ZES_RESET_TYPE_FLR: + pSysfsAccess->getRealPath(functionLevelReset, resetPath); + result = pFsAccess->write(resetPath, "1"); + break; + default: + return ZE_RESULT_ERROR_INVALID_ARGUMENT; } - pSysfsAccess->getRealPath(functionLevelReset, resetPath); - - // Reset the device. - result = pFsAccess->write(resetPath, "1"); if (ZE_RESULT_SUCCESS != result) { NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to reset the device and returning error:0x%x \n", __FUNCTION__, result); return result; diff --git a/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.h b/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.h index a45733e1fa..3519d79aeb 100644 --- a/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.h +++ b/level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.h @@ -33,6 +33,7 @@ class LinuxGlobalOperationsImp : public OsGlobalOperations, NEO::NonCopyableOrMo ze_result_t reset(ze_bool_t force) override; ze_result_t scanProcessesState(std::vector &pProcessList) override; ze_result_t deviceGetState(zes_device_state_t *pState) override; + ze_result_t resetExt(zes_reset_properties_t *pProperties) override; bool getUuid(std::array &uuid) override; bool generateUuidFromPciBusInfo(const NEO::PhysicalDevicePciBusInfo &pciBusInfo, std::array &uuid) override; LinuxGlobalOperationsImp() = default; @@ -82,6 +83,7 @@ class LinuxGlobalOperationsImp : public OsGlobalOperations, NEO::NonCopyableOrMo uint32_t rootDeviceIndex = 0u; ze_result_t getListOfEnginesUsedByProcess(std::vector &fdFileContents, uint32_t &activeEngines); ze_result_t getMemoryStatsUsedByProcess(std::vector &fdFileContents, uint64_t &memSize, uint64_t &sharedSize); + ze_result_t resetImpl(ze_bool_t force, zes_reset_type_t resetType); }; } // namespace Sysman diff --git a/level_zero/sysman/source/api/global_operations/sysman_global_operations.h b/level_zero/sysman/source/api/global_operations/sysman_global_operations.h index f09be617dc..d6424975fc 100644 --- a/level_zero/sysman/source/api/global_operations/sysman_global_operations.h +++ b/level_zero/sysman/source/api/global_operations/sysman_global_operations.h @@ -18,6 +18,7 @@ class GlobalOperations { virtual ze_result_t deviceGetProperties(zes_device_properties_t *pProperties) = 0; virtual ze_result_t processesGetState(uint32_t *pCount, zes_process_state_t *pProcesses) = 0; virtual ze_result_t deviceGetState(zes_device_state_t *pState) = 0; + virtual ze_result_t resetExt(zes_reset_properties_t *pProperties) = 0; virtual void init() = 0; }; diff --git a/level_zero/sysman/source/api/global_operations/sysman_global_operations_imp.cpp b/level_zero/sysman/source/api/global_operations/sysman_global_operations_imp.cpp index 5cb7aa8af8..1db63be08b 100644 --- a/level_zero/sysman/source/api/global_operations/sysman_global_operations_imp.cpp +++ b/level_zero/sysman/source/api/global_operations/sysman_global_operations_imp.cpp @@ -98,6 +98,11 @@ ze_result_t GlobalOperationsImp::reset(ze_bool_t force) { return pOsGlobalOperations->reset(force); } +ze_result_t GlobalOperationsImp::resetExt(zes_reset_properties_t *pProperties) { + initGlobalOperations(); + return pOsGlobalOperations->resetExt(pProperties); +} + ze_result_t GlobalOperationsImp::deviceGetState(zes_device_state_t *pState) { initGlobalOperations(); return pOsGlobalOperations->deviceGetState(pState); diff --git a/level_zero/sysman/source/api/global_operations/sysman_global_operations_imp.h b/level_zero/sysman/source/api/global_operations/sysman_global_operations_imp.h index 9489025756..644b56add7 100644 --- a/level_zero/sysman/source/api/global_operations/sysman_global_operations_imp.h +++ b/level_zero/sysman/source/api/global_operations/sysman_global_operations_imp.h @@ -23,6 +23,7 @@ class GlobalOperationsImp : public GlobalOperations, NEO::NonCopyableOrMovableCl ze_result_t deviceGetProperties(zes_device_properties_t *pProperties) override; ze_result_t processesGetState(uint32_t *pCount, zes_process_state_t *pProcesses) override; ze_result_t deviceGetState(zes_device_state_t *pState) override; + ze_result_t resetExt(zes_reset_properties_t *pProperties) override; OsGlobalOperations *pOsGlobalOperations = nullptr; GlobalOperationsImp() = default; diff --git a/level_zero/sysman/source/api/global_operations/sysman_os_global_operations.h b/level_zero/sysman/source/api/global_operations/sysman_os_global_operations.h index 12583af891..749602ad52 100644 --- a/level_zero/sysman/source/api/global_operations/sysman_os_global_operations.h +++ b/level_zero/sysman/source/api/global_operations/sysman_os_global_operations.h @@ -35,6 +35,7 @@ class OsGlobalOperations { virtual ze_result_t reset(ze_bool_t force) = 0; virtual ze_result_t scanProcessesState(std::vector &pProcessList) = 0; virtual ze_result_t deviceGetState(zes_device_state_t *pState) = 0; + virtual ze_result_t resetExt(zes_reset_properties_t *pProperties) = 0; static OsGlobalOperations *create(OsSysman *pOsSysman); virtual ~OsGlobalOperations() {} }; diff --git a/level_zero/sysman/source/api/global_operations/windows/sysman_os_global_operations_imp.cpp b/level_zero/sysman/source/api/global_operations/windows/sysman_os_global_operations_imp.cpp index ec569c8d94..f71aed1ef0 100644 --- a/level_zero/sysman/source/api/global_operations/windows/sysman_os_global_operations_imp.cpp +++ b/level_zero/sysman/source/api/global_operations/windows/sysman_os_global_operations_imp.cpp @@ -53,6 +53,10 @@ ze_result_t WddmGlobalOperationsImp::reset(ze_bool_t force) { return pKmdSysManager->requestSingle(request, response); } +ze_result_t WddmGlobalOperationsImp::resetExt(zes_reset_properties_t *pProperties) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + ze_result_t WddmGlobalOperationsImp::scanProcessesState(std::vector &pProcessList) { return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/level_zero/sysman/source/api/global_operations/windows/sysman_os_global_operations_imp.h b/level_zero/sysman/source/api/global_operations/windows/sysman_os_global_operations_imp.h index cef7b59fc9..a6ba0b7990 100644 --- a/level_zero/sysman/source/api/global_operations/windows/sysman_os_global_operations_imp.h +++ b/level_zero/sysman/source/api/global_operations/windows/sysman_os_global_operations_imp.h @@ -28,6 +28,7 @@ class WddmGlobalOperationsImp : public OsGlobalOperations, NEO::NonCopyableOrMov ze_result_t reset(ze_bool_t force) override; ze_result_t scanProcessesState(std::vector &pProcessList) override; ze_result_t deviceGetState(zes_device_state_t *pState) override; + ze_result_t resetExt(zes_reset_properties_t *pProperties) override; bool getUuid(std::array &uuid) override; bool generateUuidFromPciBusInfo(const NEO::PhysicalDevicePciBusInfo &pciBusInfo, std::array &uuid) override; diff --git a/level_zero/sysman/source/api/scheduler/linux/sysman_os_scheduler_imp_prelim.cpp b/level_zero/sysman/source/api/scheduler/linux/sysman_os_scheduler_imp_prelim.cpp index 3b4fd997ec..e4eaecd4f4 100644 --- a/level_zero/sysman/source/api/scheduler/linux/sysman_os_scheduler_imp_prelim.cpp +++ b/level_zero/sysman/source/api/scheduler/linux/sysman_os_scheduler_imp_prelim.cpp @@ -428,7 +428,7 @@ ze_result_t LinuxSchedulerImp::updateComputeUnitDebugNode(uint64_t val) { // I915 will be reloaded if we toggle value of enableEuDebug // Hence for gracefull handling close all i915 clients before toggling enableEuDebug pLinuxSysmanImp->releaseSysmanDeviceResources(); - ze_result_t result = pLinuxSysmanImp->gpuProcessCleanup(); + ze_result_t result = pLinuxSysmanImp->gpuProcessCleanup(true); if (ZE_RESULT_SUCCESS != result) { NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "updateComputeUnitDebugNode: gpuProcessCleanup() failed with error code: %ld\n", result); diff --git a/level_zero/sysman/source/device/sysman_device.cpp b/level_zero/sysman/source/device/sysman_device.cpp index 277f19c6a9..1ae428e243 100644 --- a/level_zero/sysman/source/device/sysman_device.cpp +++ b/level_zero/sysman/source/device/sysman_device.cpp @@ -165,7 +165,8 @@ uint64_t SysmanDevice::getSysmanTimestamp() { } ze_result_t SysmanDevice::deviceResetExt(zes_device_handle_t hDevice, zes_reset_properties_t *pProperties) { - return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + auto pSysmanDevice = L0::Sysman::SysmanDevice::fromHandle(hDevice); + return pSysmanDevice->deviceResetExt(pProperties); } ze_result_t SysmanDevice::fabricPortGetMultiPortThroughput(zes_device_handle_t hDevice, uint32_t numPorts, zes_fabric_port_handle_t *phPort, zes_fabric_port_throughput_t **pThroughput) { diff --git a/level_zero/sysman/source/device/sysman_device.h b/level_zero/sysman/source/device/sysman_device.h index a07027fc96..a68c0e722c 100644 --- a/level_zero/sysman/source/device/sysman_device.h +++ b/level_zero/sysman/source/device/sysman_device.h @@ -123,9 +123,11 @@ struct SysmanDevice : _ze_device_handle_t { virtual bool deviceEventListen(zes_event_type_flags_t &pEvent, uint64_t timeout) = 0; static uint64_t getSysmanTimestamp(); + virtual ze_result_t deviceResetExt(zes_reset_properties_t *pProperties) = 0; static ze_result_t deviceResetExt(zes_device_handle_t hDevice, zes_reset_properties_t *pProperties); - static ze_result_t fabricPortGetMultiPortThroughput(zes_device_handle_t hDevice, uint32_t numPorts, zes_fabric_port_handle_t *phPort, zes_fabric_port_throughput_t **pThroughput); + virtual ze_result_t fabricPortGetMultiPortThroughput(uint32_t numPorts, zes_fabric_port_handle_t *phPort, zes_fabric_port_throughput_t **pThroughput) = 0; + static ze_result_t fabricPortGetMultiPortThroughput(zes_device_handle_t hDevice, uint32_t numPorts, zes_fabric_port_handle_t *phPort, zes_fabric_port_throughput_t **pThroughput); virtual OsSysman *deviceGetOsInterface() = 0; }; diff --git a/level_zero/sysman/source/device/sysman_device_imp.cpp b/level_zero/sysman/source/device/sysman_device_imp.cpp index b8e5a2f76d..ce10219313 100644 --- a/level_zero/sysman/source/device/sysman_device_imp.cpp +++ b/level_zero/sysman/source/device/sysman_device_imp.cpp @@ -84,6 +84,10 @@ ze_result_t SysmanDeviceImp::deviceReset(ze_bool_t force) { return pGlobalOperations->reset(force); } +ze_result_t SysmanDeviceImp::deviceResetExt(zes_reset_properties_t *pProperties) { + return pGlobalOperations->resetExt(pProperties); +} + ze_result_t SysmanDeviceImp::deviceGetState(zes_device_state_t *pState) { return pGlobalOperations->deviceGetState(pState); } diff --git a/level_zero/sysman/source/device/sysman_device_imp.h b/level_zero/sysman/source/device/sysman_device_imp.h index 68bcafe03b..767e162b24 100644 --- a/level_zero/sysman/source/device/sysman_device_imp.h +++ b/level_zero/sysman/source/device/sysman_device_imp.h @@ -85,6 +85,7 @@ struct SysmanDeviceImp : SysmanDevice, NEO::NonCopyableOrMovableClass { ze_result_t pciGetStats(zes_pci_stats_t *pStats) override; ze_result_t fanGet(uint32_t *pCount, zes_fan_handle_t *phFan) override; ze_result_t deviceEventRegister(zes_event_type_flags_t events) override; + ze_result_t deviceResetExt(zes_reset_properties_t *pProperties) override; bool deviceEventListen(zes_event_type_flags_t &pEvent, uint64_t timeout) override; ze_result_t fabricPortGetMultiPortThroughput(uint32_t numPorts, zes_fabric_port_handle_t *phPort, zes_fabric_port_throughput_t **pThroughput) override; diff --git a/level_zero/sysman/source/linux/zes_os_sysman_imp.cpp b/level_zero/sysman/source/linux/zes_os_sysman_imp.cpp index 24094ca15e..f343e0a88b 100644 --- a/level_zero/sysman/source/linux/zes_os_sysman_imp.cpp +++ b/level_zero/sysman/source/linux/zes_os_sysman_imp.cpp @@ -261,7 +261,7 @@ void LinuxSysmanImp::getPidFdsForOpenDevice(const ::pid_t pid, std::vector } } -ze_result_t LinuxSysmanImp::gpuProcessCleanup() { +ze_result_t LinuxSysmanImp::gpuProcessCleanup(ze_bool_t force) { ::pid_t myPid = pProcfsAccess->myProcessId(); std::vector<::pid_t> processes; std::vector myPidFds; @@ -282,7 +282,12 @@ ze_result_t LinuxSysmanImp::gpuProcessCleanup() { continue; } if (!fds.empty()) { - pProcfsAccess->kill(pid); + if (force) { + pProcfsAccess->kill(pid); + } else { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Device in use by another process, returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE); + return ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE; + } } } diff --git a/level_zero/sysman/source/linux/zes_os_sysman_imp.h b/level_zero/sysman/source/linux/zes_os_sysman_imp.h index 5b5b651fe0..d165b730a0 100644 --- a/level_zero/sysman/source/linux/zes_os_sysman_imp.h +++ b/level_zero/sysman/source/linux/zes_os_sysman_imp.h @@ -64,7 +64,7 @@ class LinuxSysmanImp : public OsSysman, NEO::NonCopyableOrMovableClass { MOCKABLE_VIRTUAL void getPidFdsForOpenDevice(const ::pid_t, std::vector &); MOCKABLE_VIRTUAL ze_result_t osWarmReset(); MOCKABLE_VIRTUAL ze_result_t osColdReset(); - ze_result_t gpuProcessCleanup(); + ze_result_t gpuProcessCleanup(ze_bool_t force); std::string getAddressFromPath(std::string &rootPortPath); decltype(&NEO::SysCalls::pread) preadFunction = NEO::SysCalls::pread; decltype(&NEO::SysCalls::pwrite) pwriteFunction = NEO::SysCalls::pwrite; diff --git a/level_zero/sysman/test/unit_tests/sources/diagnostics/linux/mock_zes_sysman_diagnostics.h b/level_zero/sysman/test/unit_tests/sources/diagnostics/linux/mock_zes_sysman_diagnostics.h index b83c1b3763..637a9a9bbc 100644 --- a/level_zero/sysman/test/unit_tests/sources/diagnostics/linux/mock_zes_sysman_diagnostics.h +++ b/level_zero/sysman/test/unit_tests/sources/diagnostics/linux/mock_zes_sysman_diagnostics.h @@ -253,7 +253,6 @@ struct MockDiagLinuxSysmanImp : public L0::Sysman::LinuxSysmanImp { }; class PublicLinuxDiagnosticsImp : public L0::Sysman::LinuxDiagnosticsImp { public: - using L0::Sysman::LinuxDiagnosticsImp::gpuProcessCleanup; using L0::Sysman::LinuxDiagnosticsImp::pFwInterface; using L0::Sysman::LinuxDiagnosticsImp::pLinuxSysmanImp; using L0::Sysman::LinuxDiagnosticsImp::pSysfsAccess; diff --git a/level_zero/sysman/test/unit_tests/sources/diagnostics/linux/test_zes_sysman_diagnostics.cpp b/level_zero/sysman/test/unit_tests/sources/diagnostics/linux/test_zes_sysman_diagnostics.cpp index 219b4f7a9b..f404a27317 100644 --- a/level_zero/sysman/test/unit_tests/sources/diagnostics/linux/test_zes_sysman_diagnostics.cpp +++ b/level_zero/sysman/test/unit_tests/sources/diagnostics/linux/test_zes_sysman_diagnostics.cpp @@ -487,7 +487,7 @@ TEST_F(ZesDiagnosticsFixture, GivenValidDiagnosticsHandleWhenGPUProcessCleanupSu pMockDiagProcfsAccess->ourDevicePid = getpid(); pMockDiagLinuxSysmanImp->ourDevicePid = getpid(); pMockDiagLinuxSysmanImp->ourDeviceFd = NEO::SysCalls::open("/dev/null", 0); - EXPECT_EQ(ZE_RESULT_SUCCESS, pPublicLinuxDiagnosticsImp->pLinuxSysmanImp->gpuProcessCleanup()); + EXPECT_EQ(ZE_RESULT_SUCCESS, pPublicLinuxDiagnosticsImp->pLinuxSysmanImp->gpuProcessCleanup(true)); } TEST_F(ZesDiagnosticsFixture, GivenValidDiagnosticsHandleWhenGPUProcessCleanupFailsThenWaitForQuiescentCompletionsFails) { diff --git a/level_zero/sysman/test/unit_tests/sources/global_operations/linux/mock_global_operations.h b/level_zero/sysman/test/unit_tests/sources/global_operations/linux/mock_global_operations.h index ca5babb70a..de0bb3b3d5 100644 --- a/level_zero/sysman/test/unit_tests/sources/global_operations/linux/mock_global_operations.h +++ b/level_zero/sysman/test/unit_tests/sources/global_operations/linux/mock_global_operations.h @@ -61,6 +61,10 @@ const std::string ueventWedgedFile("/var/lib/libze_intel_gpu/wedged_file"); const std::string mockFunctionResetPath("/MOCK_FUNCTION_LEVEL_RESET_PATH"); const std::string mockDeviceDir("devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0"); const std::string mockDeviceName("/MOCK_DEVICE_NAME"); +const std::string mockSlotPath("/sys/bus/pci/slots/1"); +const std::string mockSlotPathAddress("/sys/bus/pci/slots/1/address"); +const std::string mockRootAddress("devices"); +const std::string mockCardBusPath("/sys/devices"); enum mockEnumListProcessCall { DEVICE_IN_USE = 0, @@ -464,11 +468,19 @@ struct MockGlobalOperationsFsAccess : public L0::Sysman::FsAccess { ze_result_t mockReadError = ZE_RESULT_SUCCESS; ze_result_t readResult = ZE_RESULT_ERROR_NOT_AVAILABLE; std::string mockReadVal = ""; + std::string mockFlrValue = ""; + std::string mockColdResetValue = "unknown"; + std::string mockWarmResetValue = "unknown"; ze_result_t read(const std::string file, std::string &val) override { if (mockReadError != ZE_RESULT_SUCCESS) { return mockReadError; } + if (file.compare(mockSlotPathAddress) == 0) { + val = mockRootAddress; + return ZE_RESULT_SUCCESS; + } + if (mockReadVal == srcVersion) { if (file.compare(srcVersionFile) == 0) { val = mockReadVal; @@ -538,9 +550,25 @@ struct MockGlobalOperationsFsAccess : public L0::Sysman::FsAccess { return mockWriteError; } + if (file.compare(mockFunctionResetPath) == 0) { + mockFlrValue = val; + } else if (file.compare(mockSlotPath + "/power") == 0) { + if (val.compare("0") == 0) { + mockColdResetValue = val; + } else if ((val.compare("1") == 0) && mockColdResetValue.compare("0") == 0) { + mockColdResetValue = val; + } + } else if (file.compare(mockCardBusPath + "/remove") == 0) { + mockWarmResetValue = val; + } return writeResult; } + ze_result_t listDirectory(const std::string directory, std::vector &listOfslots) override { + listOfslots.push_back("1"); + return ZE_RESULT_SUCCESS; + } + ADDMETHOD_NOBASE(canWrite, ze_result_t, ZE_RESULT_SUCCESS, (const std::string file)); MockGlobalOperationsFsAccess() = default; }; diff --git a/level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations.cpp b/level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations.cpp index 5bc01aa153..98cff87a38 100644 --- a/level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations.cpp +++ b/level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations.cpp @@ -752,9 +752,33 @@ TEST_F(SysmanGlobalOperationsFixture, GivenGemCreateIoctlFailsWithEINVALWhenCall EXPECT_EQ(0u, deviceState.reset); } -TEST_F(SysmanGlobalOperationsFixture, GivenValidDeviceHandleWhenCallingzesDeviceResetExtThenUnsupportedFeatureErrorIsReturned) { - ze_result_t result = zesDeviceResetExt(device, nullptr); - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, result); +TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingzesDeviceResetExtThenResetExtCallReturnSuccess) { + init(true); + DebugManagerStateRestore dbgRestore; + DebugManager.flags.VfBarResourceAllocationWa.set(false); + zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_WARM}; + ze_result_t result = zesDeviceResetExt(pSysmanDevice->toHandle(), &pProperties); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(pFsAccess->mockWarmResetValue, "1"); + + pProperties.resetType = ZES_RESET_TYPE_COLD; + result = zesDeviceResetExt(pSysmanDevice->toHandle(), &pProperties); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(pFsAccess->mockColdResetValue, "1"); + + pProperties.resetType = ZES_RESET_TYPE_FLR; + result = zesDeviceResetExt(pSysmanDevice->toHandle(), &pProperties); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(pFsAccess->mockFlrValue, "1"); +} + +TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingResetExtWithInvalidTypeThenFailureIsReturned) { + init(true); + DebugManagerStateRestore dbgRestore; + DebugManager.flags.VfBarResourceAllocationWa.set(false); + zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_FORCE_UINT32}; + ze_result_t result = zesDeviceResetExt(pSysmanDevice->toHandle(), &pProperties); + EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, result); } TEST_F(SysmanGlobalOperationsFixture, GivenForceTrueWhenCallingResetThenSuccessIsReturned) { diff --git a/level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_imp.cpp b/level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_imp.cpp index 62e3631b7c..713ea7b2a1 100644 --- a/level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_imp.cpp +++ b/level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_imp.cpp @@ -50,7 +50,7 @@ ze_result_t LinuxDiagnosticsImp::waitForQuiescentCompletion() { if (ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE == result) { count++; NEO::sleep(std::chrono::seconds(1)); // Sleep for 1second every loop, gives enough time for KMD to clear all allocations and wedge the system - auto processResult = pLinuxSysmanImp->gpuProcessCleanup(); + auto processResult = pLinuxSysmanImp->gpuProcessCleanup(true); if (ZE_RESULT_SUCCESS != processResult) { return processResult; } @@ -75,7 +75,7 @@ ze_result_t LinuxDiagnosticsImp::osRunDiagTestsinFW(zes_diag_result_t *pResult) NEO::ExecutionEnvironment *executionEnvironment = devicePtr->getNEODevice()->getExecutionEnvironment(); auto restorer = std::make_unique(executionEnvironment); pLinuxSysmanImp->releaseDeviceResources(); - ze_result_t result = pLinuxSysmanImp->gpuProcessCleanup(); + ze_result_t result = pLinuxSysmanImp->gpuProcessCleanup(true); if (ZE_RESULT_SUCCESS != result) { NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): gpuProcessCleanup() failed and returning error:0x%x \n", __FUNCTION__, result); return result; diff --git a/level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_imp.h b/level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_imp.h index 499ac0f467..7e54185a2c 100644 --- a/level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_imp.h +++ b/level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_imp.h @@ -31,7 +31,6 @@ class LinuxDiagnosticsImp : public OsDiagnostics, NEO::NonCopyableOrMovableClass FirmwareUtil *pFwInterface = nullptr; SysfsAccess *pSysfsAccess = nullptr; FsAccess *pFsAccess = nullptr; - ze_result_t gpuProcessCleanup(); ze_result_t waitForQuiescentCompletion(); private: diff --git a/level_zero/tools/source/sysman/global_operations/global_operations.h b/level_zero/tools/source/sysman/global_operations/global_operations.h index e686c34fa1..c02487771d 100644 --- a/level_zero/tools/source/sysman/global_operations/global_operations.h +++ b/level_zero/tools/source/sysman/global_operations/global_operations.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -17,6 +17,7 @@ class GlobalOperations { virtual ze_result_t deviceGetProperties(zes_device_properties_t *pProperties) = 0; virtual ze_result_t processesGetState(uint32_t *pCount, zes_process_state_t *pProcesses) = 0; virtual ze_result_t deviceGetState(zes_device_state_t *pState) = 0; + virtual ze_result_t resetExt(zes_reset_properties_t *pProperties) = 0; virtual void init() = 0; }; diff --git a/level_zero/tools/source/sysman/global_operations/global_operations_imp.cpp b/level_zero/tools/source/sysman/global_operations/global_operations_imp.cpp index 285feb4b10..25fafc5490 100644 --- a/level_zero/tools/source/sysman/global_operations/global_operations_imp.cpp +++ b/level_zero/tools/source/sysman/global_operations/global_operations_imp.cpp @@ -60,6 +60,11 @@ ze_result_t GlobalOperationsImp::reset(ze_bool_t force) { return pOsGlobalOperations->reset(force); } +ze_result_t GlobalOperationsImp::resetExt(zes_reset_properties_t *pProperties) { + initGlobalOperations(); + return pOsGlobalOperations->resetExt(pProperties); +} + ze_result_t GlobalOperationsImp::deviceGetState(zes_device_state_t *pState) { initGlobalOperations(); return pOsGlobalOperations->deviceGetState(pState); diff --git a/level_zero/tools/source/sysman/global_operations/global_operations_imp.h b/level_zero/tools/source/sysman/global_operations/global_operations_imp.h index 479617b340..f3ac8e73e7 100644 --- a/level_zero/tools/source/sysman/global_operations/global_operations_imp.h +++ b/level_zero/tools/source/sysman/global_operations/global_operations_imp.h @@ -22,6 +22,7 @@ class GlobalOperationsImp : public GlobalOperations, NEO::NonCopyableOrMovableCl ze_result_t deviceGetProperties(zes_device_properties_t *pProperties) override; ze_result_t processesGetState(uint32_t *pCount, zes_process_state_t *pProcesses) override; ze_result_t deviceGetState(zes_device_state_t *pState) override; + ze_result_t resetExt(zes_reset_properties_t *pProperties) override; OsGlobalOperations *pOsGlobalOperations = nullptr; GlobalOperationsImp() = default; diff --git a/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.cpp b/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.cpp index dee4aa76b7..6fdab74126 100644 --- a/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.cpp +++ b/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.cpp @@ -194,86 +194,46 @@ void LinuxGlobalOperationsImp::getDriverVersion(char (&driverVersion)[ZES_STRING } ze_result_t LinuxGlobalOperationsImp::reset(ze_bool_t force) { + ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; + pDevice->getProperties(&deviceProperties); + auto resetType = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED ? ZES_RESET_TYPE_FLR : ZES_RESET_TYPE_WARM; + + return resetImpl(force, resetType); +} + +ze_result_t LinuxGlobalOperationsImp::resetExt(zes_reset_properties_t *pProperties) { + return resetImpl(pProperties->force, pProperties->resetType); +} + +ze_result_t LinuxGlobalOperationsImp::resetImpl(ze_bool_t force, zes_reset_type_t resetType) { if (!pSysfsAccess->isRootUser()) { NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Not running as root user and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS; } - ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; - pDevice->getProperties(&deviceProperties); + auto devicePtr = static_cast(pDevice); NEO::ExecutionEnvironment *executionEnvironment = devicePtr->getNEODevice()->getExecutionEnvironment(); auto restorer = std::make_unique(executionEnvironment); pLinuxSysmanImp->releaseDeviceResources(); - std::string resetPath; - std::string resetName; - ze_result_t result = ZE_RESULT_SUCCESS; - ::pid_t myPid = pProcfsAccess->myProcessId(); - std::vector myPidFds; - std::vector<::pid_t> processes; - - result = pProcfsAccess->listProcesses(processes); + ze_result_t result = pLinuxSysmanImp->gpuProcessCleanup(force); if (ZE_RESULT_SUCCESS != result) { - NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Unable to list processes and returning error:0x%x \n", __FUNCTION__, result); + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): gpuProcessCleanup() failed and returning error:0x%x \n", __FUNCTION__, result); return result; } - for (auto &&pid : processes) { - std::vector fds; - pLinuxSysmanImp->getPidFdsForOpenDevice(pProcfsAccess, pSysfsAccess, pid, fds); - if (pid == myPid) { - // L0 is expected to have this file open. - // Keep list of fds. Close before unbind. - myPidFds = fds; - } else if (!fds.empty()) { - if (force) { - pProcfsAccess->kill(pid); - } else { - // Device is in use by another process. - // Don't reset while in use. - NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Device in use by another process, not resetting and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE); - return ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE; - } - } - } + std::string resetName; pSysfsAccess->getRealPath(deviceDir, resetName); resetName = pFsAccess->getBaseName(resetName); - if (!(deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED)) { - result = pSysfsAccess->unbindDevice(resetName); - if (ZE_RESULT_SUCCESS != result) { - NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to unbind device and returning error:0x%x \n", __FUNCTION__, result); - return result; - } - result = pLinuxSysmanImp->osWarmReset(); - if (ZE_RESULT_SUCCESS == result) { - return pLinuxSysmanImp->initDevice(); - } - NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Error during warm reset \n", __FUNCTION__); - return result; - } - - pSysfsAccess->getRealPath(functionLevelReset, resetPath); - - for (auto &&fd : myPidFds) { - // Close open filedescriptors to the device - // before unbinding device. - // From this point forward, there is no - // graceful way to fail the reset call. - // All future ze calls by this process for this - // device will fail. - ::close(fd); - } - // Unbind the device from the kernel driver. result = pSysfsAccess->unbindDevice(resetName); if (ZE_RESULT_SUCCESS != result) { - NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to unbind device and returning error:0x%x \n", __FUNCTION__, result); + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to unbind device:%s and returning error:0x%x \n", __FUNCTION__, resetName.c_str(), result); return result; } - // If someone opened the device - // after we check, kill them here. + std::vector<::pid_t> processes; result = pProcfsAccess->listProcesses(processes); if (ZE_RESULT_SUCCESS != result) { NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to list processes and returning error:0x%x \n", __FUNCTION__, result); @@ -308,14 +268,27 @@ ze_result_t LinuxGlobalOperationsImp::reset(ze_bool_t force) { } } - // Reset the device. - result = pFsAccess->write(resetPath, "1"); + std::string resetPath = {}; + switch (resetType) { + case ZES_RESET_TYPE_WARM: + result = pLinuxSysmanImp->osWarmReset(); + break; + case ZES_RESET_TYPE_COLD: + result = pLinuxSysmanImp->osColdReset(); + break; + case ZES_RESET_TYPE_FLR: + pSysfsAccess->getRealPath(functionLevelReset, resetPath); + result = pFsAccess->write(resetPath, "1"); + break; + default: + return ZE_RESULT_ERROR_INVALID_ARGUMENT; + } + if (ZE_RESULT_SUCCESS != result) { NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to reset the device and returning error:0x%x \n", __FUNCTION__, result); return result; } - // Rebind the device to the kernel driver. result = pSysfsAccess->bindDevice(resetName); if (ZE_RESULT_SUCCESS != result) { NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to bind the device to the kernel driver and returning error:0x%x \n", __FUNCTION__, result); diff --git a/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.h b/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.h index b1249384a1..5e4a1298ce 100644 --- a/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.h +++ b/level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ class LinuxGlobalOperationsImp : public OsGlobalOperations, NEO::NonCopyableOrMo ze_result_t reset(ze_bool_t force) override; ze_result_t scanProcessesState(std::vector &pProcessList) override; ze_result_t deviceGetState(zes_device_state_t *pState) override; + ze_result_t resetExt(zes_reset_properties_t *pProperties) override; LinuxGlobalOperationsImp() = default; LinuxGlobalOperationsImp(OsSysman *pOsSysman); ~LinuxGlobalOperationsImp() override = default; @@ -58,6 +59,7 @@ class LinuxGlobalOperationsImp : public OsGlobalOperations, NEO::NonCopyableOrMo std::string devicePciBdf = ""; NEO::ExecutionEnvironment *executionEnvironment = nullptr; uint32_t rootDeviceIndex = 0u; + ze_result_t resetImpl(ze_bool_t force, zes_reset_type_t resetType); }; } // namespace L0 diff --git a/level_zero/tools/source/sysman/global_operations/os_global_operations.h b/level_zero/tools/source/sysman/global_operations/os_global_operations.h index d1fdf5cd2a..d4be9588cb 100644 --- a/level_zero/tools/source/sysman/global_operations/os_global_operations.h +++ b/level_zero/tools/source/sysman/global_operations/os_global_operations.h @@ -29,6 +29,7 @@ class OsGlobalOperations { virtual ze_result_t reset(ze_bool_t force) = 0; virtual ze_result_t scanProcessesState(std::vector &pProcessList) = 0; virtual ze_result_t deviceGetState(zes_device_state_t *pState) = 0; + virtual ze_result_t resetExt(zes_reset_properties_t *pProperties) = 0; static OsGlobalOperations *create(OsSysman *pOsSysman); virtual ~OsGlobalOperations() {} }; diff --git a/level_zero/tools/source/sysman/global_operations/windows/os_global_operations_imp.cpp b/level_zero/tools/source/sysman/global_operations/windows/os_global_operations_imp.cpp index 35f7aa98d3..a69685cad1 100644 --- a/level_zero/tools/source/sysman/global_operations/windows/os_global_operations_imp.cpp +++ b/level_zero/tools/source/sysman/global_operations/windows/os_global_operations_imp.cpp @@ -60,6 +60,10 @@ ze_result_t WddmGlobalOperationsImp::deviceGetState(zes_device_state_t *pState) return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; } +ze_result_t WddmGlobalOperationsImp::resetExt(zes_reset_properties_t *pProperties) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + WddmGlobalOperationsImp::WddmGlobalOperationsImp(OsSysman *pOsSysman) { WddmSysmanImp *pWddmSysmanImp = static_cast(pOsSysman); pDevice = pWddmSysmanImp->getDeviceHandle(); diff --git a/level_zero/tools/source/sysman/global_operations/windows/os_global_operations_imp.h b/level_zero/tools/source/sysman/global_operations/windows/os_global_operations_imp.h index 8d11cf69e5..d7c1a81c47 100644 --- a/level_zero/tools/source/sysman/global_operations/windows/os_global_operations_imp.h +++ b/level_zero/tools/source/sysman/global_operations/windows/os_global_operations_imp.h @@ -28,6 +28,7 @@ class WddmGlobalOperationsImp : public OsGlobalOperations, NEO::NonCopyableOrMov ze_result_t reset(ze_bool_t force) override; ze_result_t scanProcessesState(std::vector &pProcessList) override; ze_result_t deviceGetState(zes_device_state_t *pState) override; + ze_result_t resetExt(zes_reset_properties_t *pProperties) override; WddmGlobalOperationsImp(OsSysman *pOsSysman); WddmGlobalOperationsImp(const WddmGlobalOperationsImp &obj) = delete; diff --git a/level_zero/tools/source/sysman/linux/os_sysman_imp.cpp b/level_zero/tools/source/sysman/linux/os_sysman_imp.cpp index 8484f30d79..c39ffb2cf0 100644 --- a/level_zero/tools/source/sysman/linux/os_sysman_imp.cpp +++ b/level_zero/tools/source/sysman/linux/os_sysman_imp.cpp @@ -254,7 +254,7 @@ void LinuxSysmanImp::getPidFdsForOpenDevice(ProcfsAccess *pProcfsAccess, SysfsAc } } -ze_result_t LinuxSysmanImp::gpuProcessCleanup() { +ze_result_t LinuxSysmanImp::gpuProcessCleanup(ze_bool_t force) { ::pid_t myPid = pProcfsAccess->myProcessId(); std::vector<::pid_t> processes; std::vector myPidFds; @@ -275,7 +275,12 @@ ze_result_t LinuxSysmanImp::gpuProcessCleanup() { continue; } if (!fds.empty()) { - pProcfsAccess->kill(pid); + if (force) { + pProcfsAccess->kill(pid); + } else { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Device in use by another process, returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE); + return ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE; + } } } diff --git a/level_zero/tools/source/sysman/linux/os_sysman_imp.h b/level_zero/tools/source/sysman/linux/os_sysman_imp.h index 4c0fa6bc43..2d028fd789 100644 --- a/level_zero/tools/source/sysman/linux/os_sysman_imp.h +++ b/level_zero/tools/source/sysman/linux/os_sysman_imp.h @@ -73,7 +73,7 @@ class LinuxSysmanImp : public OsSysman, NEO::NonCopyableOrMovableClass { MOCKABLE_VIRTUAL void getPidFdsForOpenDevice(ProcfsAccess *, SysfsAccess *, const ::pid_t, std::vector &); MOCKABLE_VIRTUAL ze_result_t osWarmReset(); MOCKABLE_VIRTUAL ze_result_t osColdReset(); - ze_result_t gpuProcessCleanup(); + MOCKABLE_VIRTUAL ze_result_t gpuProcessCleanup(ze_bool_t force); std::string getAddressFromPath(std::string &rootPortPath); decltype(&NEO::SysCalls::pread) preadFunction = NEO::SysCalls::pread; decltype(&NEO::SysCalls::pwrite) pwriteFunction = NEO::SysCalls::pwrite; diff --git a/level_zero/tools/source/sysman/scheduler/linux/os_scheduler_imp_prelim.cpp b/level_zero/tools/source/sysman/scheduler/linux/os_scheduler_imp_prelim.cpp index 9b441d428d..b6e6f3c430 100644 --- a/level_zero/tools/source/sysman/scheduler/linux/os_scheduler_imp_prelim.cpp +++ b/level_zero/tools/source/sysman/scheduler/linux/os_scheduler_imp_prelim.cpp @@ -430,7 +430,7 @@ ze_result_t LinuxSchedulerImp::updateComputeUnitDebugNode(uint64_t val) { NEO::ExecutionEnvironment *executionEnvironment = devicePtr->getNEODevice()->getExecutionEnvironment(); auto restorer = std::make_unique(executionEnvironment); pLinuxSysmanImp->releaseDeviceResources(); - ze_result_t result = pLinuxSysmanImp->gpuProcessCleanup(); + ze_result_t result = pLinuxSysmanImp->gpuProcessCleanup(true); if (ZE_RESULT_SUCCESS != result) { NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "updateComputeUnitDebugNode: gpuProcessCleanup() failed with error code: %ld\n", result); diff --git a/level_zero/tools/source/sysman/sysman.cpp b/level_zero/tools/source/sysman/sysman.cpp index 0469cd74cb..ce2a32b32b 100644 --- a/level_zero/tools/source/sysman/sysman.cpp +++ b/level_zero/tools/source/sysman/sysman.cpp @@ -313,7 +313,8 @@ uint64_t SysmanDevice::getSysmanTimestamp() { } ze_result_t SysmanDevice::deviceResetExt(zes_device_handle_t hDevice, zes_reset_properties_t *pProperties) { - return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + auto pSysmanDevice = L0::SysmanDevice::fromHandle(hDevice); + return pSysmanDevice->deviceResetExt(pProperties); } ze_result_t SysmanDevice::fabricPortGetMultiPortThroughput(zes_device_handle_t hDevice, uint32_t numPorts, zes_fabric_port_handle_t *phPort, zes_fabric_port_throughput_t **pThroughput) { diff --git a/level_zero/tools/source/sysman/sysman.h b/level_zero/tools/source/sysman/sysman.h index b7fd06acbd..137b3f497e 100644 --- a/level_zero/tools/source/sysman/sysman.h +++ b/level_zero/tools/source/sysman/sysman.h @@ -93,6 +93,7 @@ struct SysmanDevice : _ze_device_handle_t { virtual ze_result_t deviceSetEccState(const zes_device_ecc_desc_t *newState, zes_device_ecc_properties_t *pState) = 0; virtual ze_result_t fabricPortGetMultiPortThroughput(uint32_t numPorts, zes_fabric_port_handle_t *phPort, zes_fabric_port_throughput_t **pThroughput) = 0; virtual bool deviceEventListen(zes_event_type_flags_t &pEvent, uint64_t timeout) = 0; + virtual ze_result_t deviceResetExt(zes_reset_properties_t *pProperties) = 0; virtual OsSysman *deviceGetOsInterface() = 0; virtual ~SysmanDevice() = default; }; diff --git a/level_zero/tools/source/sysman/sysman_imp.cpp b/level_zero/tools/source/sysman/sysman_imp.cpp index 3a26e0bd1f..7bb318b0b5 100644 --- a/level_zero/tools/source/sysman/sysman_imp.cpp +++ b/level_zero/tools/source/sysman/sysman_imp.cpp @@ -137,6 +137,10 @@ bool SysmanDeviceImp::deviceEventListen(zes_event_type_flags_t &pEvent, uint64_t return pEvents->eventListen(pEvent, timeout); } +ze_result_t SysmanDeviceImp::deviceResetExt(zes_reset_properties_t *pProperties) { + return pGlobalOperations->resetExt(pProperties); +} + ze_result_t SysmanDeviceImp::deviceGetState(zes_device_state_t *pState) { return pGlobalOperations->deviceGetState(pState); } diff --git a/level_zero/tools/source/sysman/sysman_imp.h b/level_zero/tools/source/sysman/sysman_imp.h index 3ab7625a72..77d4e759e0 100644 --- a/level_zero/tools/source/sysman/sysman_imp.h +++ b/level_zero/tools/source/sysman/sysman_imp.h @@ -72,6 +72,7 @@ struct SysmanDeviceImp : SysmanDevice, NEO::NonCopyableOrMovableClass { ze_result_t deviceEccConfigurable(ze_bool_t *pConfigurable) override; ze_result_t deviceGetEccState(zes_device_ecc_properties_t *pState) override; ze_result_t deviceSetEccState(const zes_device_ecc_desc_t *newState, zes_device_ecc_properties_t *pState) override; + ze_result_t deviceResetExt(zes_reset_properties_t *pProperties) override; bool deviceEventListen(zes_event_type_flags_t &pEvent, uint64_t timeout) override; ze_result_t fabricPortGetMultiPortThroughput(uint32_t numPorts, zes_fabric_port_handle_t *phPort, zes_fabric_port_throughput_t **pThroughput) override; diff --git a/level_zero/tools/test/unit_tests/sources/sysman/diagnostics/linux/mock_zes_sysman_diagnostics.h b/level_zero/tools/test/unit_tests/sources/sysman/diagnostics/linux/mock_zes_sysman_diagnostics.h index 45c7d3dcc6..e554831b45 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/diagnostics/linux/mock_zes_sysman_diagnostics.h +++ b/level_zero/tools/test/unit_tests/sources/sysman/diagnostics/linux/mock_zes_sysman_diagnostics.h @@ -253,7 +253,6 @@ struct MockDiagLinuxSysmanImp : public LinuxSysmanImp { }; class PublicLinuxDiagnosticsImp : public L0::LinuxDiagnosticsImp { public: - using LinuxDiagnosticsImp::gpuProcessCleanup; using LinuxDiagnosticsImp::pFwInterface; using LinuxDiagnosticsImp::pLinuxSysmanImp; using LinuxDiagnosticsImp::pSysfsAccess; diff --git a/level_zero/tools/test/unit_tests/sources/sysman/diagnostics/linux/test_zes_sysman_diagnostics.cpp b/level_zero/tools/test/unit_tests/sources/sysman/diagnostics/linux/test_zes_sysman_diagnostics.cpp index 5cc576f1e9..c9fd96553c 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/diagnostics/linux/test_zes_sysman_diagnostics.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/diagnostics/linux/test_zes_sysman_diagnostics.cpp @@ -518,7 +518,7 @@ TEST_F(ZesDiagnosticsFixture, GivenValidDiagnosticsHandleWhenGPUProcessCleanupSu pMockDiagProcfsAccess->ourDevicePid = getpid(); pMockDiagLinuxSysmanImp->ourDevicePid = getpid(); pMockDiagLinuxSysmanImp->ourDeviceFd = ::open("/dev/null", 0); - EXPECT_EQ(ZE_RESULT_SUCCESS, pPublicLinuxDiagnosticsImp->pLinuxSysmanImp->gpuProcessCleanup()); + EXPECT_EQ(ZE_RESULT_SUCCESS, pPublicLinuxDiagnosticsImp->pLinuxSysmanImp->gpuProcessCleanup(true)); } TEST_F(ZesDiagnosticsFixture, GivenValidDiagnosticsHandleWhenGPUProcessCleanupFailsThenWaitForQuiescentCompletionsFails) { diff --git a/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/mock_global_operations.h b/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/mock_global_operations.h index 61e4bf129f..a05d0bfeba 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/mock_global_operations.h +++ b/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/mock_global_operations.h @@ -541,6 +541,7 @@ struct MockGlobalOpsLinuxSysmanImp : public LinuxSysmanImp { void setMockInitDeviceError(ze_result_t result) { mockInitDeviceError = result; } + ADDMETHOD_NOBASE(gpuProcessCleanup, ze_result_t, ZE_RESULT_SUCCESS, (ze_bool_t force)); }; class DrmGlobalOpsMock : public DrmMock { diff --git a/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations.cpp b/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations.cpp index aadcd2aff3..9baa9cceb2 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations.cpp @@ -149,12 +149,34 @@ class SysmanGlobalOperationsIntegratedFixture : public SysmanGlobalOperationsFix } }; -TEST_F(SysmanGlobalOperationsFixture, GivenValidDeviceHandleWhenCallingzesDeviceResetExtThenUnsupportedFeatureErrorIsReturned) { +TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingzesDeviceResetExtThenResetExtCallReturnSuccess) { + DebugManagerStateRestore dbgRestore; + DebugManager.flags.VfBarResourceAllocationWa.set(false); initGlobalOps(); static_cast(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp = pMockGlobalOpsLinuxSysmanImp.get(); static_cast(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp->pDevice = pLinuxSysmanImp->getDeviceHandle(); - ze_result_t result = zesDeviceResetExt(device, nullptr); - EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, result); + zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_WARM}; + ze_result_t result = zesDeviceResetExt(device, &pProperties); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + pProperties.resetType = ZES_RESET_TYPE_COLD; + result = zesDeviceResetExt(device, &pProperties); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + pProperties.resetType = ZES_RESET_TYPE_FLR; + result = zesDeviceResetExt(device, &pProperties); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); +} + +TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingResetExtWithInvalidTypeThenFailureIsReturned) { + DebugManagerStateRestore dbgRestore; + DebugManager.flags.VfBarResourceAllocationWa.set(false); + initGlobalOps(); + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp = pMockGlobalOpsLinuxSysmanImp.get(); + static_cast(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp->pDevice = pLinuxSysmanImp->getDeviceHandle(); + zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_FORCE_UINT32}; + ze_result_t result = zesDeviceResetExt(device, &pProperties); + EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, result); } TEST_F(SysmanGlobalOperationsFixture, GivenValidDeviceHandleWhenCallingzesGlobalOperationsGetPropertiesThenVerifyValidPropertiesAreReturned) {