feature: abort when unexpected GPU page fault detected

If ResetStats from i915 is from the GPU page fault, abort
the entire process instead of disabling engines.
Added a fallback mechanism when prelim_drm_i915_reset_stats
fails.

Related-To: GSD-5673
Signed-off-by: Young Jin Yoon <young.jin.yoon@intel.com>
This commit is contained in:
Young Jin Yoon
2024-02-26 07:57:36 +00:00
committed by Compute-Runtime-Automation
parent 448ad59d28
commit 7b81c4e08f
12 changed files with 260 additions and 8 deletions

View File

@@ -72,6 +72,14 @@ struct UuidRegisterResult {
uint32_t handle;
};
struct ResetStatsFault {
uint64_t addr;
uint16_t type;
uint16_t level;
uint16_t access;
uint16_t flags;
};
using MemRegionsVec = StackVec<MemoryClassInstance, 5>;
using VmBindExtSetPatT = uint8_t[40];
using VmBindExtUserFenceT = uint8_t[56];
@@ -123,6 +131,7 @@ class IoctlHelper {
virtual uint32_t getVmAdviseAtomicAttribute() = 0;
virtual int vmBind(const VmBindParams &vmBindParams) = 0;
virtual int vmUnbind(const VmBindParams &vmBindParams) = 0;
virtual int getResetStats(ResetStats &resetStats, uint32_t *status, ResetStatsFault *resetStatsFault) = 0;
virtual bool getEuStallProperties(std::array<uint64_t, 12u> &properties, uint64_t dssBufferSize,
uint64_t samplingRate, uint64_t pollPeriod, uint64_t engineInstance, uint64_t notifyNReports) = 0;
virtual uint32_t getEuStallFdParameter() = 0;
@@ -177,6 +186,9 @@ class IoctlHelper {
virtual int getEuDebugSysFsEnable() { return false; }
virtual bool isVmBindPatIndexExtSupported() { return false; }
virtual bool validPageFault(uint16_t flags) { return false; }
virtual uint32_t getStatusForResetStats(bool banned) { return 0u; }
protected:
Drm &drm;
};
@@ -256,6 +268,7 @@ class IoctlHelperUpstream : public IoctlHelperI915 {
uint32_t getVmAdviseAtomicAttribute() override;
int vmBind(const VmBindParams &vmBindParams) override;
int vmUnbind(const VmBindParams &vmBindParams) override;
int getResetStats(ResetStats &resetStats, uint32_t *status, ResetStatsFault *resetStatsFault) override;
bool getEuStallProperties(std::array<uint64_t, 12u> &properties, uint64_t dssBufferSize, uint64_t samplingRate,
uint64_t pollPeriod, uint64_t engineInstance, uint64_t notifyNReports) override;
uint32_t getEuStallFdParameter() override;
@@ -331,6 +344,7 @@ class IoctlHelperPrelim20 : public IoctlHelperI915 {
uint32_t getVmAdviseAtomicAttribute() override;
int vmBind(const VmBindParams &vmBindParams) override;
int vmUnbind(const VmBindParams &vmBindParams) override;
int getResetStats(ResetStats &resetStats, uint32_t *status, ResetStatsFault *resetStatsFault) override;
bool getEuStallProperties(std::array<uint64_t, 12u> &properties, uint64_t dssBufferSize, uint64_t samplingRate,
uint64_t pollPeriod, uint64_t engineInstance, uint64_t notifyNReports) override;
uint32_t getEuStallFdParameter() override;
@@ -362,6 +376,9 @@ class IoctlHelperPrelim20 : public IoctlHelperI915 {
int getEuDebugSysFsEnable() override;
bool isVmBindPatIndexExtSupported() override { return true; }
bool validPageFault(uint16_t flags) override;
uint32_t getStatusForResetStats(bool banned) override;
protected:
bool queryHwIpVersion(EngineClassInstance &engineInfo, HardwareIpVersion &ipVersion, int &ret);
StackVec<uint32_t, size_t(DrmResourceClass::maxSize)> classHandles;