fix: handle GPU error in xe path
get DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN property and signal gpuHang based on this Related-To: HSD-18038050680 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
parent
39ca00fb08
commit
22bedda081
|
@ -1091,11 +1091,14 @@ int IoctlHelperXe::ioctl(DrmIoctl request, void *arg) {
|
|||
d->handle, d->offset, d->flags, ret);
|
||||
} break;
|
||||
case DrmIoctl::getResetStats: {
|
||||
ResetStats *d = static_cast<ResetStats *>(arg);
|
||||
// d->batchActive = 1; // fake gpu hang
|
||||
ret = 0;
|
||||
xeLog(" -> IoctlHelperXe::ioctl GetResetStats ctx=0x%x r=%d\n",
|
||||
d->contextId, ret);
|
||||
ResetStats *resetStats = static_cast<ResetStats *>(arg);
|
||||
drm_xe_exec_queue_get_property getProperty{};
|
||||
getProperty.exec_queue_id = resetStats->contextId;
|
||||
getProperty.property = DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN;
|
||||
ret = IoctlHelper::ioctl(request, &getProperty);
|
||||
resetStats->batchPending = static_cast<uint32_t>(getProperty.value);
|
||||
xeLog(" -> IoctlHelperXe::ioctl GetResetStats ctx=0x%x r=%d value=%llu\n",
|
||||
resetStats->contextId, ret, getProperty.value);
|
||||
} break;
|
||||
case DrmIoctl::primeFdToHandle: {
|
||||
PrimeHandle *prime = static_cast<PrimeHandle *>(arg);
|
||||
|
@ -1526,6 +1529,8 @@ unsigned int IoctlHelperXe::getIoctlRequestValue(DrmIoctl ioctlRequest) const {
|
|||
RETURN_ME(DRM_IOCTL_PRIME_FD_TO_HANDLE);
|
||||
case DrmIoctl::primeHandleToFd:
|
||||
RETURN_ME(DRM_IOCTL_PRIME_HANDLE_TO_FD);
|
||||
case DrmIoctl::getResetStats:
|
||||
RETURN_ME(DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY);
|
||||
case DrmIoctl::debuggerOpen:
|
||||
case DrmIoctl::metadataCreate:
|
||||
case DrmIoctl::metadataDestroy:
|
||||
|
@ -1570,6 +1575,8 @@ std::string IoctlHelperXe::getIoctlString(DrmIoctl ioctlRequest) const {
|
|||
STRINGIFY_ME(DRM_IOCTL_XE_DEBUG_METADATA_CREATE);
|
||||
case DrmIoctl::metadataDestroy:
|
||||
STRINGIFY_ME(DRM_IOCTL_XE_DEBUG_METADATA_DESTROY);
|
||||
case DrmIoctl::getResetStats:
|
||||
STRINGIFY_ME(DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY);
|
||||
default:
|
||||
return "???";
|
||||
}
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include "shared/source/os_interface/linux/os_context_linux.h"
|
||||
#include "shared/test/common/helpers/engine_descriptor_helper.h"
|
||||
#include "shared/test/common/mocks/linux/mock_drm_memory_manager.h"
|
||||
#include "shared/test/common/mocks/linux/mock_os_context_linux.h"
|
||||
|
||||
using namespace NEO;
|
||||
|
||||
|
@ -2131,3 +2132,20 @@ TEST(IoctlHelperXeTest, whenGetFdFromVmExportIsCalledThenFalseIsReturned) {
|
|||
int32_t fd = 0;
|
||||
EXPECT_FALSE(xeIoctlHelper->getFdFromVmExport(vmId, flags, &fd));
|
||||
}
|
||||
|
||||
TEST(IoctlHelperXeTest, whenCheckingGpuHangThenBanPropertyIsQueried) {
|
||||
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
|
||||
DrmMockXe drm{*executionEnvironment->rootDeviceEnvironments[0]};
|
||||
auto xeHelper = std::make_unique<MockIoctlHelperXe>(drm);
|
||||
drm.ioctlHelper = std::move(xeHelper);
|
||||
|
||||
MockOsContextLinux osContext(drm, 0, 5u, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular}));
|
||||
osContext.drmContextIds.push_back(0);
|
||||
drm.execQueueBanPropertyReturn = 0;
|
||||
EXPECT_FALSE(drm.checkResetStatus(osContext));
|
||||
EXPECT_FALSE(osContext.isHangDetected());
|
||||
|
||||
drm.execQueueBanPropertyReturn = 1;
|
||||
EXPECT_TRUE(drm.checkResetStatus(osContext));
|
||||
EXPECT_TRUE(osContext.isHangDetected());
|
||||
}
|
||||
|
|
|
@ -205,9 +205,14 @@ class DrmMockXe : public DrmMockCustom {
|
|||
ret = 0;
|
||||
} break;
|
||||
case DrmIoctl::getparam:
|
||||
case DrmIoctl::getResetStats:
|
||||
ret = -2;
|
||||
break;
|
||||
case DrmIoctl::getResetStats: {
|
||||
auto execQueueProperty = static_cast<drm_xe_exec_queue_get_property *>(arg);
|
||||
EXPECT_EQ(execQueueProperty->property, static_cast<uint32_t>(DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN));
|
||||
execQueueProperty->value = execQueueBanPropertyReturn;
|
||||
ret = 0;
|
||||
} break;
|
||||
case DrmIoctl::query: {
|
||||
struct drm_xe_device_query *deviceQuery = static_cast<struct drm_xe_device_query *>(arg);
|
||||
switch (deviceQuery->query) {
|
||||
|
@ -344,6 +349,7 @@ class DrmMockXe : public DrmMockCustom {
|
|||
StackVec<drm_xe_ext_set_property, 1> execQueueProperties;
|
||||
|
||||
int waitUserFenceReturn = 0;
|
||||
int execQueueBanPropertyReturn = 0;
|
||||
uint32_t createParamsFlags = 0u;
|
||||
uint16_t createParamsCpuCaching = 0u;
|
||||
uint32_t createParamsPlacement = 0u;
|
||||
|
|
Loading…
Reference in New Issue