fix: handle GPU error in xe path
get DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN property and signal gpuHang based on this Related-To: HSD-18038050680 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
parent
39ca00fb08
commit
22bedda081
|
@ -1091,11 +1091,14 @@ int IoctlHelperXe::ioctl(DrmIoctl request, void *arg) {
|
||||||
d->handle, d->offset, d->flags, ret);
|
d->handle, d->offset, d->flags, ret);
|
||||||
} break;
|
} break;
|
||||||
case DrmIoctl::getResetStats: {
|
case DrmIoctl::getResetStats: {
|
||||||
ResetStats *d = static_cast<ResetStats *>(arg);
|
ResetStats *resetStats = static_cast<ResetStats *>(arg);
|
||||||
// d->batchActive = 1; // fake gpu hang
|
drm_xe_exec_queue_get_property getProperty{};
|
||||||
ret = 0;
|
getProperty.exec_queue_id = resetStats->contextId;
|
||||||
xeLog(" -> IoctlHelperXe::ioctl GetResetStats ctx=0x%x r=%d\n",
|
getProperty.property = DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN;
|
||||||
d->contextId, ret);
|
ret = IoctlHelper::ioctl(request, &getProperty);
|
||||||
|
resetStats->batchPending = static_cast<uint32_t>(getProperty.value);
|
||||||
|
xeLog(" -> IoctlHelperXe::ioctl GetResetStats ctx=0x%x r=%d value=%llu\n",
|
||||||
|
resetStats->contextId, ret, getProperty.value);
|
||||||
} break;
|
} break;
|
||||||
case DrmIoctl::primeFdToHandle: {
|
case DrmIoctl::primeFdToHandle: {
|
||||||
PrimeHandle *prime = static_cast<PrimeHandle *>(arg);
|
PrimeHandle *prime = static_cast<PrimeHandle *>(arg);
|
||||||
|
@ -1526,6 +1529,8 @@ unsigned int IoctlHelperXe::getIoctlRequestValue(DrmIoctl ioctlRequest) const {
|
||||||
RETURN_ME(DRM_IOCTL_PRIME_FD_TO_HANDLE);
|
RETURN_ME(DRM_IOCTL_PRIME_FD_TO_HANDLE);
|
||||||
case DrmIoctl::primeHandleToFd:
|
case DrmIoctl::primeHandleToFd:
|
||||||
RETURN_ME(DRM_IOCTL_PRIME_HANDLE_TO_FD);
|
RETURN_ME(DRM_IOCTL_PRIME_HANDLE_TO_FD);
|
||||||
|
case DrmIoctl::getResetStats:
|
||||||
|
RETURN_ME(DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY);
|
||||||
case DrmIoctl::debuggerOpen:
|
case DrmIoctl::debuggerOpen:
|
||||||
case DrmIoctl::metadataCreate:
|
case DrmIoctl::metadataCreate:
|
||||||
case DrmIoctl::metadataDestroy:
|
case DrmIoctl::metadataDestroy:
|
||||||
|
@ -1570,6 +1575,8 @@ std::string IoctlHelperXe::getIoctlString(DrmIoctl ioctlRequest) const {
|
||||||
STRINGIFY_ME(DRM_IOCTL_XE_DEBUG_METADATA_CREATE);
|
STRINGIFY_ME(DRM_IOCTL_XE_DEBUG_METADATA_CREATE);
|
||||||
case DrmIoctl::metadataDestroy:
|
case DrmIoctl::metadataDestroy:
|
||||||
STRINGIFY_ME(DRM_IOCTL_XE_DEBUG_METADATA_DESTROY);
|
STRINGIFY_ME(DRM_IOCTL_XE_DEBUG_METADATA_DESTROY);
|
||||||
|
case DrmIoctl::getResetStats:
|
||||||
|
STRINGIFY_ME(DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY);
|
||||||
default:
|
default:
|
||||||
return "???";
|
return "???";
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#include "shared/source/os_interface/linux/os_context_linux.h"
|
#include "shared/source/os_interface/linux/os_context_linux.h"
|
||||||
#include "shared/test/common/helpers/engine_descriptor_helper.h"
|
#include "shared/test/common/helpers/engine_descriptor_helper.h"
|
||||||
#include "shared/test/common/mocks/linux/mock_drm_memory_manager.h"
|
#include "shared/test/common/mocks/linux/mock_drm_memory_manager.h"
|
||||||
|
#include "shared/test/common/mocks/linux/mock_os_context_linux.h"
|
||||||
|
|
||||||
using namespace NEO;
|
using namespace NEO;
|
||||||
|
|
||||||
|
@ -2131,3 +2132,20 @@ TEST(IoctlHelperXeTest, whenGetFdFromVmExportIsCalledThenFalseIsReturned) {
|
||||||
int32_t fd = 0;
|
int32_t fd = 0;
|
||||||
EXPECT_FALSE(xeIoctlHelper->getFdFromVmExport(vmId, flags, &fd));
|
EXPECT_FALSE(xeIoctlHelper->getFdFromVmExport(vmId, flags, &fd));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(IoctlHelperXeTest, whenCheckingGpuHangThenBanPropertyIsQueried) {
|
||||||
|
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
|
||||||
|
DrmMockXe drm{*executionEnvironment->rootDeviceEnvironments[0]};
|
||||||
|
auto xeHelper = std::make_unique<MockIoctlHelperXe>(drm);
|
||||||
|
drm.ioctlHelper = std::move(xeHelper);
|
||||||
|
|
||||||
|
MockOsContextLinux osContext(drm, 0, 5u, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular}));
|
||||||
|
osContext.drmContextIds.push_back(0);
|
||||||
|
drm.execQueueBanPropertyReturn = 0;
|
||||||
|
EXPECT_FALSE(drm.checkResetStatus(osContext));
|
||||||
|
EXPECT_FALSE(osContext.isHangDetected());
|
||||||
|
|
||||||
|
drm.execQueueBanPropertyReturn = 1;
|
||||||
|
EXPECT_TRUE(drm.checkResetStatus(osContext));
|
||||||
|
EXPECT_TRUE(osContext.isHangDetected());
|
||||||
|
}
|
||||||
|
|
|
@ -205,9 +205,14 @@ class DrmMockXe : public DrmMockCustom {
|
||||||
ret = 0;
|
ret = 0;
|
||||||
} break;
|
} break;
|
||||||
case DrmIoctl::getparam:
|
case DrmIoctl::getparam:
|
||||||
case DrmIoctl::getResetStats:
|
|
||||||
ret = -2;
|
ret = -2;
|
||||||
break;
|
break;
|
||||||
|
case DrmIoctl::getResetStats: {
|
||||||
|
auto execQueueProperty = static_cast<drm_xe_exec_queue_get_property *>(arg);
|
||||||
|
EXPECT_EQ(execQueueProperty->property, static_cast<uint32_t>(DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN));
|
||||||
|
execQueueProperty->value = execQueueBanPropertyReturn;
|
||||||
|
ret = 0;
|
||||||
|
} break;
|
||||||
case DrmIoctl::query: {
|
case DrmIoctl::query: {
|
||||||
struct drm_xe_device_query *deviceQuery = static_cast<struct drm_xe_device_query *>(arg);
|
struct drm_xe_device_query *deviceQuery = static_cast<struct drm_xe_device_query *>(arg);
|
||||||
switch (deviceQuery->query) {
|
switch (deviceQuery->query) {
|
||||||
|
@ -344,6 +349,7 @@ class DrmMockXe : public DrmMockCustom {
|
||||||
StackVec<drm_xe_ext_set_property, 1> execQueueProperties;
|
StackVec<drm_xe_ext_set_property, 1> execQueueProperties;
|
||||||
|
|
||||||
int waitUserFenceReturn = 0;
|
int waitUserFenceReturn = 0;
|
||||||
|
int execQueueBanPropertyReturn = 0;
|
||||||
uint32_t createParamsFlags = 0u;
|
uint32_t createParamsFlags = 0u;
|
||||||
uint16_t createParamsCpuCaching = 0u;
|
uint16_t createParamsCpuCaching = 0u;
|
||||||
uint32_t createParamsPlacement = 0u;
|
uint32_t createParamsPlacement = 0u;
|
||||||
|
|
Loading…
Reference in New Issue