fix: handle GPU error in xe path

get DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN property and signal gpuHang based on this

Related-To: HSD-18038050680
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski 2024-04-24 08:23:11 +00:00 committed by Compute-Runtime-Automation
parent 39ca00fb08
commit 22bedda081
3 changed files with 37 additions and 6 deletions

View File

@ -1091,11 +1091,14 @@ int IoctlHelperXe::ioctl(DrmIoctl request, void *arg) {
d->handle, d->offset, d->flags, ret);
} break;
case DrmIoctl::getResetStats: {
ResetStats *d = static_cast<ResetStats *>(arg);
// d->batchActive = 1; // fake gpu hang
ret = 0;
xeLog(" -> IoctlHelperXe::ioctl GetResetStats ctx=0x%x r=%d\n",
d->contextId, ret);
ResetStats *resetStats = static_cast<ResetStats *>(arg);
drm_xe_exec_queue_get_property getProperty{};
getProperty.exec_queue_id = resetStats->contextId;
getProperty.property = DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN;
ret = IoctlHelper::ioctl(request, &getProperty);
resetStats->batchPending = static_cast<uint32_t>(getProperty.value);
xeLog(" -> IoctlHelperXe::ioctl GetResetStats ctx=0x%x r=%d value=%llu\n",
resetStats->contextId, ret, getProperty.value);
} break;
case DrmIoctl::primeFdToHandle: {
PrimeHandle *prime = static_cast<PrimeHandle *>(arg);
@ -1526,6 +1529,8 @@ unsigned int IoctlHelperXe::getIoctlRequestValue(DrmIoctl ioctlRequest) const {
RETURN_ME(DRM_IOCTL_PRIME_FD_TO_HANDLE);
case DrmIoctl::primeHandleToFd:
RETURN_ME(DRM_IOCTL_PRIME_HANDLE_TO_FD);
case DrmIoctl::getResetStats:
RETURN_ME(DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY);
case DrmIoctl::debuggerOpen:
case DrmIoctl::metadataCreate:
case DrmIoctl::metadataDestroy:
@ -1570,6 +1575,8 @@ std::string IoctlHelperXe::getIoctlString(DrmIoctl ioctlRequest) const {
STRINGIFY_ME(DRM_IOCTL_XE_DEBUG_METADATA_CREATE);
case DrmIoctl::metadataDestroy:
STRINGIFY_ME(DRM_IOCTL_XE_DEBUG_METADATA_DESTROY);
case DrmIoctl::getResetStats:
STRINGIFY_ME(DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY);
default:
return "???";
}

View File

@ -10,6 +10,7 @@
#include "shared/source/os_interface/linux/os_context_linux.h"
#include "shared/test/common/helpers/engine_descriptor_helper.h"
#include "shared/test/common/mocks/linux/mock_drm_memory_manager.h"
#include "shared/test/common/mocks/linux/mock_os_context_linux.h"
using namespace NEO;
@ -2131,3 +2132,20 @@ TEST(IoctlHelperXeTest, whenGetFdFromVmExportIsCalledThenFalseIsReturned) {
int32_t fd = 0;
EXPECT_FALSE(xeIoctlHelper->getFdFromVmExport(vmId, flags, &fd));
}
TEST(IoctlHelperXeTest, whenCheckingGpuHangThenBanPropertyIsQueried) {
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
DrmMockXe drm{*executionEnvironment->rootDeviceEnvironments[0]};
auto xeHelper = std::make_unique<MockIoctlHelperXe>(drm);
drm.ioctlHelper = std::move(xeHelper);
MockOsContextLinux osContext(drm, 0, 5u, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular}));
osContext.drmContextIds.push_back(0);
drm.execQueueBanPropertyReturn = 0;
EXPECT_FALSE(drm.checkResetStatus(osContext));
EXPECT_FALSE(osContext.isHangDetected());
drm.execQueueBanPropertyReturn = 1;
EXPECT_TRUE(drm.checkResetStatus(osContext));
EXPECT_TRUE(osContext.isHangDetected());
}

View File

@ -205,9 +205,14 @@ class DrmMockXe : public DrmMockCustom {
ret = 0;
} break;
case DrmIoctl::getparam:
case DrmIoctl::getResetStats:
ret = -2;
break;
case DrmIoctl::getResetStats: {
auto execQueueProperty = static_cast<drm_xe_exec_queue_get_property *>(arg);
EXPECT_EQ(execQueueProperty->property, static_cast<uint32_t>(DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN));
execQueueProperty->value = execQueueBanPropertyReturn;
ret = 0;
} break;
case DrmIoctl::query: {
struct drm_xe_device_query *deviceQuery = static_cast<struct drm_xe_device_query *>(arg);
switch (deviceQuery->query) {
@ -344,6 +349,7 @@ class DrmMockXe : public DrmMockCustom {
StackVec<drm_xe_ext_set_property, 1> execQueueProperties;
int waitUserFenceReturn = 0;
int execQueueBanPropertyReturn = 0;
uint32_t createParamsFlags = 0u;
uint16_t createParamsCpuCaching = 0u;
uint32_t createParamsPlacement = 0u;