Implement GPU hang detection

This change uses DRM_IOCTL_I915_GET_RESET_STATS to detect
GPU hangs. When such situation is encountered, then
zeCommandQueueSynchronize returns ZE_RESULT_ERROR_DEVICE_LOST.

Related-To: NEO-5313
Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
Patryk Wrobel
2022-01-20 16:56:19 +00:00
committed by Compute-Runtime-Automation
parent 543c854a3b
commit 498cf5e871
37 changed files with 556 additions and 101 deletions

View File

@@ -13,6 +13,7 @@
#include "shared/source/memory_manager/surface.h"
#include "shared/source/os_interface/device_factory.h"
#include "shared/source/os_interface/hw_info_config.h"
#include "shared/source/os_interface/os_interface.h"
#include "shared/source/utilities/tag_allocator.h"
#include "shared/test/common/fixtures/device_fixture.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
@@ -20,6 +21,7 @@
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/mocks/mock_allocation_properties.h"
#include "shared/test/common/mocks/mock_csr.h"
#include "shared/test/common/mocks/mock_driver_model.h"
#include "shared/test/common/mocks/mock_execution_environment.h"
#include "shared/test/common/mocks/mock_memory_manager.h"
#include "shared/test/common/mocks/ult_device_factory.h"
@@ -30,10 +32,15 @@
#include "gmock/gmock.h"
#include <chrono>
#include <functional>
#include <limits>
namespace NEO {
extern ApiSpecificConfig::ApiType apiTypeForUlts;
} // namespace NEO
using namespace NEO;
using namespace std::chrono_literals;
struct CommandStreamReceiverTest : public DeviceFixture,
public ::testing::Test {
@@ -165,6 +172,99 @@ HWTEST_F(CommandStreamReceiverTest, whenStoreAllocationThenStoredAllocationHasTa
EXPECT_EQ(csr.peekTaskCount(), allocation->getTaskCount(csr.getOsContext().getContextId()));
}
HWTEST_F(CommandStreamReceiverTest, givenGpuHangWhenWaititingForCompletionWithTimeoutThenGpuHangIsReturned) {
auto driverModelMock = std::make_unique<MockDriverModel>();
driverModelMock->isGpuHangDetectedToReturn = true;
auto osInterface = std::make_unique<OSInterface>();
osInterface->setDriverModel(std::move(driverModelMock));
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface);
csr.callBaseWaitForCompletionWithTimeout = true;
csr.activePartitions = 1;
csr.gpuHangCheckPeriod = 0us;
volatile std::uint32_t tasksCount[16] = {};
csr.tagAddress = tasksCount;
constexpr auto enableTimeout = false;
constexpr auto timeoutMicroseconds = std::numeric_limits<std::int64_t>::max();
constexpr auto taskCountToWait = 1;
const auto waitStatus = csr.waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait);
EXPECT_EQ(WaitStatus::GpuHang, waitStatus);
}
HWTEST_F(CommandStreamReceiverTest, givenNoGpuHangWhenWaititingForCompletionWithTimeoutThenReadyIsReturned) {
auto driverModelMock = std::make_unique<MockDriverModel>();
driverModelMock->isGpuHangDetectedToReturn = false;
volatile std::uint32_t tasksCount[16] = {};
driverModelMock->isGpuHangDetectedSideEffect = [&tasksCount] {
tasksCount[0]++;
};
auto osInterface = std::make_unique<OSInterface>();
osInterface->setDriverModel(std::move(driverModelMock));
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface);
csr.callBaseWaitForCompletionWithTimeout = true;
csr.tagAddress = tasksCount;
csr.activePartitions = 1;
csr.gpuHangCheckPeriod = 0us;
constexpr auto enableTimeout = false;
constexpr auto timeoutMicroseconds = std::numeric_limits<std::int64_t>::max();
constexpr auto taskCountToWait = 1;
const auto waitStatus = csr.waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait);
EXPECT_EQ(WaitStatus::Ready, waitStatus);
}
HWTEST_F(CommandStreamReceiverTest, givenFailingFlushSubmissionsAndGpuHangWhenWaititingForCompletionWithTimeoutThenGpuHangIsReturned) {
auto driverModelMock = std::make_unique<MockDriverModel>();
driverModelMock->isGpuHangDetectedToReturn = true;
auto osInterface = std::make_unique<OSInterface>();
osInterface->setDriverModel(std::move(driverModelMock));
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
csr.latestFlushedTaskCount = 0;
csr.shouldFailFlushBatchedSubmissions = true;
csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface);
csr.callBaseWaitForCompletionWithTimeout = true;
constexpr auto enableTimeout = false;
constexpr auto timeoutMicroseconds = std::numeric_limits<std::int64_t>::max();
constexpr auto taskCountToWait = 1;
const auto waitStatus = csr.waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait);
EXPECT_EQ(WaitStatus::GpuHang, waitStatus);
}
HWTEST_F(CommandStreamReceiverTest, givenFailingFlushSubmissionsAndNoGpuHangWhenWaititingForCompletionWithTimeoutThenNotReadyIsReturned) {
auto driverModelMock = std::make_unique<MockDriverModel>();
driverModelMock->isGpuHangDetectedToReturn = false;
auto osInterface = std::make_unique<OSInterface>();
osInterface->setDriverModel(std::move(driverModelMock));
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
csr.latestFlushedTaskCount = 0;
csr.shouldFailFlushBatchedSubmissions = true;
csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface);
csr.callBaseWaitForCompletionWithTimeout = true;
constexpr auto enableTimeout = false;
constexpr auto timeoutMicroseconds = std::numeric_limits<std::int64_t>::max();
constexpr auto taskCountToWait = 1;
const auto waitStatus = csr.waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait);
EXPECT_EQ(WaitStatus::NotReady, waitStatus);
}
HWTEST_F(CommandStreamReceiverTest, givenCommandStreamReceiverWhenCheckedForInitialStatusOfStatelessMocsIndexThenUnknownMocsIsReturend) {
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
EXPECT_EQ(CacheSettings::unknownMocs, csr.latestSentStatelessMocsConfig);

View File

@@ -9,6 +9,7 @@
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/ult_hw_config.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/mocks/mock_driver_model.h"
#include "shared/test/common/mocks/mock_execution_environment.h"
#include "shared/test/common/mocks/mock_memory_manager.h"
#include "shared/test/common/mocks/ult_device_factory.h"
@@ -28,17 +29,6 @@ class MockMemoryManagerOsAgnosticContext : public MockMemoryManager {
}
};
struct MockDriverModel : NEO::DriverModel {
PhysicalDevicePciBusInfo pciBusInfo{};
MockDriverModel() : NEO::DriverModel(NEO::DriverModelType::UNKNOWN) {}
void setGmmInputArgs(void *args) override {}
uint32_t getDeviceHandle() const override { return {}; }
PhysicalDevicePciBusInfo getPciBusInfo() const override { return pciBusInfo; }
size_t getMaxMemAllocSize() const override {
return 0;
}
};
template <PRODUCT_FAMILY gfxProduct>
class MockHwInfoConfigHw : public HwInfoConfigHw<gfxProduct> {
public: