Implement GPU hang detection

This change uses DRM_IOCTL_I915_GET_RESET_STATS to detect
GPU hangs. When such situation is encountered, then
zeCommandQueueSynchronize returns ZE_RESULT_ERROR_DEVICE_LOST.

Related-To: NEO-5313
Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
Patryk Wrobel
2022-01-20 16:56:19 +00:00
committed by Compute-Runtime-Automation
parent 543c854a3b
commit 498cf5e871
37 changed files with 556 additions and 101 deletions

View File

@ -1886,5 +1886,5 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenWaitForCompletionWithTimeoutI
mockCsr.latestSentTaskCount = 1;
auto cmdBuffer = std::make_unique<CommandBuffer>(*pDevice);
mockCsr.submissionAggregator->recordCommandBuffer(cmdBuffer.release());
EXPECT_FALSE(mockCsr.waitForCompletionWithTimeout(false, 0, 1));
EXPECT_EQ(NEO::WaitStatus::NotReady, mockCsr.waitForCompletionWithTimeout(false, 0, 1));
}

View File

@ -738,6 +738,6 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenTagValueNotMeetingTaskCountTo
CpuIntrinsicsTests::pauseAddress = mockCsr->tagAddress;
CpuIntrinsicsTests::pauseValue = taskCountToWait;
bool ret = mockCsr->waitForCompletionWithTimeout(false, 1, taskCountToWait);
EXPECT_TRUE(ret);
const auto ret = mockCsr->waitForCompletionWithTimeout(false, 1, taskCountToWait);
EXPECT_EQ(NEO::WaitStatus::Ready, ret);
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2021 Intel Corporation
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -468,7 +468,7 @@ HWTEST_F(UltCommandStreamReceiverTest, givenComputeOverrideDisableWhenComputeSup
HWTEST_F(UltCommandStreamReceiverTest, givenSinglePartitionWhenCallingWaitKmdNotifyThenExpectImplicitBusyLoopWaitCalled) {
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
commandStreamReceiver.callBaseWaitForCompletionWithTimeout = false;
commandStreamReceiver.returnWaitForCompletionWithTimeout = false;
commandStreamReceiver.returnWaitForCompletionWithTimeout = NEO::WaitStatus::NotReady;
commandStreamReceiver.waitForTaskCountWithKmdNotifyFallback(0, 0, false, false);
EXPECT_EQ(2u, commandStreamReceiver.waitForCompletionWithTimeoutTaskCountCalled);
@ -477,7 +477,7 @@ HWTEST_F(UltCommandStreamReceiverTest, givenSinglePartitionWhenCallingWaitKmdNot
HWTEST_F(UltCommandStreamReceiverTest, givenMultiplePartitionsWhenCallingWaitKmdNotifyThenExpectExplicitBusyLoopWaitCalled) {
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
commandStreamReceiver.callBaseWaitForCompletionWithTimeout = false;
commandStreamReceiver.returnWaitForCompletionWithTimeout = false;
commandStreamReceiver.returnWaitForCompletionWithTimeout = NEO::WaitStatus::NotReady;
commandStreamReceiver.waitForTaskCountWithKmdNotifyFallback(0, 0, false, false);
EXPECT_EQ(2u, commandStreamReceiver.waitForCompletionWithTimeoutTaskCountCalled);

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2021 Intel Corporation
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -610,13 +610,14 @@ HWTEST_F(BcsTests, whenBlitFromHostPtrCalledThenCallWaitWithKmdFallback) {
public:
using UltCommandStreamReceiver<FamilyType>::UltCommandStreamReceiver;
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait,
bool useQuickKmdSleep, bool forcePowerSavingMode) override {
WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait,
bool useQuickKmdSleep, bool forcePowerSavingMode) override {
waitForTaskCountWithKmdNotifyFallbackCalled++;
taskCountToWaitPassed = taskCountToWait;
flushStampToWaitPassed = flushStampToWait;
useQuickKmdSleepPassed = useQuickKmdSleep;
forcePowerSavingModePassed = forcePowerSavingMode;
return WaitStatus::Ready;
}
FlushStamp flushStampToWaitPassed = 0;