From 498cf5e871ca0e06d996e04ad5bd260d9044488b Mon Sep 17 00:00:00 2001 From: Patryk Wrobel Date: Thu, 20 Jan 2022 16:56:19 +0000 Subject: [PATCH] Implement GPU hang detection This change uses DRM_IOCTL_I915_GET_RESET_STATS to detect GPU hangs. When such situation is encountered, then zeCommandQueueSynchronize returns ZE_RESULT_ERROR_DEVICE_LOST. Related-To: NEO-5313 Signed-off-by: Patryk Wrobel --- level_zero/core/source/cmdqueue/cmdqueue.cpp | 15 +- .../sources/cmdqueue/test_cmdqueue_2.cpp | 68 +++++++- .../command_queue/command_queue_tests.cpp | 4 +- ...and_stream_receiver_flush_task_3_tests.cpp | 2 +- ...and_stream_receiver_flush_task_4_tests.cpp | 4 +- .../command_stream_receiver_hw_1_tests.cpp | 6 +- .../command_stream_receiver_hw_2_tests.cpp | 7 +- opencl/test/unit_test/event/event_tests.cpp | 4 +- .../unit_test/helpers/kmd_notify_tests.cpp | 12 +- opencl/test/unit_test/kernel/kernel_tests.cpp | 3 +- .../unit_test/mem_obj/buffer_bcs_tests.cpp | 8 +- .../mem_obj/mem_obj_destruction_tests.cpp | 34 ++-- .../os_interface/linux/drm_tests.cpp | 152 ++++++++++++++++++ .../aub_command_stream_receiver_hw.h | 2 +- .../aub_command_stream_receiver_hw_base.inl | 6 +- .../command_stream_receiver.cpp | 32 +++- .../command_stream/command_stream_receiver.h | 15 +- .../command_stream_receiver_hw.h | 2 +- .../command_stream_receiver_hw_base.inl | 14 +- .../command_stream_receiver_with_aub_dump.h | 4 +- .../command_stream_receiver_with_aub_dump.inl | 6 +- .../tbx_command_stream_receiver_hw.h | 4 +- .../tbx_command_stream_receiver_hw.inl | 6 +- shared/source/os_interface/linux/drm_neo.cpp | 22 +++ shared/source/os_interface/linux/drm_neo.h | 1 + shared/source/os_interface/os_interface.h | 4 + .../os_interface/windows/os_interface_win.cpp | 3 +- shared/test/common/libult/linux/drm_mock.cpp | 14 +- shared/test/common/libult/linux/drm_mock.h | 3 +- .../libult/ult_command_stream_receiver.h | 15 +- shared/test/common/mocks/CMakeLists.txt | 2 + .../mocks/linux/mock_os_context_linux.h | 16 ++ shared/test/common/mocks/mock_aub_csr.h | 4 +- .../mocks/mock_command_stream_receiver.h | 10 +- shared/test/common/mocks/mock_driver_model.h | 41 +++++ .../command_stream_receiver_tests.cpp | 100 ++++++++++++ .../os_interface/device_uuid_tests.cpp | 12 +- 37 files changed, 556 insertions(+), 101 deletions(-) create mode 100644 shared/test/common/mocks/linux/mock_os_context_linux.h create mode 100644 shared/test/common/mocks/mock_driver_model.h diff --git a/level_zero/core/source/cmdqueue/cmdqueue.cpp b/level_zero/core/source/cmdqueue/cmdqueue.cpp index 036a1e30ae..0a05b740da 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue.cpp +++ b/level_zero/core/source/cmdqueue/cmdqueue.cpp @@ -97,7 +97,11 @@ NEO::SubmissionStatus CommandQueueImp::submitBatchBuffer(size_t offset, NEO::Res ze_result_t CommandQueueImp::synchronize(uint64_t timeout) { if ((timeout == std::numeric_limits::max()) && useKmdWaitFunction) { auto &waitPair = buffers.getCurrentFlushStamp(); - csr->waitForTaskCountWithKmdNotifyFallback(waitPair.first, waitPair.second, false, false); + const auto waitStatus = csr->waitForTaskCountWithKmdNotifyFallback(waitPair.first, waitPair.second, false, false); + if (waitStatus == NEO::WaitStatus::GpuHang) { + return ZE_RESULT_ERROR_DEVICE_LOST; + } + postSyncOperations(); return ZE_RESULT_SUCCESS; } else { @@ -116,12 +120,15 @@ ze_result_t CommandQueueImp::synchronizeByPollingForTaskCount(uint64_t timeout) timeoutMicroseconds = NEO::TimeoutControls::maxTimeout; } - bool ready = csr->waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait); - if (!ready) { + const auto waitStatus = csr->waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait); + if (waitStatus == NEO::WaitStatus::NotReady) { return ZE_RESULT_NOT_READY; } - postSyncOperations(); + if (waitStatus == NEO::WaitStatus::GpuHang) { + return ZE_RESULT_ERROR_DEVICE_LOST; + } + postSyncOperations(); return ZE_RESULT_SUCCESS; } diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp index 3a3d40bc1a..139360c5bc 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp @@ -137,23 +137,22 @@ using MultiTileCommandQueueSynchronizeTest = Test struct SynchronizeCsr : public NEO::UltCommandStreamReceiver { - SynchronizeCsr(const NEO::ExecutionEnvironment &executionEnvironment, const DeviceBitfield deviceBitfield) : NEO::UltCommandStreamReceiver(const_cast(executionEnvironment), 0, deviceBitfield) { CommandStreamReceiver::tagAddress = &tagAddressData[0]; memset(const_cast(CommandStreamReceiver::tagAddress), 0xFFFFFFFF, tagSize * sizeof(uint32_t)); } - bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override { + WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override { enableTimeoutSet = enableTimeout; waitForComplitionCalledTimes++; partitionCountSet = this->activePartitions; - return true; + return waitForCompletionWithTimeoutResult; } - void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override { + WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override { waitForTaskCountWithKmdNotifyFallbackCalled++; - NEO::UltCommandStreamReceiver::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, quickKmdSleep, forcePowerSavingMode); + return NEO::UltCommandStreamReceiver::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, quickKmdSleep, forcePowerSavingMode); } static constexpr size_t tagSize = 128; @@ -162,6 +161,7 @@ struct SynchronizeCsr : public NEO::UltCommandStreamReceiver { uint32_t waitForTaskCountWithKmdNotifyFallbackCalled = 0; uint32_t partitionCountSet = 0; bool enableTimeoutSet = false; + WaitStatus waitForCompletionWithTimeoutResult = WaitStatus::Ready; }; template @@ -201,6 +201,61 @@ HWTEST_F(CommandQueueSynchronizeTest, givenCallToSynchronizeThenCorrectEnableTim L0::CommandQueue::fromHandle(commandQueue)->destroy(); } +HWTEST_F(CommandQueueSynchronizeTest, givenGpuHangWhenCallingSynchronizeThenErrorIsPropagated) { + auto csr = std::unique_ptr>(new SynchronizeCsr(*device->getNEODevice()->getExecutionEnvironment(), + device->getNEODevice()->getDeviceBitfield())); + csr->waitForCompletionWithTimeoutResult = NEO::WaitStatus::GpuHang; + + ze_command_queue_desc_t desc{}; + ze_command_queue_handle_t commandQueue{}; + ze_result_t res = context->createCommandQueue(device, &desc, &commandQueue); + + ASSERT_EQ(ZE_RESULT_SUCCESS, res); + ASSERT_NE(nullptr, commandQueue); + + auto queue = whitebox_cast(L0::CommandQueue::fromHandle(commandQueue)); + queue->csr = csr.get(); + + constexpr auto timeout{std::numeric_limits::max()}; + const auto synchronizationResult{queue->synchronize(timeout)}; + + EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, synchronizationResult); + EXPECT_EQ(1u, csr->waitForComplitionCalledTimes); + EXPECT_EQ(0u, csr->waitForTaskCountWithKmdNotifyFallbackCalled); + EXPECT_FALSE(csr->enableTimeoutSet); + + L0::CommandQueue::fromHandle(commandQueue)->destroy(); +} + +HWTEST_F(CommandQueueSynchronizeTest, givenDebugOverrideEnabledAndGpuHangWhenCallingSynchronizeThenErrorIsPropagated) { + DebugManagerStateRestore restore; + NEO::DebugManager.flags.OverrideUseKmdWaitFunction.set(1); + + auto csr = std::unique_ptr>(new SynchronizeCsr(*device->getNEODevice()->getExecutionEnvironment(), + device->getNEODevice()->getDeviceBitfield())); + csr->waitForCompletionWithTimeoutResult = NEO::WaitStatus::GpuHang; + + ze_command_queue_desc_t desc{}; + ze_command_queue_handle_t commandQueue{}; + ze_result_t res = context->createCommandQueue(device, &desc, &commandQueue); + + ASSERT_EQ(ZE_RESULT_SUCCESS, res); + ASSERT_NE(nullptr, commandQueue); + + auto queue = whitebox_cast(L0::CommandQueue::fromHandle(commandQueue)); + queue->csr = csr.get(); + + constexpr auto timeout{std::numeric_limits::max()}; + const auto synchronizationResult{queue->synchronize(timeout)}; + + EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, synchronizationResult); + EXPECT_EQ(1u, csr->waitForComplitionCalledTimes); + EXPECT_EQ(1u, csr->waitForTaskCountWithKmdNotifyFallbackCalled); + EXPECT_FALSE(csr->enableTimeoutSet); + + L0::CommandQueue::fromHandle(commandQueue)->destroy(); +} + HWTEST_F(CommandQueueSynchronizeTest, givenDebugOverrideEnabledWhenCallToSynchronizeThenCorrectEnableTimeoutAndTimeoutValuesAreUsed) { DebugManagerStateRestore restore; NEO::DebugManager.flags.OverrideUseKmdWaitFunction.set(1); @@ -349,7 +404,8 @@ struct TestCmdQueueCsr : public NEO::UltCommandStreamReceiver { TestCmdQueueCsr(const NEO::ExecutionEnvironment &executionEnvironment, const DeviceBitfield deviceBitfield) : NEO::UltCommandStreamReceiver(const_cast(executionEnvironment), 0, deviceBitfield) { } - ADDMETHOD_NOBASE(waitForCompletionWithTimeout, bool, false, (bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait)); + + ADDMETHOD_NOBASE(waitForCompletionWithTimeout, NEO::WaitStatus, NEO::WaitStatus::NotReady, (bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait)); }; HWTEST_F(CommandQueueSynchronizeTest, givenSinglePartitionCountWhenWaitFunctionFailsThenReturnNotReady) { diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp index c88dcc3e52..73d6310137 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp @@ -942,8 +942,8 @@ class CommandStreamReceiverHwMock : public CommandStreamReceiverHw { : CommandStreamReceiverHw(executionEnvironment, rootDeviceIndex, deviceBitfield) {} bool wiatForTaskCountCalled = false; - void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) override { - return; + WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) override { + return WaitStatus::Ready; } void waitForTaskCount(uint32_t requiredTaskCount) override { diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp index 89f93f66f5..490f475abd 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp @@ -1886,5 +1886,5 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenWaitForCompletionWithTimeoutI mockCsr.latestSentTaskCount = 1; auto cmdBuffer = std::make_unique(*pDevice); mockCsr.submissionAggregator->recordCommandBuffer(cmdBuffer.release()); - EXPECT_FALSE(mockCsr.waitForCompletionWithTimeout(false, 0, 1)); + EXPECT_EQ(NEO::WaitStatus::NotReady, mockCsr.waitForCompletionWithTimeout(false, 0, 1)); } diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp index cbf618c14e..1273d0bb70 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp @@ -738,6 +738,6 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenTagValueNotMeetingTaskCountTo CpuIntrinsicsTests::pauseAddress = mockCsr->tagAddress; CpuIntrinsicsTests::pauseValue = taskCountToWait; - bool ret = mockCsr->waitForCompletionWithTimeout(false, 1, taskCountToWait); - EXPECT_TRUE(ret); + const auto ret = mockCsr->waitForCompletionWithTimeout(false, 1, taskCountToWait); + EXPECT_EQ(NEO::WaitStatus::Ready, ret); } diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp index 99c73bcd50..302e79764d 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2021 Intel Corporation + * Copyright (C) 2020-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -468,7 +468,7 @@ HWTEST_F(UltCommandStreamReceiverTest, givenComputeOverrideDisableWhenComputeSup HWTEST_F(UltCommandStreamReceiverTest, givenSinglePartitionWhenCallingWaitKmdNotifyThenExpectImplicitBusyLoopWaitCalled) { auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); commandStreamReceiver.callBaseWaitForCompletionWithTimeout = false; - commandStreamReceiver.returnWaitForCompletionWithTimeout = false; + commandStreamReceiver.returnWaitForCompletionWithTimeout = NEO::WaitStatus::NotReady; commandStreamReceiver.waitForTaskCountWithKmdNotifyFallback(0, 0, false, false); EXPECT_EQ(2u, commandStreamReceiver.waitForCompletionWithTimeoutTaskCountCalled); @@ -477,7 +477,7 @@ HWTEST_F(UltCommandStreamReceiverTest, givenSinglePartitionWhenCallingWaitKmdNot HWTEST_F(UltCommandStreamReceiverTest, givenMultiplePartitionsWhenCallingWaitKmdNotifyThenExpectExplicitBusyLoopWaitCalled) { auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); commandStreamReceiver.callBaseWaitForCompletionWithTimeout = false; - commandStreamReceiver.returnWaitForCompletionWithTimeout = false; + commandStreamReceiver.returnWaitForCompletionWithTimeout = NEO::WaitStatus::NotReady; commandStreamReceiver.waitForTaskCountWithKmdNotifyFallback(0, 0, false, false); EXPECT_EQ(2u, commandStreamReceiver.waitForCompletionWithTimeoutTaskCountCalled); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp index a70e964595..640a64b958 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2021 Intel Corporation + * Copyright (C) 2020-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -610,13 +610,14 @@ HWTEST_F(BcsTests, whenBlitFromHostPtrCalledThenCallWaitWithKmdFallback) { public: using UltCommandStreamReceiver::UltCommandStreamReceiver; - void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, - bool useQuickKmdSleep, bool forcePowerSavingMode) override { + WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, + bool useQuickKmdSleep, bool forcePowerSavingMode) override { waitForTaskCountWithKmdNotifyFallbackCalled++; taskCountToWaitPassed = taskCountToWait; flushStampToWaitPassed = flushStampToWait; useQuickKmdSleepPassed = useQuickKmdSleep; forcePowerSavingModePassed = forcePowerSavingMode; + return WaitStatus::Ready; } FlushStamp flushStampToWaitPassed = 0; diff --git a/opencl/test/unit_test/event/event_tests.cpp b/opencl/test/unit_test/event/event_tests.cpp index 8fa310fecf..a2b8938144 100644 --- a/opencl/test/unit_test/event/event_tests.cpp +++ b/opencl/test/unit_test/event/event_tests.cpp @@ -1487,7 +1487,7 @@ struct TestEventCsr : public UltCommandStreamReceiver { TestEventCsr(const ExecutionEnvironment &executionEnvironment, const DeviceBitfield deviceBitfield) : UltCommandStreamReceiver(const_cast(executionEnvironment), 0, deviceBitfield) {} - bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override { + WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override { waitForCompletionWithTimeoutCalled++; waitForCompletionWithTimeoutParamsPassed.push_back({enableTimeout, timeoutMs, taskCountToWait}); return waitForCompletionWithTimeoutResult; @@ -1500,7 +1500,7 @@ struct TestEventCsr : public UltCommandStreamReceiver { }; uint32_t waitForCompletionWithTimeoutCalled = 0u; - bool waitForCompletionWithTimeoutResult = true; + WaitStatus waitForCompletionWithTimeoutResult = WaitStatus::Ready; StackVec waitForCompletionWithTimeoutParamsPassed{}; }; diff --git a/opencl/test/unit_test/helpers/kmd_notify_tests.cpp b/opencl/test/unit_test/helpers/kmd_notify_tests.cpp index 98b4765234..f439452f80 100644 --- a/opencl/test/unit_test/helpers/kmd_notify_tests.cpp +++ b/opencl/test/unit_test/helpers/kmd_notify_tests.cpp @@ -88,7 +88,7 @@ struct KmdNotifyTests : public ::testing::Test { bool waitForFlushStampResult = true; StackVec waitForFlushStampParamsPassed{}; - bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override { + WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override { waitForCompletionWithTimeoutCalled++; waitForCompletionWithTimeoutParamsPassed.push_back({enableTimeout, timeoutMs, taskCountToWait}); return waitForCompletionWithTimeoutResult; @@ -101,7 +101,7 @@ struct KmdNotifyTests : public ::testing::Test { }; uint32_t waitForCompletionWithTimeoutCalled = 0u; - bool waitForCompletionWithTimeoutResult = true; + WaitStatus waitForCompletionWithTimeoutResult = WaitStatus::Ready; StackVec waitForCompletionWithTimeoutParamsPassed{}; }; @@ -127,7 +127,6 @@ struct KmdNotifyTests : public ::testing::Test { HWTEST_F(KmdNotifyTests, givenTaskCountWhenWaitUntilCompletionCalledThenAlwaysTryCpuPolling) { auto csr = createMockCsr(); - cmdQ->waitUntilComplete(taskCountToWait, {}, flushStampToWait, false); EXPECT_EQ(1u, csr->waitForCompletionWithTimeoutCalled); EXPECT_EQ(true, csr->waitForCompletionWithTimeoutParamsPassed[0].enableTimeout); @@ -138,7 +137,6 @@ HWTEST_F(KmdNotifyTests, givenTaskCountWhenWaitUntilCompletionCalledThenAlwaysTr HWTEST_F(KmdNotifyTests, givenTaskCountAndKmdNotifyDisabledWhenWaitUntilCompletionCalledThenTryCpuPollingWithoutTimeout) { overrideKmdNotifyParams(false, 0, false, 0, false, 0, false, 0); auto csr = createMockCsr(); - cmdQ->waitUntilComplete(taskCountToWait, {}, flushStampToWait, false); EXPECT_EQ(0u, csr->waitForFlushStampCalled); EXPECT_EQ(1u, csr->waitForCompletionWithTimeoutCalled); @@ -152,7 +150,8 @@ HWTEST_F(KmdNotifyTests, givenNotReadyTaskCountWhenWaitUntilCompletionCalledThen *csr->getTagAddress() = taskCountToWait - 1; ::testing::InSequence is; - csr->waitForCompletionWithTimeoutResult = false; + + csr->waitForCompletionWithTimeoutResult = WaitStatus::NotReady; //we have unrecoverable for this case, this will throw. EXPECT_THROW(cmdQ->waitUntilComplete(taskCountToWait, {}, flushStampToWait, false), std::exception); @@ -220,7 +219,7 @@ HWTEST_F(KmdNotifyTests, givenDisabledQuickSleepWhenWaitUntilCompleteWithQuickSl HWTEST_F(KmdNotifyTests, givenNotReadyTaskCountWhenPollForCompletionCalledThenTimeout) { *device->getDefaultEngine().commandStreamReceiver->getTagAddress() = taskCountToWait - 1; auto success = device->getUltCommandStreamReceiver().waitForCompletionWithTimeout(true, 1, taskCountToWait); - EXPECT_FALSE(success); + EXPECT_NE(NEO::WaitStatus::Ready, success); } HWTEST_F(KmdNotifyTests, givenZeroFlushStampWhenWaitIsCalledThenDisableTimeout) { @@ -263,6 +262,7 @@ HWTEST_F(KmdNotifyTests, givenNonQuickSleepRequestWhenItsNotSporadicWaitThenOver HWTEST_F(KmdNotifyTests, givenKmdNotifyDisabledWhenPowerSavingModeIsRequestedThenTimeoutIsEnabled) { overrideKmdNotifyParams(false, 3, false, 2, false, 9999999, false, 0); auto csr = createMockCsr(); + csr->waitForTaskCountWithKmdNotifyFallback(taskCountToWait, 1, false, true); EXPECT_EQ(1u, csr->waitForCompletionWithTimeoutCalled); EXPECT_EQ(true, csr->waitForCompletionWithTimeoutParamsPassed[0].enableTimeout); diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index d4b27f0195..0a9e575d85 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -484,7 +484,8 @@ class CommandStreamReceiverMock : public CommandStreamReceiver { return NEO::SubmissionStatus::SUCCESS; } - void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override { + WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override { + return WaitStatus::Ready; } uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled, Device &device) override { return taskCount; }; diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index af7628d7a3..edda8b1e7e 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2021 Intel Corporation + * Copyright (C) 2020-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -35,14 +35,16 @@ struct BcsBufferTests : public ::testing::Test { public: using UltCommandStreamReceiver::UltCommandStreamReceiver; - void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, - bool useQuickKmdSleep, bool forcePowerSavingMode) override { + WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, + bool useQuickKmdSleep, bool forcePowerSavingMode) override { EXPECT_EQ(this->latestFlushedTaskCount, taskCountToWait); EXPECT_EQ(0u, flushStampToWait); EXPECT_FALSE(useQuickKmdSleep); EXPECT_FALSE(forcePowerSavingMode); EXPECT_EQ(1u, this->activePartitions); waitForTaskCountWithKmdNotifyFallbackCalled++; + + return WaitStatus::Ready; } void waitForTaskCountAndCleanTemporaryAllocationList(uint32_t requiredTaskCount) override { diff --git a/opencl/test/unit_test/mem_obj/mem_obj_destruction_tests.cpp b/opencl/test/unit_test/mem_obj/mem_obj_destruction_tests.cpp index 6bcaa10df1..f96091eca5 100644 --- a/opencl/test/unit_test/mem_obj/mem_obj_destruction_tests.cpp +++ b/opencl/test/unit_test/mem_obj/mem_obj_destruction_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -29,7 +29,7 @@ class MyCsr : public UltCommandStreamReceiver { public: MyCsr(const ExecutionEnvironment &executionEnvironment, const DeviceBitfield deviceBitfield) : UltCommandStreamReceiver(const_cast(executionEnvironment), 0, deviceBitfield) {} - MOCK_METHOD3(waitForCompletionWithTimeout, bool(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait)); + MOCK_METHOD3(waitForCompletionWithTimeout, WaitStatus(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait)); }; void CL_CALLBACK emptyDestructorCallback(cl_mem memObj, void *userData) { @@ -148,13 +148,13 @@ HWTEST_P(MemObjAsyncDestructionTest, givenUsedMemObjWithAsyncDestructionsEnabled *mockCsr0->getTagAddress() = 0; *mockCsr1->getTagAddress() = 0; - auto waitForCompletionWithTimeoutMock0 = [&mockCsr0](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) -> bool { + auto waitForCompletionWithTimeoutMock0 = [&mockCsr0](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) -> NEO::WaitStatus { *mockCsr0->getTagAddress() = taskCountReady; - return true; + return NEO::WaitStatus::Ready; }; - auto waitForCompletionWithTimeoutMock1 = [&mockCsr1](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) -> bool { + auto waitForCompletionWithTimeoutMock1 = [&mockCsr1](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) -> NEO::WaitStatus { *mockCsr1->getTagAddress() = taskCountReady; - return true; + return NEO::WaitStatus::Ready; }; auto osContextId0 = mockCsr0->getOsContext().getContextId(); auto osContextId1 = mockCsr1->getOsContext().getContextId(); @@ -198,9 +198,9 @@ HWTEST_P(MemObjAsyncDestructionTest, givenUsedMemObjWithAsyncDestructionsEnabled *mockCsr->getTagAddress() = 0; auto osContextId = mockCsr->getOsContext().getContextId(); - bool desired = true; + auto desired = NEO::WaitStatus::Ready; - auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) -> bool { return desired; }; + auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) { return desired; }; ON_CALL(*mockCsr, waitForCompletionWithTimeout(::testing::_, ::testing::_, ::testing::_)) .WillByDefault(::testing::Invoke(waitForCompletionWithTimeoutMock)); @@ -240,9 +240,9 @@ HWTEST_P(MemObjAsyncDestructionTest, givenUsedMemObjWithAsyncDestructionsEnabled device->resetCommandStreamReceiver(mockCsr); *mockCsr->getTagAddress() = 0; - bool desired = true; + auto desired = NEO::WaitStatus::Ready; - auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) -> bool { return desired; }; + auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) { return desired; }; auto osContextId = mockCsr->getOsContext().getContextId(); ON_CALL(*mockCsr, waitForCompletionWithTimeout(::testing::_, ::testing::_, ::testing::_)) @@ -275,9 +275,9 @@ HWTEST_P(MemObjSyncDestructionTest, givenMemObjWithDestructableAllocationWhenAsy device->resetCommandStreamReceiver(mockCsr); *mockCsr->getTagAddress() = 0; - bool desired = true; + auto desired = NEO::WaitStatus::Ready; - auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) -> bool { return desired; }; + auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) { return desired; }; auto osContextId = mockCsr->getOsContext().getContextId(); ON_CALL(*mockCsr, waitForCompletionWithTimeout(::testing::_, ::testing::_, ::testing::_)) @@ -302,9 +302,9 @@ HWTEST_P(MemObjSyncDestructionTest, givenMemObjWithDestructableAllocationWhenAsy device->resetCommandStreamReceiver(mockCsr); *mockCsr->getTagAddress() = 0; - bool desired = true; + auto desired = NEO::WaitStatus::Ready; - auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) -> bool { return desired; }; + auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) { return desired; }; ON_CALL(*mockCsr, waitForCompletionWithTimeout(::testing::_, ::testing::_, ::testing::_)) .WillByDefault(::testing::Invoke(waitForCompletionWithTimeoutMock)); @@ -335,7 +335,7 @@ HWTEST_P(MemObjSyncDestructionTest, givenMemObjWithMapAllocationWhenAsyncDestruc memObj->getMapAllocation(device->getRootDeviceIndex())->updateTaskCount(taskCountReady, contextId); } - auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) -> bool { return true; }; + auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) { return NEO::WaitStatus::Ready; }; auto osContextId = mockCsr->getOsContext().getContextId(); ON_CALL(*mockCsr, waitForCompletionWithTimeout(::testing::_, ::testing::_, ::testing::_)) @@ -498,7 +498,7 @@ HWTEST_F(UsmDestructionTests, givenSharedUsmAllocationWhenBlockingFreeIsCalledTh auto svmEntry = svmAllocationsManager->getSVMAlloc(sharedMemory); - auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) -> bool { return true; }; + auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) { return NEO::WaitStatus::Ready; }; ON_CALL(*mockCsr, waitForCompletionWithTimeout(::testing::_, ::testing::_, ::testing::_)) .WillByDefault(::testing::Invoke(waitForCompletionWithTimeoutMock)); svmEntry->gpuAllocations.getGraphicsAllocation(mockDevice.getRootDeviceIndex())->updateTaskCount(6u, 0u); @@ -531,7 +531,7 @@ HWTEST_F(UsmDestructionTests, givenUsmAllocationWhenBlockingFreeIsCalledThenWait auto svmEntry = svmAllocationsManager->getSVMAlloc(hostMemory); - auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) -> bool { return true; }; + auto waitForCompletionWithTimeoutMock = [=](bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) { return NEO::WaitStatus::Ready; }; ON_CALL(*mockCsr, waitForCompletionWithTimeout(::testing::_, ::testing::_, ::testing::_)) .WillByDefault(::testing::Invoke(waitForCompletionWithTimeoutMock)); svmEntry->gpuAllocations.getGraphicsAllocation(mockDevice.getRootDeviceIndex())->updateTaskCount(6u, 0u); diff --git a/opencl/test/unit_test/os_interface/linux/drm_tests.cpp b/opencl/test/unit_test/os_interface/linux/drm_tests.cpp index dbff9c9bef..3349246f2c 100644 --- a/opencl/test/unit_test/os_interface/linux/drm_tests.cpp +++ b/opencl/test/unit_test/os_interface/linux/drm_tests.cpp @@ -15,6 +15,8 @@ #include "shared/test/common/helpers/default_hw_info.h" #include "shared/test/common/helpers/engine_descriptor_helper.h" #include "shared/test/common/libult/linux/drm_mock.h" +#include "shared/test/common/mocks/linux/mock_os_context_linux.h" +#include "shared/test/common/mocks/mock_memory_manager.h" #include "opencl/test/unit_test/mocks/mock_platform.h" @@ -965,3 +967,153 @@ TEST(DrmTest, GivenCompletionFenceDebugFlagWhenCreatingDrmObjectThenExpectCorrec DrmMock drmDisabled{*executionEnvironment->rootDeviceEnvironments[0]}; EXPECT_FALSE(drmDisabled.completionFenceSupport()); } + +TEST(DrmTest, GivenInvalidContextIdWhenIsGpuHangIsCalledThenErrorIsThrown) { + ExecutionEnvironment executionEnvironment{}; + executionEnvironment.prepareRootDeviceEnvironments(1); + + DrmMock drm{*executionEnvironment.rootDeviceEnvironments[0]}; + uint32_t contextId{0}; + EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})}; + + CommandStreamReceiver *csr{nullptr}; + MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor}; + EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}}; + + auto memoryManager = std::make_unique(); + auto memoryManagerRaw = memoryManager.get(); + + memoryManagerRaw->registeredEngines = std::move(engines); + executionEnvironment.memoryManager = std::move(memoryManager); + + const auto invalidContextId = 1; + EXPECT_THROW(drm.isGpuHangDetected(invalidContextId), std::runtime_error); + + memoryManagerRaw->registeredEngines.clear(); +} + +TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) { + ExecutionEnvironment executionEnvironment{}; + executionEnvironment.prepareRootDeviceEnvironments(1); + + DrmMock drm{*executionEnvironment.rootDeviceEnvironments[0]}; + uint32_t contextId{0}; + EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})}; + + CommandStreamReceiver *csr{nullptr}; + MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor}; + EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}}; + + auto memoryManager = std::make_unique(); + auto memoryManagerRaw = memoryManager.get(); + + memoryManagerRaw->registeredEngines = std::move(engines); + executionEnvironment.memoryManager = std::move(memoryManager); + + mockOsContextLinux.drmContextIds.push_back(0); + mockOsContextLinux.drmContextIds.push_back(3); + + EXPECT_THROW(drm.isGpuHangDetected(0), std::runtime_error); + + memoryManagerRaw->registeredEngines.clear(); +} + +TEST(DrmTest, GivenZeroBatchActiveAndZeroBatchPendingResetStatsWhenIsGpuHangIsCalledThenNoHangIsReported) { + ExecutionEnvironment executionEnvironment{}; + executionEnvironment.prepareRootDeviceEnvironments(1); + + DrmMock drm{*executionEnvironment.rootDeviceEnvironments[0]}; + uint32_t contextId{0}; + EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})}; + + CommandStreamReceiver *csr{nullptr}; + MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor}; + EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}}; + + auto memoryManager = std::make_unique(); + auto memoryManagerRaw = memoryManager.get(); + + memoryManagerRaw->registeredEngines = std::move(engines); + executionEnvironment.memoryManager = std::move(memoryManager); + + drm_i915_reset_stats resetStats{}; + resetStats.ctx_id = 0; + mockOsContextLinux.drmContextIds.push_back(0); + drm.resetStatsToReturn.push_back(resetStats); + + resetStats.ctx_id = 3; + mockOsContextLinux.drmContextIds.push_back(3); + drm.resetStatsToReturn.push_back(resetStats); + + bool isGpuHangDetected{}; + EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0)); + EXPECT_FALSE(isGpuHangDetected); + + memoryManagerRaw->registeredEngines.clear(); +} + +TEST(DrmTest, GivenBatchActiveGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThenHangIsReported) { + ExecutionEnvironment executionEnvironment{}; + executionEnvironment.prepareRootDeviceEnvironments(1); + + DrmMock drm{*executionEnvironment.rootDeviceEnvironments[0]}; + uint32_t contextId{0}; + EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})}; + + CommandStreamReceiver *csr{nullptr}; + MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor}; + EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}}; + + auto memoryManager = std::make_unique(); + auto memoryManagerRaw = memoryManager.get(); + + memoryManagerRaw->registeredEngines = std::move(engines); + executionEnvironment.memoryManager = std::move(memoryManager); + + drm_i915_reset_stats resetStats{}; + resetStats.ctx_id = 0; + mockOsContextLinux.drmContextIds.push_back(0); + drm.resetStatsToReturn.push_back(resetStats); + + resetStats.ctx_id = 3; + resetStats.batch_active = 2; + mockOsContextLinux.drmContextIds.push_back(3); + drm.resetStatsToReturn.push_back(resetStats); + + bool isGpuHangDetected{}; + EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0)); + EXPECT_TRUE(isGpuHangDetected); + + memoryManagerRaw->registeredEngines.clear(); +} + +TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThenHangIsReported) { + ExecutionEnvironment executionEnvironment{}; + executionEnvironment.prepareRootDeviceEnvironments(1); + + DrmMock drm{*executionEnvironment.rootDeviceEnvironments[0]}; + uint32_t contextId{0}; + EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})}; + + CommandStreamReceiver *csr{nullptr}; + MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor}; + EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}}; + + auto memoryManager = std::make_unique(); + auto memoryManagerRaw = memoryManager.get(); + + memoryManagerRaw->registeredEngines = std::move(engines); + executionEnvironment.memoryManager = std::move(memoryManager); + + drm_i915_reset_stats resetStats{}; + resetStats.ctx_id = 8; + resetStats.batch_pending = 7; + mockOsContextLinux.drmContextIds.push_back(8); + drm.resetStatsToReturn.push_back(resetStats); + + bool isGpuHangDetected{}; + EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0)); + EXPECT_TRUE(isGpuHangDetected); + + memoryManagerRaw->registeredEngines.clear(); +} \ No newline at end of file diff --git a/shared/source/command_stream/aub_command_stream_receiver_hw.h b/shared/source/command_stream/aub_command_stream_receiver_hw.h index 36d4590fc7..c68d93a7a8 100644 --- a/shared/source/command_stream/aub_command_stream_receiver_hw.h +++ b/shared/source/command_stream/aub_command_stream_receiver_hw.h @@ -63,7 +63,7 @@ class AUBCommandStreamReceiverHw : public CommandStreamReceiverSimulatedHw::pollForCompletionImpl() { } template -inline void AUBCommandStreamReceiverHw::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) { - CommandStreamReceiverSimulatedHw::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode); +inline WaitStatus AUBCommandStreamReceiverHw::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) { + const auto result = CommandStreamReceiverSimulatedHw::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode); pollForCompletion(); + + return result; } template diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index 4321e7815e..c02c87b00a 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -227,6 +227,10 @@ bool CommandStreamReceiver::skipResourceCleanup() const { return this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->skipResourceCleanup(); } +bool CommandStreamReceiver::isGpuHangDetected() const { + return this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->isGpuHangDetected(osContext->getContextId()); +} + void CommandStreamReceiver::cleanupResources() { if (this->skipResourceCleanup()) { return; @@ -286,19 +290,21 @@ void CommandStreamReceiver::cleanupResources() { } } -bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) { +WaitStatus CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) { uint32_t latestSentTaskCount = this->latestFlushedTaskCount; if (latestSentTaskCount < taskCountToWait) { if (!this->flushBatchedSubmissions()) { - return false; + const auto isGpuHang{isGpuHangDetected()}; + return isGpuHang ? WaitStatus::GpuHang : WaitStatus::NotReady; } } return baseWaitFunction(getTagAddress(), enableTimeout, timeoutMicroseconds, taskCountToWait); } -bool CommandStreamReceiver::baseWaitFunction(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) { - std::chrono::high_resolution_clock::time_point time1, time2; +WaitStatus CommandStreamReceiver::baseWaitFunction(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) { + std::chrono::microseconds elapsedTimeSinceGpuHangCheck{0}; + std::chrono::high_resolution_clock::time_point waitStartTime, lastHangCheckTime, currentTime; int64_t timeDiff = 0; uint32_t latestSentTaskCount = this->latestFlushedTaskCount; @@ -308,23 +314,33 @@ bool CommandStreamReceiver::baseWaitFunction(volatile uint32_t *pollAddress, boo volatile uint32_t *partitionAddress = pollAddress; - time1 = std::chrono::high_resolution_clock::now(); + waitStartTime = std::chrono::high_resolution_clock::now(); + lastHangCheckTime = waitStartTime; for (uint32_t i = 0; i < activePartitions; i++) { while (*partitionAddress < taskCountToWait && timeDiff <= timeoutMicroseconds) { if (WaitUtils::waitFunction(partitionAddress, taskCountToWait)) { break; } + currentTime = std::chrono::high_resolution_clock::now(); + elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast(currentTime - lastHangCheckTime); + + if (elapsedTimeSinceGpuHangCheck.count() >= gpuHangCheckPeriod.count()) { + lastHangCheckTime = currentTime; + if (isGpuHangDetected()) { + return WaitStatus::GpuHang; + } + } + if (enableTimeout) { - time2 = std::chrono::high_resolution_clock::now(); - timeDiff = std::chrono::duration_cast(time2 - time1).count(); + timeDiff = std::chrono::duration_cast(currentTime - waitStartTime).count(); } } partitionAddress = ptrOffset(partitionAddress, this->postSyncWriteOffset); } - return testTaskCountReady(pollAddress, taskCountToWait); + return testTaskCountReady(pollAddress, taskCountToWait) ? WaitStatus::Ready : WaitStatus::NotReady; } void CommandStreamReceiver::setTagAllocation(GraphicsAllocation *allocation) { diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index ac8d361ba5..07abed92e3 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -26,6 +26,7 @@ #include "shared/source/os_interface/os_thread.h" #include "shared/source/utilities/spinlock.h" +#include #include #include @@ -63,6 +64,12 @@ enum class DispatchMode { BatchedDispatch // dispatching is batched, explicit clFlush is required }; +enum class WaitStatus { + NotReady = 0, + Ready = 1, + GpuHang = 2, +}; + class CommandStreamReceiver { public: enum class SamplerCacheFlushState { @@ -158,9 +165,9 @@ class CommandStreamReceiver { void requestStallingCommandsOnNextFlush() { stallingCommandsOnNextFlushRequired = true; } bool isStallingCommandsOnNextFlushRequired() const { return stallingCommandsOnNextFlushRequired; } - virtual void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0; - virtual bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait); - bool baseWaitFunction(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait); + virtual WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0; + virtual WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait); + WaitStatus baseWaitFunction(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait); bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait); virtual void downloadAllocations(){}; @@ -316,6 +323,7 @@ class CommandStreamReceiver { void printDeviceIndex(); void checkForNewResources(uint32_t submittedTaskCount, uint32_t allocationTaskCount, GraphicsAllocation &gfxAllocation); bool checkImplicitFlushForGpuIdle(); + bool isGpuHangDetected() const; MOCKABLE_VIRTUAL std::unique_lock obtainHostPtrSurfaceCreationLock(); std::unique_ptr flushStamp; @@ -373,6 +381,7 @@ class CommandStreamReceiver { SamplerCacheFlushState samplerCacheFlushRequired = SamplerCacheFlushState::samplerCacheFlushNotRequired; PreemptionMode lastPreemptionMode = PreemptionMode::Initial; + std::chrono::microseconds gpuHangCheckPeriod{500'000}; uint32_t lastSentL3Config = 0; uint32_t latestSentStatelessMocsConfig = 0; uint32_t lastSentNumGrfRequired = GrfConfig::DefaultGrfNumber; diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index fe1fe6f871..65fab3d32f 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -77,7 +77,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { bool isPipelineSelectAlreadyProgrammed() const; void programComputeMode(LinearStream &csr, DispatchFlags &dispatchFlags, const HardwareInfo &hwInfo); - void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) override; + WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) override; void collectStateBaseAddresPatchInfo( uint64_t commandBufferAddress, diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 67f8c1b19b..5b8e8640f9 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -859,7 +859,7 @@ inline size_t CommandStreamReceiverHw::getCmdSizeForPipelineSelect() } template -inline void CommandStreamReceiverHw::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) { +inline WaitStatus CommandStreamReceiverHw::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) { int64_t waitTimeout = 0; bool enableTimeout = false; @@ -870,12 +870,18 @@ inline void CommandStreamReceiverHw::waitForTaskCountWithKmdNotifyFal "\nWaiting for task count %u at location %p. Current value: %u\n", taskCountToWait, getTagAddress(), *getTagAddress()); - bool status = waitForCompletionWithTimeout(enableTimeout, waitTimeout, taskCountToWait); - if (!status) { + auto status = waitForCompletionWithTimeout(enableTimeout, waitTimeout, taskCountToWait); + if (status == WaitStatus::NotReady) { waitForFlushStamp(flushStampToWait); //now call blocking wait, this is to ensure that task count is reached status = waitForCompletionWithTimeout(false, 0, taskCountToWait); } + + // If GPU hang occured, then propagate it to the caller. + if (status == WaitStatus::GpuHang) { + return status; + } + UNRECOVERABLE_IF(*getTagAddress() < taskCountToWait); if (kmdNotifyHelper->quickKmdSleepForSporadicWaitsEnabled()) { @@ -884,6 +890,8 @@ inline void CommandStreamReceiverHw::waitForTaskCountWithKmdNotifyFal PRINT_DEBUG_STRING(DebugManager.flags.LogWaitingForCompletion.get(), stdout, "\nWaiting completed. Current value: %u\n", *getTagAddress()); + + return WaitStatus::Ready; } template diff --git a/shared/source/command_stream/command_stream_receiver_with_aub_dump.h b/shared/source/command_stream/command_stream_receiver_with_aub_dump.h index 69cf368e7c..2c08bc5dc5 100644 --- a/shared/source/command_stream/command_stream_receiver_with_aub_dump.h +++ b/shared/source/command_stream/command_stream_receiver_with_aub_dump.h @@ -39,8 +39,8 @@ class CommandStreamReceiverWithAUBDump : public BaseCSR { return CommandStreamReceiverType::CSR_HW_WITH_AUB; } - void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, - bool useQuickKmdSleep, bool forcePowerSavingMode) override; + WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, + bool useQuickKmdSleep, bool forcePowerSavingMode) override; size_t getPreferredTagPoolSize() const override { return 1; } diff --git a/shared/source/command_stream/command_stream_receiver_with_aub_dump.inl b/shared/source/command_stream/command_stream_receiver_with_aub_dump.inl index e717ad5fac..f5b28e3c31 100644 --- a/shared/source/command_stream/command_stream_receiver_with_aub_dump.inl +++ b/shared/source/command_stream/command_stream_receiver_with_aub_dump.inl @@ -78,13 +78,13 @@ void CommandStreamReceiverWithAUBDump::setupContext(OsContext &osContex } template -void CommandStreamReceiverWithAUBDump::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, - bool useQuickKmdSleep, bool forcePowerSavingMode) { +WaitStatus CommandStreamReceiverWithAUBDump::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, + bool useQuickKmdSleep, bool forcePowerSavingMode) { if (aubCSR) { aubCSR->waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode); } - BaseCSR::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode); + return BaseCSR::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode); } template diff --git a/shared/source/command_stream/tbx_command_stream_receiver_hw.h b/shared/source/command_stream/tbx_command_stream_receiver_hw.h index 72a84dea55..89a0c86ad5 100644 --- a/shared/source/command_stream/tbx_command_stream_receiver_hw.h +++ b/shared/source/command_stream/tbx_command_stream_receiver_hw.h @@ -42,8 +42,8 @@ class TbxCommandStreamReceiverHw : public CommandStreamReceiverSimulatedHw::flushSubmissionsAndDownloadAllocatio } template -void TbxCommandStreamReceiverHw::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) { +WaitStatus TbxCommandStreamReceiverHw::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) { flushSubmissionsAndDownloadAllocations(taskCountToWait); - BaseClass::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode); + return BaseClass::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, forcePowerSavingMode); } template -bool TbxCommandStreamReceiverHw::waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) { +WaitStatus TbxCommandStreamReceiverHw::waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) { flushSubmissionsAndDownloadAllocations(taskCountToWait); return BaseClass::waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait); } diff --git a/shared/source/os_interface/linux/drm_neo.cpp b/shared/source/os_interface/linux/drm_neo.cpp index 6804fa1c69..46afc5f02a 100644 --- a/shared/source/os_interface/linux/drm_neo.cpp +++ b/shared/source/os_interface/linux/drm_neo.cpp @@ -317,6 +317,28 @@ int Drm::queryGttSize(uint64_t >tSizeOutput) { return ret; } +bool Drm::isGpuHangDetected(uint32_t contextId) { + const auto &engines = this->rootDeviceEnvironment.executionEnvironment.memoryManager->getRegisteredEngines(); + UNRECOVERABLE_IF(engines.size() <= contextId); + + const auto osContextLinux = static_cast(engines[contextId].osContext); + const auto &drmContextIds = osContextLinux->getDrmContextIds(); + + for (const auto drmContextId : drmContextIds) { + drm_i915_reset_stats reset_stats{}; + reset_stats.ctx_id = drmContextId; + + const auto retVal{ioctl(DRM_IOCTL_I915_GET_RESET_STATS, &reset_stats)}; + UNRECOVERABLE_IF(retVal != 0); + + if (reset_stats.batch_active > 0 || reset_stats.batch_pending > 0) { + return true; + } + } + + return false; +} + void Drm::checkPreemptionSupport() { int value = 0; auto ret = getParamIoctl(I915_PARAM_HAS_SCHEDULER, &value); diff --git a/shared/source/os_interface/linux/drm_neo.h b/shared/source/os_interface/linux/drm_neo.h index 9ebfd73bdd..05b610e198 100644 --- a/shared/source/os_interface/linux/drm_neo.h +++ b/shared/source/os_interface/linux/drm_neo.h @@ -148,6 +148,7 @@ class Drm : public DriverModel { MOCKABLE_VIRTUAL void getPrelimVersion(std::string &prelimVersion); PhysicalDevicePciBusInfo getPciBusInfo() const override; + bool isGpuHangDetected(uint32_t contextId) override; bool areNonPersistentContextsSupported() const { return nonPersistentContextsSupported; } void checkNonPersistentContextsSupport(); diff --git a/shared/source/os_interface/os_interface.h b/shared/source/os_interface/os_interface.h index c0f85fc29d..b890efb263 100644 --- a/shared/source/os_interface/os_interface.h +++ b/shared/source/os_interface/os_interface.h @@ -85,6 +85,10 @@ class DriverModel : public NonCopyableClass { return false; } + virtual bool isGpuHangDetected(uint32_t contextId) { + return false; + } + protected: DriverModelType driverModelType; }; diff --git a/shared/source/os_interface/windows/os_interface_win.cpp b/shared/source/os_interface/windows/os_interface_win.cpp index 14f022a8c6..b4e0c99b28 100644 --- a/shared/source/os_interface/windows/os_interface_win.cpp +++ b/shared/source/os_interface/windows/os_interface_win.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -17,4 +17,5 @@ bool OSInterface::requiresSupportForWddmTrimNotification = true; bool OSInterface::isDebugAttachAvailable() const { return false; } + } // namespace NEO diff --git a/shared/test/common/libult/linux/drm_mock.cpp b/shared/test/common/libult/linux/drm_mock.cpp index f077530314..2bc4e625ef 100644 --- a/shared/test/common/libult/linux/drm_mock.cpp +++ b/shared/test/common/libult/linux/drm_mock.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2021 Intel Corporation + * Copyright (C) 2019-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -204,6 +204,18 @@ int DrmMock::ioctl(unsigned long request, void *arg) { if (request == DRM_IOCTL_GEM_CLOSE) { return 0; } + if (request == DRM_IOCTL_I915_GET_RESET_STATS && arg != nullptr) { + auto outResetStats = static_cast(arg); + for (const auto &resetStats : resetStatsToReturn) { + if (resetStats.ctx_id == outResetStats->ctx_id) { + *outResetStats = resetStats; + return 0; + } + } + + return -1; + } + if (request == DRM_IOCTL_I915_QUERY && arg != nullptr) { auto queryArg = static_cast(arg); auto queryItemArg = reinterpret_cast(queryArg->items_ptr); diff --git a/shared/test/common/libult/linux/drm_mock.h b/shared/test/common/libult/linux/drm_mock.h index 2e13fbfbb8..4c03a2bff0 100644 --- a/shared/test/common/libult/linux/drm_mock.h +++ b/shared/test/common/libult/linux/drm_mock.h @@ -17,6 +17,7 @@ #include #include #include +#include using namespace NEO; @@ -145,11 +146,11 @@ class DrmMock : public Drm { int storedExecSoftPin = 0; int storedRetValForVmId = 1; int storedCsTimestampFrequency = 1000; - bool disableSomeTopology = false; bool allowDebugAttach = false; bool allowDebugAttachCallBase = false; uint32_t passedContextDebugId = std::numeric_limits::max(); + std::vector resetStatsToReturn{}; drm_i915_gem_context_create_ext_setparam receivedContextCreateSetParam = {}; uint32_t receivedContextCreateFlags = 0; diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index cadeb96748..6a053c19d2 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -18,6 +18,7 @@ #include #include +#include namespace NEO { @@ -77,6 +78,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ using BaseClass::CommandStreamReceiver::experimentalCmdBuffer; using BaseClass::CommandStreamReceiver::flushStamp; using BaseClass::CommandStreamReceiver::globalFenceAllocation; + using BaseClass::CommandStreamReceiver::gpuHangCheckPeriod; using BaseClass::CommandStreamReceiver::GSBAFor32BitProgrammed; using BaseClass::CommandStreamReceiver::initDirectSubmission; using BaseClass::CommandStreamReceiver::internalAllocationStorage; @@ -122,7 +124,8 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ uint32_t rootDeviceIndex, const DeviceBitfield deviceBitfield) : BaseClass(executionEnvironment, rootDeviceIndex, deviceBitfield), recursiveLockCounter(0), - recordedDispatchFlags(DispatchFlagsHelper::createDefaultDispatchFlags()) {} + recordedDispatchFlags(DispatchFlagsHelper::createDefaultDispatchFlags()) { + } static CommandStreamReceiver *create(bool withAubDump, ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex, @@ -169,7 +172,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ downloadAllocationCalled = true; } - bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) override { + WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) override { latestWaitForCompletionWithTimeoutTaskCount.store(taskCountToWait); waitForCompletionWithTimeoutTaskCountCalled++; if (callBaseWaitForCompletionWithTimeout) { @@ -222,6 +225,11 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ } bool flushBatchedSubmissions() override { flushBatchedSubmissionsCalled = true; + + if (shouldFailFlushBatchedSubmissions) { + return false; + } + return CommandStreamReceiverHw::flushBatchedSubmissions(); } void initProgrammingFlags() override { @@ -328,6 +336,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ bool blitterDirectSubmissionAvailable = false; bool callBaseIsMultiOsContextCapable = false; bool callBaseWaitForCompletionWithTimeout = true; - bool returnWaitForCompletionWithTimeout = true; + bool shouldFailFlushBatchedSubmissions = false; + WaitStatus returnWaitForCompletionWithTimeout = WaitStatus::Ready; }; } // namespace NEO diff --git a/shared/test/common/mocks/CMakeLists.txt b/shared/test/common/mocks/CMakeLists.txt index ef915c8295..e2889439f2 100644 --- a/shared/test/common/mocks/CMakeLists.txt +++ b/shared/test/common/mocks/CMakeLists.txt @@ -45,6 +45,7 @@ set(NEO_CORE_tests_mocks ${CMAKE_CURRENT_SOURCE_DIR}/mock_direct_submission_diagnostic_collector.h ${CMAKE_CURRENT_SOURCE_DIR}/mock_direct_submission_hw.h ${CMAKE_CURRENT_SOURCE_DIR}/mock_dispatch_kernel_encoder_interface.h + ${CMAKE_CURRENT_SOURCE_DIR}/mock_driver_model.h ${CMAKE_CURRENT_SOURCE_DIR}/mock_elf.h ${CMAKE_CURRENT_SOURCE_DIR}/mock_execution_environment.h ${CMAKE_CURRENT_SOURCE_DIR}/mock_experimental_command_buffer.h @@ -105,6 +106,7 @@ else() ${CMAKE_CURRENT_SOURCE_DIR}/linux/mock_drm_allocation.h ${CMAKE_CURRENT_SOURCE_DIR}/linux/mock_drm_command_stream_receiver.h ${CMAKE_CURRENT_SOURCE_DIR}/linux/mock_drm_memory_manager.h + ${CMAKE_CURRENT_SOURCE_DIR}/linux/mock_os_context_linux.h ) endif() diff --git a/shared/test/common/mocks/linux/mock_os_context_linux.h b/shared/test/common/mocks/linux/mock_os_context_linux.h new file mode 100644 index 0000000000..1815bfa79f --- /dev/null +++ b/shared/test/common/mocks/linux/mock_os_context_linux.h @@ -0,0 +1,16 @@ +/* + * Copyright (C) 2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +#include "shared/source/os_interface/linux/os_context_linux.h" + +class MockOsContextLinux : public NEO::OsContextLinux { + public: + using NEO::OsContextLinux::drmContextIds; + using NEO::OsContextLinux::OsContextLinux; +}; \ No newline at end of file diff --git a/shared/test/common/mocks/mock_aub_csr.h b/shared/test/common/mocks/mock_aub_csr.h index c7a51251d5..4b7e43a103 100644 --- a/shared/test/common/mocks/mock_aub_csr.h +++ b/shared/test/common/mocks/mock_aub_csr.h @@ -121,8 +121,8 @@ struct MockAubCsr : public AUBCommandStreamReceiverHw { expectMemoryCompressedCalled = true; return AUBCommandStreamReceiverHw::expectMemoryCompressed(gfxAddress, srcAddress, length); } - bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) override { - return true; + WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) override { + return NEO::WaitStatus::Ready; } void addAubComment(const char *message) override { AUBCommandStreamReceiverHw::addAubComment(message); diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index c78cbdf418..10c6838c80 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -50,9 +50,9 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { memset(const_cast(CommandStreamReceiver::tagAddress), 0xFFFFFFFF, tagSize * sizeof(uint32_t)); } - bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) override { + WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) override { waitForCompletionWithTimeoutCalled++; - return true; + return NEO::WaitStatus::Ready; } SubmissionStatus flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) override; @@ -86,7 +86,8 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { return true; } - void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override { + WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override { + return WaitStatus::Ready; } uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled, Device &device) override { return taskCount; }; @@ -197,7 +198,8 @@ class MockCsrHw2 : public CommandStreamReceiverHw { using CommandStreamReceiver::useNewResourceImplicitFlush; MockCsrHw2(ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex, const DeviceBitfield deviceBitfield) - : CommandStreamReceiverHw::CommandStreamReceiverHw(executionEnvironment, rootDeviceIndex, deviceBitfield) {} + : CommandStreamReceiverHw::CommandStreamReceiverHw(executionEnvironment, rootDeviceIndex, deviceBitfield) { + } SubmissionAggregator *peekSubmissionAggregator() { return this->submissionAggregator.get(); diff --git a/shared/test/common/mocks/mock_driver_model.h b/shared/test/common/mocks/mock_driver_model.h new file mode 100644 index 0000000000..c49333b64e --- /dev/null +++ b/shared/test/common/mocks/mock_driver_model.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +#include "shared/source/os_interface/driver_info.h" +#include "shared/source/os_interface/os_interface.h" + +#include +#include + +class MockDriverModel : public NEO::DriverModel { + public: + MockDriverModel() : NEO::DriverModel(NEO::DriverModelType::UNKNOWN) {} + + void setGmmInputArgs(void *args) override {} + + uint32_t getDeviceHandle() const override { return {}; } + + NEO::PhysicalDevicePciBusInfo getPciBusInfo() const override { return pciBusInfo; } + + size_t getMaxMemAllocSize() const override { + return 0; + } + + bool isGpuHangDetected(uint32_t contextId) override { + if (isGpuHangDetectedSideEffect) { + std::invoke(isGpuHangDetectedSideEffect); + } + + return isGpuHangDetectedToReturn; + } + + NEO::PhysicalDevicePciBusInfo pciBusInfo{}; + bool isGpuHangDetectedToReturn{}; + std::function isGpuHangDetectedSideEffect{}; +}; \ No newline at end of file diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index d2d8b055df..1613e3c3cb 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -13,6 +13,7 @@ #include "shared/source/memory_manager/surface.h" #include "shared/source/os_interface/device_factory.h" #include "shared/source/os_interface/hw_info_config.h" +#include "shared/source/os_interface/os_interface.h" #include "shared/source/utilities/tag_allocator.h" #include "shared/test/common/fixtures/device_fixture.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" @@ -20,6 +21,7 @@ #include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/mocks/mock_allocation_properties.h" #include "shared/test/common/mocks/mock_csr.h" +#include "shared/test/common/mocks/mock_driver_model.h" #include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/mocks/mock_memory_manager.h" #include "shared/test/common/mocks/ult_device_factory.h" @@ -30,10 +32,15 @@ #include "gmock/gmock.h" +#include +#include +#include + namespace NEO { extern ApiSpecificConfig::ApiType apiTypeForUlts; } // namespace NEO using namespace NEO; +using namespace std::chrono_literals; struct CommandStreamReceiverTest : public DeviceFixture, public ::testing::Test { @@ -165,6 +172,99 @@ HWTEST_F(CommandStreamReceiverTest, whenStoreAllocationThenStoredAllocationHasTa EXPECT_EQ(csr.peekTaskCount(), allocation->getTaskCount(csr.getOsContext().getContextId())); } +HWTEST_F(CommandStreamReceiverTest, givenGpuHangWhenWaititingForCompletionWithTimeoutThenGpuHangIsReturned) { + auto driverModelMock = std::make_unique(); + driverModelMock->isGpuHangDetectedToReturn = true; + + auto osInterface = std::make_unique(); + osInterface->setDriverModel(std::move(driverModelMock)); + + auto &csr = pDevice->getUltCommandStreamReceiver(); + csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface); + csr.callBaseWaitForCompletionWithTimeout = true; + csr.activePartitions = 1; + csr.gpuHangCheckPeriod = 0us; + + volatile std::uint32_t tasksCount[16] = {}; + csr.tagAddress = tasksCount; + + constexpr auto enableTimeout = false; + constexpr auto timeoutMicroseconds = std::numeric_limits::max(); + constexpr auto taskCountToWait = 1; + + const auto waitStatus = csr.waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait); + EXPECT_EQ(WaitStatus::GpuHang, waitStatus); +} + +HWTEST_F(CommandStreamReceiverTest, givenNoGpuHangWhenWaititingForCompletionWithTimeoutThenReadyIsReturned) { + auto driverModelMock = std::make_unique(); + driverModelMock->isGpuHangDetectedToReturn = false; + + volatile std::uint32_t tasksCount[16] = {}; + driverModelMock->isGpuHangDetectedSideEffect = [&tasksCount] { + tasksCount[0]++; + }; + + auto osInterface = std::make_unique(); + osInterface->setDriverModel(std::move(driverModelMock)); + + auto &csr = pDevice->getUltCommandStreamReceiver(); + csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface); + csr.callBaseWaitForCompletionWithTimeout = true; + csr.tagAddress = tasksCount; + csr.activePartitions = 1; + csr.gpuHangCheckPeriod = 0us; + + constexpr auto enableTimeout = false; + constexpr auto timeoutMicroseconds = std::numeric_limits::max(); + constexpr auto taskCountToWait = 1; + + const auto waitStatus = csr.waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait); + EXPECT_EQ(WaitStatus::Ready, waitStatus); +} + +HWTEST_F(CommandStreamReceiverTest, givenFailingFlushSubmissionsAndGpuHangWhenWaititingForCompletionWithTimeoutThenGpuHangIsReturned) { + auto driverModelMock = std::make_unique(); + driverModelMock->isGpuHangDetectedToReturn = true; + + auto osInterface = std::make_unique(); + osInterface->setDriverModel(std::move(driverModelMock)); + + auto &csr = pDevice->getUltCommandStreamReceiver(); + csr.latestFlushedTaskCount = 0; + csr.shouldFailFlushBatchedSubmissions = true; + csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface); + csr.callBaseWaitForCompletionWithTimeout = true; + + constexpr auto enableTimeout = false; + constexpr auto timeoutMicroseconds = std::numeric_limits::max(); + constexpr auto taskCountToWait = 1; + + const auto waitStatus = csr.waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait); + EXPECT_EQ(WaitStatus::GpuHang, waitStatus); +} + +HWTEST_F(CommandStreamReceiverTest, givenFailingFlushSubmissionsAndNoGpuHangWhenWaititingForCompletionWithTimeoutThenNotReadyIsReturned) { + auto driverModelMock = std::make_unique(); + driverModelMock->isGpuHangDetectedToReturn = false; + + auto osInterface = std::make_unique(); + osInterface->setDriverModel(std::move(driverModelMock)); + + auto &csr = pDevice->getUltCommandStreamReceiver(); + csr.latestFlushedTaskCount = 0; + csr.shouldFailFlushBatchedSubmissions = true; + csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface); + csr.callBaseWaitForCompletionWithTimeout = true; + + constexpr auto enableTimeout = false; + constexpr auto timeoutMicroseconds = std::numeric_limits::max(); + constexpr auto taskCountToWait = 1; + + const auto waitStatus = csr.waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait); + EXPECT_EQ(WaitStatus::NotReady, waitStatus); +} + HWTEST_F(CommandStreamReceiverTest, givenCommandStreamReceiverWhenCheckedForInitialStatusOfStatelessMocsIndexThenUnknownMocsIsReturend) { auto &csr = pDevice->getUltCommandStreamReceiver(); EXPECT_EQ(CacheSettings::unknownMocs, csr.latestSentStatelessMocsConfig); diff --git a/shared/test/unit_test/os_interface/device_uuid_tests.cpp b/shared/test/unit_test/os_interface/device_uuid_tests.cpp index d0dc6eef4b..e479f3ae69 100644 --- a/shared/test/unit_test/os_interface/device_uuid_tests.cpp +++ b/shared/test/unit_test/os_interface/device_uuid_tests.cpp @@ -9,6 +9,7 @@ #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/ult_hw_config.h" #include "shared/test/common/mocks/mock_device.h" +#include "shared/test/common/mocks/mock_driver_model.h" #include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/mocks/mock_memory_manager.h" #include "shared/test/common/mocks/ult_device_factory.h" @@ -28,17 +29,6 @@ class MockMemoryManagerOsAgnosticContext : public MockMemoryManager { } }; -struct MockDriverModel : NEO::DriverModel { - PhysicalDevicePciBusInfo pciBusInfo{}; - MockDriverModel() : NEO::DriverModel(NEO::DriverModelType::UNKNOWN) {} - void setGmmInputArgs(void *args) override {} - uint32_t getDeviceHandle() const override { return {}; } - PhysicalDevicePciBusInfo getPciBusInfo() const override { return pciBusInfo; } - size_t getMaxMemAllocSize() const override { - return 0; - } -}; - template class MockHwInfoConfigHw : public HwInfoConfigHw { public: