Detect GPU hang in remaining blocking calls of L0

This change introduces detection of GPU hangs in zeEventHostSynchronize and zeFenceHostSynchronize. Furthermore, if CommandQueueHw::executeCommandLists uses ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS and hang occurs, the information about it is propagated to the caller. Related-To: NEO-6681 Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
2025-09-15 13:01:45 +08:00 · 2022-02-16 10:22:03 +00:00
parent 64b8de3c1d
commit 1275c4e200
11 changed files with 199 additions and 14 deletions
--- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl
+++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl
@ -462,7 +462,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
    csr->makeSurfacePackNonResident(csr->getResidencyAllocations());

    if (getSynchronousMode() == ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS) {
-        this->synchronize(std::numeric_limits<uint64_t>::max());
+        const auto synchronizeResult = this->synchronize(std::numeric_limits<uint64_t>::max());
+        if (synchronizeResult == ZE_RESULT_ERROR_DEVICE_LOST) {
+            return ZE_RESULT_ERROR_DEVICE_LOST;
+        }
    }

    this->heapContainer.clear();
--- a/level_zero/core/source/event/event.h
+++ b/level_zero/core/source/event/event.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2021 Intel Corporation
+ * Copyright (C) 2020-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -15,6 +15,7 @@
 #include "level_zero/core/source/driver/driver_handle.h"
 #include <level_zero/ze_api.h>

+#include <chrono>
 #include <limits>

 struct _ze_event_handle_t {};
@ -102,6 +103,7 @@ struct Event : _ze_event_handle_t {
    ze_event_scope_flags_t waitScope = 0u;

    uint32_t kernelCount = 1u;
+    std::chrono::microseconds gpuHangCheckPeriod{500'000};

  protected:
    size_t contextStartOffset = 0u;
--- a/level_zero/core/source/event/event_impl.inl
+++ b/level_zero/core/source/event/event_impl.inl
@ -199,7 +199,8 @@ ze_result_t EventImp<TagSizeT>::hostSignal() {

 template <typename TagSizeT>
 ze_result_t EventImp<TagSizeT>::hostSynchronize(uint64_t timeout) {
-    std::chrono::high_resolution_clock::time_point time1, time2;
+    std::chrono::microseconds elapsedTimeSinceGpuHangCheck{0};
+    std::chrono::high_resolution_clock::time_point waitStartTime, lastHangCheckTime, currentTime;
    uint64_t timeDiff = 0;

    ze_result_t ret = ZE_RESULT_NOT_READY;
@ -212,7 +213,8 @@ ze_result_t EventImp<TagSizeT>::hostSynchronize(uint64_t timeout) {
        return queryStatus();
    }

-    time1 = std::chrono::high_resolution_clock::now();
+    waitStartTime = std::chrono::high_resolution_clock::now();
+    lastHangCheckTime = waitStartTime;
    while (true) {
        ret = queryStatus();
        if (ret == ZE_RESULT_SUCCESS) {
@ -221,12 +223,21 @@ ze_result_t EventImp<TagSizeT>::hostSynchronize(uint64_t timeout) {

        NEO::WaitUtils::waitFunction(nullptr, 0u);

+        currentTime = std::chrono::high_resolution_clock::now();
+        elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast<std::chrono::microseconds>(currentTime - lastHangCheckTime);
+
+        if (elapsedTimeSinceGpuHangCheck.count() >= this->gpuHangCheckPeriod.count()) {
+            lastHangCheckTime = currentTime;
+            if (this->csr->isGpuHangDetected()) {
+                return ZE_RESULT_ERROR_DEVICE_LOST;
+            }
+        }
+
        if (timeout == std::numeric_limits<uint32_t>::max()) {
            continue;
        }

-        time2 = std::chrono::high_resolution_clock::now();
-        timeDiff = std::chrono::duration_cast<std::chrono::nanoseconds>(time2 - time1).count();
+        timeDiff = std::chrono::duration_cast<std::chrono::nanoseconds>(currentTime - waitStartTime).count();

        if (timeDiff >= timeout) {
            break;
--- a/level_zero/core/source/fence/fence.cpp
+++ b/level_zero/core/source/fence/fence.cpp
@ -43,11 +43,13 @@ ze_result_t FenceImp::reset() {
 }

 ze_result_t FenceImp::hostSynchronize(uint64_t timeout) {
-    std::chrono::high_resolution_clock::time_point time1, time2;
+    std::chrono::microseconds elapsedTimeSinceGpuHangCheck{0};
+    std::chrono::high_resolution_clock::time_point waitStartTime, lastHangCheckTime, currentTime;
    uint64_t timeDiff = 0;
    ze_result_t ret = ZE_RESULT_NOT_READY;
+    const auto csr = cmdQueue->getCsr();

-    if (cmdQueue->getCsr()->getType() == NEO::CommandStreamReceiverType::CSR_AUB) {
+    if (csr->getType() == NEO::CommandStreamReceiverType::CSR_AUB) {
        return ZE_RESULT_SUCCESS;
    }

@ -59,7 +61,8 @@ ze_result_t FenceImp::hostSynchronize(uint64_t timeout) {
        return queryStatus();
    }

-    time1 = std::chrono::high_resolution_clock::now();
+    waitStartTime = std::chrono::high_resolution_clock::now();
+    lastHangCheckTime = waitStartTime;
    while (timeDiff < timeout) {
        ret = queryStatus();
        if (ret == ZE_RESULT_SUCCESS) {
@ -68,12 +71,21 @@ ze_result_t FenceImp::hostSynchronize(uint64_t timeout) {

        NEO::WaitUtils::waitFunction(nullptr, 0u);

+        currentTime = std::chrono::high_resolution_clock::now();
+        elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast<std::chrono::microseconds>(currentTime - lastHangCheckTime);
+
+        if (elapsedTimeSinceGpuHangCheck.count() >= gpuHangCheckPeriod.count()) {
+            lastHangCheckTime = currentTime;
+            if (csr->isGpuHangDetected()) {
+                return ZE_RESULT_ERROR_DEVICE_LOST;
+            }
+        }
+
        if (timeout == std::numeric_limits<uint64_t>::max()) {
            continue;
        }

-        time2 = std::chrono::high_resolution_clock::now();
-        timeDiff = std::chrono::duration_cast<std::chrono::nanoseconds>(time2 - time1).count();
+        timeDiff = std::chrono::duration_cast<std::chrono::nanoseconds>(currentTime - waitStartTime).count();
    }

    return ret;
--- a/level_zero/core/source/fence/fence.h
+++ b/level_zero/core/source/fence/fence.h
@ -13,6 +13,7 @@
 #include "level_zero/core/source/cmdqueue/cmdqueue_imp.h"
 #include <level_zero/ze_api.h>

+#include <chrono>
 #include <limits>

 struct _ze_fence_handle_t {};
@ -39,6 +40,7 @@ struct Fence : _ze_fence_handle_t {
  protected:
    uint32_t partitionCount = 1;
    uint32_t taskCount = 0;
+    std::chrono::microseconds gpuHangCheckPeriod{500'000};
 };

 struct FenceImp : public Fence {
--- a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h
+++ b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h
@ -69,7 +69,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
    }
    ze_result_t synchronize(uint64_t timeout) override {
        synchronizedCalled++;
-        return ZE_RESULT_SUCCESS;
+        return synchronizeReturnValue;
    }

    NEO::SubmissionStatus submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr, bool isCooperative) override {
@ -79,6 +79,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {

    uint32_t synchronizedCalled = 0;
    NEO::ResidencyContainer residencyContainerSnapshot;
+    ze_result_t synchronizeReturnValue{ZE_RESULT_SUCCESS};
 };

 struct Deleter {
--- a/level_zero/core/test/unit_tests/mocks/mock_fence.h
+++ b/level_zero/core/test/unit_tests/mocks/mock_fence.h
@ -19,6 +19,9 @@ namespace ult {

 template <>
 struct WhiteBox<::L0::Fence> : public ::L0::Fence {
+    ~WhiteBox() override = default;
+
+    using ::L0::Fence::gpuHangCheckPeriod;
    using ::L0::Fence::partitionCount;
    using ::L0::Fence::taskCount;
 };
--- a/level_zero/core/test/unit_tests/sources/event/test_event.cpp
+++ b/level_zero/core/test/unit_tests/sources/event/test_event.cpp
@ -21,6 +21,12 @@
 #include "level_zero/core/test/unit_tests/mocks/mock_event.h"

 #include <atomic>
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <memory>
+
+using namespace std::chrono_literals;

 namespace CpuIntrinsicsTests {
 extern std::atomic<uintptr_t> lastClFlushedPtr;
@ -597,6 +603,43 @@ class EventSynchronizeTest : public Test<DeviceFixture> {
    std::unique_ptr<L0::Event> event;
 };

+TEST_F(EventSynchronizeTest, GivenGpuHangWhenHostSynchronizeIsCalledThenDeviceLostIsReturned) {
+    const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
+    csr->isGpuHangDetectedReturnValue = true;
+
+    event->csr = csr.get();
+    event->gpuHangCheckPeriod = 0ms;
+
+    const auto timeout = std::numeric_limits<std::uint32_t>::max();
+    const auto result = event->hostSynchronize(timeout);
+
+    EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, result);
+}
+
+TEST_F(EventSynchronizeTest, GivenNoGpuHangAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) {
+    const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
+    csr->isGpuHangDetectedReturnValue = false;
+
+    event->csr = csr.get();
+    event->gpuHangCheckPeriod = 0ms;
+
+    const auto timeoutNanoseconds = 1;
+    const auto result = event->hostSynchronize(timeoutNanoseconds);
+
+    EXPECT_EQ(ZE_RESULT_NOT_READY, result);
+}
+
+TEST_F(EventSynchronizeTest, GivenLongPeriodOfGpuCheckAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) {
+    const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
+    event->csr = csr.get();
+    event->gpuHangCheckPeriod = 50000000ms;
+
+    const auto timeoutNanoseconds = 1;
+    const auto result = event->hostSynchronize(timeoutNanoseconds);
+
+    EXPECT_EQ(ZE_RESULT_NOT_READY, result);
+}
+
 TEST_F(EventSynchronizeTest, givenCallToEventHostSynchronizeWithTimeoutZeroAndStateInitialHostSynchronizeReturnsNotReady) {
    ze_result_t result = event->hostSynchronize(0);
    EXPECT_EQ(ZE_RESULT_NOT_READY, result);
--- a/level_zero/core/test/unit_tests/sources/fence/test_fence.cpp
+++ b/level_zero/core/test/unit_tests/sources/fence/test_fence.cpp
@ -15,6 +15,13 @@
 #include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
 #include "level_zero/core/test/unit_tests/mocks/mock_fence.h"

+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <memory>
+
+using namespace std::chrono_literals;
+
 namespace L0 {
 namespace ult {

@ -49,6 +56,87 @@ TEST_F(FenceTest, whenQueryingStatusWithoutCsrAndFenceUnsignaledThenReturnsNotRe
    fence->destroy();
 }

+TEST_F(FenceTest, GivenGpuHangWhenHostSynchronizeIsCalledThenDeviceLostIsReturned) {
+    const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
+    csr->isGpuHangDetectedReturnValue = true;
+    csr->testTaskCountReadyReturnValue = false;
+
+    Mock<CommandQueue> cmdqueue(device, csr.get());
+    ze_fence_desc_t desc;
+
+    std::unique_ptr<WhiteBox<L0::Fence>> fence;
+    fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc)));
+    ASSERT_NE(nullptr, fence);
+
+    fence->taskCount = 1;
+    fence->gpuHangCheckPeriod = 0ms;
+
+    const auto timeout = std::numeric_limits<std::uint32_t>::max();
+    const auto result = fence->hostSynchronize(timeout);
+
+    EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, result);
+}
+
+TEST_F(FenceTest, GivenNoGpuHangAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) {
+    const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
+    csr->isGpuHangDetectedReturnValue = false;
+    csr->testTaskCountReadyReturnValue = false;
+
+    Mock<CommandQueue> cmdqueue(device, csr.get());
+    ze_fence_desc_t desc;
+
+    std::unique_ptr<WhiteBox<L0::Fence>> fence;
+    fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc)));
+    ASSERT_NE(nullptr, fence);
+
+    fence->taskCount = 1;
+    fence->gpuHangCheckPeriod = 0ms;
+
+    const auto timeoutNanoseconds = 1;
+    const auto result = fence->hostSynchronize(timeoutNanoseconds);
+
+    EXPECT_EQ(ZE_RESULT_NOT_READY, result);
+}
+
+TEST_F(FenceTest, GivenLongPeriodOfGpuCheckAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) {
+    const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
+    csr->testTaskCountReadyReturnValue = false;
+
+    Mock<CommandQueue> cmdqueue(device, csr.get());
+    ze_fence_desc_t desc;
+
+    std::unique_ptr<WhiteBox<L0::Fence>> fence;
+    fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc)));
+    ASSERT_NE(nullptr, fence);
+
+    fence->taskCount = 1;
+    fence->gpuHangCheckPeriod = 50000000ms;
+
+    const auto timeoutNanoseconds = 1;
+    const auto result = fence->hostSynchronize(timeoutNanoseconds);
+
+    EXPECT_EQ(ZE_RESULT_NOT_READY, result);
+}
+
+TEST_F(FenceTest, GivenSuccessfulQueryResultAndNoTimeoutWhenHostSynchronizeIsCalledThenResultSuccessIsReturned) {
+    const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
+    csr->testTaskCountReadyReturnValue = true;
+
+    Mock<CommandQueue> cmdqueue(device, csr.get());
+    ze_fence_desc_t desc;
+
+    std::unique_ptr<WhiteBox<L0::Fence>> fence;
+    fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc)));
+    ASSERT_NE(nullptr, fence);
+
+    fence->taskCount = 1;
+
+    const auto timeout = std::numeric_limits<std::uint32_t>::max();
+    const auto result = fence->hostSynchronize(timeout);
+
+    EXPECT_EQ(ZE_RESULT_SUCCESS, result);
+}
+
 using FenceSynchronizeTest = Test<DeviceFixture>;

 TEST_F(FenceSynchronizeTest, givenCallToFenceHostSynchronizeWithTimeoutZeroAndStateInitialThenHostSynchronizeReturnsNotReady) {
--- a/shared/source/command_stream/command_stream_receiver.h
+++ b/shared/source/command_stream/command_stream_receiver.h
@ -168,7 +168,7 @@ class CommandStreamReceiver {
    virtual WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0;
    virtual WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
    WaitStatus baseWaitFunction(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
-    bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait);
+    MOCKABLE_VIRTUAL bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait);
    virtual void downloadAllocations(){};

    void setSamplerCacheFlushRequired(SamplerCacheFlushState value) { this->samplerCacheFlushRequired = value; }
@ -318,12 +318,13 @@ class CommandStreamReceiver {

    const HardwareInfo &peekHwInfo() const;

+    MOCKABLE_VIRTUAL bool isGpuHangDetected() const;
+
  protected:
    void cleanupResources();
    void printDeviceIndex();
    void checkForNewResources(uint32_t submittedTaskCount, uint32_t allocationTaskCount, GraphicsAllocation &gfxAllocation);
    bool checkImplicitFlushForGpuIdle();
-    bool isGpuHangDetected() const;
    MOCKABLE_VIRTUAL std::unique_lock<MutexType> obtainHostPtrSurfaceCreationLock();

    std::unique_ptr<FlushStampTracker> flushStamp;
--- a/shared/test/common/mocks/mock_command_stream_receiver.h
+++ b/shared/test/common/mocks/mock_command_stream_receiver.h
@ -19,6 +19,7 @@

 #include "gmock/gmock.h"

+#include <optional>
 #include <vector>

 using namespace NEO;
@ -63,6 +64,22 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {

    bool isMultiOsContextCapable() const override { return multiOsContextCapable; }

+    bool isGpuHangDetected() const override {
+        if (isGpuHangDetectedReturnValue.has_value()) {
+            return *isGpuHangDetectedReturnValue;
+        } else {
+            return CommandStreamReceiver::isGpuHangDetected();
+        }
+    }
+
+    bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait) override {
+        if (testTaskCountReadyReturnValue.has_value()) {
+            return *testTaskCountReadyReturnValue;
+        } else {
+            return CommandStreamReceiver::testTaskCountReady(pollAddress, taskCountToWait);
+        }
+    }
+
    MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired, const HardwareInfo &hwInfo) const override {
        return MemoryCompressionState::NotApplicable;
    };
@ -147,6 +164,8 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
    bool createPreemptionAllocationReturn = true;
    bool createPreemptionAllocationParentCall = false;
    bool programComputeBarrierCommandCalled = false;
+    std::optional<bool> isGpuHangDetectedReturnValue{};
+    std::optional<bool> testTaskCountReadyReturnValue{};
 };

 class MockCommandStreamReceiverWithFailingSubmitBatch : public MockCommandStreamReceiver {