Detect GPU hang in clWaitForEvents

This change: - moves NEO::WaitStatus to a separate file - enables detection of GPU hang in clWaitForEvents - adjusts most of blocking calls in CommandStreamReceiver to return WaitStatus - adds ULTs to cover the new code Related-To: NEO-6681 Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
2025-12-30 09:58:55 +08:00 · 2022-02-22 12:51:29 +00:00
parent f2e1361541
commit 7f729b7f89
41 changed files with 487 additions and 95 deletions
--- a/opencl/source/command_queue/command_queue.cpp
+++ b/opencl/source/command_queue/command_queue.cpp
@@ -229,19 +229,25 @@ bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState
    return false;
 }

-void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) {
+WaitStatus CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) {
    WAIT_ENTER()

+    WaitStatus waitStatus{WaitStatus::Ready};
+
    DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", gpgpuTaskCountToWait);
    DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag());

    if (!skipWait) {
        bool forcePowerSavingMode = this->throttle == QueueThrottle::LOW;

-        getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait,
-                                                                              flushStampToWait,
-                                                                              useQuickKmdSleep,
-                                                                              forcePowerSavingMode);
+        waitStatus = getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait,
+                                                                                           flushStampToWait,
+                                                                                           useQuickKmdSleep,
+                                                                                           forcePowerSavingMode);
+        if (waitStatus == WaitStatus::GpuHang) {
+            return WaitStatus::GpuHang;
+        }
+
        DEBUG_BREAK_IF(getHwTag() < gpgpuTaskCountToWait);

        if (gtpinIsGTPinInitialized()) {
@@ -251,17 +257,25 @@ void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEn

    for (const CopyEngineState &copyEngine : copyEnginesToWait) {
        auto bcsCsr = getBcsCommandStreamReceiver(copyEngine.engineType);
-        bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false);
-        bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount);
+
+        waitStatus = bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false);
+        if (waitStatus == WaitStatus::GpuHang) {
+            return WaitStatus::GpuHang;
+        }
+
+        waitStatus = bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount);
+        if (waitStatus == WaitStatus::GpuHang) {
+            return WaitStatus::GpuHang;
+        }
    }

-    if (cleanTemporaryAllocationList) {
-        getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait);
-    } else {
-        getGpgpuCommandStreamReceiver().waitForTaskCount(gpgpuTaskCountToWait);
-    }
+    waitStatus = cleanTemporaryAllocationList
+                     ? getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait)
+                     : getGpgpuCommandStreamReceiver().waitForTaskCount(gpgpuTaskCountToWait);

    WAIT_LEAVE()
+
+    return waitStatus;
 }

 bool CommandQueue::isQueueBlocked() {
--- a/opencl/source/command_queue/command_queue.h
+++ b/opencl/source/command_queue/command_queue.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -209,9 +209,9 @@ class CommandQueue : public BaseObject<_cl_command_queue> {

    MOCKABLE_VIRTUAL bool isQueueBlocked();

-    MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait);
-    MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
-        this->waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, true, false);
+    MOCKABLE_VIRTUAL WaitStatus waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait);
+    MOCKABLE_VIRTUAL WaitStatus waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
+        return this->waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, true, false);
    }
    MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList);
    MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler) {
@@ -223,7 +223,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
                                             const cl_event *eventWaitList);

    MOCKABLE_VIRTUAL CommandStreamReceiver &getGpgpuCommandStreamReceiver() const;
-    CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const;
+    MOCKABLE_VIRTUAL CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const;
    CommandStreamReceiver *getBcsForAuxTranslation() const;
    MOCKABLE_VIRTUAL CommandStreamReceiver &selectCsrForBuiltinOperation(const CsrSelectionArgs &args) const;
    Device &getDevice() const noexcept;