mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-30 09:58:55 +08:00
Detect GPU hang in clWaitForEvents
This change: - moves NEO::WaitStatus to a separate file - enables detection of GPU hang in clWaitForEvents - adjusts most of blocking calls in CommandStreamReceiver to return WaitStatus - adds ULTs to cover the new code Related-To: NEO-6681 Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
f2e1361541
commit
7f729b7f89
@@ -229,19 +229,25 @@ bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState
|
||||
return false;
|
||||
}
|
||||
|
||||
void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) {
|
||||
WaitStatus CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) {
|
||||
WAIT_ENTER()
|
||||
|
||||
WaitStatus waitStatus{WaitStatus::Ready};
|
||||
|
||||
DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", gpgpuTaskCountToWait);
|
||||
DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag());
|
||||
|
||||
if (!skipWait) {
|
||||
bool forcePowerSavingMode = this->throttle == QueueThrottle::LOW;
|
||||
|
||||
getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait,
|
||||
flushStampToWait,
|
||||
useQuickKmdSleep,
|
||||
forcePowerSavingMode);
|
||||
waitStatus = getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait,
|
||||
flushStampToWait,
|
||||
useQuickKmdSleep,
|
||||
forcePowerSavingMode);
|
||||
if (waitStatus == WaitStatus::GpuHang) {
|
||||
return WaitStatus::GpuHang;
|
||||
}
|
||||
|
||||
DEBUG_BREAK_IF(getHwTag() < gpgpuTaskCountToWait);
|
||||
|
||||
if (gtpinIsGTPinInitialized()) {
|
||||
@@ -251,17 +257,25 @@ void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEn
|
||||
|
||||
for (const CopyEngineState ©Engine : copyEnginesToWait) {
|
||||
auto bcsCsr = getBcsCommandStreamReceiver(copyEngine.engineType);
|
||||
bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false);
|
||||
bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount);
|
||||
|
||||
waitStatus = bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false);
|
||||
if (waitStatus == WaitStatus::GpuHang) {
|
||||
return WaitStatus::GpuHang;
|
||||
}
|
||||
|
||||
waitStatus = bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount);
|
||||
if (waitStatus == WaitStatus::GpuHang) {
|
||||
return WaitStatus::GpuHang;
|
||||
}
|
||||
}
|
||||
|
||||
if (cleanTemporaryAllocationList) {
|
||||
getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait);
|
||||
} else {
|
||||
getGpgpuCommandStreamReceiver().waitForTaskCount(gpgpuTaskCountToWait);
|
||||
}
|
||||
waitStatus = cleanTemporaryAllocationList
|
||||
? getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait)
|
||||
: getGpgpuCommandStreamReceiver().waitForTaskCount(gpgpuTaskCountToWait);
|
||||
|
||||
WAIT_LEAVE()
|
||||
|
||||
return waitStatus;
|
||||
}
|
||||
|
||||
bool CommandQueue::isQueueBlocked() {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -209,9 +209,9 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
|
||||
MOCKABLE_VIRTUAL bool isQueueBlocked();
|
||||
|
||||
MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait);
|
||||
MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
|
||||
this->waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, true, false);
|
||||
MOCKABLE_VIRTUAL WaitStatus waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait);
|
||||
MOCKABLE_VIRTUAL WaitStatus waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
|
||||
return this->waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, true, false);
|
||||
}
|
||||
MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList);
|
||||
MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler) {
|
||||
@@ -223,7 +223,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
const cl_event *eventWaitList);
|
||||
|
||||
MOCKABLE_VIRTUAL CommandStreamReceiver &getGpgpuCommandStreamReceiver() const;
|
||||
CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const;
|
||||
MOCKABLE_VIRTUAL CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const;
|
||||
CommandStreamReceiver *getBcsForAuxTranslation() const;
|
||||
MOCKABLE_VIRTUAL CommandStreamReceiver &selectCsrForBuiltinOperation(const CsrSelectionArgs &args) const;
|
||||
Device &getDevice() const noexcept;
|
||||
|
||||
Reference in New Issue
Block a user