Detect GPU hang in clWaitForEvents

This change:
- moves NEO::WaitStatus to a separate file
- enables detection of GPU hang in clWaitForEvents
- adjusts most of blocking calls in CommandStreamReceiver to return WaitStatus
- adds ULTs to cover the new code

Related-To: NEO-6681
Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
Patryk Wrobel
2022-02-22 12:51:29 +00:00
committed by Compute-Runtime-Automation
parent f2e1361541
commit 7f729b7f89
41 changed files with 487 additions and 95 deletions

View File

@@ -229,19 +229,25 @@ bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState
return false;
}
void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) {
WaitStatus CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) {
WAIT_ENTER()
WaitStatus waitStatus{WaitStatus::Ready};
DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", gpgpuTaskCountToWait);
DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag());
if (!skipWait) {
bool forcePowerSavingMode = this->throttle == QueueThrottle::LOW;
getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait,
flushStampToWait,
useQuickKmdSleep,
forcePowerSavingMode);
waitStatus = getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait,
flushStampToWait,
useQuickKmdSleep,
forcePowerSavingMode);
if (waitStatus == WaitStatus::GpuHang) {
return WaitStatus::GpuHang;
}
DEBUG_BREAK_IF(getHwTag() < gpgpuTaskCountToWait);
if (gtpinIsGTPinInitialized()) {
@@ -251,17 +257,25 @@ void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEn
for (const CopyEngineState &copyEngine : copyEnginesToWait) {
auto bcsCsr = getBcsCommandStreamReceiver(copyEngine.engineType);
bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false);
bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount);
waitStatus = bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false);
if (waitStatus == WaitStatus::GpuHang) {
return WaitStatus::GpuHang;
}
waitStatus = bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount);
if (waitStatus == WaitStatus::GpuHang) {
return WaitStatus::GpuHang;
}
}
if (cleanTemporaryAllocationList) {
getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait);
} else {
getGpgpuCommandStreamReceiver().waitForTaskCount(gpgpuTaskCountToWait);
}
waitStatus = cleanTemporaryAllocationList
? getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait)
: getGpgpuCommandStreamReceiver().waitForTaskCount(gpgpuTaskCountToWait);
WAIT_LEAVE()
return waitStatus;
}
bool CommandQueue::isQueueBlocked() {