Detect GPU hang in clWaitForEvents

This change:
- moves NEO::WaitStatus to a separate file
- enables detection of GPU hang in clWaitForEvents
- adjusts most of blocking calls in CommandStreamReceiver to return WaitStatus
- adds ULTs to cover the new code

Related-To: NEO-6681
Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
Patryk Wrobel
2022-02-22 12:51:29 +00:00
committed by Compute-Runtime-Automation
parent f2e1361541
commit 7f729b7f89
41 changed files with 487 additions and 95 deletions

View File

@@ -27,6 +27,8 @@
#include "opencl/source/helpers/hardware_commands_helper.h"
#include "opencl/source/mem_obj/mem_obj.h"
#include <algorithm>
namespace NEO {
Event::Event(
@@ -417,15 +419,18 @@ void Event::getBoundaryTimestampValues(TimestampPacketContainer *timestampContai
}
}
inline bool Event::wait(bool blocking, bool useQuickKmdSleep) {
inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) {
while (this->taskCount == CompletionStamp::notReady) {
if (blocking == false) {
return false;
return WaitStatus::NotReady;
}
}
Range<CopyEngineState> states{&bcsState, bcsState.isValid() ? 1u : 0u};
cmdQueue->waitUntilComplete(taskCount.load(), states, flushStamp->peekStamp(), useQuickKmdSleep);
const auto waitStatus = cmdQueue->waitUntilComplete(taskCount.load(), states, flushStamp->peekStamp(), useQuickKmdSleep);
if (waitStatus == WaitStatus::GpuHang) {
return WaitStatus::GpuHang;
}
updateExecutionStatus();
DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);
@@ -433,7 +438,7 @@ inline bool Event::wait(bool blocking, bool useQuickKmdSleep) {
auto *allocationStorage = cmdQueue->getGpgpuCommandStreamReceiver().getInternalAllocationStorage();
allocationStorage->cleanAllocationList(this->taskCount, TEMPORARY_ALLOCATION);
return true;
return WaitStatus::Ready;
}
void Event::updateExecutionStatus() {
@@ -630,16 +635,23 @@ cl_int Event::waitForEvents(cl_uint numEvents,
// pointers to workerLists - for fast swap operations
WorkerListT *currentlyPendingEvents = &workerList1;
WorkerListT *pendingEventsLeft = &workerList2;
WaitStatus eventWaitStatus = WaitStatus::NotReady;
while (currentlyPendingEvents->size() > 0) {
for (auto &e : *currentlyPendingEvents) {
Event *event = castToObjectOrAbort<Event>(e);
for (auto current = currentlyPendingEvents->begin(), end = currentlyPendingEvents->end(); current != end; ++current) {
Event *event = castToObjectOrAbort<Event>(*current);
if (event->peekExecutionStatus() < CL_COMPLETE) {
return CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
}
if (event->wait(false, false) == false) {
eventWaitStatus = event->wait(false, false);
if (eventWaitStatus == WaitStatus::NotReady) {
pendingEventsLeft->push_back(event);
} else if (eventWaitStatus == WaitStatus::GpuHang) {
setExecutionStatusToAbortedDueToGpuHang(pendingEventsLeft->begin(), pendingEventsLeft->end());
setExecutionStatusToAbortedDueToGpuHang(current, end);
return CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
}
}
@@ -650,6 +662,13 @@ cl_int Event::waitForEvents(cl_uint numEvents,
return CL_SUCCESS;
}
inline void Event::setExecutionStatusToAbortedDueToGpuHang(cl_event *first, cl_event *last) {
std::for_each(first, last, [](cl_event &e) {
Event *event = castToObjectOrAbort<Event>(e);
event->transitionExecutionStatus(executionAbortedDueToGpuHang);
});
}
uint32_t Event::getTaskLevel() {
return taskLevel;
}

View File

@@ -1,11 +1,12 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/helpers/flush_stamp.h"
#include "shared/source/os_interface/os_time.h"
#include "shared/source/os_interface/performance_counters.h"
@@ -80,6 +81,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
};
static const cl_ulong objectMagic = 0x80134213A43C981ALL;
static constexpr cl_int executionAbortedDueToGpuHang = -777;
Event(CommandQueue *cmdQueue, cl_command_type cmdType,
uint32_t taskLevel, uint32_t taskCount);
@@ -206,9 +208,8 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
// adds a callback (execution state change listener) to this event's list of callbacks
void addCallback(Callback::ClbFuncT fn, cl_int type, void *data);
//returns true on success
//if(blocking==false), will return with false instead of blocking while waiting for completion
virtual bool wait(bool blocking, bool useQuickKmdSleep);
//if(blocking==false), will return with WaitStatus::NotReady instead of blocking while waiting for completion
virtual WaitStatus wait(bool blocking, bool useQuickKmdSleep);
bool isUserEvent() const {
return (CL_COMMAND_USER == cmdType);
@@ -347,6 +348,8 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
void unblockEventsBlockedByThis(int32_t transitionStatus);
void submitCommand(bool abortBlockedTasks);
static void setExecutionStatusToAbortedDueToGpuHang(cl_event *first, cl_event *last);
bool currentCmdQVirtualEvent;
std::atomic<Command *> cmdToSubmit;
std::atomic<Command *> submittedCmd;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -24,13 +24,13 @@ void UserEvent::updateExecutionStatus() {
return;
}
bool UserEvent::wait(bool blocking, bool useQuickKmdSleep) {
WaitStatus UserEvent::wait(bool blocking, bool useQuickKmdSleep) {
while (updateStatusAndCheckCompletion() == false) {
if (blocking == false) {
return false;
return WaitStatus::NotReady;
}
}
return true;
return WaitStatus::Ready;
}
uint32_t UserEvent::getTaskLevel() {
@@ -53,16 +53,15 @@ VirtualEvent::VirtualEvent(CommandQueue *cmdQ, Context *ctx)
}
void VirtualEvent::updateExecutionStatus() {
;
}
bool VirtualEvent::wait(bool blocking, bool useQuickKmdSleep) {
WaitStatus VirtualEvent::wait(bool blocking, bool useQuickKmdSleep) {
while (updateStatusAndCheckCompletion() == false) {
if (blocking == false) {
return false;
return WaitStatus::NotReady;
}
}
return true;
return WaitStatus::Ready;
}
uint32_t VirtualEvent::getTaskLevel() {

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -18,7 +18,7 @@ class UserEvent : public Event {
~UserEvent() override = default;
bool wait(bool blocking, bool useQuickKmdSleep) override;
WaitStatus wait(bool blocking, bool useQuickKmdSleep) override;
void updateExecutionStatus() override;
@@ -33,7 +33,7 @@ class VirtualEvent : public Event {
~VirtualEvent() override = default;
bool wait(bool blocking, bool useQuickKmdSleep) override;
WaitStatus wait(bool blocking, bool useQuickKmdSleep) override;
bool setStatus(cl_int status) override;