Detect GPU hangs in CommandMapUnmap::submit()

This change introduces detection of GPU hangs
in CommandMapUnmap::submit() as well as in Event::submitCommand().
ULTs have been added to cover the new code.

Related-To: NEO-6681
Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
Patryk Wrobel
2022-03-04 16:23:43 +00:00
committed by Compute-Runtime-Automation
parent 8c4b2aafa1
commit 4cde6ea1ce
8 changed files with 154 additions and 6 deletions

View File

@@ -587,10 +587,17 @@ void Event::submitCommand(bool abortTasks) {
this->cmdQueue->getGpgpuCommandStreamReceiver().makeResident(*perfCounterNode->getBaseGraphicsAllocation());
}
}
auto &complStamp = cmdToProcess->submit(taskLevel, abortTasks);
if (profilingCpuPath && this->isProfilingEnabled()) {
setEndTimeStamp();
}
if (complStamp.taskCount == CompletionStamp::gpuHang) {
abortExecutionDueToGpuHang();
return;
}
updateTaskCount(complStamp.taskCount, peekBcsTaskCountFromCommandQueue());
flushStamp->setStamp(complStamp.flushStamp);
submittedCmd.exchange(cmdToProcess.release());

View File

@@ -11,6 +11,7 @@
#include "shared/source/command_stream/csr_deps.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/command_stream/preemption.h"
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/helpers/string.h"
@@ -38,9 +39,10 @@ CommandMapUnmap::CommandMapUnmap(MapOperationType operationType, MemObj &memObj,
}
CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
DecRefInternalAtScopeEnd decRefInternalAtScopeEnd{memObj};
if (terminated) {
this->terminated = true;
memObj.decRefInternal();
return completionStamp;
}
@@ -98,7 +100,12 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
commandQueue.updateLatestSentEnqueueType(EnqueueProperties::Operation::DependencyResolveOnGpu);
if (!memObj.isMemObjZeroCopy()) {
commandQueue.waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false);
const auto waitStatus = commandQueue.waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false);
if (waitStatus == WaitStatus::GpuHang) {
completionStamp.taskCount = CompletionStamp::gpuHang;
return completionStamp;
}
if (operationType == MAP) {
memObj.transferDataToHostPtr(copySize, copyOffset);
} else if (!readOnly) {
@@ -107,8 +114,6 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
}
}
memObj.decRefInternal();
return completionStamp;
}