Detect GPU hangs in clFinish

This change introduces detection of GPU hangs in
clFinish function as well as unit tests to cover
the new code.

Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
Patryk Wrobel
2022-02-28 10:24:05 +00:00
committed by Compute-Runtime-Automation
parent cf1bc3a2ba
commit 0ecc7c5e3b
8 changed files with 65 additions and 10 deletions

View File

@ -995,7 +995,7 @@ bool CommandQueue::isWaitForTimestampsEnabled() {
return enabled;
}
void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList) {
WaitStatus CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList) {
if (blockedQueue) {
while (isQueueBlocked()) {
}
@ -1014,11 +1014,14 @@ void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHan
activeBcsStates.push_back(state);
}
}
waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps);
const auto waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps);
if (printfHandler) {
printfHandler->printEnqueueOutput();
}
return waitStatus;
}
void CommandQueue::setupBarrierTimestampForBcsEngines(aub_stream::EngineType engineType, TimestampPacketDependencies &timestampPacketDependencies) {

View File

@ -213,9 +213,9 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
MOCKABLE_VIRTUAL WaitStatus waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
return this->waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, true, false);
}
MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList);
MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler) {
this->waitForAllEngines(blockedQueue, printfHandler, true);
MOCKABLE_VIRTUAL WaitStatus waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList);
MOCKABLE_VIRTUAL WaitStatus waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler) {
return this->waitForAllEngines(blockedQueue, printfHandler, true);
}
static uint32_t getTaskLevelFromWaitList(uint32_t taskLevel,

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -7,6 +7,7 @@
#pragma once
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/command_stream/wait_status.h"
#include "opencl/source/command_queue/command_queue_hw.h"
@ -20,7 +21,10 @@ cl_int CommandQueueHw<GfxFamily>::finish() {
}
// Stall until HW reaches taskCount on all its engines
waitForAllEngines(true, nullptr);
const auto waitStatus = waitForAllEngines(true, nullptr);
if (waitStatus == WaitStatus::GpuHang) {
return CL_OUT_OF_RESOURCES;
}
return CL_SUCCESS;
}

View File

@ -5,6 +5,7 @@
*
*/
#include "shared/source/command_stream/wait_status.h"
#include "shared/test/common/cmd_parse/hw_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/unit_test_helper.h"
@ -1497,6 +1498,28 @@ HWTEST_F(CommandQueueHwTest, givenFinishWhenFlushBatchedSubmissionsFailsThenErro
EXPECT_EQ(CL_OUT_OF_RESOURCES, errorCode);
}
HWTEST_F(CommandQueueHwTest, givenGpuHangWhenFinishingCommandQueueHwThenWaitForEnginesIsCalledAndOutOfResourcesIsReturned) {
MockCommandQueueHw<FamilyType> mockCmdQueueHw{context, pClDevice, nullptr};
mockCmdQueueHw.waitForAllEnginesReturnValue = WaitStatus::GpuHang;
mockCmdQueueHw.getUltCommandStreamReceiver().shouldFlushBatchedSubmissionsReturnSuccess = true;
const auto finishResult = mockCmdQueueHw.finish();
EXPECT_EQ(1, mockCmdQueueHw.waitForAllEnginesCalledCount);
EXPECT_EQ(CL_OUT_OF_RESOURCES, finishResult);
}
HWTEST_F(CommandQueueHwTest, givenNoGpuHangWhenFinishingCommandQueueHwThenWaitForEnginesIsCalledAndSuccessIsReturned) {
MockCommandQueueHw<FamilyType> mockCmdQueueHw{context, pClDevice, nullptr};
mockCmdQueueHw.waitForAllEnginesReturnValue = WaitStatus::Ready;
mockCmdQueueHw.getUltCommandStreamReceiver().shouldFlushBatchedSubmissionsReturnSuccess = true;
const auto finishResult = mockCmdQueueHw.finish();
EXPECT_EQ(1, mockCmdQueueHw.waitForAllEnginesCalledCount);
EXPECT_EQ(CL_SUCCESS, finishResult);
}
HWTEST_F(IoqCommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingSubsequentBlitsThenGpgpuCommandStreamIsNotObtained) {
auto &gpgpuCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
auto srcBuffer = std::unique_ptr<Buffer>{BufferHelper<>::create(pContext)};

View File

@ -1079,7 +1079,8 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetWhe
mockCsr->taskCount.store(10);
mockCsr->latestFlushedTaskCount.store(5);
commandQueue.waitForAllEngines(false, nullptr);
const auto waitStatus = commandQueue.waitForAllEngines(false, nullptr);
EXPECT_EQ(WaitStatus::Ready, waitStatus);
parseCommands<FamilyType>(mockCsr->getCS(4096u));
auto itorPipeControl = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
@ -1110,7 +1111,8 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenEnabledDirectSubmissionUpdate
mockCsr->taskCount.store(10);
mockCsr->latestFlushedTaskCount.store(5);
commandQueue.waitForAllEngines(false, nullptr);
const auto waitStatus = commandQueue.waitForAllEngines(false, nullptr);
EXPECT_EQ(WaitStatus::Ready, waitStatus);
parseCommands<FamilyType>(mockCsr->getCS(4096u));
auto itorPipeControl = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());

View File

@ -13,6 +13,8 @@
#include "opencl/source/command_queue/command_queue_hw.h"
#include <optional>
////////////////////////////////////////////////////////////////////////////////
// MockCommandQueue - Core implementation
////////////////////////////////////////////////////////////////////////////////
@ -340,6 +342,16 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
return BaseClass::waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, cleanTemporaryAllocationList, skipWait);
}
WaitStatus waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler) override {
waitForAllEnginesCalledCount++;
if (waitForAllEnginesReturnValue.has_value()) {
return *waitForAllEnginesReturnValue;
}
return BaseClass::waitForAllEngines(blockedQueue, printfHandler);
}
bool isCacheFlushForBcsRequired() const override {
if (overrideIsCacheFlushForBcsRequired.enabled) {
return overrideIsCacheFlushForBcsRequired.returnValue;
@ -373,6 +385,8 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
BuiltinOpParams kernelParams;
std::atomic<uint32_t> latestTaskCountWaited{std::numeric_limits<uint32_t>::max()};
bool flushCalled = false;
std::optional<WaitStatus> waitForAllEnginesReturnValue{};
int waitForAllEnginesCalledCount{0};
LinearStream *peekCommandStream() {
return this->commandStream;

View File

@ -5,6 +5,7 @@
*
*/
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/helpers/local_memory_access_modes.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
@ -220,13 +221,16 @@ HWTEST_F(PrintfHandlerTests, givenPrintfHandlerWhenEnqueueIsBlockedThenDontUsePr
using CommandQueueHw<FamilyType>::CommandQueueHw;
using CommandQueueHw<FamilyType>::enqueueKernel;
void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList) override {
WaitStatus waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList) override {
waitCalled = true;
printfHandlerUsedForWait = printfHandler;
return waitForAllEnginesReturnValue;
}
bool waitCalled = false;
PrintfHandler *printfHandlerUsedForWait = nullptr;
WaitStatus waitForAllEnginesReturnValue = WaitStatus::Ready;
};
auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));

View File

@ -232,6 +232,10 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
return false;
}
if (shouldFlushBatchedSubmissionsReturnSuccess) {
return true;
}
return CommandStreamReceiverHw<GfxFamily>::flushBatchedSubmissions();
}
void initProgrammingFlags() override {
@ -341,6 +345,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
bool callBaseIsMultiOsContextCapable = false;
bool callBaseWaitForCompletionWithTimeout = true;
bool shouldFailFlushBatchedSubmissions = false;
bool shouldFlushBatchedSubmissionsReturnSuccess = false;
WaitStatus returnWaitForCompletionWithTimeout = WaitStatus::Ready;
};
} // namespace NEO