mirror of
https://github.com/intel/compute-runtime.git
synced 2025-09-10 12:53:42 +08:00
Detect GPU hangs in clFinish
This change introduces detection of GPU hangs in clFinish function as well as unit tests to cover the new code. Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:

committed by
Compute-Runtime-Automation

parent
cf1bc3a2ba
commit
0ecc7c5e3b
@ -995,7 +995,7 @@ bool CommandQueue::isWaitForTimestampsEnabled() {
|
||||
return enabled;
|
||||
}
|
||||
|
||||
void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList) {
|
||||
WaitStatus CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList) {
|
||||
if (blockedQueue) {
|
||||
while (isQueueBlocked()) {
|
||||
}
|
||||
@ -1014,11 +1014,14 @@ void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHan
|
||||
activeBcsStates.push_back(state);
|
||||
}
|
||||
}
|
||||
waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps);
|
||||
|
||||
const auto waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps);
|
||||
|
||||
if (printfHandler) {
|
||||
printfHandler->printEnqueueOutput();
|
||||
}
|
||||
|
||||
return waitStatus;
|
||||
}
|
||||
|
||||
void CommandQueue::setupBarrierTimestampForBcsEngines(aub_stream::EngineType engineType, TimestampPacketDependencies ×tampPacketDependencies) {
|
||||
|
@ -213,9 +213,9 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
MOCKABLE_VIRTUAL WaitStatus waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
|
||||
return this->waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, true, false);
|
||||
}
|
||||
MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList);
|
||||
MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler) {
|
||||
this->waitForAllEngines(blockedQueue, printfHandler, true);
|
||||
MOCKABLE_VIRTUAL WaitStatus waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList);
|
||||
MOCKABLE_VIRTUAL WaitStatus waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler) {
|
||||
return this->waitForAllEngines(blockedQueue, printfHandler, true);
|
||||
}
|
||||
|
||||
static uint32_t getTaskLevelFromWaitList(uint32_t taskLevel,
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@ -7,6 +7,7 @@
|
||||
|
||||
#pragma once
|
||||
#include "shared/source/command_stream/command_stream_receiver.h"
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
|
||||
#include "opencl/source/command_queue/command_queue_hw.h"
|
||||
|
||||
@ -20,7 +21,10 @@ cl_int CommandQueueHw<GfxFamily>::finish() {
|
||||
}
|
||||
|
||||
// Stall until HW reaches taskCount on all its engines
|
||||
waitForAllEngines(true, nullptr);
|
||||
const auto waitStatus = waitForAllEngines(true, nullptr);
|
||||
if (waitStatus == WaitStatus::GpuHang) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
@ -5,6 +5,7 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
#include "shared/test/common/cmd_parse/hw_parse.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/helpers/unit_test_helper.h"
|
||||
@ -1497,6 +1498,28 @@ HWTEST_F(CommandQueueHwTest, givenFinishWhenFlushBatchedSubmissionsFailsThenErro
|
||||
EXPECT_EQ(CL_OUT_OF_RESOURCES, errorCode);
|
||||
}
|
||||
|
||||
HWTEST_F(CommandQueueHwTest, givenGpuHangWhenFinishingCommandQueueHwThenWaitForEnginesIsCalledAndOutOfResourcesIsReturned) {
|
||||
MockCommandQueueHw<FamilyType> mockCmdQueueHw{context, pClDevice, nullptr};
|
||||
|
||||
mockCmdQueueHw.waitForAllEnginesReturnValue = WaitStatus::GpuHang;
|
||||
mockCmdQueueHw.getUltCommandStreamReceiver().shouldFlushBatchedSubmissionsReturnSuccess = true;
|
||||
|
||||
const auto finishResult = mockCmdQueueHw.finish();
|
||||
EXPECT_EQ(1, mockCmdQueueHw.waitForAllEnginesCalledCount);
|
||||
EXPECT_EQ(CL_OUT_OF_RESOURCES, finishResult);
|
||||
}
|
||||
|
||||
HWTEST_F(CommandQueueHwTest, givenNoGpuHangWhenFinishingCommandQueueHwThenWaitForEnginesIsCalledAndSuccessIsReturned) {
|
||||
MockCommandQueueHw<FamilyType> mockCmdQueueHw{context, pClDevice, nullptr};
|
||||
|
||||
mockCmdQueueHw.waitForAllEnginesReturnValue = WaitStatus::Ready;
|
||||
mockCmdQueueHw.getUltCommandStreamReceiver().shouldFlushBatchedSubmissionsReturnSuccess = true;
|
||||
|
||||
const auto finishResult = mockCmdQueueHw.finish();
|
||||
EXPECT_EQ(1, mockCmdQueueHw.waitForAllEnginesCalledCount);
|
||||
EXPECT_EQ(CL_SUCCESS, finishResult);
|
||||
}
|
||||
|
||||
HWTEST_F(IoqCommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingSubsequentBlitsThenGpgpuCommandStreamIsNotObtained) {
|
||||
auto &gpgpuCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
auto srcBuffer = std::unique_ptr<Buffer>{BufferHelper<>::create(pContext)};
|
||||
|
@ -1079,7 +1079,8 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetWhe
|
||||
mockCsr->taskCount.store(10);
|
||||
mockCsr->latestFlushedTaskCount.store(5);
|
||||
|
||||
commandQueue.waitForAllEngines(false, nullptr);
|
||||
const auto waitStatus = commandQueue.waitForAllEngines(false, nullptr);
|
||||
EXPECT_EQ(WaitStatus::Ready, waitStatus);
|
||||
|
||||
parseCommands<FamilyType>(mockCsr->getCS(4096u));
|
||||
auto itorPipeControl = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
|
||||
@ -1110,7 +1111,8 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenEnabledDirectSubmissionUpdate
|
||||
mockCsr->taskCount.store(10);
|
||||
mockCsr->latestFlushedTaskCount.store(5);
|
||||
|
||||
commandQueue.waitForAllEngines(false, nullptr);
|
||||
const auto waitStatus = commandQueue.waitForAllEngines(false, nullptr);
|
||||
EXPECT_EQ(WaitStatus::Ready, waitStatus);
|
||||
|
||||
parseCommands<FamilyType>(mockCsr->getCS(4096u));
|
||||
auto itorPipeControl = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
|
||||
|
@ -13,6 +13,8 @@
|
||||
|
||||
#include "opencl/source/command_queue/command_queue_hw.h"
|
||||
|
||||
#include <optional>
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// MockCommandQueue - Core implementation
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@ -340,6 +342,16 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
||||
return BaseClass::waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, cleanTemporaryAllocationList, skipWait);
|
||||
}
|
||||
|
||||
WaitStatus waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler) override {
|
||||
waitForAllEnginesCalledCount++;
|
||||
|
||||
if (waitForAllEnginesReturnValue.has_value()) {
|
||||
return *waitForAllEnginesReturnValue;
|
||||
}
|
||||
|
||||
return BaseClass::waitForAllEngines(blockedQueue, printfHandler);
|
||||
}
|
||||
|
||||
bool isCacheFlushForBcsRequired() const override {
|
||||
if (overrideIsCacheFlushForBcsRequired.enabled) {
|
||||
return overrideIsCacheFlushForBcsRequired.returnValue;
|
||||
@ -373,6 +385,8 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
||||
BuiltinOpParams kernelParams;
|
||||
std::atomic<uint32_t> latestTaskCountWaited{std::numeric_limits<uint32_t>::max()};
|
||||
bool flushCalled = false;
|
||||
std::optional<WaitStatus> waitForAllEnginesReturnValue{};
|
||||
int waitForAllEnginesCalledCount{0};
|
||||
|
||||
LinearStream *peekCommandStream() {
|
||||
return this->commandStream;
|
||||
|
@ -5,6 +5,7 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
#include "shared/source/helpers/local_memory_access_modes.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/libult/ult_command_stream_receiver.h"
|
||||
@ -220,13 +221,16 @@ HWTEST_F(PrintfHandlerTests, givenPrintfHandlerWhenEnqueueIsBlockedThenDontUsePr
|
||||
using CommandQueueHw<FamilyType>::CommandQueueHw;
|
||||
using CommandQueueHw<FamilyType>::enqueueKernel;
|
||||
|
||||
void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList) override {
|
||||
WaitStatus waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList) override {
|
||||
waitCalled = true;
|
||||
printfHandlerUsedForWait = printfHandler;
|
||||
|
||||
return waitForAllEnginesReturnValue;
|
||||
}
|
||||
|
||||
bool waitCalled = false;
|
||||
PrintfHandler *printfHandlerUsedForWait = nullptr;
|
||||
WaitStatus waitForAllEnginesReturnValue = WaitStatus::Ready;
|
||||
};
|
||||
|
||||
auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
|
||||
|
@ -232,6 +232,10 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
|
||||
return false;
|
||||
}
|
||||
|
||||
if (shouldFlushBatchedSubmissionsReturnSuccess) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return CommandStreamReceiverHw<GfxFamily>::flushBatchedSubmissions();
|
||||
}
|
||||
void initProgrammingFlags() override {
|
||||
@ -341,6 +345,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
|
||||
bool callBaseIsMultiOsContextCapable = false;
|
||||
bool callBaseWaitForCompletionWithTimeout = true;
|
||||
bool shouldFailFlushBatchedSubmissions = false;
|
||||
bool shouldFlushBatchedSubmissionsReturnSuccess = false;
|
||||
WaitStatus returnWaitForCompletionWithTimeout = WaitStatus::Ready;
|
||||
};
|
||||
} // namespace NEO
|
||||
|
Reference in New Issue
Block a user