fix: don't return csr as busy if gpu hang is detected

Related-To: NEO-13071

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2024-10-30 14:02:28 +00:00
committed by Compute-Runtime-Automation
parent a0ce4b78d3
commit 1f60935930
4 changed files with 22 additions and 3 deletions

View File

@@ -548,6 +548,10 @@ class CommandStreamReceiver {
return !testTaskCountReady(getTagAddress(), this->taskCount);
}
bool isBusyWithoutHang(TimeType &lastHangCheckTime) {
return isBusy() && !this->checkGpuHangDetected(std::chrono::high_resolution_clock::now(), lastHangCheckTime);
}
bool canUse4GbHeaps() const {
return this->use4GbHeaps;
}

View File

@@ -125,6 +125,7 @@ void *DirectSubmissionController::controlDirectSubmissionsState(void *self) {
}
controller->timeSinceLastCheck = controller->getCpuTimestamp();
controller->lastHangCheckTime = std::chrono::high_resolution_clock::now();
while (true) {
if (!controller->keepControlling.load()) {
return nullptr;
@@ -190,7 +191,7 @@ void DirectSubmissionController::checkNewSubmissions() {
bool DirectSubmissionController::isDirectSubmissionIdle(CommandStreamReceiver *csr, std::unique_lock<std::recursive_mutex> &csrLock) {
if (csr->peekLatestFlushedTaskCount() == csr->peekTaskCount()) {
return !csr->isBusy();
return !csr->isBusyWithoutHang(lastHangCheckTime);
}
csr->flushTagUpdate();
@@ -203,13 +204,13 @@ bool DirectSubmissionController::isDirectSubmissionIdle(CommandStreamReceiver *c
// unblock csr during polling
csrLock.unlock();
while (currCpuTimeInNS < timeToWait) {
if (!csr->isBusy()) {
if (!csr->isBusyWithoutHang(lastHangCheckTime)) {
break;
}
osTime->getCpuTime(&currCpuTimeInNS);
}
csrLock.lock();
return !csr->isBusy();
return !csr->isBusyWithoutHang(lastHangCheckTime);
}
SteadyClock::time_point DirectSubmissionController::getCpuTimestamp() {

View File

@@ -27,6 +27,7 @@ class Thread;
class ProductHelper;
using SteadyClock = std::chrono::steady_clock;
using HighResolutionClock = std::chrono::high_resolution_clock;
struct TimeoutParams {
std::chrono::microseconds maxTimeout;
@@ -118,6 +119,7 @@ class DirectSubmissionController {
SteadyClock::time_point timeSinceLastCheck{};
SteadyClock::time_point lastTerminateCpuTimestamp{};
HighResolutionClock::time_point lastHangCheckTime{};
std::chrono::microseconds maxTimeout{defaultTimeout};
std::chrono::microseconds timeout{defaultTimeout};
int32_t timeoutDivisor = 1;

View File

@@ -803,6 +803,18 @@ TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskSameAsTaskCount
EXPECT_EQ(0u, csr->flushTagUpdateCalledTimes);
}
TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskSameAsTaskCountAndGpuHangThenTerminateDirectSubmission) {
csr->setLatestFlushedTaskCount(10u);
csr->isBusyReturnValue = true;
csr->isGpuHangDetectedReturnValue = true;
controller->checkNewSubmissions();
EXPECT_TRUE(controller->directSubmissions[csr.get()].isStopped);
EXPECT_EQ(controller->directSubmissions[csr.get()].taskCount, 10u);
EXPECT_EQ(1u, csr->stopDirectSubmissionCalledTimes);
EXPECT_EQ(0u, csr->flushTagUpdateCalledTimes);
}
TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskSameAsTaskCountAndGpuIdleThenTerminateDirectSubmission) {
csr->setLatestFlushedTaskCount(10u);
csr->isBusyReturnValue = false;