mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-08 22:12:59 +08:00
fix: don't return csr as busy if gpu hang is detected
Related-To: NEO-13071 Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
a0ce4b78d3
commit
1f60935930
@@ -548,6 +548,10 @@ class CommandStreamReceiver {
|
||||
return !testTaskCountReady(getTagAddress(), this->taskCount);
|
||||
}
|
||||
|
||||
bool isBusyWithoutHang(TimeType &lastHangCheckTime) {
|
||||
return isBusy() && !this->checkGpuHangDetected(std::chrono::high_resolution_clock::now(), lastHangCheckTime);
|
||||
}
|
||||
|
||||
bool canUse4GbHeaps() const {
|
||||
return this->use4GbHeaps;
|
||||
}
|
||||
|
||||
@@ -125,6 +125,7 @@ void *DirectSubmissionController::controlDirectSubmissionsState(void *self) {
|
||||
}
|
||||
|
||||
controller->timeSinceLastCheck = controller->getCpuTimestamp();
|
||||
controller->lastHangCheckTime = std::chrono::high_resolution_clock::now();
|
||||
while (true) {
|
||||
if (!controller->keepControlling.load()) {
|
||||
return nullptr;
|
||||
@@ -190,7 +191,7 @@ void DirectSubmissionController::checkNewSubmissions() {
|
||||
|
||||
bool DirectSubmissionController::isDirectSubmissionIdle(CommandStreamReceiver *csr, std::unique_lock<std::recursive_mutex> &csrLock) {
|
||||
if (csr->peekLatestFlushedTaskCount() == csr->peekTaskCount()) {
|
||||
return !csr->isBusy();
|
||||
return !csr->isBusyWithoutHang(lastHangCheckTime);
|
||||
}
|
||||
|
||||
csr->flushTagUpdate();
|
||||
@@ -203,13 +204,13 @@ bool DirectSubmissionController::isDirectSubmissionIdle(CommandStreamReceiver *c
|
||||
// unblock csr during polling
|
||||
csrLock.unlock();
|
||||
while (currCpuTimeInNS < timeToWait) {
|
||||
if (!csr->isBusy()) {
|
||||
if (!csr->isBusyWithoutHang(lastHangCheckTime)) {
|
||||
break;
|
||||
}
|
||||
osTime->getCpuTime(&currCpuTimeInNS);
|
||||
}
|
||||
csrLock.lock();
|
||||
return !csr->isBusy();
|
||||
return !csr->isBusyWithoutHang(lastHangCheckTime);
|
||||
}
|
||||
|
||||
SteadyClock::time_point DirectSubmissionController::getCpuTimestamp() {
|
||||
|
||||
@@ -27,6 +27,7 @@ class Thread;
|
||||
class ProductHelper;
|
||||
|
||||
using SteadyClock = std::chrono::steady_clock;
|
||||
using HighResolutionClock = std::chrono::high_resolution_clock;
|
||||
|
||||
struct TimeoutParams {
|
||||
std::chrono::microseconds maxTimeout;
|
||||
@@ -118,6 +119,7 @@ class DirectSubmissionController {
|
||||
|
||||
SteadyClock::time_point timeSinceLastCheck{};
|
||||
SteadyClock::time_point lastTerminateCpuTimestamp{};
|
||||
HighResolutionClock::time_point lastHangCheckTime{};
|
||||
std::chrono::microseconds maxTimeout{defaultTimeout};
|
||||
std::chrono::microseconds timeout{defaultTimeout};
|
||||
int32_t timeoutDivisor = 1;
|
||||
|
||||
@@ -803,6 +803,18 @@ TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskSameAsTaskCount
|
||||
EXPECT_EQ(0u, csr->flushTagUpdateCalledTimes);
|
||||
}
|
||||
|
||||
TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskSameAsTaskCountAndGpuHangThenTerminateDirectSubmission) {
|
||||
csr->setLatestFlushedTaskCount(10u);
|
||||
csr->isBusyReturnValue = true;
|
||||
csr->isGpuHangDetectedReturnValue = true;
|
||||
|
||||
controller->checkNewSubmissions();
|
||||
EXPECT_TRUE(controller->directSubmissions[csr.get()].isStopped);
|
||||
EXPECT_EQ(controller->directSubmissions[csr.get()].taskCount, 10u);
|
||||
EXPECT_EQ(1u, csr->stopDirectSubmissionCalledTimes);
|
||||
EXPECT_EQ(0u, csr->flushTagUpdateCalledTimes);
|
||||
}
|
||||
|
||||
TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskSameAsTaskCountAndGpuIdleThenTerminateDirectSubmission) {
|
||||
csr->setLatestFlushedTaskCount(10u);
|
||||
csr->isBusyReturnValue = false;
|
||||
|
||||
Reference in New Issue
Block a user