From 1f60935930f77ea048f85bdfdf8006d81b001afb Mon Sep 17 00:00:00 2001 From: Szymon Morek Date: Wed, 30 Oct 2024 14:02:28 +0000 Subject: [PATCH] fix: don't return csr as busy if gpu hang is detected Related-To: NEO-13071 Signed-off-by: Szymon Morek --- .../source/command_stream/command_stream_receiver.h | 4 ++++ .../direct_submission_controller.cpp | 7 ++++--- .../direct_submission/direct_submission_controller.h | 2 ++ .../direct_submission_controller_tests.cpp | 12 ++++++++++++ 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 352ad5bdbe..00380461a0 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -548,6 +548,10 @@ class CommandStreamReceiver { return !testTaskCountReady(getTagAddress(), this->taskCount); } + bool isBusyWithoutHang(TimeType &lastHangCheckTime) { + return isBusy() && !this->checkGpuHangDetected(std::chrono::high_resolution_clock::now(), lastHangCheckTime); + } + bool canUse4GbHeaps() const { return this->use4GbHeaps; } diff --git a/shared/source/direct_submission/direct_submission_controller.cpp b/shared/source/direct_submission/direct_submission_controller.cpp index af2d4f2934..da618bd32c 100644 --- a/shared/source/direct_submission/direct_submission_controller.cpp +++ b/shared/source/direct_submission/direct_submission_controller.cpp @@ -125,6 +125,7 @@ void *DirectSubmissionController::controlDirectSubmissionsState(void *self) { } controller->timeSinceLastCheck = controller->getCpuTimestamp(); + controller->lastHangCheckTime = std::chrono::high_resolution_clock::now(); while (true) { if (!controller->keepControlling.load()) { return nullptr; @@ -190,7 +191,7 @@ void DirectSubmissionController::checkNewSubmissions() { bool DirectSubmissionController::isDirectSubmissionIdle(CommandStreamReceiver *csr, std::unique_lock &csrLock) { if (csr->peekLatestFlushedTaskCount() == csr->peekTaskCount()) { - return !csr->isBusy(); + return !csr->isBusyWithoutHang(lastHangCheckTime); } csr->flushTagUpdate(); @@ -203,13 +204,13 @@ bool DirectSubmissionController::isDirectSubmissionIdle(CommandStreamReceiver *c // unblock csr during polling csrLock.unlock(); while (currCpuTimeInNS < timeToWait) { - if (!csr->isBusy()) { + if (!csr->isBusyWithoutHang(lastHangCheckTime)) { break; } osTime->getCpuTime(&currCpuTimeInNS); } csrLock.lock(); - return !csr->isBusy(); + return !csr->isBusyWithoutHang(lastHangCheckTime); } SteadyClock::time_point DirectSubmissionController::getCpuTimestamp() { diff --git a/shared/source/direct_submission/direct_submission_controller.h b/shared/source/direct_submission/direct_submission_controller.h index 15be336b4b..0ebd5c0c09 100644 --- a/shared/source/direct_submission/direct_submission_controller.h +++ b/shared/source/direct_submission/direct_submission_controller.h @@ -27,6 +27,7 @@ class Thread; class ProductHelper; using SteadyClock = std::chrono::steady_clock; +using HighResolutionClock = std::chrono::high_resolution_clock; struct TimeoutParams { std::chrono::microseconds maxTimeout; @@ -118,6 +119,7 @@ class DirectSubmissionController { SteadyClock::time_point timeSinceLastCheck{}; SteadyClock::time_point lastTerminateCpuTimestamp{}; + HighResolutionClock::time_point lastHangCheckTime{}; std::chrono::microseconds maxTimeout{defaultTimeout}; std::chrono::microseconds timeout{defaultTimeout}; int32_t timeoutDivisor = 1; diff --git a/shared/test/unit_test/direct_submission/direct_submission_controller_tests.cpp b/shared/test/unit_test/direct_submission/direct_submission_controller_tests.cpp index c8a78c0adf..e2a0a7663d 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_controller_tests.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_controller_tests.cpp @@ -803,6 +803,18 @@ TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskSameAsTaskCount EXPECT_EQ(0u, csr->flushTagUpdateCalledTimes); } +TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskSameAsTaskCountAndGpuHangThenTerminateDirectSubmission) { + csr->setLatestFlushedTaskCount(10u); + csr->isBusyReturnValue = true; + csr->isGpuHangDetectedReturnValue = true; + + controller->checkNewSubmissions(); + EXPECT_TRUE(controller->directSubmissions[csr.get()].isStopped); + EXPECT_EQ(controller->directSubmissions[csr.get()].taskCount, 10u); + EXPECT_EQ(1u, csr->stopDirectSubmissionCalledTimes); + EXPECT_EQ(0u, csr->flushTagUpdateCalledTimes); +} + TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskSameAsTaskCountAndGpuIdleThenTerminateDirectSubmission) { csr->setLatestFlushedTaskCount(10u); csr->isBusyReturnValue = false;