diff --git a/level_zero/core/source/event/event.h b/level_zero/core/source/event/event.h index a9f6e0ec0c..3cc4d23714 100644 --- a/level_zero/core/source/event/event.h +++ b/level_zero/core/source/event/event.h @@ -7,6 +7,7 @@ #pragma once #include "shared/source/helpers/common_types.h" +#include "shared/source/helpers/constants.h" #include "shared/source/helpers/timestamp_packet_constants.h" #include "shared/source/helpers/timestamp_packet_container.h" #include "shared/source/memory_manager/multi_graphics_allocation.h" @@ -352,7 +353,7 @@ struct Event : _ze_event_handle_t { uint64_t inOrderExecSignalValue = 0; uint32_t inOrderAllocationOffset = 0; - std::chrono::microseconds gpuHangCheckPeriod{500'000}; + std::chrono::microseconds gpuHangCheckPeriod{CommonConstants::gpuHangCheckTimeInUS}; std::bitset l3FlushAppliedOnKernel; size_t contextStartOffset = 0u; diff --git a/level_zero/core/source/fence/fence.h b/level_zero/core/source/fence/fence.h index d4e6fbd155..b9acdf585e 100644 --- a/level_zero/core/source/fence/fence.h +++ b/level_zero/core/source/fence/fence.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -8,6 +8,7 @@ #pragma once #include "shared/source/command_stream/task_count_helper.h" +#include "shared/source/helpers/constants.h" #include @@ -39,7 +40,7 @@ struct Fence : _ze_fence_handle_t { protected: Fence(CommandQueueImp *cmdQueueImp) : cmdQueue(cmdQueueImp) {} - std::chrono::microseconds gpuHangCheckPeriod{500'000}; + std::chrono::microseconds gpuHangCheckPeriod{CommonConstants::gpuHangCheckTimeInUS}; CommandQueueImp *cmdQueue; TaskCountType taskCount = 0; }; diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index d53a7fb934..8e6e494cc9 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -652,7 +652,7 @@ class CommandStreamReceiver { SamplerCacheFlushState samplerCacheFlushRequired = SamplerCacheFlushState::samplerCacheFlushNotRequired; PreemptionMode lastPreemptionMode = PreemptionMode::Initial; - std::chrono::microseconds gpuHangCheckPeriod{500'000}; + std::chrono::microseconds gpuHangCheckPeriod{CommonConstants::gpuHangCheckTimeInUS}; uint32_t lastSentL3Config = 0; uint32_t latestSentStatelessMocsConfig = CacheSettings::unknownMocs; uint64_t lastSentSliceCount = QueueSliceCount::defaultSliceCount; diff --git a/shared/source/direct_submission/linux/drm_direct_submission.h b/shared/source/direct_submission/linux/drm_direct_submission.h index 7f3349c7fe..66725cc224 100644 --- a/shared/source/direct_submission/linux/drm_direct_submission.h +++ b/shared/source/direct_submission/linux/drm_direct_submission.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -35,11 +35,13 @@ class DrmDirectSubmission : public DirectSubmissionHw { void getTagAddressValue(TagData &tagData) override; bool isCompleted(uint32_t ringBufferIndex) override; bool isCompletionFenceSupported(); + bool isGpuHangDetected(std::chrono::high_resolution_clock::time_point &lastHangCheckTime); MOCKABLE_VIRTUAL void wait(TaskCountType taskCountToWait); TagData currentTagData{}; volatile TagAddressType *tagAddress; TaskCountType completionFenceValue{}; + std::chrono::microseconds gpuHangCheckPeriod{CommonConstants::gpuHangCheckTimeInUS}; }; } // namespace NEO diff --git a/shared/source/direct_submission/linux/drm_direct_submission.inl b/shared/source/direct_submission/linux/drm_direct_submission.inl index c25fe8cbd7..eb429d515d 100644 --- a/shared/source/direct_submission/linux/drm_direct_submission.inl +++ b/shared/source/direct_submission/linux/drm_direct_submission.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -246,12 +246,31 @@ bool DrmDirectSubmission::isCompletionFenceSupported() { template void DrmDirectSubmission::wait(TaskCountType taskCountToWait) { + auto lastHangCheckTime = std::chrono::high_resolution_clock::now(); auto pollAddress = this->tagAddress; for (uint32_t i = 0; i < this->activeTiles; i++) { - while (!WaitUtils::waitFunction(pollAddress, taskCountToWait)) { + while (!WaitUtils::waitFunction(pollAddress, taskCountToWait) && + !isGpuHangDetected(lastHangCheckTime)) { } pollAddress = ptrOffset(pollAddress, this->immWritePostSyncOffset); } } +template +bool DrmDirectSubmission::isGpuHangDetected(std::chrono::high_resolution_clock::time_point &lastHangCheckTime) { + if (!this->detectGpuHang) { + return false; + } + + auto currentTime = std::chrono::high_resolution_clock::now(); + auto elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast(currentTime - lastHangCheckTime); + if (elapsedTimeSinceGpuHangCheck.count() >= gpuHangCheckPeriod.count()) { + lastHangCheckTime = currentTime; + auto osContextLinux = static_cast(&this->osContext); + auto &drm = osContextLinux->getDrm(); + return drm.isGpuHangDetected(this->osContext); + } + return false; +} + } // namespace NEO diff --git a/shared/source/helpers/constants.h b/shared/source/helpers/constants.h index 8bf05a2828..f360f0c75d 100644 --- a/shared/source/helpers/constants.h +++ b/shared/source/helpers/constants.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -93,5 +93,6 @@ inline constexpr uint32_t maximalSizeOfAtomicType = 8; inline constexpr uint32_t engineGroupCount = static_cast(NEO::EngineGroupType::maxEngineGroups); inline constexpr uint32_t maxWorkgroupSize = 1024u; inline constexpr uint32_t minimalSyncBufferSize = 12; +inline constexpr uint32_t gpuHangCheckTimeInUS = 500'000; inline constexpr double defaultProfilingTimerResolution = 83.333; } // namespace CommonConstants diff --git a/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp b/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp index 4f28b3ffb7..a34ac871fa 100644 --- a/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp +++ b/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -82,6 +82,7 @@ struct MockDrmDirectSubmission : public DrmDirectSubmissionfreeGraphicsMemory(commandBuffer); *drmDirectSubmission.tagAddress = 1; } + +HWTEST_F(DrmDirectSubmissionTest, givenGpuHangWhenWaitCalledThenGpuHangDetected) { + using Dispatcher = RenderDispatcher; + + VariableBackup backupWaitpkgUse(&WaitUtils::waitpkgUse, false); + VariableBackup backupWaitCount(&WaitUtils::waitCount, 1); + + MockDrmDirectSubmission directSubmission(*device->getDefaultEngine().commandStreamReceiver); + directSubmission.gpuHangCheckPeriod = {}; + bool ret = directSubmission.allocateResources(); + EXPECT_TRUE(ret); + + auto pollAddress = directSubmission.tagAddress; + *pollAddress = 0; + + auto drm = static_cast(executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as()); + ResetStats resetStats{}; + resetStats.contextId = 0; + resetStats.batchActive = 1; + drm->resetStatsToReturn.push_back(resetStats); + + EXPECT_EQ(0, drm->ioctlCount.getResetStats); + directSubmission.wait(1); + EXPECT_EQ(1, drm->ioctlCount.getResetStats); +} \ No newline at end of file