From 835b344968607b957f9a486c658d485428a23f68 Mon Sep 17 00:00:00 2001 From: Patryk Wrobel Date: Wed, 9 Mar 2022 12:36:51 +0000 Subject: [PATCH] Add debug flag to disable GPU hang detection This change introduces the new flag called DisableGpuHangDetection. By default it is disabled. When someone wants to disable hang checking, then this flag can be set to true. Related-To: NEO-6681 Signed-off-by: Patryk Wrobel --- opencl/test/unit_test/test_files/igdrcl.config | 1 + .../command_stream/command_stream_receiver.cpp | 5 +++++ .../debug_settings/debug_variables_base.inl | 1 + .../command_stream_receiver_tests.cpp | 16 ++++++++++++++++ 4 files changed, 23 insertions(+) diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index d79754d5c6..a8d470958c 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -382,6 +382,7 @@ ReuseKernelBinaries = -1 EnableChipsetUniqueUUID = -1 ForceSimdMessageSizeInWalker = -1 UseNewQueryTopoIoctl = 1 +DisableGpuHangDetection = 0 EnableRecoverablePageFaults = -1 EnableImplicitMigrationOnFaultableHardware = -1 UseDrmVirtualEnginesForCcs = -1 diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index 26cdc04958..b6c80ed781 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -12,6 +12,7 @@ #include "shared/source/command_stream/experimental_command_buffer.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/command_stream/scratch_space_controller.h" +#include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/device/device.h" #include "shared/source/direct_submission/direct_submission_controller.h" #include "shared/source/execution_environment/root_device_environment.h" @@ -249,6 +250,10 @@ bool CommandStreamReceiver::skipResourceCleanup() const { } bool CommandStreamReceiver::isGpuHangDetected() const { + if (DebugManager.flags.DisableGpuHangDetection.get()) { + return false; + } + return this->osContext && this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->isGpuHangDetected(*osContext); } diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 87fb4771e4..af006faaa7 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -73,6 +73,7 @@ DECLARE_DEBUG_VARIABLE(bool, AllowPatchingVfeStateInCommandLists, false, "true: DECLARE_DEBUG_VARIABLE(bool, PrintMemoryRegionSizes, false, "print memory bank type, instance and it's size") DECLARE_DEBUG_VARIABLE(bool, UpdateCrossThreadDataSize, false, "Turn on cross thread data size calculation for PATCH TOKEN binary") DECLARE_DEBUG_VARIABLE(bool, UseNewQueryTopoIoctl, true, "Use DRM_I915_QUERY_COMPUTE_SLICES") +DECLARE_DEBUG_VARIABLE(bool, DisableGpuHangDetection, false, "Disable GPU hang detection") DECLARE_DEBUG_VARIABLE(std::string, ForceDeviceId, std::string("unk"), "DeviceId selected for testing") DECLARE_DEBUG_VARIABLE(std::string, FilterDeviceId, std::string("unk"), "Device id filter, adapter matching device id will be opened. Ignored when unk.") DECLARE_DEBUG_VARIABLE(std::string, FilterBdfPath, std::string("unk"), "Linux-only, BDF path filter, only matching paths will be opened. Ignored when unk.") diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 7b94734c9a..b1b8cc003a 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -174,6 +174,22 @@ HWTEST_F(CommandStreamReceiverTest, whenStoreAllocationThenStoredAllocationHasTa EXPECT_EQ(csr.peekTaskCount(), allocation->getTaskCount(csr.getOsContext().getContextId())); } +HWTEST_F(CommandStreamReceiverTest, givenDisableGpuHangDetectionFlagWhenCheckingGpuHangThenDriverModelIsNotCalledAndFalseIsReturned) { + DebugManagerStateRestore stateRestore; + DebugManager.flags.DisableGpuHangDetection.set(true); + + auto driverModelMock = std::make_unique(); + driverModelMock->isGpuHangDetectedToReturn = true; + + auto osInterface = std::make_unique(); + osInterface->setDriverModel(std::move(driverModelMock)); + + auto &csr = pDevice->getUltCommandStreamReceiver(); + csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface); + + EXPECT_FALSE(csr.isGpuHangDetected()); +} + HWTEST_F(CommandStreamReceiverTest, givenGpuHangWhenWaititingForCompletionWithTimeoutThenGpuHangIsReturned) { auto driverModelMock = std::make_unique(); driverModelMock->isGpuHangDetectedToReturn = true;