Add debug flag to disable GPU hang detection

This change introduces the new flag called DisableGpuHangDetection.
By default it is disabled. When someone wants to disable hang checking,
then this flag can be set to true.

Related-To: NEO-6681
Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
Patryk Wrobel
2022-03-09 12:36:51 +00:00
committed by Compute-Runtime-Automation
parent 9d4dacacca
commit 835b344968
4 changed files with 23 additions and 0 deletions

View File

@ -382,6 +382,7 @@ ReuseKernelBinaries = -1
EnableChipsetUniqueUUID = -1 EnableChipsetUniqueUUID = -1
ForceSimdMessageSizeInWalker = -1 ForceSimdMessageSizeInWalker = -1
UseNewQueryTopoIoctl = 1 UseNewQueryTopoIoctl = 1
DisableGpuHangDetection = 0
EnableRecoverablePageFaults = -1 EnableRecoverablePageFaults = -1
EnableImplicitMigrationOnFaultableHardware = -1 EnableImplicitMigrationOnFaultableHardware = -1
UseDrmVirtualEnginesForCcs = -1 UseDrmVirtualEnginesForCcs = -1

View File

@ -12,6 +12,7 @@
#include "shared/source/command_stream/experimental_command_buffer.h" #include "shared/source/command_stream/experimental_command_buffer.h"
#include "shared/source/command_stream/preemption.h" #include "shared/source/command_stream/preemption.h"
#include "shared/source/command_stream/scratch_space_controller.h" #include "shared/source/command_stream/scratch_space_controller.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/device/device.h" #include "shared/source/device/device.h"
#include "shared/source/direct_submission/direct_submission_controller.h" #include "shared/source/direct_submission/direct_submission_controller.h"
#include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/execution_environment/root_device_environment.h"
@ -249,6 +250,10 @@ bool CommandStreamReceiver::skipResourceCleanup() const {
} }
bool CommandStreamReceiver::isGpuHangDetected() const { bool CommandStreamReceiver::isGpuHangDetected() const {
if (DebugManager.flags.DisableGpuHangDetection.get()) {
return false;
}
return this->osContext && this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->isGpuHangDetected(*osContext); return this->osContext && this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->isGpuHangDetected(*osContext);
} }

View File

@ -73,6 +73,7 @@ DECLARE_DEBUG_VARIABLE(bool, AllowPatchingVfeStateInCommandLists, false, "true:
DECLARE_DEBUG_VARIABLE(bool, PrintMemoryRegionSizes, false, "print memory bank type, instance and it's size") DECLARE_DEBUG_VARIABLE(bool, PrintMemoryRegionSizes, false, "print memory bank type, instance and it's size")
DECLARE_DEBUG_VARIABLE(bool, UpdateCrossThreadDataSize, false, "Turn on cross thread data size calculation for PATCH TOKEN binary") DECLARE_DEBUG_VARIABLE(bool, UpdateCrossThreadDataSize, false, "Turn on cross thread data size calculation for PATCH TOKEN binary")
DECLARE_DEBUG_VARIABLE(bool, UseNewQueryTopoIoctl, true, "Use DRM_I915_QUERY_COMPUTE_SLICES") DECLARE_DEBUG_VARIABLE(bool, UseNewQueryTopoIoctl, true, "Use DRM_I915_QUERY_COMPUTE_SLICES")
DECLARE_DEBUG_VARIABLE(bool, DisableGpuHangDetection, false, "Disable GPU hang detection")
DECLARE_DEBUG_VARIABLE(std::string, ForceDeviceId, std::string("unk"), "DeviceId selected for testing") DECLARE_DEBUG_VARIABLE(std::string, ForceDeviceId, std::string("unk"), "DeviceId selected for testing")
DECLARE_DEBUG_VARIABLE(std::string, FilterDeviceId, std::string("unk"), "Device id filter, adapter matching device id will be opened. Ignored when unk.") DECLARE_DEBUG_VARIABLE(std::string, FilterDeviceId, std::string("unk"), "Device id filter, adapter matching device id will be opened. Ignored when unk.")
DECLARE_DEBUG_VARIABLE(std::string, FilterBdfPath, std::string("unk"), "Linux-only, BDF path filter, only matching paths will be opened. Ignored when unk.") DECLARE_DEBUG_VARIABLE(std::string, FilterBdfPath, std::string("unk"), "Linux-only, BDF path filter, only matching paths will be opened. Ignored when unk.")

View File

@ -174,6 +174,22 @@ HWTEST_F(CommandStreamReceiverTest, whenStoreAllocationThenStoredAllocationHasTa
EXPECT_EQ(csr.peekTaskCount(), allocation->getTaskCount(csr.getOsContext().getContextId())); EXPECT_EQ(csr.peekTaskCount(), allocation->getTaskCount(csr.getOsContext().getContextId()));
} }
HWTEST_F(CommandStreamReceiverTest, givenDisableGpuHangDetectionFlagWhenCheckingGpuHangThenDriverModelIsNotCalledAndFalseIsReturned) {
DebugManagerStateRestore stateRestore;
DebugManager.flags.DisableGpuHangDetection.set(true);
auto driverModelMock = std::make_unique<MockDriverModel>();
driverModelMock->isGpuHangDetectedToReturn = true;
auto osInterface = std::make_unique<OSInterface>();
osInterface->setDriverModel(std::move(driverModelMock));
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface);
EXPECT_FALSE(csr.isGpuHangDetected());
}
HWTEST_F(CommandStreamReceiverTest, givenGpuHangWhenWaititingForCompletionWithTimeoutThenGpuHangIsReturned) { HWTEST_F(CommandStreamReceiverTest, givenGpuHangWhenWaititingForCompletionWithTimeoutThenGpuHangIsReturned) {
auto driverModelMock = std::make_unique<MockDriverModel>(); auto driverModelMock = std::make_unique<MockDriverModel>();
driverModelMock->isGpuHangDetectedToReturn = true; driverModelMock->isGpuHangDetectedToReturn = true;