feature: Add hang detection support for zeDeviceGetStatus

Added hang detection for zeDeviceGetStatus API.
Added ULT for hang detection for zeDeviceGetStatus API.

Related-To: LOCI-1558

Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
This commit is contained in:
Zhang, Winston
2023-07-17 17:52:20 +00:00
committed by Compute-Runtime-Automation
parent 0a4d0917d4
commit 2a41ace67e
2 changed files with 22 additions and 0 deletions

View File

@@ -78,6 +78,13 @@ ze_result_t DeviceImp::getStatus() {
if (this->resourcesReleased) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
auto engines = neoDevice->getAllEngines();
for (auto engine : engines) {
auto csr = engine.commandStreamReceiver;
if (csr->isGpuHangDetected()) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
}
return ZE_RESULT_SUCCESS;
}

View File

@@ -18,10 +18,12 @@
#include "shared/source/unified_memory/usm_memory_support.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/engine_descriptor_helper.h"
#include "shared/test/common/helpers/execution_environment_helper.h"
#include "shared/test/common/helpers/mock_product_helper_hw.h"
#include "shared/test/common/helpers/raii_gfx_core_helper.h"
#include "shared/test/common/helpers/raii_product_helper.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_compilers.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/mocks/mock_driver_info.h"
@@ -2398,6 +2400,19 @@ TEST_F(DeviceGetStatusTest, givenCallToDeviceGetStatusThenCorrectErrorCodeIsRetu
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, res);
}
TEST_F(DeviceGetStatusTest, givenCallToDeviceGetStatusThenCorrectErrorCodeIsReturnedWhenGpuHangs) {
ze_result_t res = device->getStatus();
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
auto mockCSR = new MockCommandStreamReceiver(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
mockCSR->isGpuHangDetectedReturnValue = true;
neoDevice->resetCommandStreamReceiver(mockCSR);
res = device->getStatus();
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, res);
}
using DeviceTests = Test<DeviceFixture>;
TEST_F(DeviceTests, WhenGettingMemoryAccessPropertiesThenSuccessIsReturned) {