Check for GPU hang in path with wait for timestamps

Related-To: NEO-6868

Signed-off-by: Milczarek, Slawomir <slawomir.milczarek@intel.com>
This commit is contained in:
Milczarek, Slawomir
2022-09-19 10:20:14 +00:00
committed by Compute-Runtime-Automation
parent 1ca5d57ab0
commit 0192e8038f
10 changed files with 65 additions and 26 deletions

View File

@ -1206,14 +1206,18 @@ WaitStatus CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *pri
}
}
auto waitedOnTimestamps = waitForTimestamps(activeBcsStates, taskCount);
auto waitStatus = WaitStatus::NotReady;
auto waitedOnTimestamps = waitForTimestamps(activeBcsStates, taskCount, waitStatus);
if (waitStatus == WaitStatus::GpuHang) {
return WaitStatus::GpuHang;
}
TimestampPacketContainer nodesToRelease;
if (deferredTimestampPackets) {
deferredTimestampPackets->swapNodes(nodesToRelease);
}
const auto waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps);
waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps);
if (printfHandler) {
if (!printfHandler->printEnqueueOutput()) {

View File

@ -204,7 +204,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
bool isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState);
bool isWaitForTimestampsEnabled() const;
virtual bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount) = 0;
virtual bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount, WaitStatus &status) = 0;
MOCKABLE_VIRTUAL bool isQueueBlocked();

View File

@ -427,7 +427,7 @@ class CommandQueueHw : public CommandQueue {
bool isCacheFlushCommand(uint32_t commandType) const override;
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount) override;
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount, WaitStatus &status) override;
MOCKABLE_VIRTUAL bool isCacheFlushForBcsRequired() const;

View File

@ -141,16 +141,23 @@ bool CommandQueueHw<Family>::isCacheFlushForBcsRequired() const {
}
template <typename TSPacketType>
inline bool waitForTimestampsWithinContainer(TimestampPacketContainer *container, CommandStreamReceiver &csr) {
inline bool waitForTimestampsWithinContainer(TimestampPacketContainer *container, CommandStreamReceiver &csr, WaitStatus &status) {
bool waited = false;
status = WaitStatus::NotReady;
if (container) {
auto lastHangCheckTime = std::chrono::high_resolution_clock::now();
for (const auto &timestamp : container->peekNodes()) {
for (uint32_t i = 0; i < timestamp->getPacketsUsed(); i++) {
while (timestamp->getContextEndValue(i) == 1) {
csr.downloadAllocation(*timestamp->getBaseGraphicsAllocation()->getGraphicsAllocation(csr.getRootDeviceIndex()));
WaitUtils::waitFunctionWithPredicate<const TSPacketType>(static_cast<TSPacketType const *>(timestamp->getContextEndAddress(i)), 1u, std::not_equal_to<TSPacketType>());
if (csr.checkGpuHangDetected(std::chrono::high_resolution_clock::now(), lastHangCheckTime)) {
status = WaitStatus::GpuHang;
return false;
}
}
status = WaitStatus::Ready;
waited = true;
}
}
@ -160,14 +167,14 @@ inline bool waitForTimestampsWithinContainer(TimestampPacketContainer *container
}
template <typename Family>
bool CommandQueueHw<Family>::waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount) {
bool CommandQueueHw<Family>::waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount, WaitStatus &status) {
using TSPacketType = typename Family::TimestampPacketType;
bool waited = false;
if (isWaitForTimestampsEnabled()) {
waited = waitForTimestampsWithinContainer<TSPacketType>(timestampPacketContainer.get(), getGpgpuCommandStreamReceiver());
waited = waitForTimestampsWithinContainer<TSPacketType>(timestampPacketContainer.get(), getGpgpuCommandStreamReceiver(), status);
if (isOOQEnabled()) {
waitForTimestampsWithinContainer<TSPacketType>(deferredTimestampPackets.get(), getGpgpuCommandStreamReceiver());
waitForTimestampsWithinContainer<TSPacketType>(deferredTimestampPackets.get(), getGpgpuCommandStreamReceiver(), status);
}
if (waited) {

View File

@ -13,6 +13,7 @@
#include "shared/test/common/mocks/mock_csr.h"
#include "shared/test/common/mocks/mock_os_library.h"
#include "shared/test/common/mocks/mock_source_level_debugger.h"
#include "shared/test/common/mocks/mock_timestamp_container.h"
#include "shared/test/common/utilities/base_object_utils.h"
#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
@ -49,12 +50,36 @@ HWTEST_F(CommandQueueHwTest, givenNoTimestampPacketsWhenWaitForTimestampsThenNoW
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
MockCommandQueueHw<FamilyType> cmdQ(context, device.get(), nullptr);
auto taskCount = device->getUltCommandStreamReceiver<FamilyType>().peekLatestFlushedTaskCount();
auto status = WaitStatus::NotReady;
cmdQ.waitForTimestamps({}, 101u);
cmdQ.waitForTimestamps({}, 101u, status);
EXPECT_EQ(device->getUltCommandStreamReceiver<FamilyType>().peekLatestFlushedTaskCount(), taskCount);
}
HWTEST_F(CommandQueueHwTest, givenEnableTimestampWaitForQueuesWhenGpuHangDetectedWhileWaitingForAllEnginesThenReturnCorrectStatus) {
DebugManagerStateRestore restorer;
DebugManager.flags.EnableTimestampWaitForQueues.set(4);
ExecutionEnvironment *executionEnvironment = platform()->peekExecutionEnvironment();
auto device = std::make_unique<MockClDevice>(MockDevice::create<MockDevice>(executionEnvironment, 0u));
MockCommandQueueHw<FamilyType> cmdQ(context, device.get(), nullptr);
auto status = WaitStatus::NotReady;
auto mockCSR = new MockCommandStreamReceiver(*executionEnvironment, 0, device->getDeviceBitfield());
mockCSR->isGpuHangDetectedReturnValue = true;
device->resetCommandStreamReceiver(mockCSR);
auto mockTagAllocator = new MockTagAllocator<>(0, device->getMemoryManager());
mockCSR->timestampPacketAllocator.reset(mockTagAllocator);
cmdQ.timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
cmdQ.timestampPacketContainer->add(mockTagAllocator->getTag());
status = cmdQ.waitForAllEngines(false, nullptr, false);
EXPECT_EQ(WaitStatus::GpuHang, status);
}
HWTEST_F(CommandQueueHwTest, WhenDebugSurfaceIsAllocatedThenBufferIsZeroed) {
ExecutionEnvironment *executionEnvironment = platform()->peekExecutionEnvironment();
executionEnvironment->rootDeviceEnvironments[0]->debugger.reset(new MockActiveSourceLevelDebugger(new MockOsLibrary));

View File

@ -212,7 +212,7 @@ class MockCommandQueue : public CommandQueue {
bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const override { return isCacheFlushRequired; }
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount) override { return false; };
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount, WaitStatus &status) override { return false; };
bool releaseIndirectHeapCalled = false;