mirror of
https://github.com/intel/compute-runtime.git
synced 2025-09-10 12:53:42 +08:00
Check for GPU hang in path with wait for timestamps
Related-To: NEO-6868 Signed-off-by: Milczarek, Slawomir <slawomir.milczarek@intel.com>
This commit is contained in:

committed by
Compute-Runtime-Automation

parent
1ca5d57ab0
commit
0192e8038f
@ -1206,14 +1206,18 @@ WaitStatus CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *pri
|
||||
}
|
||||
}
|
||||
|
||||
auto waitedOnTimestamps = waitForTimestamps(activeBcsStates, taskCount);
|
||||
auto waitStatus = WaitStatus::NotReady;
|
||||
auto waitedOnTimestamps = waitForTimestamps(activeBcsStates, taskCount, waitStatus);
|
||||
if (waitStatus == WaitStatus::GpuHang) {
|
||||
return WaitStatus::GpuHang;
|
||||
}
|
||||
|
||||
TimestampPacketContainer nodesToRelease;
|
||||
if (deferredTimestampPackets) {
|
||||
deferredTimestampPackets->swapNodes(nodesToRelease);
|
||||
}
|
||||
|
||||
const auto waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps);
|
||||
waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps);
|
||||
|
||||
if (printfHandler) {
|
||||
if (!printfHandler->printEnqueueOutput()) {
|
||||
|
@ -204,7 +204,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
bool isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState);
|
||||
|
||||
bool isWaitForTimestampsEnabled() const;
|
||||
virtual bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount) = 0;
|
||||
virtual bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount, WaitStatus &status) = 0;
|
||||
|
||||
MOCKABLE_VIRTUAL bool isQueueBlocked();
|
||||
|
||||
|
@ -427,7 +427,7 @@ class CommandQueueHw : public CommandQueue {
|
||||
|
||||
bool isCacheFlushCommand(uint32_t commandType) const override;
|
||||
|
||||
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount) override;
|
||||
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount, WaitStatus &status) override;
|
||||
|
||||
MOCKABLE_VIRTUAL bool isCacheFlushForBcsRequired() const;
|
||||
|
||||
|
@ -141,16 +141,23 @@ bool CommandQueueHw<Family>::isCacheFlushForBcsRequired() const {
|
||||
}
|
||||
|
||||
template <typename TSPacketType>
|
||||
inline bool waitForTimestampsWithinContainer(TimestampPacketContainer *container, CommandStreamReceiver &csr) {
|
||||
inline bool waitForTimestampsWithinContainer(TimestampPacketContainer *container, CommandStreamReceiver &csr, WaitStatus &status) {
|
||||
bool waited = false;
|
||||
status = WaitStatus::NotReady;
|
||||
|
||||
if (container) {
|
||||
auto lastHangCheckTime = std::chrono::high_resolution_clock::now();
|
||||
for (const auto ×tamp : container->peekNodes()) {
|
||||
for (uint32_t i = 0; i < timestamp->getPacketsUsed(); i++) {
|
||||
while (timestamp->getContextEndValue(i) == 1) {
|
||||
csr.downloadAllocation(*timestamp->getBaseGraphicsAllocation()->getGraphicsAllocation(csr.getRootDeviceIndex()));
|
||||
WaitUtils::waitFunctionWithPredicate<const TSPacketType>(static_cast<TSPacketType const *>(timestamp->getContextEndAddress(i)), 1u, std::not_equal_to<TSPacketType>());
|
||||
if (csr.checkGpuHangDetected(std::chrono::high_resolution_clock::now(), lastHangCheckTime)) {
|
||||
status = WaitStatus::GpuHang;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
status = WaitStatus::Ready;
|
||||
waited = true;
|
||||
}
|
||||
}
|
||||
@ -160,14 +167,14 @@ inline bool waitForTimestampsWithinContainer(TimestampPacketContainer *container
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
bool CommandQueueHw<Family>::waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount) {
|
||||
bool CommandQueueHw<Family>::waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount, WaitStatus &status) {
|
||||
using TSPacketType = typename Family::TimestampPacketType;
|
||||
bool waited = false;
|
||||
|
||||
if (isWaitForTimestampsEnabled()) {
|
||||
waited = waitForTimestampsWithinContainer<TSPacketType>(timestampPacketContainer.get(), getGpgpuCommandStreamReceiver());
|
||||
waited = waitForTimestampsWithinContainer<TSPacketType>(timestampPacketContainer.get(), getGpgpuCommandStreamReceiver(), status);
|
||||
if (isOOQEnabled()) {
|
||||
waitForTimestampsWithinContainer<TSPacketType>(deferredTimestampPackets.get(), getGpgpuCommandStreamReceiver());
|
||||
waitForTimestampsWithinContainer<TSPacketType>(deferredTimestampPackets.get(), getGpgpuCommandStreamReceiver(), status);
|
||||
}
|
||||
|
||||
if (waited) {
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "shared/test/common/mocks/mock_csr.h"
|
||||
#include "shared/test/common/mocks/mock_os_library.h"
|
||||
#include "shared/test/common/mocks/mock_source_level_debugger.h"
|
||||
#include "shared/test/common/mocks/mock_timestamp_container.h"
|
||||
#include "shared/test/common/utilities/base_object_utils.h"
|
||||
|
||||
#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
|
||||
@ -49,12 +50,36 @@ HWTEST_F(CommandQueueHwTest, givenNoTimestampPacketsWhenWaitForTimestampsThenNoW
|
||||
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
|
||||
MockCommandQueueHw<FamilyType> cmdQ(context, device.get(), nullptr);
|
||||
auto taskCount = device->getUltCommandStreamReceiver<FamilyType>().peekLatestFlushedTaskCount();
|
||||
auto status = WaitStatus::NotReady;
|
||||
|
||||
cmdQ.waitForTimestamps({}, 101u);
|
||||
cmdQ.waitForTimestamps({}, 101u, status);
|
||||
|
||||
EXPECT_EQ(device->getUltCommandStreamReceiver<FamilyType>().peekLatestFlushedTaskCount(), taskCount);
|
||||
}
|
||||
|
||||
HWTEST_F(CommandQueueHwTest, givenEnableTimestampWaitForQueuesWhenGpuHangDetectedWhileWaitingForAllEnginesThenReturnCorrectStatus) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.EnableTimestampWaitForQueues.set(4);
|
||||
|
||||
ExecutionEnvironment *executionEnvironment = platform()->peekExecutionEnvironment();
|
||||
auto device = std::make_unique<MockClDevice>(MockDevice::create<MockDevice>(executionEnvironment, 0u));
|
||||
MockCommandQueueHw<FamilyType> cmdQ(context, device.get(), nullptr);
|
||||
auto status = WaitStatus::NotReady;
|
||||
|
||||
auto mockCSR = new MockCommandStreamReceiver(*executionEnvironment, 0, device->getDeviceBitfield());
|
||||
mockCSR->isGpuHangDetectedReturnValue = true;
|
||||
device->resetCommandStreamReceiver(mockCSR);
|
||||
|
||||
auto mockTagAllocator = new MockTagAllocator<>(0, device->getMemoryManager());
|
||||
mockCSR->timestampPacketAllocator.reset(mockTagAllocator);
|
||||
cmdQ.timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
|
||||
cmdQ.timestampPacketContainer->add(mockTagAllocator->getTag());
|
||||
|
||||
status = cmdQ.waitForAllEngines(false, nullptr, false);
|
||||
|
||||
EXPECT_EQ(WaitStatus::GpuHang, status);
|
||||
}
|
||||
|
||||
HWTEST_F(CommandQueueHwTest, WhenDebugSurfaceIsAllocatedThenBufferIsZeroed) {
|
||||
ExecutionEnvironment *executionEnvironment = platform()->peekExecutionEnvironment();
|
||||
executionEnvironment->rootDeviceEnvironments[0]->debugger.reset(new MockActiveSourceLevelDebugger(new MockOsLibrary));
|
||||
|
@ -212,7 +212,7 @@ class MockCommandQueue : public CommandQueue {
|
||||
|
||||
bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const override { return isCacheFlushRequired; }
|
||||
|
||||
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount) override { return false; };
|
||||
bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, uint32_t taskCount, WaitStatus &status) override { return false; };
|
||||
|
||||
bool releaseIndirectHeapCalled = false;
|
||||
|
||||
|
Reference in New Issue
Block a user