fix: Fix DirectSubmissionController deadlock with try_lock pattern

Related-To: NEO-13325

Replace blocking obtainUniqueOwnership() with tryObtainUniqueOwnership()
in critical paths to prevent deadlock between controller thread holding
directSubmissionsMutex and submission threads holding CSR locks.

Changes:
- Add CommandStreamReceiver::tryObtainUniqueOwnership() method
- Use try_lock in checkNewSubmissions() and context group idle detection
- Skip contended CSRs instead of blocking (conservative approach)
- Add comprehensive unit tests for try_lock functionality

Fixes deadlock scenario where registerDirectSubmission() waits for
directSubmissionsMutex while checkNewSubmissions() holds it and waits
for CSR ownership locks.

Signed-off-by: Slawomir Milczarek <slawomir.milczarek@intel.com>
This commit is contained in:
Slawomir Milczarek
2025-09-26 14:43:38 +00:00
committed by Compute-Runtime-Automation
parent 44645a3ed6
commit 40ce5eb55e
4 changed files with 82 additions and 2 deletions

View File

@@ -970,6 +970,9 @@ bool CommandStreamReceiver::createPreemptionAllocation() {
std::unique_lock<CommandStreamReceiver::MutexType> CommandStreamReceiver::obtainUniqueOwnership() {
return std::unique_lock<CommandStreamReceiver::MutexType>(this->ownershipMutex);
}
std::unique_lock<CommandStreamReceiver::MutexType> CommandStreamReceiver::tryObtainUniqueOwnership() {
return std::unique_lock<CommandStreamReceiver::MutexType>(this->ownershipMutex, std::try_to_lock);
}
std::unique_lock<CommandStreamReceiver::MutexType> CommandStreamReceiver::obtainHostPtrSurfaceCreationLock() {
return std::unique_lock<CommandStreamReceiver::MutexType>(this->hostPtrSurfaceCreationMutex);
}

View File

@@ -266,6 +266,7 @@ class CommandStreamReceiver : NEO::NonCopyableAndNonMovableClass {
MOCKABLE_VIRTUAL bool createPreemptionAllocation();
MOCKABLE_VIRTUAL bool createPerDssBackedBuffer(Device &device);
[[nodiscard]] MOCKABLE_VIRTUAL std::unique_lock<MutexType> obtainUniqueOwnership();
[[nodiscard]] MOCKABLE_VIRTUAL std::unique_lock<MutexType> tryObtainUniqueOwnership();
bool peekTimestampPacketWriteEnabled() const { return timestampPacketWriteEnabled; }

View File

@@ -136,7 +136,10 @@ void DirectSubmissionController::checkNewSubmissions() {
if (!isBcs && csr->getProductHelper().checkBcsForDirectSubmissionStop()) {
isCopyEngineIdle = isCopyEngineOnDeviceIdle(csr->getRootDeviceIndex(), bcsTaskCount);
}
auto lock = csr->obtainUniqueOwnership();
auto lock = csr->tryObtainUniqueOwnership();
if (!lock.owns_lock()) {
continue; // Skip this CSR if contended - avoid deadlock
}
if (!isCsrIdleDetectionEnabled || (isDirectSubmissionIdle(csr, lock) && isCopyEngineIdle)) {
csr->stopDirectSubmission(false, false);
state.isStopped = true;
@@ -206,7 +209,11 @@ bool DirectSubmissionController::isDirectSubmissionIdle(CommandStreamReceiver *c
auto otherKey = ContextGroupKey{otherCsr->getRootDeviceIndex(), otherCsr->getContextGroupId()};
if (otherKey == myKey) {
auto otherLock = otherCsr->obtainUniqueOwnership();
auto otherLock = otherCsr->tryObtainUniqueOwnership();
if (!otherLock.owns_lock()) {
allOthersIdle = false;
break; // Treat contended CSR as active - avoid deadlock
}
if (!checkCSRIdle(otherCsr, otherLock)) {
allOthersIdle = false;
break; // Early exit for performance

View File

@@ -459,6 +459,75 @@ TEST_F(DirectSubmissionIdleDetectionTests, givenDebugFlagSetWhenTaskCountNotUpda
EXPECT_EQ(0u, csr->flushTagUpdateCalledTimes);
}
TEST_F(DirectSubmissionIdleDetectionTests, givenSimulatedCsrLockContentionWhenCheckNewSubmissionsCalledThenTryLockSkipsCsr) {
// Setup: Make CSR appear idle (should normally be stopped)
csr->taskCount.store(10u);
csr->setLatestFlushedTaskCount(10u);
csr->isBusyReturnValue = false;
// First, verify normal operation works
controller->checkNewSubmissions();
EXPECT_TRUE(controller->directSubmissions[csr.get()].isStopped);
EXPECT_EQ(1u, csr->stopDirectSubmissionCalledTimes);
// Reset for contention simulation
controller->directSubmissions[csr.get()].isStopped = false;
csr->stopDirectSubmissionCalledTimes = 0;
// Create a contention-simulating CSR that overrides tryObtainUniqueOwnership
struct ContentionSimulatingCsr : public TagUpdateMockCommandStreamReceiver {
using TagUpdateMockCommandStreamReceiver::TagUpdateMockCommandStreamReceiver;
bool simulateContention = false;
std::unique_lock<CommandStreamReceiver::MutexType> tryObtainUniqueOwnership() override {
if (simulateContention) {
return std::unique_lock<CommandStreamReceiver::MutexType>();
}
return TagUpdateMockCommandStreamReceiver::tryObtainUniqueOwnership();
}
};
// Replace the existing CSR with our contention-simulating one
controller->unregisterDirectSubmission(csr.get());
DeviceBitfield deviceBitfield(1);
auto contentionCsr = std::make_unique<ContentionSimulatingCsr>(executionEnvironment, 0, deviceBitfield);
auto newOsContext = std::unique_ptr<OsContext>(OsContext::create(nullptr, 0, 1,
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular},
PreemptionMode::ThreadGroup, deviceBitfield)));
contentionCsr->setupContext(*newOsContext);
contentionCsr->taskCount.store(10u);
contentionCsr->setLatestFlushedTaskCount(10u);
contentionCsr->isBusyReturnValue = false;
controller->registerDirectSubmission(contentionCsr.get());
controller->checkNewSubmissions();
controller->directSubmissions[contentionCsr.get()].isStopped = false;
contentionCsr->stopDirectSubmissionCalledTimes = 0;
// Enable contention simulation - this will make tryObtainUniqueOwnership() fail
contentionCsr->simulateContention = true;
controller->checkNewSubmissions();
// Verify CSR was NOT stopped due to simulated contended lock (conservative behavior)
EXPECT_FALSE(controller->directSubmissions[contentionCsr.get()].isStopped);
EXPECT_EQ(0u, contentionCsr->stopDirectSubmissionCalledTimes);
// Disable contention simulation and verify normal operation resumes
contentionCsr->simulateContention = false;
controller->checkNewSubmissions();
// Now it should be stopped since no contention exists
EXPECT_TRUE(controller->directSubmissions[contentionCsr.get()].isStopped);
EXPECT_EQ(1u, contentionCsr->stopDirectSubmissionCalledTimes);
controller->unregisterDirectSubmission(contentionCsr.get());
}
struct DirectSubmissionCheckForCopyEngineIdleTests : public ::testing::Test {
void SetUp() override {
controller = std::make_unique<DirectSubmissionControllerMock>();