mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-26 23:33:20 +08:00
fix: Fix DirectSubmissionController deadlock with try_lock pattern
Related-To: NEO-13325 Replace blocking obtainUniqueOwnership() with tryObtainUniqueOwnership() in critical paths to prevent deadlock between controller thread holding directSubmissionsMutex and submission threads holding CSR locks. Changes: - Add CommandStreamReceiver::tryObtainUniqueOwnership() method - Use try_lock in checkNewSubmissions() and context group idle detection - Skip contended CSRs instead of blocking (conservative approach) - Add comprehensive unit tests for try_lock functionality Fixes deadlock scenario where registerDirectSubmission() waits for directSubmissionsMutex while checkNewSubmissions() holds it and waits for CSR ownership locks. Signed-off-by: Slawomir Milczarek <slawomir.milczarek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
44645a3ed6
commit
40ce5eb55e
@@ -970,6 +970,9 @@ bool CommandStreamReceiver::createPreemptionAllocation() {
|
||||
std::unique_lock<CommandStreamReceiver::MutexType> CommandStreamReceiver::obtainUniqueOwnership() {
|
||||
return std::unique_lock<CommandStreamReceiver::MutexType>(this->ownershipMutex);
|
||||
}
|
||||
std::unique_lock<CommandStreamReceiver::MutexType> CommandStreamReceiver::tryObtainUniqueOwnership() {
|
||||
return std::unique_lock<CommandStreamReceiver::MutexType>(this->ownershipMutex, std::try_to_lock);
|
||||
}
|
||||
std::unique_lock<CommandStreamReceiver::MutexType> CommandStreamReceiver::obtainHostPtrSurfaceCreationLock() {
|
||||
return std::unique_lock<CommandStreamReceiver::MutexType>(this->hostPtrSurfaceCreationMutex);
|
||||
}
|
||||
|
||||
@@ -266,6 +266,7 @@ class CommandStreamReceiver : NEO::NonCopyableAndNonMovableClass {
|
||||
MOCKABLE_VIRTUAL bool createPreemptionAllocation();
|
||||
MOCKABLE_VIRTUAL bool createPerDssBackedBuffer(Device &device);
|
||||
[[nodiscard]] MOCKABLE_VIRTUAL std::unique_lock<MutexType> obtainUniqueOwnership();
|
||||
[[nodiscard]] MOCKABLE_VIRTUAL std::unique_lock<MutexType> tryObtainUniqueOwnership();
|
||||
|
||||
bool peekTimestampPacketWriteEnabled() const { return timestampPacketWriteEnabled; }
|
||||
|
||||
|
||||
@@ -136,7 +136,10 @@ void DirectSubmissionController::checkNewSubmissions() {
|
||||
if (!isBcs && csr->getProductHelper().checkBcsForDirectSubmissionStop()) {
|
||||
isCopyEngineIdle = isCopyEngineOnDeviceIdle(csr->getRootDeviceIndex(), bcsTaskCount);
|
||||
}
|
||||
auto lock = csr->obtainUniqueOwnership();
|
||||
auto lock = csr->tryObtainUniqueOwnership();
|
||||
if (!lock.owns_lock()) {
|
||||
continue; // Skip this CSR if contended - avoid deadlock
|
||||
}
|
||||
if (!isCsrIdleDetectionEnabled || (isDirectSubmissionIdle(csr, lock) && isCopyEngineIdle)) {
|
||||
csr->stopDirectSubmission(false, false);
|
||||
state.isStopped = true;
|
||||
@@ -206,7 +209,11 @@ bool DirectSubmissionController::isDirectSubmissionIdle(CommandStreamReceiver *c
|
||||
|
||||
auto otherKey = ContextGroupKey{otherCsr->getRootDeviceIndex(), otherCsr->getContextGroupId()};
|
||||
if (otherKey == myKey) {
|
||||
auto otherLock = otherCsr->obtainUniqueOwnership();
|
||||
auto otherLock = otherCsr->tryObtainUniqueOwnership();
|
||||
if (!otherLock.owns_lock()) {
|
||||
allOthersIdle = false;
|
||||
break; // Treat contended CSR as active - avoid deadlock
|
||||
}
|
||||
if (!checkCSRIdle(otherCsr, otherLock)) {
|
||||
allOthersIdle = false;
|
||||
break; // Early exit for performance
|
||||
|
||||
@@ -459,6 +459,75 @@ TEST_F(DirectSubmissionIdleDetectionTests, givenDebugFlagSetWhenTaskCountNotUpda
|
||||
EXPECT_EQ(0u, csr->flushTagUpdateCalledTimes);
|
||||
}
|
||||
|
||||
TEST_F(DirectSubmissionIdleDetectionTests, givenSimulatedCsrLockContentionWhenCheckNewSubmissionsCalledThenTryLockSkipsCsr) {
|
||||
// Setup: Make CSR appear idle (should normally be stopped)
|
||||
csr->taskCount.store(10u);
|
||||
csr->setLatestFlushedTaskCount(10u);
|
||||
csr->isBusyReturnValue = false;
|
||||
|
||||
// First, verify normal operation works
|
||||
controller->checkNewSubmissions();
|
||||
EXPECT_TRUE(controller->directSubmissions[csr.get()].isStopped);
|
||||
EXPECT_EQ(1u, csr->stopDirectSubmissionCalledTimes);
|
||||
|
||||
// Reset for contention simulation
|
||||
controller->directSubmissions[csr.get()].isStopped = false;
|
||||
csr->stopDirectSubmissionCalledTimes = 0;
|
||||
|
||||
// Create a contention-simulating CSR that overrides tryObtainUniqueOwnership
|
||||
struct ContentionSimulatingCsr : public TagUpdateMockCommandStreamReceiver {
|
||||
using TagUpdateMockCommandStreamReceiver::TagUpdateMockCommandStreamReceiver;
|
||||
|
||||
bool simulateContention = false;
|
||||
|
||||
std::unique_lock<CommandStreamReceiver::MutexType> tryObtainUniqueOwnership() override {
|
||||
if (simulateContention) {
|
||||
return std::unique_lock<CommandStreamReceiver::MutexType>();
|
||||
}
|
||||
return TagUpdateMockCommandStreamReceiver::tryObtainUniqueOwnership();
|
||||
}
|
||||
};
|
||||
|
||||
// Replace the existing CSR with our contention-simulating one
|
||||
controller->unregisterDirectSubmission(csr.get());
|
||||
|
||||
DeviceBitfield deviceBitfield(1);
|
||||
auto contentionCsr = std::make_unique<ContentionSimulatingCsr>(executionEnvironment, 0, deviceBitfield);
|
||||
auto newOsContext = std::unique_ptr<OsContext>(OsContext::create(nullptr, 0, 1,
|
||||
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular},
|
||||
PreemptionMode::ThreadGroup, deviceBitfield)));
|
||||
contentionCsr->setupContext(*newOsContext);
|
||||
|
||||
contentionCsr->taskCount.store(10u);
|
||||
contentionCsr->setLatestFlushedTaskCount(10u);
|
||||
contentionCsr->isBusyReturnValue = false;
|
||||
|
||||
controller->registerDirectSubmission(contentionCsr.get());
|
||||
controller->checkNewSubmissions();
|
||||
controller->directSubmissions[contentionCsr.get()].isStopped = false;
|
||||
contentionCsr->stopDirectSubmissionCalledTimes = 0;
|
||||
|
||||
// Enable contention simulation - this will make tryObtainUniqueOwnership() fail
|
||||
contentionCsr->simulateContention = true;
|
||||
|
||||
controller->checkNewSubmissions();
|
||||
|
||||
// Verify CSR was NOT stopped due to simulated contended lock (conservative behavior)
|
||||
EXPECT_FALSE(controller->directSubmissions[contentionCsr.get()].isStopped);
|
||||
EXPECT_EQ(0u, contentionCsr->stopDirectSubmissionCalledTimes);
|
||||
|
||||
// Disable contention simulation and verify normal operation resumes
|
||||
contentionCsr->simulateContention = false;
|
||||
|
||||
controller->checkNewSubmissions();
|
||||
|
||||
// Now it should be stopped since no contention exists
|
||||
EXPECT_TRUE(controller->directSubmissions[contentionCsr.get()].isStopped);
|
||||
EXPECT_EQ(1u, contentionCsr->stopDirectSubmissionCalledTimes);
|
||||
|
||||
controller->unregisterDirectSubmission(contentionCsr.get());
|
||||
}
|
||||
|
||||
struct DirectSubmissionCheckForCopyEngineIdleTests : public ::testing::Test {
|
||||
void SetUp() override {
|
||||
controller = std::make_unique<DirectSubmissionControllerMock>();
|
||||
|
||||
Reference in New Issue
Block a user