performance: don't terminate ULLS if BCS is busy

Related-To: NEO-15452

If CCS is idle, but BCS is busy, keep CCS ULLS
context running. BMG only.

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2025-07-17 12:44:46 +00:00
committed by Compute-Runtime-Automation
parent 2ee3070a1e
commit 1d842c58bf
7 changed files with 162 additions and 4 deletions

View File

@@ -14,6 +14,7 @@
#include "shared/source/os_interface/os_context.h"
#include "shared/source/os_interface/os_thread.h"
#include "shared/source/os_interface/os_time.h"
#include "shared/source/os_interface/product_helper.h"
#include <chrono>
#include <thread>
@@ -111,21 +112,28 @@ void DirectSubmissionController::checkNewSubmissions() {
std::lock_guard<std::mutex> lock(this->directSubmissionsMutex);
bool shouldRecalculateTimeout = false;
std::optional<TaskCountType> bcsTaskCount{};
for (auto &directSubmission : this->directSubmissions) {
auto csr = directSubmission.first;
auto &state = directSubmission.second;
if (timeoutMode == TimeoutElapsedMode::bcsOnly && !EngineHelpers::isBcs(csr->getOsContext().getEngineType())) {
auto isBcs = EngineHelpers::isBcs(csr->getOsContext().getEngineType());
if (timeoutMode == TimeoutElapsedMode::bcsOnly && !isBcs) {
continue;
}
if (isBcs) {
bcsTaskCount = state.taskCount;
}
auto taskCount = csr->peekTaskCount();
if (taskCount == state.taskCount) {
if (state.isStopped) {
continue;
}
bool isCopyEngineIdle = true;
if (!isBcs && csr->getProductHelper().checkBcsForDirectSubmissionStop()) {
isCopyEngineIdle = isCopyEngineOnDeviceIdle(csr->getRootDeviceIndex(), bcsTaskCount);
}
auto lock = csr->obtainUniqueOwnership();
if (!isCsrIdleDetectionEnabled || isDirectSubmissionIdle(csr, lock)) {
if (!isCsrIdleDetectionEnabled || (isCopyEngineIdle && isDirectSubmissionIdle(csr, lock))) {
csr->stopDirectSubmission(false, false);
state.isStopped = true;
shouldRecalculateTimeout = true;
@@ -169,6 +177,27 @@ bool DirectSubmissionController::isDirectSubmissionIdle(CommandStreamReceiver *c
return !csr->isBusyWithoutHang(lastHangCheckTime);
}
bool DirectSubmissionController::isCopyEngineOnDeviceIdle(uint32_t rootDeviceIndex, std::optional<TaskCountType> &bcsTaskCount) {
CommandStreamReceiver *bcsCsr = nullptr;
TaskCountType registeredTaskCount = 0;
for (auto &directSubmission : this->directSubmissions) {
auto csr = directSubmission.first;
if (csr->getRootDeviceIndex() == rootDeviceIndex && EngineHelpers::isBcs(csr->getOsContext().getEngineType())) {
if (!directSubmission.second.isStopped) {
registeredTaskCount = bcsTaskCount.value_or(directSubmission.second.taskCount);
bcsCsr = csr;
}
break;
}
}
if (bcsCsr == nullptr) {
return true;
}
auto lock = bcsCsr->obtainUniqueOwnership();
return (bcsCsr->peekTaskCount() == registeredTaskCount) && isDirectSubmissionIdle(bcsCsr, lock);
}
SteadyClock::time_point DirectSubmissionController::getCpuTimestamp() {
return SteadyClock::now();
}

View File

@@ -17,6 +17,7 @@
#include <condition_variable>
#include <memory>
#include <mutex>
#include <optional>
#include <queue>
#include <unordered_map>
@@ -87,6 +88,7 @@ class DirectSubmissionController {
static void *controlDirectSubmissionsState(void *self);
void checkNewSubmissions();
bool isDirectSubmissionIdle(CommandStreamReceiver *csr, std::unique_lock<std::recursive_mutex> &csrLock);
bool isCopyEngineOnDeviceIdle(uint32_t rootDeviceIndex, std::optional<TaskCountType> &bcsTaskCount);
MOCKABLE_VIRTUAL bool sleep(std::unique_lock<std::mutex> &lock);
MOCKABLE_VIRTUAL SteadyClock::time_point getCpuTimestamp();
MOCKABLE_VIRTUAL void overrideDirectSubmissionTimeouts(const ProductHelper &productHelper);

View File

@@ -274,6 +274,7 @@ class ProductHelper {
virtual void adjustRTDispatchGlobals(RTDispatchGlobals &rtDispatchGlobals, const HardwareInfo &hwInfo) const = 0;
virtual uint32_t getSyncNumRTStacksPerDss(const HardwareInfo &hwInfo) const = 0;
virtual uint32_t getNumRtStacksPerDSSForAllocation(const HardwareInfo &hwInfo) const = 0;
virtual bool checkBcsForDirectSubmissionStop() const = 0;
virtual bool shouldRegisterEnqueuedWalkerWithProfiling() const = 0;
virtual bool getStorageInfoLocalOnlyFlag(LocalMemAllocationMode usmDeviceAllocationMode, bool defaultValue) const = 0;

View File

@@ -1098,6 +1098,11 @@ uint32_t ProductHelperHw<gfxProduct>::getNumRtStacksPerDSSForAllocation(const Ha
return RayTracingHelper::getAsyncNumRTStacksPerDss();
}
template <PRODUCT_FAMILY gfxProduct>
bool ProductHelperHw<gfxProduct>::checkBcsForDirectSubmissionStop() const {
return false;
}
template <PRODUCT_FAMILY gfxProduct>
bool ProductHelperHw<gfxProduct>::shouldRegisterEnqueuedWalkerWithProfiling() const {
return false;

View File

@@ -211,6 +211,7 @@ class ProductHelperHw : public ProductHelper {
void adjustRTDispatchGlobals(RTDispatchGlobals &rtDispatchGlobals, const HardwareInfo &hwInfo) const override;
uint32_t getSyncNumRTStacksPerDss(const HardwareInfo &hwInfo) const override;
uint32_t getNumRtStacksPerDSSForAllocation(const HardwareInfo &hwInfo) const override;
bool checkBcsForDirectSubmissionStop() const override;
bool shouldRegisterEnqueuedWalkerWithProfiling() const override;
~ProductHelperHw() override = default;

View File

@@ -51,4 +51,9 @@ void ProductHelperHw<gfxProduct>::adjustScratchSize(size_t &requiredScratchSize)
requiredScratchSize *= 2;
}
template <>
bool ProductHelperHw<gfxProduct>::checkBcsForDirectSubmissionStop() const {
return true;
}
} // namespace NEO

View File

@@ -10,6 +10,7 @@
#include "shared/source/os_interface/os_time.h"
#include "shared/source/os_interface/product_helper.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/default_hw_info.h"
#include "shared/test/common/helpers/engine_descriptor_helper.h"
#include "shared/test/common/mocks/mock_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_execution_environment.h"
@@ -458,4 +459,118 @@ TEST_F(DirectSubmissionIdleDetectionTests, givenDebugFlagSetWhenTaskCountNotUpda
EXPECT_EQ(0u, csr->flushTagUpdateCalledTimes);
}
struct DirectSubmissionCheckForCopyEngineIdleTests : public ::testing::Test {
void SetUp() override {
controller = std::make_unique<DirectSubmissionControllerMock>();
executionEnvironment.prepareRootDeviceEnvironments(2);
executionEnvironment.initializeMemoryManager();
executionEnvironment.rootDeviceEnvironments[0]->initOsTime();
DeviceBitfield deviceBitfield(1);
ccsCsr = std::make_unique<TagUpdateMockCommandStreamReceiver>(executionEnvironment, 0, deviceBitfield);
bcsCsr = std::make_unique<TagUpdateMockCommandStreamReceiver>(executionEnvironment, 0, deviceBitfield);
ccsOsContext.reset(OsContext::create(nullptr, 0, 0, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular}, PreemptionMode::ThreadGroup, deviceBitfield)));
bcsOsContext.reset(OsContext::create(nullptr, 0, 0, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::regular}, PreemptionMode::ThreadGroup, deviceBitfield)));
ccsCsr->setupContext(*ccsOsContext);
bcsCsr->setupContext(*bcsOsContext);
controller->timeoutElapsedReturnValue.store(TimeoutElapsedMode::fullyElapsed);
controller->registerDirectSubmission(ccsCsr.get());
controller->registerDirectSubmission(bcsCsr.get());
bcsCsr->taskCount.store(10u);
ccsCsr->taskCount.store(10u);
controller->checkNewSubmissions();
}
void TearDown() override {
controller->unregisterDirectSubmission(ccsCsr.get());
controller->unregisterDirectSubmission(bcsCsr.get());
}
MockExecutionEnvironment executionEnvironment{defaultHwInfo.get(), true, 2u};
std::unique_ptr<OsContext> osContext;
std::unique_ptr<TagUpdateMockCommandStreamReceiver> ccsCsr;
std::unique_ptr<OsContext> ccsOsContext;
std::unique_ptr<TagUpdateMockCommandStreamReceiver> bcsCsr;
std::unique_ptr<OsContext> bcsOsContext;
std::unique_ptr<DirectSubmissionControllerMock> controller;
};
TEST_F(DirectSubmissionCheckForCopyEngineIdleTests, givenCheckBcsForDirectSubmissionStopWhenCCSIdleAndCopyEngineBusyThenDontTerminateDirectSubmission) {
ccsCsr->setLatestFlushedTaskCount(10u);
bcsCsr->setLatestFlushedTaskCount(10u);
ccsCsr->isBusyReturnValue = false;
bcsCsr->isBusyReturnValue = true;
controller->directSubmissions[bcsCsr.get()].isStopped = false;
controller->checkNewSubmissions();
EXPECT_EQ(controller->directSubmissions[ccsCsr.get()].taskCount, 10u);
if (ccsCsr->getProductHelper().checkBcsForDirectSubmissionStop()) {
EXPECT_FALSE(controller->directSubmissions[ccsCsr.get()].isStopped);
EXPECT_EQ(0u, ccsCsr->stopDirectSubmissionCalledTimes);
} else {
EXPECT_TRUE(controller->directSubmissions[ccsCsr.get()].isStopped);
EXPECT_EQ(1u, ccsCsr->stopDirectSubmissionCalledTimes);
}
}
TEST_F(DirectSubmissionCheckForCopyEngineIdleTests, givenCheckBcsForDirectSubmissionStopWhenCCSIdleAndCopyEngineUpdatedTaskCountThenDontTerminateDirectSubmission) {
ccsCsr->setLatestFlushedTaskCount(10u);
bcsCsr->setLatestFlushedTaskCount(10u);
ccsCsr->isBusyReturnValue = false;
bcsCsr->isBusyReturnValue = false;
controller->directSubmissions[bcsCsr.get()].isStopped = false;
bcsCsr->taskCount.store(20u);
controller->checkNewSubmissions();
EXPECT_EQ(controller->directSubmissions[ccsCsr.get()].taskCount, 10u);
if (ccsCsr->getProductHelper().checkBcsForDirectSubmissionStop()) {
EXPECT_FALSE(controller->directSubmissions[ccsCsr.get()].isStopped);
EXPECT_EQ(0u, ccsCsr->stopDirectSubmissionCalledTimes);
} else {
EXPECT_TRUE(controller->directSubmissions[ccsCsr.get()].isStopped);
EXPECT_EQ(1u, ccsCsr->stopDirectSubmissionCalledTimes);
}
}
TEST_F(DirectSubmissionCheckForCopyEngineIdleTests, givenCheckBcsForDirectSubmissionStopWhenCCSIdleAndCopyEngineBusyAndDifferentDeviceThenTerminateDirectSubmission) {
DeviceBitfield deviceBitfield(1);
TagUpdateMockCommandStreamReceiver secondDeviceCsr(executionEnvironment, 1, deviceBitfield);
std::unique_ptr<OsContext> osContext(OsContext::create(nullptr, 1, 0, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular}, PreemptionMode::ThreadGroup, deviceBitfield)));
secondDeviceCsr.setupContext(*osContext);
controller->registerDirectSubmission(&secondDeviceCsr);
secondDeviceCsr.taskCount.store(10u);
controller->checkNewSubmissions();
secondDeviceCsr.setLatestFlushedTaskCount(10u);
bcsCsr->setLatestFlushedTaskCount(10u);
secondDeviceCsr.isBusyReturnValue = false;
bcsCsr->isBusyReturnValue = true;
controller->directSubmissions[bcsCsr.get()].isStopped = false;
controller->checkNewSubmissions();
EXPECT_EQ(controller->directSubmissions[&secondDeviceCsr].taskCount, 10u);
EXPECT_TRUE(controller->directSubmissions[&secondDeviceCsr].isStopped);
EXPECT_EQ(1u, secondDeviceCsr.stopDirectSubmissionCalledTimes);
}
TEST_F(DirectSubmissionCheckForCopyEngineIdleTests, givenCheckBcsForDirectSubmissionStopWhenCopyEngineNotStartedThenTerminateDirectSubmission) {
ccsCsr->setLatestFlushedTaskCount(10u);
bcsCsr->setLatestFlushedTaskCount(10u);
ccsCsr->isBusyReturnValue = false;
bcsCsr->isBusyReturnValue = true;
controller->directSubmissions[bcsCsr.get()].isStopped = true;
controller->checkNewSubmissions();
EXPECT_EQ(controller->directSubmissions[ccsCsr.get()].taskCount, 10u);
EXPECT_TRUE(controller->directSubmissions[ccsCsr.get()].isStopped);
EXPECT_EQ(1u, ccsCsr->stopDirectSubmissionCalledTimes);
}
} // namespace NEO