performance: improve ULLS controller timeout detection

Related-To: NEO-12991

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2024-10-22 11:41:19 +00:00
committed by Compute-Runtime-Automation
parent 7a440f1143
commit 01a0b8e7f7
8 changed files with 142 additions and 5 deletions

View File

@@ -20,6 +20,7 @@ TEST(DirectSubmissionControllerTestsMt, givenDirectSubmissionControllerWhenTimeo
MockExecutionEnvironment executionEnvironment;
executionEnvironment.prepareRootDeviceEnvironments(1);
executionEnvironment.initializeMemoryManager();
executionEnvironment.rootDeviceEnvironments[0]->initOsTime();
DeviceBitfield deviceBitfield(1);
MockCommandStreamReceiver csr(executionEnvironment, 0, deviceBitfield);
@@ -82,6 +83,7 @@ TEST(DirectSubmissionControllerTestsMt, givenDirectSubmissionControllerWhenEnque
MockExecutionEnvironment executionEnvironment;
executionEnvironment.prepareRootDeviceEnvironments(1);
executionEnvironment.initializeMemoryManager();
executionEnvironment.rootDeviceEnvironments[0]->initOsTime();
DeviceBitfield deviceBitfield(1);
MockCommandStreamReceiver csr(executionEnvironment, 0, deviceBitfield);

View File

@@ -544,7 +544,7 @@ class CommandStreamReceiver {
uint32_t getRequiredScratchSlot1Size() { return requiredScratchSlot1Size; }
virtual bool submitDependencyUpdate(TagNodeBase *tag) = 0;
bool isBusy() {
MOCKABLE_VIRTUAL bool isBusy() {
return !testTaskCountReady(getTagAddress(), this->taskCount);
}

View File

@@ -444,7 +444,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionPrintSemaphoreUsage, -1, "-1: de
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionSwitchSemaphoreMode, -1, "-1: default, 1: enable switch on unsuccessful, 0: disable switch on unsuccessful")
DECLARE_DEBUG_VARIABLE(bool, DirectSubmissionPrintBuffers, false, "Print address of submitted command buffers")
DECLARE_DEBUG_VARIABLE(int32_t, WaitForPagingFenceInController, -1, "Instead of waiting for paging fence on user thread, program additional semaphore which will be signaled by direct submission controller when paging fence reaches required value -1: default, 0 - disable, 1 - enable.")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionControllerIdleDetection, -1, "Terminate direct submission only if CSR is idle. -1: default, 0 - disable, 1 - enable.")
/*FEATURE FLAGS*/
DECLARE_DEBUG_VARIABLE(bool, USMEvictAfterMigration, false, "Evict USM allocation after implicit migration to GPU")
DECLARE_DEBUG_VARIABLE(bool, RegisterPageFaultHandlerOnMigration, true, "Register handler on migration to GPU when current is not from pagefault manager")

View File

@@ -9,9 +9,11 @@
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/sleep.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/source/os_interface/os_thread.h"
#include "shared/source/os_interface/os_time.h"
#include "shared/source/os_interface/product_helper.h"
#include <chrono>
@@ -29,6 +31,10 @@ DirectSubmissionController::DirectSubmissionController() {
if (debugManager.flags.DirectSubmissionControllerMaxTimeout.get() != -1) {
maxTimeout = std::chrono::microseconds{debugManager.flags.DirectSubmissionControllerMaxTimeout.get()};
}
isCsrIdleDetectionEnabled = false;
if (debugManager.flags.DirectSubmissionControllerIdleDetection.get() != -1) {
isCsrIdleDetectionEnabled = debugManager.flags.DirectSubmissionControllerIdleDetection.get();
}
};
DirectSubmissionController::~DirectSubmissionController() {
@@ -145,13 +151,15 @@ void DirectSubmissionController::checkNewSubmissions() {
if (taskCount == state.taskCount) {
if (state.isStopped) {
continue;
} else {
auto lock = csr->obtainUniqueOwnership();
}
auto lock = csr->obtainUniqueOwnership();
if (!isCsrIdleDetectionEnabled || isDirectSubmissionIdle(csr, lock)) {
csr->stopDirectSubmission(false);
state.isStopped = true;
shouldRecalculateTimeout = true;
this->lowestThrottleSubmitted = QueueThrottle::HIGH;
}
state.taskCount = csr->peekTaskCount();
} else {
state.isStopped = false;
state.taskCount = taskCount;
@@ -171,6 +179,30 @@ bool DirectSubmissionController::sleep(std::unique_lock<std::mutex> &lock) {
return NEO::waitOnConditionWithPredicate(condVar, lock, std::chrono::microseconds(this->timeout), [&] { return !pagingFenceRequests.empty(); });
}
bool DirectSubmissionController::isDirectSubmissionIdle(CommandStreamReceiver *csr, std::unique_lock<std::recursive_mutex> &csrLock) {
if (csr->peekLatestFlushedTaskCount() == csr->peekTaskCount()) {
return !csr->isBusy();
}
csr->flushTagUpdate();
auto osTime = csr->peekRootDeviceEnvironment().osTime.get();
uint64_t currCpuTimeInNS;
osTime->getCpuTime(&currCpuTimeInNS);
auto timeToWait = currCpuTimeInNS + timeToPollTagUpdateNS;
// unblock csr during polling
csrLock.unlock();
while (currCpuTimeInNS < timeToWait) {
if (!csr->isBusy()) {
break;
}
osTime->getCpuTime(&currCpuTimeInNS);
}
csrLock.lock();
return !csr->isBusy();
}
SteadyClock::time_point DirectSubmissionController::getCpuTimestamp() {
return SteadyClock::now();
}

View File

@@ -43,6 +43,7 @@ struct WaitForPagingFenceRequest {
class DirectSubmissionController {
public:
static constexpr size_t defaultTimeout = 5'000;
static constexpr size_t timeToPollTagUpdateNS = 20'000;
DirectSubmissionController();
virtual ~DirectSubmissionController();
@@ -86,6 +87,7 @@ class DirectSubmissionController {
static void *controlDirectSubmissionsState(void *self);
void checkNewSubmissions();
bool isDirectSubmissionIdle(CommandStreamReceiver *csr, std::unique_lock<std::recursive_mutex> &csrLock);
MOCKABLE_VIRTUAL bool sleep(std::unique_lock<std::mutex> &lock);
MOCKABLE_VIRTUAL SteadyClock::time_point getCpuTimestamp();
@@ -115,6 +117,7 @@ class DirectSubmissionController {
std::unordered_map<size_t, TimeoutParams> timeoutParamsMap;
QueueThrottle lowestThrottleSubmitted = QueueThrottle::HIGH;
bool adjustTimeoutOnThrottleAndAcLineStatus = false;
bool isCsrIdleDetectionEnabled = false;
std::condition_variable condVar;
std::mutex condVarMutex;

View File

@@ -204,6 +204,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
void stopDirectSubmission(bool blocking) override {
this->blockingStopDirectSubmissionCalled = blocking;
stopDirectSubmissionCalledTimes++;
}
bool createPreemptionAllocation() override {
@@ -264,6 +265,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
uint32_t makeResidentCalledTimes = 0;
uint32_t downloadAllocationsCalledCount = 0;
uint32_t submitDependencyUpdateCalledTimes = 0;
uint32_t stopDirectSubmissionCalledTimes = 0;
int hostPtrSurfaceCreationMutexLockCount = 0;
bool multiOsContextCapable = false;
bool memoryCompressionEnabled = false;

View File

@@ -630,4 +630,5 @@ IgnoreZebinUnknownAttributes = 0
FifoPollInterval = -1
MaxSubSlicesSupportedOverride = -1
ForceWddmHugeChunkSizeMB = -1
DirectSubmissionControllerIdleDetection = -1
# Please don't edit below this line

View File

@@ -7,10 +7,12 @@
#include "shared/source/os_interface/os_context.h"
#include "shared/source/os_interface/os_thread.h"
#include "shared/source/os_interface/os_time.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/engine_descriptor_helper.h"
#include "shared/test/common/mocks/mock_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_execution_environment.h"
#include "shared/test/common/mocks/mock_ostime.h"
#include "shared/test/common/test_macros/test.h"
#include "shared/test/unit_test/direct_submission/direct_submission_controller_mock.h"
@@ -38,6 +40,7 @@ TEST(DirectSubmissionControllerTests, givenDirectSubmissionControllerWhenRegiste
MockExecutionEnvironment executionEnvironment;
executionEnvironment.prepareRootDeviceEnvironments(1);
executionEnvironment.initializeMemoryManager();
executionEnvironment.rootDeviceEnvironments[0]->initOsTime();
DeviceBitfield deviceBitfield(1);
MockCommandStreamReceiver csr(executionEnvironment, 0, deviceBitfield);
@@ -83,6 +86,7 @@ TEST(DirectSubmissionControllerTests, givenDirectSubmissionControllerAndDivisorD
MockExecutionEnvironment executionEnvironment;
executionEnvironment.prepareRootDeviceEnvironments(1);
executionEnvironment.initializeMemoryManager();
executionEnvironment.rootDeviceEnvironments[0]->initOsTime();
DeviceBitfield deviceBitfield(1);
MockCommandStreamReceiver csr(executionEnvironment, 0, deviceBitfield);
@@ -203,6 +207,7 @@ TEST(DirectSubmissionControllerTests, givenDirectSubmissionControllerAndAdjustOn
MockExecutionEnvironment executionEnvironment;
executionEnvironment.prepareRootDeviceEnvironments(1);
executionEnvironment.initializeMemoryManager();
executionEnvironment.rootDeviceEnvironments[0]->initOsTime();
DeviceBitfield deviceBitfield(1);
MockCommandStreamReceiver csr(executionEnvironment, 0, deviceBitfield);
@@ -210,7 +215,6 @@ TEST(DirectSubmissionControllerTests, givenDirectSubmissionControllerAndAdjustOn
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular},
PreemptionMode::ThreadGroup, deviceBitfield)));
csr.setupContext(*osContext.get());
DirectSubmissionControllerMock controller;
controller.timeoutElapsedReturnValue.store(true);
controller.setTimeoutParamsForPlatform(csr.getProductHelper());
@@ -652,4 +656,97 @@ TEST(DirectSubmissionControllerTests, givenDirectSubmissionControllerWhenCheckTi
EXPECT_FALSE(controller.timeoutElapsed());
}
struct TagUpdateMockCommandStreamReceiver : public MockCommandStreamReceiver {
TagUpdateMockCommandStreamReceiver(ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex, const DeviceBitfield deviceBitfield)
: MockCommandStreamReceiver(executionEnvironment, rootDeviceIndex, deviceBitfield) {}
SubmissionStatus flushTagUpdate() override {
flushTagUpdateCalledTimes++;
return SubmissionStatus::success;
}
bool isBusy() override {
return isBusyReturnValue;
}
uint32_t flushTagUpdateCalledTimes = 0;
bool isBusyReturnValue = false;
};
struct DirectSubmissionIdleDetectionTests : public ::testing::Test {
void SetUp() override {
debugManager.flags.DirectSubmissionControllerIdleDetection.set(true);
controller = std::make_unique<DirectSubmissionControllerMock>();
executionEnvironment.prepareRootDeviceEnvironments(1);
executionEnvironment.initializeMemoryManager();
executionEnvironment.rootDeviceEnvironments[0]->osTime.reset(new MockOSTime{});
DeviceBitfield deviceBitfield(1);
csr = std::make_unique<TagUpdateMockCommandStreamReceiver>(executionEnvironment, 0, deviceBitfield);
osContext.reset(OsContext::create(nullptr, 0, 0,
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular},
PreemptionMode::ThreadGroup, deviceBitfield)));
csr->setupContext(*osContext);
controller->timeoutElapsedReturnValue.store(true);
controller->registerDirectSubmission(csr.get());
csr->taskCount.store(10u);
controller->checkNewSubmissions();
}
void TearDown() override {
controller->unregisterDirectSubmission(csr.get());
}
DebugManagerStateRestore restorer;
MockExecutionEnvironment executionEnvironment;
std::unique_ptr<OsContext> osContext;
std::unique_ptr<TagUpdateMockCommandStreamReceiver> csr;
std::unique_ptr<DirectSubmissionControllerMock> controller;
};
TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskSameAsTaskCountAndGpuBusyThenDontTerminateDirectSubmission) {
csr->setLatestFlushedTaskCount(10u);
csr->isBusyReturnValue = true;
controller->checkNewSubmissions();
EXPECT_FALSE(controller->directSubmissions[csr.get()].isStopped);
EXPECT_EQ(controller->directSubmissions[csr.get()].taskCount, 10u);
EXPECT_EQ(0u, csr->stopDirectSubmissionCalledTimes);
EXPECT_EQ(0u, csr->flushTagUpdateCalledTimes);
}
TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskSameAsTaskCountAndGpuIdleThenTerminateDirectSubmission) {
csr->setLatestFlushedTaskCount(10u);
csr->isBusyReturnValue = false;
controller->checkNewSubmissions();
EXPECT_TRUE(controller->directSubmissions[csr.get()].isStopped);
EXPECT_EQ(controller->directSubmissions[csr.get()].taskCount, 10u);
EXPECT_EQ(1u, csr->stopDirectSubmissionCalledTimes);
EXPECT_EQ(0u, csr->flushTagUpdateCalledTimes);
}
TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskLowerThanTaskCountAndGpuBusyThenFlushTagAndDontTerminateDirectSubmission) {
csr->isBusyReturnValue = true;
controller->checkNewSubmissions();
EXPECT_FALSE(controller->directSubmissions[csr.get()].isStopped);
EXPECT_EQ(controller->directSubmissions[csr.get()].taskCount, 10u);
EXPECT_EQ(0u, csr->stopDirectSubmissionCalledTimes);
EXPECT_EQ(1u, csr->flushTagUpdateCalledTimes);
}
TEST_F(DirectSubmissionIdleDetectionTests, givenLatestFlushedTaskLowerThanTaskCountAndGpuIdleThenFlushTagAndTerminateDirectSubmission) {
csr->isBusyReturnValue = false;
controller->checkNewSubmissions();
EXPECT_TRUE(controller->directSubmissions[csr.get()].isStopped);
EXPECT_EQ(controller->directSubmissions[csr.get()].taskCount, 10u);
EXPECT_EQ(1u, csr->stopDirectSubmissionCalledTimes);
EXPECT_EQ(1u, csr->flushTagUpdateCalledTimes);
}
} // namespace NEO