fix(ocl): reduce busy waiting in clFinish

Use flushStamp=taskCount when passed flushStamp==0.
This will cause driver to busy wait for a short while before falling
back to use kmd notify.

Related-To: GSD-3612

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek 2023-06-13 14:37:41 +00:00 committed by Compute-Runtime-Automation
parent 02436b8877
commit 60d5e22f3b
9 changed files with 53 additions and 2 deletions

View File

@ -442,6 +442,9 @@ WaitStatus CommandQueue::waitUntilComplete(TaskCountType gpgpuTaskCountToWait, R
DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag());
if (!skipWait) {
if (flushStampToWait == 0 && getGpgpuCommandStreamReceiver().isKmdWaitOnTaskCountAllowed()) {
flushStampToWait = gpgpuTaskCountToWait;
}
waitStatus = getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait,
flushStampToWait,
useQuickKmdSleep,

View File

@ -280,7 +280,7 @@ HWTEST_F(KmdNotifyTests, givenKmdNotifyDisabledWhenQueueHasPowerSavingModeAndCal
EXPECT_EQ(1, csr->waitForCompletionWithTimeoutParamsPassed[0].timeoutMs);
}
HWTEST_F(KmdNotifyTests, givenKmdNotifyDisabledWhenQueueHasPowerSavingModButThereIsNoFlushStampeAndCallWaitThenTimeoutIsDisabled) {
HWTEST_F(KmdNotifyTests, givenKmdNotifyDisabledWhenQueueHasPowerSavingModButThereIsNoFlushStampAndCallWaitThenTimeoutIsDisabled) {
overrideKmdNotifyParams(false, 3, false, 2, false, 9999999, false, 0);
auto csr = createMockCsr<FamilyType>();
@ -292,6 +292,19 @@ HWTEST_F(KmdNotifyTests, givenKmdNotifyDisabledWhenQueueHasPowerSavingModButTher
EXPECT_EQ(0, csr->waitForCompletionWithTimeoutParamsPassed[0].timeoutMs);
}
HWTEST_F(KmdNotifyTests, givenKmdNotifyDisabledWhenQueueHasPowerSavingModAndThereIsNoFlushStampButKmdWaitOnTaskCountAllowedAndCallWaitThenTimeoutIsEnabled) {
overrideKmdNotifyParams(false, 3, false, 2, false, 9999999, false, 0);
auto csr = createMockCsr<FamilyType>();
csr->isKmdWaitOnTaskCountAllowedValue = true;
cmdQ->throttle = QueueThrottle::LOW;
cmdQ->waitUntilComplete(1, {}, 0, false);
EXPECT_EQ(1u, csr->waitForCompletionWithTimeoutCalled);
EXPECT_EQ(true, csr->waitForCompletionWithTimeoutParamsPassed[0].enableTimeout);
EXPECT_EQ(1, csr->waitForCompletionWithTimeoutParamsPassed[0].timeoutMs);
}
HWTEST_F(KmdNotifyTests, givenQuickSleepRequestWhenItsSporadicWaitOptimizationIsDisabledThenDontOverrideQuickSleepRequest) {
overrideKmdNotifyParams(true, 3, true, 2, false, 0, false, 0);
auto csr = createMockCsr<FamilyType>();

View File

@ -297,6 +297,10 @@ class CommandStreamReceiver {
return false;
}
virtual bool isKmdWaitOnTaskCountAllowed() const {
return false;
}
virtual void stopDirectSubmission() {}
bool isStaticWorkPartitioningEnabled() const {

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2022 Intel Corporation
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -49,6 +49,7 @@ class DrmCommandStreamReceiver : public DeviceCommandStreamReceiver<GfxFamily> {
void makeNonResident(GraphicsAllocation &gfxAllocation) override;
bool waitForFlushStamp(FlushStamp &flushStampToWait) override;
bool isKmdWaitModeActive() override;
bool isKmdWaitOnTaskCountAllowed() const override;
DrmMemoryManager *getMemoryManager() const;
GmmPageTableMngr *createPageTableManager() override;

View File

@ -341,4 +341,9 @@ template <typename GfxFamily>
inline bool DrmCommandStreamReceiver<GfxFamily>::isUserFenceWaitActive() {
return (this->drm->isVmBindAvailable() && useUserFenceWait);
}
template <typename GfxFamily>
bool DrmCommandStreamReceiver<GfxFamily>::isKmdWaitOnTaskCountAllowed() const {
return this->isDirectSubmissionEnabled();
}
} // namespace NEO

View File

@ -362,6 +362,13 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
return blitterDirectSubmissionAvailable;
}
bool isKmdWaitOnTaskCountAllowed() const override {
if (callBaseIsKmdWaitOnTaskCountAllowed) {
return BaseClass::isKmdWaitOnTaskCountAllowed();
}
return isKmdWaitOnTaskCountAllowedValue;
}
bool createAllocationForHostSurface(HostPtrSurface &surface, bool requiresL3Flush) override {
createAllocationForHostSurfaceCalled++;
cpuCopyForHostPtrSurfaceAllowed = surface.peekIsPtrCopyAllowed();
@ -449,6 +456,8 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
bool callBaseFlushBcsTask{true};
bool callBaseSendRenderStateCacheFlush = true;
bool forceReturnGpuHang = false;
bool callBaseIsKmdWaitOnTaskCountAllowed = false;
bool isKmdWaitOnTaskCountAllowedValue = false;
};
} // namespace NEO

View File

@ -728,6 +728,12 @@ HWTEST_F(CommandStreamReceiverTest, givenNoDirectSubmissionWhenCheckTaskCountFro
EXPECT_FALSE(csr.isUpdateTagFromWaitEnabled());
}
HWTEST_F(CommandStreamReceiverTest, givenCsrWhenCheckKmdWaitOnTaskCountEnabledThenReturnsFalse) {
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
csr.callBaseIsKmdWaitOnTaskCountAllowed = true;
EXPECT_FALSE(csr.isKmdWaitOnTaskCountAllowed());
}
HWTEST_F(CommandStreamReceiverTest, givenUpdateTaskCountFromWaitWhenCheckTaskCountFromWaitEnabledThenProperValueReturned) {
DebugManagerStateRestore restorer;
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();

View File

@ -65,6 +65,11 @@ HWTEST_TEMPLATED_F(DrmCommandStreamTest, givenEnabledDirectSubmissionWhenGetting
*csr.completionFenceValuePointer = 0;
}
HWTEST_TEMPLATED_F(DrmCommandStreamTest, givenDisabledDirectSubmissionWhenCheckingIsKmdWaitOnTaskCountAllowedThenFalseIsReturned) {
EXPECT_FALSE(csr->isDirectSubmissionEnabled());
EXPECT_FALSE(csr->isKmdWaitOnTaskCountAllowed());
}
HWTEST_TEMPLATED_F(DrmCommandStreamTest, whenGettingCompletionAddressThenOffsettedTagAddressIsReturned) {
csr->initializeTagAllocation();
EXPECT_NE(nullptr, csr->getTagAddress());

View File

@ -825,6 +825,11 @@ struct MockDrmDirectSubmissionToTestDtor : public DrmDirectSubmission<GfxFamily,
DrmDirectSubmissionFunctionsCalled &functionsCalled;
};
HWTEST_TEMPLATED_F(DrmCommandStreamDirectSubmissionTest, givenEnabledDirectSubmissionWhenCheckingIsKmdWaitOnTaskCountAllowedThenTrueIsReturned) {
EXPECT_TRUE(csr->isDirectSubmissionEnabled());
EXPECT_TRUE(csr->isKmdWaitOnTaskCountAllowed());
}
HWTEST_TEMPLATED_F(DrmCommandStreamDirectSubmissionTest, givenEnabledDirectSubmissionWhenDtorIsCalledButRingIsNotStartedThenDontCallStopRingBufferNorWaitForTagValue) {
DrmDirectSubmissionFunctionsCalled functionsCalled{};
auto directSubmission = std::make_unique<MockDrmDirectSubmissionToTestDtor<FamilyType>>(*device->getDefaultEngine().commandStreamReceiver, functionsCalled);