performance: optimize counter based waiting schemes

- store latest waited counter value.
- do not wait on values that are already completed.
- disable mechanism when counter overflows.

Signed-off-by: Michal Mrozek <michal.mrozek@intel.com>
This commit is contained in:
Michal Mrozek
2024-04-25 15:08:55 +00:00
committed by Compute-Runtime-Automation
parent 205f8d2ffd
commit 4aa7c6c99e
4 changed files with 88 additions and 11 deletions

View File

@@ -223,21 +223,24 @@ ze_result_t EventImp<TagSizeT>::queryCounterBasedEventStatus() {
return ZE_RESULT_NOT_READY;
}
const uint64_t *hostAddress = ptrOffset(inOrderExecInfo->getBaseHostAddress(), this->inOrderAllocationOffset);
auto waitValue = getInOrderExecSignalValueWithSubmissionCounter();
bool signaled = true;
for (uint32_t i = 0; i < inOrderExecInfo->getNumHostPartitionsToWait(); i++) {
if (!NEO::WaitUtils::waitFunctionWithPredicate<const uint64_t>(hostAddress, waitValue, std::greater_equal<uint64_t>())) {
signaled = false;
break;
if (!inOrderExecInfo->isCounterAlreadyDone(waitValue)) {
bool signaled = true;
const uint64_t *hostAddress = ptrOffset(inOrderExecInfo->getBaseHostAddress(), this->inOrderAllocationOffset);
for (uint32_t i = 0; i < inOrderExecInfo->getNumHostPartitionsToWait(); i++) {
if (!NEO::WaitUtils::waitFunctionWithPredicate<const uint64_t>(hostAddress, waitValue, std::greater_equal<uint64_t>())) {
signaled = false;
break;
}
hostAddress = ptrOffset(hostAddress, sizeof(uint64_t));
}
hostAddress = ptrOffset(hostAddress, sizeof(uint64_t));
}
if (!signaled) {
return ZE_RESULT_NOT_READY;
if (!signaled) {
return ZE_RESULT_NOT_READY;
}
inOrderExecInfo->setLastWaitedCounterValue(waitValue);
}
handleSuccessfulHostSynchronization();

View File

@@ -301,6 +301,41 @@ HWTEST2_F(InOrderCmdListTests, givenCmdListsWhenDispatchingThenUseInternalTaskCo
}
}
HWTEST2_F(InOrderCmdListTests, givenCounterBasedEventsWhenHostWaitsAreCalledThenLatestWaitIsRecorded, IsAtLeastSkl) {
auto immCmdList = createImmCmdList<gfxCoreFamily>();
auto eventPool = createEvents<FamilyType>(2, false);
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, events[0]->toHandle(), 0, nullptr, launchParams, false);
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, events[1]->toHandle(), 0, nullptr, launchParams, false);
auto inOrderExecInfo = events[1]->getInOrderExecInfo();
*inOrderExecInfo->getBaseHostAddress() = 2u;
auto status = events[1]->hostSynchronize(-1);
EXPECT_EQ(ZE_RESULT_SUCCESS, status);
auto counterValue = events[1]->inOrderExecSignalValue;
EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(counterValue));
EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(events[0]->inOrderExecSignalValue));
EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(counterValue + 1));
// setting lower counter ignored
inOrderExecInfo->setLastWaitedCounterValue(counterValue - 1);
EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(counterValue));
EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(events[0]->inOrderExecSignalValue));
EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(counterValue + 1));
status = events[0]->hostSynchronize(-1);
EXPECT_EQ(ZE_RESULT_SUCCESS, status);
EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(counterValue));
EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(counterValue + 1));
// setting offset disables mechanism
inOrderExecInfo->setAllocationOffset(4u);
EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(0u));
EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(counterValue));
}
HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenEventHostSyncCalledThenCallWaitUserFence, IsAtLeastXeHpCore) {
NEO::debugManager.flags.WaitForUserFenceOnEventHostSynchronize.set(1);

View File

@@ -80,12 +80,20 @@ class InOrderExecInfo : public NEO::NonCopyableClass {
void reset();
bool isExternalMemoryExecInfo() const { return deviceCounterNode == nullptr; }
void setLastWaitedCounterValue(uint64_t value) {
lastWaitedCounterValue = std::max(value, lastWaitedCounterValue);
}
bool isCounterAlreadyDone(uint64_t waitValue) const {
return lastWaitedCounterValue >= waitValue && this->allocationOffset == 0u;
}
protected:
NEO::MemoryManager &memoryManager;
NEO::TagNodeBase *deviceCounterNode = nullptr;
NEO::TagNodeBase *hostCounterNode = nullptr;
uint64_t counterValue = 0;
uint64_t lastWaitedCounterValue = 0;
uint64_t regularCmdListSubmissionCounter = 0;
uint64_t deviceAddress = 0;

View File

@@ -184,6 +184,37 @@ HWTEST_F(CommandEncoderTests, givenDifferentInputParamsWhenCreatingInOrderExecIn
tempNode2->returnTag();
}
HWTEST_F(CommandEncoderTests, givenInOrderExecutionInfoWhenSetLastCounterValueIsCalledThenItReturnsProperQueries) {
MockDevice mockDevice;
MockExecutionEnvironment mockExecutionEnvironment{};
MockMemoryManager memoryManager(mockExecutionEnvironment);
MockTagAllocator<DeviceAllocNodeType<true>> tagAllocator(0, mockDevice.getMemoryManager());
auto node = tagAllocator.getTag();
auto inOrderExecInfo = std::make_unique<InOrderExecInfo>(node, nullptr, memoryManager, 2, 0, true, false);
inOrderExecInfo->setLastWaitedCounterValue(1u);
EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(2u));
EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(1u));
EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(0u));
inOrderExecInfo->setLastWaitedCounterValue(0u);
EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(2u));
EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(1u));
EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(0u));
inOrderExecInfo->setLastWaitedCounterValue(3u);
EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(2u));
EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(3u));
inOrderExecInfo->setAllocationOffset(4u);
EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(2u));
EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(3u));
EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(0u));
}
HWTEST_F(CommandEncoderTests, givenInOrderExecInfoWhenPatchingThenSetCorrectValues) {
MockDevice mockDevice;