feature: program in-order barrier as non stalling instruction

Related-To: NEO-7966

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-06-29 11:23:19 +00:00
committed by Compute-Runtime-Automation
parent c5d675570a
commit f9a58b5544
4 changed files with 133 additions and 19 deletions

View File

@@ -2921,7 +2921,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
appendEventForProfiling(signalEvent, true);
if (isCopyOnly()) {
if (this->inOrderExecutionEnabled) {
appendSignalInOrderDependencyCounter();
} else if (isCopyOnly()) {
NEO::MiFlushArgs args{this->dummyBlitWa};
uint64_t gpuAddress = 0u;
TaskCountType value = 0u;

View File

@@ -385,6 +385,23 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendBarrier(
ze_event_handle_t *phWaitEvents) {
ze_result_t ret = ZE_RESULT_SUCCESS;
if (isInOrderExecutionEnabled() && numWaitEvents == 0) {
auto signalEvent = Event::fromHandle(hSignalEvent);
bool earlyReturn = true;
if (signalEvent) {
if (signalEvent->isEventTimestampFlagSet()) {
earlyReturn = false;
} else {
signalEvent->enableInOrderExecMode(*this->inOrderDependencyCounterAllocation, this->inOrderDependencyCounter);
}
}
if (earlyReturn) {
return ZE_RESULT_SUCCESS;
}
}
if (this->isFlushTaskSubmissionEnabled) {
checkAvailableSpace(numWaitEvents, false);
checkWaitEventsState(numWaitEvents, phWaitEvents);

View File

@@ -395,12 +395,6 @@ void CommandListCoreFamily<gfxCoreFamily>::appendComputeBarrierCommand() {
uint64_t gpuWriteAddress = 0;
uint64_t writeValue = 0;
if (this->inOrderExecutionEnabled) {
postSyncMode = NEO::PostSyncMode::ImmediateData;
gpuWriteAddress = this->inOrderDependencyCounterAllocation->getGpuAddress();
writeValue = this->inOrderDependencyCounter + 1;
}
NEO::MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(*commandContainer.getCommandStream(), postSyncMode, gpuWriteAddress, writeValue, args);
}
}

View File

@@ -1354,18 +1354,58 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingAppendWaitOnEvents
EXPECT_EQ(2u, sdiCmd->getDataDword0());
}
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingAppendBarrierThenSignalSyncAllocation, IsAtLeastXeHpCore) {
HWTEST2_F(InOrderCmdListTests, givenCopyOnlyInOrderModeWhenProgrammingBarrierThenSignalInOrderAllocation, IsAtLeastXeHpCore) {
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
auto immCmdList = createCopyOnlyImmCmdList<gfxCoreFamily>();
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
auto eventPool = createEvents<FamilyType>(1, false);
auto eventHandle = events[0]->toHandle();
uint32_t copyData = 0;
immCmdList->appendMemoryCopy(&copyData, &copyData, 1, eventHandle, 0, nullptr, false, false);
auto offset = cmdStream->getUsed();
immCmdList->appendBarrier(nullptr, 1, &eventHandle);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
(cmdStream->getUsed() - offset)));
auto sdiItor = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), sdiItor);
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(*sdiItor);
EXPECT_EQ(immCmdList->inOrderDependencyCounterAllocation->getGpuAddress(), sdiCmd->getAddress());
EXPECT_EQ(1u, sdiCmd->getStoreQword());
EXPECT_EQ(2u, sdiCmd->getDataDword0());
EXPECT_EQ(0u, sdiCmd->getDataDword1());
}
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingAppendBarrierWithWaitlistThenSignalSyncAllocation, IsAtLeastXeHpCore) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
auto immCmdList = createImmCmdList<gfxCoreFamily>();
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
auto eventPool = createEvents<FamilyType>(1, false);
auto eventHandle = events[0]->toHandle();
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, eventHandle, 0, nullptr, launchParams, false);
auto offset = cmdStream->getUsed();
immCmdList->appendBarrier(nullptr, 0, nullptr);
immCmdList->appendBarrier(nullptr, 1, &eventHandle);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
@@ -1373,18 +1413,79 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingAppendBarrierThenS
(cmdStream->getUsed() - offset)));
auto pcItor = find<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), pcItor);
EXPECT_EQ(cmdList.end(), pcItor);
auto pcCmd = genCmdCast<PIPE_CONTROL *>(*pcItor);
auto sdiItor = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), sdiItor);
auto gpuAddress = immCmdList->inOrderDependencyCounterAllocation->getGpuAddress();
auto lowAddress = static_cast<uint32_t>(gpuAddress & 0x0000FFFFFFFFULL);
auto highAddress = static_cast<uint32_t>(gpuAddress >> 32);
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(*sdiItor);
EXPECT_EQ(lowAddress, pcCmd->getAddress());
EXPECT_EQ(highAddress, pcCmd->getAddressHigh());
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pcCmd->getPostSyncOperation());
EXPECT_EQ(2u, pcCmd->getImmediateData());
EXPECT_EQ(immCmdList->inOrderDependencyCounterAllocation->getGpuAddress(), sdiCmd->getAddress());
EXPECT_EQ(1u, sdiCmd->getStoreQword());
EXPECT_EQ(2u, sdiCmd->getDataDword0());
EXPECT_EQ(0u, sdiCmd->getDataDword1());
}
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingAppendBarrierWithoutWaitlistThenInheritSignalSyncAllocation, IsAtLeastXeHpCore) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
auto immCmdList = createImmCmdList<gfxCoreFamily>();
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(1u, immCmdList->inOrderDependencyCounter);
auto offset = cmdStream->getUsed();
auto eventPool = createEvents<FamilyType>(1, false);
auto eventHandle = events[0]->toHandle();
immCmdList->appendBarrier(nullptr, 0, nullptr);
immCmdList->appendBarrier(eventHandle, 0, nullptr);
EXPECT_EQ(offset, cmdStream->getUsed());
EXPECT_EQ(1u, events[0]->inOrderExecSignalValue);
}
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingAppendBarrierWithoutWaitlistAndTimestampEventThenSignalSyncAllocation, IsAtLeastXeHpCore) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
auto immCmdList = createImmCmdList<gfxCoreFamily>();
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(1u, immCmdList->inOrderDependencyCounter);
auto offset = cmdStream->getUsed();
auto eventPool = createEvents<FamilyType>(1, true);
auto eventHandle = events[0]->toHandle();
immCmdList->appendBarrier(eventHandle, 0, nullptr);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
(cmdStream->getUsed() - offset)));
auto sdiItor = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), sdiItor);
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(*sdiItor);
EXPECT_EQ(immCmdList->inOrderDependencyCounterAllocation->getGpuAddress(), sdiCmd->getAddress());
EXPECT_EQ(1u, sdiCmd->getStoreQword());
EXPECT_EQ(2u, sdiCmd->getDataDword0());
EXPECT_EQ(0u, sdiCmd->getDataDword1());
}
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenCallingSyncThenHandleCompletion, IsAtLeastXeHpCore) {