feature: sync dispatch cleanup section

Related-To: NEO-8171

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski 2024-04-02 12:02:20 +00:00 committed by Compute-Runtime-Automation
parent 27c3a4753d
commit f0a24a650c
3 changed files with 211 additions and 4 deletions

View File

@ -201,7 +201,8 @@ struct CommandListCoreFamily : public CommandListImp {
ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, CommandToPatchContainer *outWaitCmds,
bool relaxedOrderingAllowed, bool trackDependencies, bool waitForImplicitInOrderDependency, bool skipAddingWaitEventsToResidency);
void appendSynchronizedDispatchInitializationSection();
MOCKABLE_VIRTUAL void appendSynchronizedDispatchInitializationSection();
MOCKABLE_VIRTUAL void appendSynchronizedDispatchCleanupSection();
ze_result_t reserveSpace(size_t size, void **ptr) override;
ze_result_t reset() override;

View File

@ -400,6 +400,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(ze_kernel_h
handleInOrderDependencyCounter(event, isInOrderNonWalkerSignalingRequired(event));
}
appendSynchronizedDispatchCleanupSection();
addToMappedEventList(event);
if (NEO::debugManager.flags.EnableSWTags.get()) {
neoDevice->getRootDeviceEnvironment().tagsManager->insertTag<GfxFamily, NEO::SWTags::CallNameEndTag>(
@ -445,6 +447,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchCooperativeKernel(
handleInOrderDependencyCounter(event, isInOrderNonWalkerSignalingRequired(event));
appendSynchronizedDispatchCleanupSection();
return ret;
}
@ -488,6 +492,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelIndirect(ze_
handleInOrderDependencyCounter(event, isInOrderNonWalkerSignalingRequired(event));
appendSynchronizedDispatchCleanupSection();
return ret;
}
@ -539,6 +545,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchMultipleKernelsInd
addToMappedEventList(event);
appendSignalEventPostWalker(event, nullptr, nullptr, false, false);
appendSynchronizedDispatchCleanupSection();
return ret;
}
@ -590,6 +598,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendEventReset(ze_event_hand
}
handleInOrderDependencyCounter(event, false);
appendSynchronizedDispatchCleanupSection();
if (NEO::debugManager.flags.EnableSWTags.get()) {
neoDevice->getRootDeviceEnvironment().tagsManager->insertTag<GfxFamily, NEO::SWTags::CallNameEndTag>(
*commandContainer.getCommandStream(),
@ -635,6 +645,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryRangesBarrier(uint
}
handleInOrderDependencyCounter(signalEvent, false);
appendSynchronizedDispatchCleanupSection();
return ZE_RESULT_SUCCESS;
}
@ -1624,6 +1636,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
handleInOrderDependencyCounter(signalEvent, false);
}
appendSynchronizedDispatchCleanupSection();
if (NEO::debugManager.flags.EnableSWTags.get()) {
neoDevice->getRootDeviceEnvironment().tagsManager->insertTag<GfxFamily, NEO::SWTags::CallNameEndTag>(
*commandContainer.getCommandStream(),
@ -2159,6 +2173,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
}
handleInOrderDependencyCounter(signalEvent, nonWalkerInOrderCmdChaining);
appendSynchronizedDispatchCleanupSection();
if (NEO::debugManager.flags.EnableSWTags.get()) {
neoDevice->getRootDeviceEnvironment().tagsManager->insertTag<GfxFamily, NEO::SWTags::CallNameEndTag>(
*commandContainer.getCommandStream(),
@ -2877,6 +2893,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWriteGlobalTimestamp(
}
handleInOrderDependencyCounter(signalEvent, false);
appendSynchronizedDispatchCleanupSection();
addToMappedEventList(signalEvent);
return ZE_RESULT_SUCCESS;
@ -3420,6 +3438,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
}
handleInOrderDependencyCounter(signalEvent, false);
appendSynchronizedDispatchCleanupSection();
return ZE_RESULT_SUCCESS;
}
@ -4028,4 +4048,25 @@ void CommandListCoreFamily<gfxCoreFamily>::appendFullSynchronizedDispatchInit()
// End section
NEO::EncodeMiPredicate<GfxFamily>::encode(*cmdStream, NEO::MiPredicateType::disable);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendSynchronizedDispatchCleanupSection() {
if (this->synchronizedDispatchMode != NEO::SynchronizedDispatchMode::full) {
return;
}
using MI_ATOMIC = typename GfxFamily::MI_ATOMIC;
using ATOMIC_OPCODES = typename MI_ATOMIC::ATOMIC_OPCODES;
using DATA_SIZE = typename MI_ATOMIC::DATA_SIZE;
auto cmdStream = commandContainer.getCommandStream();
auto syncAllocationGpuVa = device->getSyncDispatchTokenAllocation()->getGpuAddress();
const uint64_t queueIdToken = static_cast<uint64_t>(this->syncDispatchQueueId + 1) << 32;
NEO::EncodeAtomic<GfxFamily>::programMiAtomic(*cmdStream, syncAllocationGpuVa, ATOMIC_OPCODES::ATOMIC_8B_DECREMENT, DATA_SIZE::DATA_SIZE_QWORD, 1, 1, 0, 0);
NEO::EncodeAtomic<GfxFamily>::programMiAtomic(*cmdStream, syncAllocationGpuVa, ATOMIC_OPCODES::ATOMIC_8B_CMP_WR, DATA_SIZE::DATA_SIZE_QWORD, 1, 1, queueIdToken, 0);
}
} // namespace L0

View File

@ -6026,12 +6026,30 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenLimitedSyncDispatchWhenAppend
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using COMPARE_OPERATION = typename MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
using BaseClass = WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>;
class MyCmdList : public BaseClass {
public:
void appendSynchronizedDispatchInitializationSection() override {
initCalled++;
BaseClass::appendSynchronizedDispatchInitializationSection();
}
void appendSynchronizedDispatchCleanupSection() override {
cleanupCalled++;
BaseClass::appendSynchronizedDispatchCleanupSection();
}
uint32_t initCalled = 0;
uint32_t cleanupCalled = 0;
};
void *alloc = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &alloc);
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
auto immCmdList = createMultiTileImmCmdList<gfxCoreFamily>();
auto immCmdList = createImmCmdListImpl<gfxCoreFamily, MyCmdList>();
immCmdList->partitionCount = partitionCount;
immCmdList->synchronizedDispatchMode = NEO::SynchronizedDispatchMode::limited;
auto eventPool = createEvents<FamilyType>(1, false);
@ -6040,6 +6058,9 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenLimitedSyncDispatchWhenAppend
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
size_t offset = cmdStream->getUsed();
uint32_t expectedInitCalls = 1;
uint32_t expectedCleanupCalls = 1;
auto verifyTokenCheck = [&](uint32_t numDependencies) {
GenCmdList cmdList;
EXPECT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset)));
@ -6072,6 +6093,9 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenLimitedSyncDispatchWhenAppend
EXPECT_EQ(device->getSyncDispatchTokenAllocation()->getGpuAddress() + sizeof(uint32_t), semaphoreCmd->getSemaphoreGraphicsAddress());
EXPECT_EQ(COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, semaphoreCmd->getCompareOperation());
EXPECT_EQ(expectedInitCalls++, immCmdList->initCalled);
EXPECT_EQ(expectedCleanupCalls++, immCmdList->cleanupCalled);
return !::testing::Test::HasFailure();
};
@ -6104,6 +6128,7 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenLimitedSyncDispatchWhenAppend
size_t rangeSizes = 1;
const void **ranges = const_cast<const void **>(&alloc);
immCmdList->appendMemoryRangesBarrier(1, &rangeSizes, ranges, nullptr, 0, nullptr);
EXPECT_TRUE(verifyTokenCheck(1));
offset = cmdStream->getUsed();
immCmdList->appendMemoryCopy(alloc, alloc, 1, nullptr, 0, nullptr, false, false);
@ -6115,7 +6140,7 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenLimitedSyncDispatchWhenAppend
EXPECT_TRUE(verifyTokenCheck(1));
offset = cmdStream->getUsed();
immCmdList->appendMemoryFill(alloc, alloc, 1, 1, nullptr, 0, nullptr, false);
immCmdList->appendMemoryFill(alloc, alloc, 2, 2, nullptr, 0, nullptr, false);
EXPECT_TRUE(verifyTokenCheck(1));
offset = cmdStream->getUsed();
@ -6148,6 +6173,8 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppending
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
size_t offset = cmdStream->getUsed();
uint64_t syncAllocGpuVa = device->getSyncDispatchTokenAllocation()->getGpuAddress();
auto verifyTokenAcquisition = [&](bool hasDependencySemaphore) {
GenCmdList cmdList;
EXPECT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset)));
@ -6189,6 +6216,8 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppending
EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_8B_CMP_WR, miAtomic->getAtomicOpcode());
EXPECT_EQ(MI_ATOMIC::DATA_SIZE::DATA_SIZE_QWORD, miAtomic->getDataSize());
EXPECT_EQ(syncAllocGpuVa, NEO::UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic));
if (::testing::Test::HasFailure()) {
return false;
}
@ -6199,7 +6228,6 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppending
ptrOffset(jumpToEndSectionFromPrimaryTile, NEO::EncodeBatchBufferStartOrEnd<FamilyType>::getCmdSizeConditionalDataMemBatchBufferStart(false)));
EXPECT_EQ(0u, semaphore->getSemaphoreDataDword());
uint64_t syncAllocGpuVa = device->getSyncDispatchTokenAllocation()->getGpuAddress();
EXPECT_EQ(syncAllocGpuVa + sizeof(uint32_t), semaphore->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, semaphore->getCompareOperation());
@ -6255,5 +6283,142 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppending
EXPECT_TRUE(verifyTokenAcquisition(true));
}
HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppendingThenProgramTokenCleanup, IsAtLeastSkl) {
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
auto immCmdList = createMultiTileImmCmdList<gfxCoreFamily>();
immCmdList->synchronizedDispatchMode = NEO::SynchronizedDispatchMode::full;
immCmdList->syncDispatchQueueId = 0x1234;
const uint32_t queueId = immCmdList->syncDispatchQueueId + 1;
const uint64_t queueIdToken = static_cast<uint64_t>(queueId) << 32;
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
size_t offset = cmdStream->getUsed();
uint64_t syncAllocGpuVa = device->getSyncDispatchTokenAllocation()->getGpuAddress();
auto verifyTokenCleanup = [&]() {
GenCmdList cmdList;
EXPECT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset)));
if (::testing::Test::HasFailure()) {
return false;
}
auto itor = find<typename FamilyType::DefaultWalkerType *>(cmdList.begin(), cmdList.end());
EXPECT_NE(cmdList.end(), itor);
if (::testing::Test::HasFailure()) {
return false;
}
MI_ATOMIC *miAtomic = nullptr;
bool atomicFound = false;
while (itor != cmdList.end()) {
itor = find<MI_ATOMIC *>(itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
if (::testing::Test::HasFailure()) {
return false;
}
miAtomic = genCmdCast<MI_ATOMIC *>(*itor);
if (syncAllocGpuVa == NEO::UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic)) {
atomicFound = true;
break;
}
itor++;
}
EXPECT_TRUE(atomicFound);
if (::testing::Test::HasFailure()) {
return false;
}
EXPECT_EQ(MI_ATOMIC::DWORD_LENGTH::DWORD_LENGTH_INLINE_DATA_0, miAtomic->getDwordLength());
EXPECT_EQ(0u, miAtomic->getInlineData());
EXPECT_EQ(0u, miAtomic->getOperand1DataDword0());
EXPECT_EQ(0u, miAtomic->getOperand1DataDword1());
EXPECT_EQ(0u, miAtomic->getOperand2DataDword0());
EXPECT_EQ(0u, miAtomic->getOperand2DataDword1());
EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_8B_DECREMENT, miAtomic->getAtomicOpcode());
EXPECT_EQ(MI_ATOMIC::DATA_SIZE::DATA_SIZE_QWORD, miAtomic->getDataSize());
if (::testing::Test::HasFailure()) {
return false;
}
miAtomic++;
EXPECT_EQ(MI_ATOMIC::DWORD_LENGTH::DWORD_LENGTH_INLINE_DATA_1, miAtomic->getDwordLength());
EXPECT_EQ(1u, miAtomic->getInlineData());
EXPECT_EQ(getLowPart(queueIdToken), miAtomic->getOperand1DataDword0());
EXPECT_EQ(getHighPart(queueIdToken), miAtomic->getOperand1DataDword1());
EXPECT_EQ(0u, miAtomic->getOperand2DataDword0());
EXPECT_EQ(0u, miAtomic->getOperand2DataDword1());
EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_8B_CMP_WR, miAtomic->getAtomicOpcode());
EXPECT_EQ(MI_ATOMIC::DATA_SIZE::DATA_SIZE_QWORD, miAtomic->getDataSize());
EXPECT_EQ(syncAllocGpuVa, NEO::UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic));
return !::testing::Test::HasFailure();
};
// first run without dependency
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_TRUE(verifyTokenCleanup());
offset = cmdStream->getUsed();
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_TRUE(verifyTokenCleanup());
}
HWTEST2_F(MultiTileSynchronizedDispatchTests, givenLimitedSyncDispatchWhenAppendingThenDontProgramTokenCleanup, IsAtLeastSkl) {
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
auto immCmdList = createMultiTileImmCmdList<gfxCoreFamily>();
immCmdList->synchronizedDispatchMode = NEO::SynchronizedDispatchMode::limited;
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
size_t offset = cmdStream->getUsed();
uint64_t syncAllocGpuVa = device->getSyncDispatchTokenAllocation()->getGpuAddress();
auto verifyTokenCleanup = [&]() {
GenCmdList cmdList;
EXPECT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset)));
if (::testing::Test::HasFailure()) {
return false;
}
auto itor = find<typename FamilyType::DefaultWalkerType *>(cmdList.begin(), cmdList.end());
EXPECT_NE(cmdList.end(), itor);
if (::testing::Test::HasFailure()) {
return false;
}
auto atomics = findAll<MI_ATOMIC *>(itor, cmdList.end());
for (auto &atomic : atomics) {
auto miAtomic = genCmdCast<MI_ATOMIC *>(*atomic);
EXPECT_NE(nullptr, miAtomic);
EXPECT_NE(syncAllocGpuVa, NEO::UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic));
}
return !::testing::Test::HasFailure();
};
// first run without dependency
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_TRUE(verifyTokenCleanup());
offset = cmdStream->getUsed();
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_TRUE(verifyTokenCleanup());
}
} // namespace ult
} // namespace L0