From 8f2af28b11ddc06af8a104c430ce622ff77f5e12 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Tue, 10 Jan 2023 01:25:57 +0000 Subject: [PATCH] Fix issues in signal all event packets 7/n This fix is a refactor that improves few parts of the code - code is easier to analyze, read and maintain - dispatching process and common code is unified and reused - signal of all event packets is incorporated in shared code - number of post sync hw commands is optimized thanks to multi-tile post sync capabilities Related-To: NEO-7490 Signed-off-by: Zbigniew Zdanowicz --- level_zero/core/source/cmdlist/cmdlist_hw.h | 14 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 296 ++++++---------- .../cmdlist/cmdlist_hw_xehp_and_later.inl | 14 +- .../core/test/unit_tests/mocks/mock_cmdlist.h | 3 +- .../test_cmdlist_append_event_reset.cpp | 13 +- .../cmdlist/test_cmdlist_xehp_and_later.cpp | 319 ++++++++---------- 6 files changed, 281 insertions(+), 378 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index be17e7797c..e60251b0a7 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -53,6 +53,12 @@ struct CmdListFillKernelArguments { uint32_t patternSizeInEls = 0; }; +struct CmdListEventOperation { + size_t operationOffset = 0; + uint32_t operationCount = 0; + bool workPartitionOperation = false; +}; + struct EventPool; struct Event; @@ -294,8 +300,12 @@ struct CommandListCoreFamily : CommandListImp { compactL3FlushEvent(dcFlush); } void allocateKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread); - void setRemainingEventPackets(Event *event, uint32_t value); void waitOnRemainingEventPackets(Event *event); + CmdListEventOperation estimateEventPostSync(Event *event, uint32_t operations); + void dispatchPostSyncCopy(uint64_t gpuAddress, uint32_t value, bool workloadPartition); + void dispatchPostSyncCompute(uint64_t gpuAddress, uint32_t value, bool workloadPartition); + void dispatchPostSyncCommands(const CmdListEventOperation &eventOperations, uint64_t gpuAddress, uint32_t value); + void dispatchEventPostSyncOperation(Event *event, uint32_t value, bool omitFirstOperation, bool useMax, bool useLastPipeControl); size_t cmdListCurrentStartOffset = 0; bool containsAnyKernel = false; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 473b8ce0b8..7ab37a14a3 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -380,10 +380,6 @@ template ze_result_t CommandListCoreFamily::appendEventReset(ze_event_handle_t hEvent) { auto event = Event::fromHandle(hEvent); - uint64_t baseAddr = event->getGpuAddress(this->device); - uint32_t packetsToReset = event->getPacketsInUse(); - bool appendPipeControlWithPostSync = false; - NEO::Device *neoDevice = device->getNEODevice(); uint32_t callId = 0; if (NEO::DebugManager.flags.EnableSWTags.get()) { @@ -395,65 +391,17 @@ ze_result_t CommandListCoreFamily::appendEventReset(ze_event_hand callId = neoDevice->getRootDeviceEnvironment().tagsManager->currentCallCount; } - if (event->isUsingContextEndOffset()) { - baseAddr += event->getContextEndOffset(); - } - - if (event->isEventTimestampFlagSet()) { - packetsToReset = event->getMaxPacketsCount(); - } event->resetPackets(false); event->disableHostCaching(this->cmdListType == CommandList::CommandListType::TYPE_REGULAR); commandContainer.addToResidencyContainer(&event->getAllocation(this->device)); - const auto &hwInfo = this->device->getHwInfo(); - if (isCopyOnly()) { - NEO::MiFlushArgs args; - args.commandWithPostSync = true; - for (uint32_t i = 0u; i < packetsToReset; i++) { - NEO::EncodeMiFlushDW::programMiFlushDw(*commandContainer.getCommandStream(), - baseAddr, - Event::STATE_CLEARED, args, hwInfo); - baseAddr += event->getSinglePacketSize(); - } - if ((this->signalAllEventPackets) && (packetsToReset < event->getMaxPacketsCount())) { - setRemainingEventPackets(event, Event::STATE_CLEARED); - } - } else { - bool applyScope = event->signalScope; - uint32_t packetsToResetUsingSdi = packetsToReset; - if (applyScope || event->isEventTimestampFlagSet()) { - UNRECOVERABLE_IF(packetsToReset == 0); - packetsToResetUsingSdi = packetsToReset - 1; - appendPipeControlWithPostSync = true; - } - for (uint32_t i = 0u; i < packetsToResetUsingSdi; i++) { - NEO::EncodeStoreMemory::programStoreDataImm( - *commandContainer.getCommandStream(), - baseAddr, - Event::STATE_CLEARED, - 0u, - false, - false); - baseAddr += event->getSinglePacketSize(); - } + // default state of event is single packet, handle case when reset is used 1st, launchkernel 2nd - just reset all packets then, use max + bool useMaxPackets = event->isEventTimestampFlagSet() || (event->getPacketsInUse() < this->partitionCount); - if ((this->signalAllEventPackets) && (packetsToReset < event->getMaxPacketsCount())) { - setRemainingEventPackets(event, Event::STATE_CLEARED); - } - - if (appendPipeControlWithPostSync) { - NEO::PipeControlArgs args; - args.dcFlushEnable = getDcFlushRequired(!!event->signalScope); - NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( - *commandContainer.getCommandStream(), - NEO::PostSyncMode::ImmediateData, - baseAddr, - Event::STATE_CLEARED, - hwInfo, - args); - } + bool appendPipeControlWithPostSync = (!isCopyOnly()) && (!!event->signalScope || event->isEventTimestampFlagSet()); + dispatchEventPostSyncOperation(event, Event::STATE_CLEARED, false, useMaxPackets, appendPipeControlWithPostSync); + if (!isCopyOnly()) { if (this->partitionCount > 1) { appendMultiTileBarrier(*neoDevice); } @@ -1847,38 +1795,9 @@ void CommandListCoreFamily::appendSignalEventPostWalker(Event *ev } else { event->resetKernelCountAndPacketUsedCount(); commandContainer.addToResidencyContainer(&event->getAllocation(this->device)); - uint64_t baseAddr = event->getGpuAddress(this->device); - if (event->isUsingContextEndOffset()) { - baseAddr += event->getContextEndOffset(); - } - const auto &hwInfo = this->device->getHwInfo(); - if (isCopyOnly()) { - NEO::MiFlushArgs args; - args.commandWithPostSync = true; - NEO::EncodeMiFlushDW::programMiFlushDw(*commandContainer.getCommandStream(), baseAddr, Event::STATE_SIGNALED, - args, hwInfo); - if (this->signalAllEventPackets && (event->getPacketsInUse() < event->getMaxPacketsCount())) { - setRemainingEventPackets(event, Event::STATE_SIGNALED); - } - } else { - NEO::PipeControlArgs args; - args.dcFlushEnable = getDcFlushRequired(!!event->signalScope); - if (this->partitionCount > 1) { - args.workloadPartitionOffset = true; - event->setPacketsInUse(this->partitionCount); - } - if (this->signalAllEventPackets && (event->getPacketsInUse() < event->getMaxPacketsCount())) { - setRemainingEventPackets(event, Event::STATE_SIGNALED); - } - NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( - *commandContainer.getCommandStream(), - NEO::PostSyncMode::ImmediateData, - baseAddr, - Event::STATE_SIGNALED, - hwInfo, - args); - } + event->setPacketsInUse(this->partitionCount); + dispatchEventPostSyncOperation(event, Event::STATE_SIGNALED, false, false, !isCopyOnly()); } } @@ -1895,9 +1814,7 @@ void CommandListCoreFamily::appendEventForProfilingCopyCommand(Ev NEO::MiFlushArgs args; const auto &hwInfo = this->device->getHwInfo(); NEO::EncodeMiFlushDW::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args, hwInfo); - if (this->signalAllEventPackets && (event->getPacketsInUse() < event->getMaxPacketsCount())) { - setRemainingEventPackets(event, Event::STATE_SIGNALED); - } + dispatchEventPostSyncOperation(event, Event::STATE_SIGNALED, true, false, false); } appendWriteKernelTimestamp(event, beforeWalker, false, false); } @@ -2017,7 +1934,6 @@ ze_result_t CommandListCoreFamily::appendSignalEvent(ze_event_han event->resetKernelCountAndPacketUsedCount(); commandContainer.addToResidencyContainer(&event->getAllocation(this->device)); - uint64_t baseAddr = event->getGpuAddress(this->device); NEO::Device *neoDevice = device->getNEODevice(); uint32_t callId = 0; if (NEO::DebugManager.flags.EnableSWTags.get()) { @@ -2028,53 +1944,10 @@ ze_result_t CommandListCoreFamily::appendSignalEvent(ze_event_han ++neoDevice->getRootDeviceEnvironment().tagsManager->currentCallCount); callId = neoDevice->getRootDeviceEnvironment().tagsManager->currentCallCount; } - size_t eventSignalOffset = 0; - if (event->isUsingContextEndOffset()) { - eventSignalOffset = event->getContextEndOffset(); - } - - const auto &hwInfo = this->device->getHwInfo(); - if (isCopyOnly()) { - NEO::MiFlushArgs args; - args.commandWithPostSync = true; - NEO::EncodeMiFlushDW::programMiFlushDw(*commandContainer.getCommandStream(), ptrOffset(baseAddr, eventSignalOffset), - Event::STATE_SIGNALED, args, hwInfo); - - if (this->signalAllEventPackets && (event->getPacketsInUse() < event->getMaxPacketsCount())) { - setRemainingEventPackets(event, Event::STATE_SIGNALED); - } - } else { - NEO::PipeControlArgs args; - bool applyScope = !!event->signalScope; - args.dcFlushEnable = getDcFlushRequired(applyScope); - if (this->partitionCount > 1) { - event->setPacketsInUse(this->partitionCount); - args.workloadPartitionOffset = true; - } - - if (this->signalAllEventPackets && (event->getPacketsInUse() < event->getMaxPacketsCount())) { - setRemainingEventPackets(event, Event::STATE_SIGNALED); - } - - if (applyScope || event->isEventTimestampFlagSet()) { - NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( - *commandContainer.getCommandStream(), - NEO::PostSyncMode::ImmediateData, - ptrOffset(baseAddr, eventSignalOffset), - Event::STATE_SIGNALED, - hwInfo, - args); - } else { - NEO::EncodeStoreMemory::programStoreDataImm( - *commandContainer.getCommandStream(), - ptrOffset(baseAddr, eventSignalOffset), - Event::STATE_SIGNALED, - 0u, - false, - args.workloadPartitionOffset); - } - } + event->setPacketsInUse(this->partitionCount); + bool appendPipeControlWithPostSync = (!isCopyOnly()) && (!!event->signalScope || event->isEventTimestampFlagSet()); + dispatchEventPostSyncOperation(event, Event::STATE_SIGNALED, false, false, appendPipeControlWithPostSync); if (NEO::DebugManager.flags.EnableSWTags.get()) { neoDevice->getRootDeviceEnvironment().tagsManager->insertTag( @@ -2232,9 +2105,7 @@ void CommandListCoreFamily::appendEventForProfiling(Event *event, bool workloadPartition = setupTimestampEventForMultiTile(event); appendWriteKernelTimestamp(event, beforeWalker, true, workloadPartition); } else { - if (this->signalAllEventPackets && (event->getPacketsInUse() < event->getMaxPacketsCount())) { - setRemainingEventPackets(event, Event::STATE_SIGNALED); - } + dispatchEventPostSyncOperation(event, Event::STATE_SIGNALED, true, false, false); const auto &hwInfo = this->device->getHwInfo(); NEO::PipeControlArgs args; @@ -2843,56 +2714,6 @@ void CommandListCoreFamily::allocateKernelPrivateMemoryIfNeeded(K } } -template -void CommandListCoreFamily::setRemainingEventPackets(Event *event, uint32_t value) { - uint32_t packetUsed = event->getPacketsInUse(); - uint32_t packetsRemaining = event->getMaxPacketsCount() - packetUsed; - if (packetsRemaining == 0) { - return; - } - - uint64_t gpuAddress = event->getGpuAddress(this->device); - size_t packetSize = event->getSinglePacketSize(); - gpuAddress += packetSize * packetUsed; - if (event->isUsingContextEndOffset()) { - gpuAddress += event->getContextEndOffset(); - } - - uint32_t operationsRemaining = packetsRemaining; - size_t operationOffset = packetSize; - bool partitionEnabled = false; - - if ((this->partitionCount > 1) && (packetsRemaining % this->partitionCount == 0)) { - operationsRemaining = operationsRemaining / this->partitionCount; - operationOffset = operationOffset * this->partitionCount; - partitionEnabled = true; - } - - for (uint32_t i = 0; i < operationsRemaining; i++) { - if (isCopyOnly()) { - const auto &hwInfo = this->device->getHwInfo(); - NEO::MiFlushArgs args; - args.commandWithPostSync = true; - NEO::EncodeMiFlushDW::programMiFlushDw( - *commandContainer.getCommandStream(), - gpuAddress, - value, - args, - hwInfo); - } else { - NEO::EncodeStoreMemory::programStoreDataImm( - *commandContainer.getCommandStream(), - gpuAddress, - value, - 0u, - false, - partitionEnabled); - } - - gpuAddress += operationOffset; - } -} - template void CommandListCoreFamily::waitOnRemainingEventPackets(Event *event) { using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; @@ -2919,4 +2740,97 @@ void CommandListCoreFamily::waitOnRemainingEventPackets(Event *ev } } +template +CmdListEventOperation CommandListCoreFamily::estimateEventPostSync(Event *event, uint32_t operations) { + CmdListEventOperation ret; + + UNRECOVERABLE_IF(operations & (this->partitionCount - 1)); + + ret.operationCount = operations / this->partitionCount; + ret.operationOffset = event->getSinglePacketSize() * this->partitionCount; + ret.workPartitionOperation = this->partitionCount > 1; + + return ret; +} + +template +void CommandListCoreFamily::dispatchPostSyncCopy(uint64_t gpuAddress, uint32_t value, bool workloadPartition) { + const auto &hwInfo = this->device->getHwInfo(); + + NEO::MiFlushArgs miFlushArgs; + miFlushArgs.commandWithPostSync = true; + NEO::EncodeMiFlushDW::programMiFlushDw( + *commandContainer.getCommandStream(), + gpuAddress, + value, + miFlushArgs, + hwInfo); +} + +template +void CommandListCoreFamily::dispatchPostSyncCompute(uint64_t gpuAddress, uint32_t value, bool workloadPartition) { + NEO::EncodeStoreMemory::programStoreDataImm( + *commandContainer.getCommandStream(), + gpuAddress, + value, + 0u, + false, + workloadPartition); +} + +template +void CommandListCoreFamily::dispatchPostSyncCommands(const CmdListEventOperation &eventOperations, uint64_t gpuAddress, uint32_t value) { + decltype(&CommandListCoreFamily::dispatchPostSyncCompute) dispatchFunction = &CommandListCoreFamily::dispatchPostSyncCompute; + if (isCopyOnly()) { + dispatchFunction = &CommandListCoreFamily::dispatchPostSyncCopy; + } + + for (uint32_t i = 0; i < eventOperations.operationCount; i++) { + (this->*dispatchFunction)(gpuAddress, value, eventOperations.workPartitionOperation); + + gpuAddress += eventOperations.operationOffset; + } +} + +template +void CommandListCoreFamily::dispatchEventPostSyncOperation(Event *event, uint32_t value, bool omitFirstOperation, bool useMax, bool useLastPipeControl) { + uint32_t packets = event->getPacketsInUse(); + if (this->signalAllEventPackets || useMax) { + packets = event->getMaxPacketsCount(); + } + auto eventPostSync = estimateEventPostSync(event, packets); + + uint64_t gpuAddress = event->getGpuAddress(this->device); + if (event->isUsingContextEndOffset()) { + gpuAddress += event->getContextEndOffset(); + } + if (omitFirstOperation) { + gpuAddress += eventPostSync.operationOffset; + eventPostSync.operationCount--; + } + if (useLastPipeControl) { + eventPostSync.operationCount--; + } + + dispatchPostSyncCommands(eventPostSync, gpuAddress, value); + + if (useLastPipeControl) { + const auto &hwInfo = this->device->getHwInfo(); + + NEO::PipeControlArgs pipeControlArgs; + pipeControlArgs.dcFlushEnable = getDcFlushRequired(!!event->signalScope); + pipeControlArgs.workloadPartitionOffset = eventPostSync.workPartitionOperation; + + gpuAddress += eventPostSync.operationCount * eventPostSync.operationOffset; + + NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( + *commandContainer.getCommandStream(), + NEO::PostSyncMode::ImmediateData, + gpuAddress, + value, + hwInfo, + pipeControlArgs); + } +} + } // namespace L0 diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index cdb8781f6a..aa9b9b96bd 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -294,8 +294,16 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K if (l3FlushEnable) { programEventL3Flush(event, this->device, partitionCount, commandContainer); } - if (this->signalAllEventPackets) { - setRemainingEventPackets(event, Event::STATE_SIGNALED); + if (this->signalAllEventPackets && event->getPacketsInUse() < event->getMaxPacketsCount()) { + uint32_t packets = event->getMaxPacketsCount() - event->getPacketsInUse(); + CmdListEventOperation remainingPacketsOperation = estimateEventPostSync(event, packets); + + uint64_t eventAddress = event->getGpuAddress(device) + event->getSinglePacketSize() * event->getPacketsInUse(); + if (event->isUsingContextEndOffset()) { + eventAddress += event->getContextEndOffset(); + } + + dispatchPostSyncCommands(remainingPacketsOperation, eventAddress, Event::STATE_SIGNALED); } } diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 701556f20d..3738035bdd 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -71,7 +71,6 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::pipeControlMultiKernelEventSync; using BaseClass::pipelineSelectStateTracking; using BaseClass::requiredStreamState; - using BaseClass::setRemainingEventPackets; using BaseClass::setupTimestampEventForMultiTile; using BaseClass::signalAllEventPackets; using BaseClass::stateComputeModeTracking; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp index 1466ea910e..b56e36abda 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -407,7 +407,6 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent, auto &hwInfo = device->getNEODevice()->getHardwareInfo(); size_t expectedSize = NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false) + - ((packets - 1) * sizeof(MI_STORE_DATA_IMM)) + commandList->estimateBufferSizeMultiTileBarrier(hwInfo); size_t usedSize = cmdStream->getUsed(); EXPECT_EQ(expectedSize, usedSize); @@ -418,10 +417,9 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent, cmdStream->getCpuBase(), usedSize)); - auto itorSdi = find(cmdList.begin(), cmdList.end()); - auto cmd = genCmdCast(*itorSdi); - EXPECT_EQ(gpuAddress, cmd->getAddress()); - gpuAddress += event->getSinglePacketSize(); + auto itorSdi = findAll(cmdList.begin(), cmdList.end()); + // multi tile barrier self-cleanup commands + ASSERT_EQ(2u, itorSdi.size()); auto pipeControlList = findAll(cmdList.begin(), cmdList.end()); ASSERT_NE(0u, pipeControlList.size()); @@ -434,8 +432,9 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent, EXPECT_TRUE(cmd->getCommandStreamerStallEnable()); EXPECT_EQ(gpuAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); EXPECT_EQ(MemorySynchronizationCommands::getDcFlushEnable(true, *defaultHwInfo), cmd->getDcFlushEnable()); + EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); postSyncFound++; - gpuAddress += event->getSinglePacketSize(); + gpuAddress += event->getSinglePacketSize() * commandList->partitionCount; postSyncPipeControlItor = it; } } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp index c1d4e3f523..88be34f1e0 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -553,76 +553,6 @@ HWTEST2_F(CommandListAppendLaunchKernelMultiTileCompactL3FlushEnabledTest, testAppendLaunchKernelAndL3Flush(input, arg); } -HWTEST2_F(CommandListTests, GivenCopyCommandListWhenSettingRemainingEventPacketsThenExpectMiDwordFlushCommandsProgrammingPackets, IsAtLeastXeHpCore) { - using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; - - DebugManagerStateRestore restorer; - - NEO::DebugManager.flags.UseDynamicEventPacketsCount.set(1); - NEO::DebugManager.flags.SignalAllEventPackets.set(1); - NEO::DebugManager.flags.UsePipeControlMultiKernelEventSync.set(0); - NEO::DebugManager.flags.CompactL3FlushEventPacket.set(0); - - auto commandList = std::make_unique>>(); - auto result = commandList->initialize(device, NEO::EngineGroupType::Copy, 0u); - ASSERT_EQ(ZE_RESULT_SUCCESS, result); - - auto cmdStream = commandList->commandContainer.getCommandStream(); - - ze_event_pool_desc_t eventPoolDesc = {}; - eventPoolDesc.count = 1; - eventPoolDesc.flags = 0; - - ze_event_desc_t eventDesc = {}; - eventDesc.index = 0; - eventDesc.signal = 0; - - auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); - ASSERT_NE(nullptr, event.get()); - - uint32_t packetUsed = event->getPacketsInUse(); - uint32_t remainingPackets = event->getMaxPacketsCount() - packetUsed; - - size_t sizeBefore = cmdStream->getUsed(); - commandList->setRemainingEventPackets(event.get(), Event::STATE_SIGNALED); - size_t sizeAfter = cmdStream->getUsed(); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( - cmdList, - ptrOffset(cmdStream->getCpuBase(), sizeBefore), - (sizeAfter - sizeBefore))); - - uint32_t expectedMiFlushCount = remainingPackets; - if (NEO::EncodeMiFlushDW::getMiFlushDwWaSize() > 0) { - expectedMiFlushCount *= 2; - } - - auto miFlushList = findAll(cmdList.begin(), cmdList.end()); - EXPECT_EQ(expectedMiFlushCount, static_cast(miFlushList.size())); - - uint64_t gpuAddress = event->getGpuAddress(device); - gpuAddress += (packetUsed * event->getSinglePacketSize()); - if (event->isUsingContextEndOffset()) { - gpuAddress += event->getContextEndOffset(); - } - - for (uint32_t i = 0; i < expectedMiFlushCount; i++) { - if ((expectedMiFlushCount == 2 * remainingPackets) && (i % 2 == 0)) { - continue; - } - auto cmd = genCmdCast(*miFlushList[i]); - EXPECT_EQ(gpuAddress, cmd->getDestinationAddress()); - EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); - EXPECT_EQ(MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD, cmd->getPostSyncOperation()); - - gpuAddress += event->getSinglePacketSize(); - } -} - template struct CommandListSignalAllEventPacketFixture : public ModuleFixture { void setUp() { @@ -719,7 +649,8 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { gpuAddress += event->getContextEndOffset(); } - for (uint32_t i = extraCleanupStoreDataImm; i < itorStoreDataImm.size(); i++) { + uint32_t startIndex = extraCleanupStoreDataImm; + for (uint32_t i = startIndex; i < remainingPackets + extraCleanupStoreDataImm; i++) { auto cmd = genCmdCast(*itorStoreDataImm[i]); EXPECT_EQ(gpuAddress, cmd->getAddress()); EXPECT_FALSE(cmd->getStoreQword()); @@ -739,6 +670,8 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION; auto commandList = std::make_unique>>(); auto engineType = copyOnly == 1 ? NEO::EngineGroupType::Copy : NEO::EngineGroupType::Compute; @@ -802,6 +735,7 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { } else { auto itorStoreDataImm = findAll(cmdList.begin(), cmdList.end()); + auto itorPipeControl = findAll(cmdList.begin(), cmdList.end()); uint64_t gpuAddress = event->getGpuAddress(device); if (event->isUsingContextEndOffset()) { @@ -825,29 +759,34 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { } else { EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable()); } + } else { + uint32_t postSyncPipeControls = 0; + for (auto it : itorPipeControl) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + postSyncPipeControls++; + EXPECT_EQ(gpuAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); + if constexpr (multiTile == 1) { + EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); + } else { + EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable()); + } + } + } + EXPECT_EQ(1u, postSyncPipeControls); } } else { - uint32_t packetUsed = event->getPacketsInUse(); - uint32_t remainingPackets = event->getMaxPacketsCount() - packetUsed; - EXPECT_EQ(0u, remainingPackets % commandList->partitionCount); - remainingPackets /= commandList->partitionCount; - ASSERT_EQ(remainingPackets + extraSignalStoreDataImm, static_cast(itorStoreDataImm.size())); - - if (extraSignalStoreDataImm == 1) { - auto cmd = genCmdCast(*itorStoreDataImm[itorStoreDataImm.size() - 1]); - EXPECT_EQ(gpuAddress, cmd->getAddress()); - EXPECT_FALSE(cmd->getStoreQword()); - EXPECT_EQ(Event::STATE_SIGNALED, cmd->getDataDword0()); - if constexpr (multiTile == 1) { - EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); - } else { - EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable()); - } + uint32_t packets = event->getMaxPacketsCount(); + EXPECT_EQ(0u, packets % commandList->partitionCount); + packets /= commandList->partitionCount; + if (extraSignalStoreDataImm == 0) { + packets--; } - gpuAddress += (packetUsed * event->getSinglePacketSize()); + ASSERT_EQ(packets, static_cast(itorStoreDataImm.size())); - for (uint32_t i = 0; i < itorStoreDataImm.size() - extraSignalStoreDataImm; i++) { + for (uint32_t i = 0; i < packets; i++) { auto cmd = genCmdCast(*itorStoreDataImm[i]); EXPECT_EQ(gpuAddress, cmd->getAddress()); EXPECT_FALSE(cmd->getStoreQword()); @@ -859,6 +798,24 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { } gpuAddress += (event->getSinglePacketSize() * commandList->partitionCount); } + + if (extraSignalStoreDataImm == 0) { + uint32_t postSyncPipeControls = 0; + for (auto it : itorPipeControl) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + postSyncPipeControls++; + EXPECT_EQ(gpuAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); + if constexpr (multiTile == 1) { + EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); + } else { + EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable()); + } + } + } + EXPECT_EQ(1u, postSyncPipeControls); + } } } } @@ -938,6 +895,7 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION; auto commandList = std::make_unique>>(); auto engineType = copyOnly == 1 ? NEO::EngineGroupType::Copy : NEO::EngineGroupType::Compute; @@ -971,6 +929,11 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { ptrOffset(cmdStream->getCpuBase(), sizeBefore), (sizeAfter - sizeBefore))); + uint64_t gpuAddress = event->getGpuAddress(device); + if (event->isUsingContextEndOffset()) { + gpuAddress += event->getContextEndOffset(); + } + if constexpr (copyOnly == 1) { auto itorFlushDw = findAll(cmdList.begin(), cmdList.end()); @@ -983,11 +946,6 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { expectedFlushDw *= flushCmdWaFactor; ASSERT_EQ(expectedFlushDw, itorFlushDw.size()); - uint64_t gpuAddress = event->getGpuAddress(device); - if (event->isUsingContextEndOffset()) { - gpuAddress += event->getContextEndOffset(); - } - uint32_t startingSignalCmd = 0; if (eventPoolFlags != 0) { auto cmd = genCmdCast(*itorFlushDw[(flushCmdWaFactor - 1)]); @@ -1013,23 +971,46 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { } else { auto itorStoreDataImm = findAll(cmdList.begin(), cmdList.end()); + auto itorPipeControl = findAll(cmdList.begin(), cmdList.end()); + + uint32_t expectedPostSyncPipeControls = 0; + if (eventPoolFlags == 0) { + expectedPostSyncPipeControls = 1; + } if constexpr (limitEventPacketes == 1) { constexpr uint32_t expectedStoreDataImm = 0; ASSERT_EQ(expectedStoreDataImm, itorStoreDataImm.size()); - } else { - uint32_t packetUsed = event->getPacketsInUse(); - uint32_t remainingPackets = event->getMaxPacketsCount() - packetUsed; - remainingPackets /= commandList->partitionCount; - ASSERT_EQ(remainingPackets, static_cast(itorStoreDataImm.size())); - uint64_t gpuAddress = event->getGpuAddress(device); - gpuAddress += (packetUsed * event->getSinglePacketSize()); - if (event->isUsingContextEndOffset()) { - gpuAddress += event->getContextEndOffset(); + uint32_t postSyncPipeControls = 0; + for (auto it : itorPipeControl) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + postSyncPipeControls++; + EXPECT_EQ(gpuAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); + if constexpr (multiTile == 1) { + EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); + } else { + EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable()); + } + } + } + EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls); + + } else { + uint32_t packets = event->getMaxPacketsCount(); + EXPECT_EQ(0u, packets % commandList->partitionCount); + packets /= commandList->partitionCount; + packets--; + + ASSERT_EQ(packets, static_cast(itorStoreDataImm.size())); + + if (eventPoolFlags != 0) { + gpuAddress += (event->getSinglePacketSize() * commandList->partitionCount); } - for (uint32_t i = 0; i < remainingPackets; i++) { + for (uint32_t i = 0; i < packets; i++) { auto cmd = genCmdCast(*itorStoreDataImm[i]); EXPECT_EQ(gpuAddress, cmd->getAddress()); EXPECT_FALSE(cmd->getStoreQword()); @@ -1041,12 +1022,22 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { } gpuAddress += (event->getSinglePacketSize() * commandList->partitionCount); } - if (remainingPackets > 0) { - auto lastIterator = itorStoreDataImm[itorStoreDataImm.size() - 1]; - ++lastIterator; - auto cmd = genCmdCast(*lastIterator); - EXPECT_NE(nullptr, cmd); + + uint32_t postSyncPipeControls = 0; + for (auto it : itorPipeControl) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + postSyncPipeControls++; + EXPECT_EQ(gpuAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData()); + if constexpr (multiTile == 1) { + EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); + } else { + EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable()); + } + } } + EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls); } } } @@ -1116,6 +1107,8 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { using FamilyType = typename NEO::GfxFamilyMapper::GfxFamily; using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION; auto commandList = std::make_unique>>(); auto engineType = copyOnly == 1 ? NEO::EngineGroupType::Copy : NEO::EngineGroupType::Compute; @@ -1146,6 +1139,11 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { size_t sizeAfter = cmdStream->getUsed(); EXPECT_EQ(ZE_RESULT_SUCCESS, result); + uint64_t gpuAddress = event->getGpuAddress(device); + if (event->isUsingContextEndOffset()) { + gpuAddress += event->getContextEndOffset(); + } + GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( cmdList, @@ -1163,11 +1161,6 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { uint32_t expectedFlushDw = event->getMaxPacketsCount() * flushCmdWaFactor; ASSERT_EQ(expectedFlushDw, itorFlushDw.size()); - uint64_t gpuAddress = event->getGpuAddress(device); - if (event->isUsingContextEndOffset()) { - gpuAddress += event->getContextEndOffset(); - } - for (uint32_t i = 0; i < expectedFlushDw; i++) { auto cmd = genCmdCast(*itorFlushDw[i]); if (flushCmdWaFactor == 2) { @@ -1183,6 +1176,7 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { } else { auto itorStoreDataImm = findAll(cmdList.begin(), cmdList.end()); + auto itorPipeControl = findAll(cmdList.begin(), cmdList.end()); uint32_t extraCleanupStoreDataImm = 0; if constexpr (multiTile == 1) { @@ -1190,70 +1184,49 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture { extraCleanupStoreDataImm = 2; } - if constexpr (limitEventPacketes == 1) { // single packet for single tile, two packets for two tiles - uint32_t expectedStoreDataImm = 0; // single packet will be reset by PC or SDI - assume here PC is used for timestamp event + uint32_t expectedStoreDataImm = event->getMaxPacketsCount() / commandList->partitionCount; + if constexpr (limitEventPacketes == 1) { + // single packet will be reset by PC or SDI + expectedStoreDataImm = 1; + } + + uint32_t expectedPostSyncPipeControls = 0; + // last packet is reset by PIPE_CONTROL w/ post sync + if (eventPoolFlags == ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP) { + expectedStoreDataImm--; + expectedPostSyncPipeControls = 1; + } + + ASSERT_EQ(expectedStoreDataImm + extraCleanupStoreDataImm, static_cast(itorStoreDataImm.size())); + + for (uint32_t i = 0; i < expectedStoreDataImm; i++) { + auto cmd = genCmdCast(*itorStoreDataImm[i]); + EXPECT_EQ(gpuAddress, cmd->getAddress()); + EXPECT_FALSE(cmd->getStoreQword()); + EXPECT_EQ(Event::STATE_CLEARED, cmd->getDataDword0()); if constexpr (multiTile == 1) { - expectedStoreDataImm = 1; // single SDI to reset second packet - } - if (eventPoolFlags == 0) { - expectedStoreDataImm++; // but for immediate events, SDI is used instead PC, then add 1 here - } - ASSERT_EQ(expectedStoreDataImm + extraCleanupStoreDataImm, itorStoreDataImm.size()); - } else { - // TS events reset uses getMaxPacketsCount(), no need to reset not used packets - if (eventPoolFlags == ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP) { - uint64_t gpuAddress = event->getGpuAddress(device); - gpuAddress += event->getContextEndOffset(); - // last packet is reset by PIPE_CONTROL w/ post sync - uint32_t expectedStoreDataImm = event->getMaxPacketsCount() - 1; - - ASSERT_EQ(expectedStoreDataImm + extraCleanupStoreDataImm, static_cast(itorStoreDataImm.size())); - for (uint32_t i = 0; i < expectedStoreDataImm; i++) { - auto cmd = genCmdCast(*itorStoreDataImm[i]); - EXPECT_EQ(gpuAddress, cmd->getAddress()); - EXPECT_FALSE(cmd->getStoreQword()); - EXPECT_EQ(Event::STATE_CLEARED, cmd->getDataDword0()); - gpuAddress += event->getSinglePacketSize(); - } + EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); } else { - uint32_t packetUsed = event->getPacketsInUse(); - uint32_t remainingResetSdiCommands = event->getMaxPacketsCount() - packetUsed; + EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable()); + } - uint32_t packetOffsetFactor = 1; - uint32_t usePacketSignalStoreDataImm = 1; // single SDI to reset single packet in single tile - bool usePartitioningWrite = false; - if (this->alignEventPacketsForReset) { - remainingResetSdiCommands /= commandList->partitionCount; - packetOffsetFactor = commandList->partitionCount; - - if constexpr (multiTile == 1) { - usePacketSignalStoreDataImm++; // and two SDI to reset two packets in multi tile - usePartitioningWrite = true; // only when number of not used packets is aligned to partition count, multi-tile reset can be split to both tiles - } - } - - ASSERT_EQ(remainingResetSdiCommands + usePacketSignalStoreDataImm + extraCleanupStoreDataImm, static_cast(itorStoreDataImm.size())); - - uint64_t gpuAddress = event->getGpuAddress(device); - gpuAddress += (packetUsed * event->getSinglePacketSize()); - if (event->isUsingContextEndOffset()) { - gpuAddress += event->getContextEndOffset(); - } - - for (uint32_t i = usePacketSignalStoreDataImm; i < itorStoreDataImm.size() - extraCleanupStoreDataImm; i++) { - auto cmd = genCmdCast(*itorStoreDataImm[i]); - EXPECT_EQ(gpuAddress, cmd->getAddress()); - EXPECT_FALSE(cmd->getStoreQword()); - EXPECT_EQ(Event::STATE_CLEARED, cmd->getDataDword0()); - if (usePartitioningWrite) { - EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); - } else { - EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable()); - } - gpuAddress += (event->getSinglePacketSize() * packetOffsetFactor); + gpuAddress += event->getSinglePacketSize() * commandList->partitionCount; + } + uint32_t postSyncPipeControls = 0; + for (auto it : itorPipeControl) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + postSyncPipeControls++; + EXPECT_EQ(gpuAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(Event::STATE_CLEARED, cmd->getImmediateData()); + if constexpr (multiTile == 1) { + EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable()); + } else { + EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable()); } } } + EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls); } }