Fix issues in signal all event packets 7/n

This fix is a refactor that improves few parts of the code
- code is easier to analyze, read and maintain
- dispatching process and common code is unified and reused
- signal of all event packets is incorporated in shared code
- number of post sync hw commands is optimized thanks to  multi-tile
post sync capabilities

Related-To: NEO-7490

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2023-01-10 01:25:57 +00:00
committed by Compute-Runtime-Automation
parent 614928ed45
commit 8f2af28b11
6 changed files with 281 additions and 378 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2022 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -53,6 +53,12 @@ struct CmdListFillKernelArguments {
uint32_t patternSizeInEls = 0;
};
struct CmdListEventOperation {
size_t operationOffset = 0;
uint32_t operationCount = 0;
bool workPartitionOperation = false;
};
struct EventPool;
struct Event;
@@ -294,8 +300,12 @@ struct CommandListCoreFamily : CommandListImp {
compactL3FlushEvent(dcFlush);
}
void allocateKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread);
void setRemainingEventPackets(Event *event, uint32_t value);
void waitOnRemainingEventPackets(Event *event);
CmdListEventOperation estimateEventPostSync(Event *event, uint32_t operations);
void dispatchPostSyncCopy(uint64_t gpuAddress, uint32_t value, bool workloadPartition);
void dispatchPostSyncCompute(uint64_t gpuAddress, uint32_t value, bool workloadPartition);
void dispatchPostSyncCommands(const CmdListEventOperation &eventOperations, uint64_t gpuAddress, uint32_t value);
void dispatchEventPostSyncOperation(Event *event, uint32_t value, bool omitFirstOperation, bool useMax, bool useLastPipeControl);
size_t cmdListCurrentStartOffset = 0;
bool containsAnyKernel = false;

View File

@@ -380,10 +380,6 @@ template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendEventReset(ze_event_handle_t hEvent) {
auto event = Event::fromHandle(hEvent);
uint64_t baseAddr = event->getGpuAddress(this->device);
uint32_t packetsToReset = event->getPacketsInUse();
bool appendPipeControlWithPostSync = false;
NEO::Device *neoDevice = device->getNEODevice();
uint32_t callId = 0;
if (NEO::DebugManager.flags.EnableSWTags.get()) {
@@ -395,65 +391,17 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendEventReset(ze_event_hand
callId = neoDevice->getRootDeviceEnvironment().tagsManager->currentCallCount;
}
if (event->isUsingContextEndOffset()) {
baseAddr += event->getContextEndOffset();
}
if (event->isEventTimestampFlagSet()) {
packetsToReset = event->getMaxPacketsCount();
}
event->resetPackets(false);
event->disableHostCaching(this->cmdListType == CommandList::CommandListType::TYPE_REGULAR);
commandContainer.addToResidencyContainer(&event->getAllocation(this->device));
const auto &hwInfo = this->device->getHwInfo();
if (isCopyOnly()) {
NEO::MiFlushArgs args;
args.commandWithPostSync = true;
for (uint32_t i = 0u; i < packetsToReset; i++) {
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(),
baseAddr,
Event::STATE_CLEARED, args, hwInfo);
baseAddr += event->getSinglePacketSize();
}
if ((this->signalAllEventPackets) && (packetsToReset < event->getMaxPacketsCount())) {
setRemainingEventPackets(event, Event::STATE_CLEARED);
}
} else {
bool applyScope = event->signalScope;
uint32_t packetsToResetUsingSdi = packetsToReset;
if (applyScope || event->isEventTimestampFlagSet()) {
UNRECOVERABLE_IF(packetsToReset == 0);
packetsToResetUsingSdi = packetsToReset - 1;
appendPipeControlWithPostSync = true;
}
for (uint32_t i = 0u; i < packetsToResetUsingSdi; i++) {
NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(
*commandContainer.getCommandStream(),
baseAddr,
Event::STATE_CLEARED,
0u,
false,
false);
baseAddr += event->getSinglePacketSize();
}
// default state of event is single packet, handle case when reset is used 1st, launchkernel 2nd - just reset all packets then, use max
bool useMaxPackets = event->isEventTimestampFlagSet() || (event->getPacketsInUse() < this->partitionCount);
if ((this->signalAllEventPackets) && (packetsToReset < event->getMaxPacketsCount())) {
setRemainingEventPackets(event, Event::STATE_CLEARED);
}
if (appendPipeControlWithPostSync) {
NEO::PipeControlArgs args;
args.dcFlushEnable = getDcFlushRequired(!!event->signalScope);
NEO::MemorySynchronizationCommands<GfxFamily>::addBarrierWithPostSyncOperation(
*commandContainer.getCommandStream(),
NEO::PostSyncMode::ImmediateData,
baseAddr,
Event::STATE_CLEARED,
hwInfo,
args);
}
bool appendPipeControlWithPostSync = (!isCopyOnly()) && (!!event->signalScope || event->isEventTimestampFlagSet());
dispatchEventPostSyncOperation(event, Event::STATE_CLEARED, false, useMaxPackets, appendPipeControlWithPostSync);
if (!isCopyOnly()) {
if (this->partitionCount > 1) {
appendMultiTileBarrier(*neoDevice);
}
@@ -1847,38 +1795,9 @@ void CommandListCoreFamily<gfxCoreFamily>::appendSignalEventPostWalker(Event *ev
} else {
event->resetKernelCountAndPacketUsedCount();
commandContainer.addToResidencyContainer(&event->getAllocation(this->device));
uint64_t baseAddr = event->getGpuAddress(this->device);
if (event->isUsingContextEndOffset()) {
baseAddr += event->getContextEndOffset();
}
const auto &hwInfo = this->device->getHwInfo();
if (isCopyOnly()) {
NEO::MiFlushArgs args;
args.commandWithPostSync = true;
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), baseAddr, Event::STATE_SIGNALED,
args, hwInfo);
if (this->signalAllEventPackets && (event->getPacketsInUse() < event->getMaxPacketsCount())) {
setRemainingEventPackets(event, Event::STATE_SIGNALED);
}
} else {
NEO::PipeControlArgs args;
args.dcFlushEnable = getDcFlushRequired(!!event->signalScope);
if (this->partitionCount > 1) {
args.workloadPartitionOffset = true;
event->setPacketsInUse(this->partitionCount);
}
if (this->signalAllEventPackets && (event->getPacketsInUse() < event->getMaxPacketsCount())) {
setRemainingEventPackets(event, Event::STATE_SIGNALED);
}
NEO::MemorySynchronizationCommands<GfxFamily>::addBarrierWithPostSyncOperation(
*commandContainer.getCommandStream(),
NEO::PostSyncMode::ImmediateData,
baseAddr,
Event::STATE_SIGNALED,
hwInfo,
args);
}
event->setPacketsInUse(this->partitionCount);
dispatchEventPostSyncOperation(event, Event::STATE_SIGNALED, false, false, !isCopyOnly());
}
}
@@ -1895,9 +1814,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingCopyCommand(Ev
NEO::MiFlushArgs args;
const auto &hwInfo = this->device->getHwInfo();
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args, hwInfo);
if (this->signalAllEventPackets && (event->getPacketsInUse() < event->getMaxPacketsCount())) {
setRemainingEventPackets(event, Event::STATE_SIGNALED);
}
dispatchEventPostSyncOperation(event, Event::STATE_SIGNALED, true, false, false);
}
appendWriteKernelTimestamp(event, beforeWalker, false, false);
}
@@ -2017,7 +1934,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
event->resetKernelCountAndPacketUsedCount();
commandContainer.addToResidencyContainer(&event->getAllocation(this->device));
uint64_t baseAddr = event->getGpuAddress(this->device);
NEO::Device *neoDevice = device->getNEODevice();
uint32_t callId = 0;
if (NEO::DebugManager.flags.EnableSWTags.get()) {
@@ -2028,53 +1944,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
++neoDevice->getRootDeviceEnvironment().tagsManager->currentCallCount);
callId = neoDevice->getRootDeviceEnvironment().tagsManager->currentCallCount;
}
size_t eventSignalOffset = 0;
if (event->isUsingContextEndOffset()) {
eventSignalOffset = event->getContextEndOffset();
}
const auto &hwInfo = this->device->getHwInfo();
if (isCopyOnly()) {
NEO::MiFlushArgs args;
args.commandWithPostSync = true;
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), ptrOffset(baseAddr, eventSignalOffset),
Event::STATE_SIGNALED, args, hwInfo);
if (this->signalAllEventPackets && (event->getPacketsInUse() < event->getMaxPacketsCount())) {
setRemainingEventPackets(event, Event::STATE_SIGNALED);
}
} else {
NEO::PipeControlArgs args;
bool applyScope = !!event->signalScope;
args.dcFlushEnable = getDcFlushRequired(applyScope);
if (this->partitionCount > 1) {
event->setPacketsInUse(this->partitionCount);
args.workloadPartitionOffset = true;
}
if (this->signalAllEventPackets && (event->getPacketsInUse() < event->getMaxPacketsCount())) {
setRemainingEventPackets(event, Event::STATE_SIGNALED);
}
if (applyScope || event->isEventTimestampFlagSet()) {
NEO::MemorySynchronizationCommands<GfxFamily>::addBarrierWithPostSyncOperation(
*commandContainer.getCommandStream(),
NEO::PostSyncMode::ImmediateData,
ptrOffset(baseAddr, eventSignalOffset),
Event::STATE_SIGNALED,
hwInfo,
args);
} else {
NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(
*commandContainer.getCommandStream(),
ptrOffset(baseAddr, eventSignalOffset),
Event::STATE_SIGNALED,
0u,
false,
args.workloadPartitionOffset);
}
}
event->setPacketsInUse(this->partitionCount);
bool appendPipeControlWithPostSync = (!isCopyOnly()) && (!!event->signalScope || event->isEventTimestampFlagSet());
dispatchEventPostSyncOperation(event, Event::STATE_SIGNALED, false, false, appendPipeControlWithPostSync);
if (NEO::DebugManager.flags.EnableSWTags.get()) {
neoDevice->getRootDeviceEnvironment().tagsManager->insertTag<GfxFamily, NEO::SWTags::CallNameEndTag>(
@@ -2232,9 +2105,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(Event *event,
bool workloadPartition = setupTimestampEventForMultiTile(event);
appendWriteKernelTimestamp(event, beforeWalker, true, workloadPartition);
} else {
if (this->signalAllEventPackets && (event->getPacketsInUse() < event->getMaxPacketsCount())) {
setRemainingEventPackets(event, Event::STATE_SIGNALED);
}
dispatchEventPostSyncOperation(event, Event::STATE_SIGNALED, true, false, false);
const auto &hwInfo = this->device->getHwInfo();
NEO::PipeControlArgs args;
@@ -2843,56 +2714,6 @@ void CommandListCoreFamily<gfxCoreFamily>::allocateKernelPrivateMemoryIfNeeded(K
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::setRemainingEventPackets(Event *event, uint32_t value) {
uint32_t packetUsed = event->getPacketsInUse();
uint32_t packetsRemaining = event->getMaxPacketsCount() - packetUsed;
if (packetsRemaining == 0) {
return;
}
uint64_t gpuAddress = event->getGpuAddress(this->device);
size_t packetSize = event->getSinglePacketSize();
gpuAddress += packetSize * packetUsed;
if (event->isUsingContextEndOffset()) {
gpuAddress += event->getContextEndOffset();
}
uint32_t operationsRemaining = packetsRemaining;
size_t operationOffset = packetSize;
bool partitionEnabled = false;
if ((this->partitionCount > 1) && (packetsRemaining % this->partitionCount == 0)) {
operationsRemaining = operationsRemaining / this->partitionCount;
operationOffset = operationOffset * this->partitionCount;
partitionEnabled = true;
}
for (uint32_t i = 0; i < operationsRemaining; i++) {
if (isCopyOnly()) {
const auto &hwInfo = this->device->getHwInfo();
NEO::MiFlushArgs args;
args.commandWithPostSync = true;
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(
*commandContainer.getCommandStream(),
gpuAddress,
value,
args,
hwInfo);
} else {
NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(
*commandContainer.getCommandStream(),
gpuAddress,
value,
0u,
false,
partitionEnabled);
}
gpuAddress += operationOffset;
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::waitOnRemainingEventPackets(Event *event) {
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
@@ -2919,4 +2740,97 @@ void CommandListCoreFamily<gfxCoreFamily>::waitOnRemainingEventPackets(Event *ev
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
CmdListEventOperation CommandListCoreFamily<gfxCoreFamily>::estimateEventPostSync(Event *event, uint32_t operations) {
CmdListEventOperation ret;
UNRECOVERABLE_IF(operations & (this->partitionCount - 1));
ret.operationCount = operations / this->partitionCount;
ret.operationOffset = event->getSinglePacketSize() * this->partitionCount;
ret.workPartitionOperation = this->partitionCount > 1;
return ret;
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCopy(uint64_t gpuAddress, uint32_t value, bool workloadPartition) {
const auto &hwInfo = this->device->getHwInfo();
NEO::MiFlushArgs miFlushArgs;
miFlushArgs.commandWithPostSync = true;
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(
*commandContainer.getCommandStream(),
gpuAddress,
value,
miFlushArgs,
hwInfo);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCompute(uint64_t gpuAddress, uint32_t value, bool workloadPartition) {
NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(
*commandContainer.getCommandStream(),
gpuAddress,
value,
0u,
false,
workloadPartition);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCommands(const CmdListEventOperation &eventOperations, uint64_t gpuAddress, uint32_t value) {
decltype(&CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCompute) dispatchFunction = &CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCompute;
if (isCopyOnly()) {
dispatchFunction = &CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCopy;
}
for (uint32_t i = 0; i < eventOperations.operationCount; i++) {
(this->*dispatchFunction)(gpuAddress, value, eventOperations.workPartitionOperation);
gpuAddress += eventOperations.operationOffset;
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::dispatchEventPostSyncOperation(Event *event, uint32_t value, bool omitFirstOperation, bool useMax, bool useLastPipeControl) {
uint32_t packets = event->getPacketsInUse();
if (this->signalAllEventPackets || useMax) {
packets = event->getMaxPacketsCount();
}
auto eventPostSync = estimateEventPostSync(event, packets);
uint64_t gpuAddress = event->getGpuAddress(this->device);
if (event->isUsingContextEndOffset()) {
gpuAddress += event->getContextEndOffset();
}
if (omitFirstOperation) {
gpuAddress += eventPostSync.operationOffset;
eventPostSync.operationCount--;
}
if (useLastPipeControl) {
eventPostSync.operationCount--;
}
dispatchPostSyncCommands(eventPostSync, gpuAddress, value);
if (useLastPipeControl) {
const auto &hwInfo = this->device->getHwInfo();
NEO::PipeControlArgs pipeControlArgs;
pipeControlArgs.dcFlushEnable = getDcFlushRequired(!!event->signalScope);
pipeControlArgs.workloadPartitionOffset = eventPostSync.workPartitionOperation;
gpuAddress += eventPostSync.operationCount * eventPostSync.operationOffset;
NEO::MemorySynchronizationCommands<GfxFamily>::addBarrierWithPostSyncOperation(
*commandContainer.getCommandStream(),
NEO::PostSyncMode::ImmediateData,
gpuAddress,
value,
hwInfo,
pipeControlArgs);
}
}
} // namespace L0

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
* Copyright (C) 2021-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -294,8 +294,16 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
if (l3FlushEnable) {
programEventL3Flush<gfxCoreFamily>(event, this->device, partitionCount, commandContainer);
}
if (this->signalAllEventPackets) {
setRemainingEventPackets(event, Event::STATE_SIGNALED);
if (this->signalAllEventPackets && event->getPacketsInUse() < event->getMaxPacketsCount()) {
uint32_t packets = event->getMaxPacketsCount() - event->getPacketsInUse();
CmdListEventOperation remainingPacketsOperation = estimateEventPostSync(event, packets);
uint64_t eventAddress = event->getGpuAddress(device) + event->getSinglePacketSize() * event->getPacketsInUse();
if (event->isUsingContextEndOffset()) {
eventAddress += event->getContextEndOffset();
}
dispatchPostSyncCommands(remainingPacketsOperation, eventAddress, Event::STATE_SIGNALED);
}
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2022 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -71,7 +71,6 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
using BaseClass::pipeControlMultiKernelEventSync;
using BaseClass::pipelineSelectStateTracking;
using BaseClass::requiredStreamState;
using BaseClass::setRemainingEventPackets;
using BaseClass::setupTimestampEventForMultiTile;
using BaseClass::signalAllEventPackets;
using BaseClass::stateComputeModeTracking;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2022 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -407,7 +407,6 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
auto &hwInfo = device->getNEODevice()->getHardwareInfo();
size_t expectedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(hwInfo, false) +
((packets - 1) * sizeof(MI_STORE_DATA_IMM)) +
commandList->estimateBufferSizeMultiTileBarrier(hwInfo);
size_t usedSize = cmdStream->getUsed();
EXPECT_EQ(expectedSize, usedSize);
@@ -418,10 +417,9 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
cmdStream->getCpuBase(),
usedSize));
auto itorSdi = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
auto cmd = genCmdCast<MI_STORE_DATA_IMM *>(*itorSdi);
EXPECT_EQ(gpuAddress, cmd->getAddress());
gpuAddress += event->getSinglePacketSize();
auto itorSdi = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
// multi tile barrier self-cleanup commands
ASSERT_EQ(2u, itorSdi.size());
auto pipeControlList = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
ASSERT_NE(0u, pipeControlList.size());
@@ -434,8 +432,9 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
EXPECT_TRUE(cmd->getCommandStreamerStallEnable());
EXPECT_EQ(gpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, *defaultHwInfo), cmd->getDcFlushEnable());
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
postSyncFound++;
gpuAddress += event->getSinglePacketSize();
gpuAddress += event->getSinglePacketSize() * commandList->partitionCount;
postSyncPipeControlItor = it;
}
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
* Copyright (C) 2021-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -553,76 +553,6 @@ HWTEST2_F(CommandListAppendLaunchKernelMultiTileCompactL3FlushEnabledTest,
testAppendLaunchKernelAndL3Flush<gfxCoreFamily>(input, arg);
}
HWTEST2_F(CommandListTests, GivenCopyCommandListWhenSettingRemainingEventPacketsThenExpectMiDwordFlushCommandsProgrammingPackets, IsAtLeastXeHpCore) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
DebugManagerStateRestore restorer;
NEO::DebugManager.flags.UseDynamicEventPacketsCount.set(1);
NEO::DebugManager.flags.SignalAllEventPackets.set(1);
NEO::DebugManager.flags.UsePipeControlMultiKernelEventSync.set(0);
NEO::DebugManager.flags.CompactL3FlushEventPacket.set(0);
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
auto result = commandList->initialize(device, NEO::EngineGroupType::Copy, 0u);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
auto cmdStream = commandList->commandContainer.getCommandStream();
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = 0;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = 0;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
ASSERT_NE(nullptr, event.get());
uint32_t packetUsed = event->getPacketsInUse();
uint32_t remainingPackets = event->getMaxPacketsCount() - packetUsed;
size_t sizeBefore = cmdStream->getUsed();
commandList->setRemainingEventPackets(event.get(), Event::STATE_SIGNALED);
size_t sizeAfter = cmdStream->getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(cmdStream->getCpuBase(), sizeBefore),
(sizeAfter - sizeBefore)));
uint32_t expectedMiFlushCount = remainingPackets;
if (NEO::EncodeMiFlushDW<FamilyType>::getMiFlushDwWaSize() > 0) {
expectedMiFlushCount *= 2;
}
auto miFlushList = findAll<MI_FLUSH_DW *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(expectedMiFlushCount, static_cast<uint32_t>(miFlushList.size()));
uint64_t gpuAddress = event->getGpuAddress(device);
gpuAddress += (packetUsed * event->getSinglePacketSize());
if (event->isUsingContextEndOffset()) {
gpuAddress += event->getContextEndOffset();
}
for (uint32_t i = 0; i < expectedMiFlushCount; i++) {
if ((expectedMiFlushCount == 2 * remainingPackets) && (i % 2 == 0)) {
continue;
}
auto cmd = genCmdCast<MI_FLUSH_DW *>(*miFlushList[i]);
EXPECT_EQ(gpuAddress, cmd->getDestinationAddress());
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
EXPECT_EQ(MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD, cmd->getPostSyncOperation());
gpuAddress += event->getSinglePacketSize();
}
}
template <uint32_t multiTile, uint32_t limitEventPacketes, uint32_t copyOnly>
struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
void setUp() {
@@ -719,7 +649,8 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
gpuAddress += event->getContextEndOffset();
}
for (uint32_t i = extraCleanupStoreDataImm; i < itorStoreDataImm.size(); i++) {
uint32_t startIndex = extraCleanupStoreDataImm;
for (uint32_t i = startIndex; i < remainingPackets + extraCleanupStoreDataImm; i++) {
auto cmd = genCmdCast<MI_STORE_DATA_IMM *>(*itorStoreDataImm[i]);
EXPECT_EQ(gpuAddress, cmd->getAddress());
EXPECT_FALSE(cmd->getStoreQword());
@@ -739,6 +670,8 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
auto engineType = copyOnly == 1 ? NEO::EngineGroupType::Copy : NEO::EngineGroupType::Compute;
@@ -802,6 +735,7 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
} else {
auto itorStoreDataImm = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
auto itorPipeControl = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
uint64_t gpuAddress = event->getGpuAddress(device);
if (event->isUsingContextEndOffset()) {
@@ -825,29 +759,34 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
} else {
EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable());
}
} else {
uint32_t postSyncPipeControls = 0;
for (auto it : itorPipeControl) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
postSyncPipeControls++;
EXPECT_EQ(gpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
if constexpr (multiTile == 1) {
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
} else {
EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable());
}
}
}
EXPECT_EQ(1u, postSyncPipeControls);
}
} else {
uint32_t packetUsed = event->getPacketsInUse();
uint32_t remainingPackets = event->getMaxPacketsCount() - packetUsed;
EXPECT_EQ(0u, remainingPackets % commandList->partitionCount);
remainingPackets /= commandList->partitionCount;
ASSERT_EQ(remainingPackets + extraSignalStoreDataImm, static_cast<uint32_t>(itorStoreDataImm.size()));
if (extraSignalStoreDataImm == 1) {
auto cmd = genCmdCast<MI_STORE_DATA_IMM *>(*itorStoreDataImm[itorStoreDataImm.size() - 1]);
EXPECT_EQ(gpuAddress, cmd->getAddress());
EXPECT_FALSE(cmd->getStoreQword());
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getDataDword0());
if constexpr (multiTile == 1) {
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
} else {
EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable());
}
uint32_t packets = event->getMaxPacketsCount();
EXPECT_EQ(0u, packets % commandList->partitionCount);
packets /= commandList->partitionCount;
if (extraSignalStoreDataImm == 0) {
packets--;
}
gpuAddress += (packetUsed * event->getSinglePacketSize());
ASSERT_EQ(packets, static_cast<uint32_t>(itorStoreDataImm.size()));
for (uint32_t i = 0; i < itorStoreDataImm.size() - extraSignalStoreDataImm; i++) {
for (uint32_t i = 0; i < packets; i++) {
auto cmd = genCmdCast<MI_STORE_DATA_IMM *>(*itorStoreDataImm[i]);
EXPECT_EQ(gpuAddress, cmd->getAddress());
EXPECT_FALSE(cmd->getStoreQword());
@@ -859,6 +798,24 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
}
gpuAddress += (event->getSinglePacketSize() * commandList->partitionCount);
}
if (extraSignalStoreDataImm == 0) {
uint32_t postSyncPipeControls = 0;
for (auto it : itorPipeControl) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
postSyncPipeControls++;
EXPECT_EQ(gpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
if constexpr (multiTile == 1) {
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
} else {
EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable());
}
}
}
EXPECT_EQ(1u, postSyncPipeControls);
}
}
}
}
@@ -938,6 +895,7 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
auto engineType = copyOnly == 1 ? NEO::EngineGroupType::Copy : NEO::EngineGroupType::Compute;
@@ -971,6 +929,11 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
ptrOffset(cmdStream->getCpuBase(), sizeBefore),
(sizeAfter - sizeBefore)));
uint64_t gpuAddress = event->getGpuAddress(device);
if (event->isUsingContextEndOffset()) {
gpuAddress += event->getContextEndOffset();
}
if constexpr (copyOnly == 1) {
auto itorFlushDw = findAll<MI_FLUSH_DW *>(cmdList.begin(), cmdList.end());
@@ -983,11 +946,6 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
expectedFlushDw *= flushCmdWaFactor;
ASSERT_EQ(expectedFlushDw, itorFlushDw.size());
uint64_t gpuAddress = event->getGpuAddress(device);
if (event->isUsingContextEndOffset()) {
gpuAddress += event->getContextEndOffset();
}
uint32_t startingSignalCmd = 0;
if (eventPoolFlags != 0) {
auto cmd = genCmdCast<MI_FLUSH_DW *>(*itorFlushDw[(flushCmdWaFactor - 1)]);
@@ -1013,23 +971,46 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
} else {
auto itorStoreDataImm = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
auto itorPipeControl = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
uint32_t expectedPostSyncPipeControls = 0;
if (eventPoolFlags == 0) {
expectedPostSyncPipeControls = 1;
}
if constexpr (limitEventPacketes == 1) {
constexpr uint32_t expectedStoreDataImm = 0;
ASSERT_EQ(expectedStoreDataImm, itorStoreDataImm.size());
} else {
uint32_t packetUsed = event->getPacketsInUse();
uint32_t remainingPackets = event->getMaxPacketsCount() - packetUsed;
remainingPackets /= commandList->partitionCount;
ASSERT_EQ(remainingPackets, static_cast<uint32_t>(itorStoreDataImm.size()));
uint64_t gpuAddress = event->getGpuAddress(device);
gpuAddress += (packetUsed * event->getSinglePacketSize());
if (event->isUsingContextEndOffset()) {
gpuAddress += event->getContextEndOffset();
uint32_t postSyncPipeControls = 0;
for (auto it : itorPipeControl) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
postSyncPipeControls++;
EXPECT_EQ(gpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
if constexpr (multiTile == 1) {
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
} else {
EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable());
}
}
}
EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls);
} else {
uint32_t packets = event->getMaxPacketsCount();
EXPECT_EQ(0u, packets % commandList->partitionCount);
packets /= commandList->partitionCount;
packets--;
ASSERT_EQ(packets, static_cast<uint32_t>(itorStoreDataImm.size()));
if (eventPoolFlags != 0) {
gpuAddress += (event->getSinglePacketSize() * commandList->partitionCount);
}
for (uint32_t i = 0; i < remainingPackets; i++) {
for (uint32_t i = 0; i < packets; i++) {
auto cmd = genCmdCast<MI_STORE_DATA_IMM *>(*itorStoreDataImm[i]);
EXPECT_EQ(gpuAddress, cmd->getAddress());
EXPECT_FALSE(cmd->getStoreQword());
@@ -1041,12 +1022,22 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
}
gpuAddress += (event->getSinglePacketSize() * commandList->partitionCount);
}
if (remainingPackets > 0) {
auto lastIterator = itorStoreDataImm[itorStoreDataImm.size() - 1];
++lastIterator;
auto cmd = genCmdCast<PIPE_CONTROL *>(*lastIterator);
EXPECT_NE(nullptr, cmd);
uint32_t postSyncPipeControls = 0;
for (auto it : itorPipeControl) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
postSyncPipeControls++;
EXPECT_EQ(gpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
if constexpr (multiTile == 1) {
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
} else {
EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable());
}
}
}
EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls);
}
}
}
@@ -1116,6 +1107,8 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
auto engineType = copyOnly == 1 ? NEO::EngineGroupType::Copy : NEO::EngineGroupType::Compute;
@@ -1146,6 +1139,11 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
size_t sizeAfter = cmdStream->getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
uint64_t gpuAddress = event->getGpuAddress(device);
if (event->isUsingContextEndOffset()) {
gpuAddress += event->getContextEndOffset();
}
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
@@ -1163,11 +1161,6 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
uint32_t expectedFlushDw = event->getMaxPacketsCount() * flushCmdWaFactor;
ASSERT_EQ(expectedFlushDw, itorFlushDw.size());
uint64_t gpuAddress = event->getGpuAddress(device);
if (event->isUsingContextEndOffset()) {
gpuAddress += event->getContextEndOffset();
}
for (uint32_t i = 0; i < expectedFlushDw; i++) {
auto cmd = genCmdCast<MI_FLUSH_DW *>(*itorFlushDw[i]);
if (flushCmdWaFactor == 2) {
@@ -1183,6 +1176,7 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
} else {
auto itorStoreDataImm = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
auto itorPipeControl = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
uint32_t extraCleanupStoreDataImm = 0;
if constexpr (multiTile == 1) {
@@ -1190,70 +1184,49 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
extraCleanupStoreDataImm = 2;
}
if constexpr (limitEventPacketes == 1) { // single packet for single tile, two packets for two tiles
uint32_t expectedStoreDataImm = 0; // single packet will be reset by PC or SDI - assume here PC is used for timestamp event
uint32_t expectedStoreDataImm = event->getMaxPacketsCount() / commandList->partitionCount;
if constexpr (limitEventPacketes == 1) {
// single packet will be reset by PC or SDI
expectedStoreDataImm = 1;
}
uint32_t expectedPostSyncPipeControls = 0;
// last packet is reset by PIPE_CONTROL w/ post sync
if (eventPoolFlags == ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP) {
expectedStoreDataImm--;
expectedPostSyncPipeControls = 1;
}
ASSERT_EQ(expectedStoreDataImm + extraCleanupStoreDataImm, static_cast<uint32_t>(itorStoreDataImm.size()));
for (uint32_t i = 0; i < expectedStoreDataImm; i++) {
auto cmd = genCmdCast<MI_STORE_DATA_IMM *>(*itorStoreDataImm[i]);
EXPECT_EQ(gpuAddress, cmd->getAddress());
EXPECT_FALSE(cmd->getStoreQword());
EXPECT_EQ(Event::STATE_CLEARED, cmd->getDataDword0());
if constexpr (multiTile == 1) {
expectedStoreDataImm = 1; // single SDI to reset second packet
}
if (eventPoolFlags == 0) {
expectedStoreDataImm++; // but for immediate events, SDI is used instead PC, then add 1 here
}
ASSERT_EQ(expectedStoreDataImm + extraCleanupStoreDataImm, itorStoreDataImm.size());
} else {
// TS events reset uses getMaxPacketsCount(), no need to reset not used packets
if (eventPoolFlags == ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP) {
uint64_t gpuAddress = event->getGpuAddress(device);
gpuAddress += event->getContextEndOffset();
// last packet is reset by PIPE_CONTROL w/ post sync
uint32_t expectedStoreDataImm = event->getMaxPacketsCount() - 1;
ASSERT_EQ(expectedStoreDataImm + extraCleanupStoreDataImm, static_cast<uint32_t>(itorStoreDataImm.size()));
for (uint32_t i = 0; i < expectedStoreDataImm; i++) {
auto cmd = genCmdCast<MI_STORE_DATA_IMM *>(*itorStoreDataImm[i]);
EXPECT_EQ(gpuAddress, cmd->getAddress());
EXPECT_FALSE(cmd->getStoreQword());
EXPECT_EQ(Event::STATE_CLEARED, cmd->getDataDword0());
gpuAddress += event->getSinglePacketSize();
}
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
} else {
uint32_t packetUsed = event->getPacketsInUse();
uint32_t remainingResetSdiCommands = event->getMaxPacketsCount() - packetUsed;
EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable());
}
uint32_t packetOffsetFactor = 1;
uint32_t usePacketSignalStoreDataImm = 1; // single SDI to reset single packet in single tile
bool usePartitioningWrite = false;
if (this->alignEventPacketsForReset) {
remainingResetSdiCommands /= commandList->partitionCount;
packetOffsetFactor = commandList->partitionCount;
if constexpr (multiTile == 1) {
usePacketSignalStoreDataImm++; // and two SDI to reset two packets in multi tile
usePartitioningWrite = true; // only when number of not used packets is aligned to partition count, multi-tile reset can be split to both tiles
}
}
ASSERT_EQ(remainingResetSdiCommands + usePacketSignalStoreDataImm + extraCleanupStoreDataImm, static_cast<uint32_t>(itorStoreDataImm.size()));
uint64_t gpuAddress = event->getGpuAddress(device);
gpuAddress += (packetUsed * event->getSinglePacketSize());
if (event->isUsingContextEndOffset()) {
gpuAddress += event->getContextEndOffset();
}
for (uint32_t i = usePacketSignalStoreDataImm; i < itorStoreDataImm.size() - extraCleanupStoreDataImm; i++) {
auto cmd = genCmdCast<MI_STORE_DATA_IMM *>(*itorStoreDataImm[i]);
EXPECT_EQ(gpuAddress, cmd->getAddress());
EXPECT_FALSE(cmd->getStoreQword());
EXPECT_EQ(Event::STATE_CLEARED, cmd->getDataDword0());
if (usePartitioningWrite) {
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
} else {
EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable());
}
gpuAddress += (event->getSinglePacketSize() * packetOffsetFactor);
gpuAddress += event->getSinglePacketSize() * commandList->partitionCount;
}
uint32_t postSyncPipeControls = 0;
for (auto it : itorPipeControl) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
postSyncPipeControls++;
EXPECT_EQ(gpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
EXPECT_EQ(Event::STATE_CLEARED, cmd->getImmediateData());
if constexpr (multiTile == 1) {
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
} else {
EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable());
}
}
}
EXPECT_EQ(expectedPostSyncPipeControls, postSyncPipeControls);
}
}