Optimize number of dc flush to single after all dispatched builtin kernels

Related-To: NEO-6871

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2022-05-17 16:32:57 +00:00
committed by Compute-Runtime-Automation
parent 1b8ceb2bbc
commit 955ef84306
4 changed files with 233 additions and 117 deletions

View File

@@ -161,7 +161,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
auto event = Event::fromHandle(hEvent);
eventAlloc = &event->getAllocation(this->device);
commandContainer.addToResidencyContainer(eventAlloc);
l3FlushEnable = NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(!!event->signalScope, hwInfo);
bool flushRequired = !!event->signalScope &&
!launchParams.isKernelSplitOperation;
l3FlushEnable = NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(flushRequired, hwInfo);
isTimestampEvent = event->isUsingContextEndOffset();
eventAddress = event->getPacketAddress(this->device);
}
@@ -362,9 +364,18 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(ze_
appendSignalEventPostWalker(hEvent, false);
}
} else {
if (hEvent && beforeWalker) {
if (hEvent) {
auto event = Event::fromHandle(hEvent);
event->zeroKernelCount();
if (beforeWalker) {
event->zeroKernelCount();
} else {
const auto &hwInfo = this->device->getHwInfo();
if (NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(!!event->signalScope, hwInfo)) {
NEO::PipeControlArgs args;
args.dcFlushEnable = true;
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
}
}
}
}
}

View File

@@ -177,6 +177,13 @@ struct SingleRootMultiSubDeviceFixture : public MultiDeviceFixture {
NEO::Device *neoDevice = nullptr;
};
struct ImplicitScalingRootDevice : public SingleRootMultiSubDeviceFixture {
void SetUp() {
DebugManager.flags.EnableImplicitScaling.set(1);
SingleRootMultiSubDeviceFixture::SetUp();
}
};
struct ContextFixture : DeviceFixture {
void SetUp();
void TearDown();

View File

@@ -97,8 +97,20 @@ class AppendFillFixture : public DeviceFixture {
uint8_t *immediateDstPtr = nullptr;
};
struct MultiTileAppendFillFixture : public AppendFillFixture {
void SetUp() {
DebugManager.flags.CreateMultipleSubDevices.set(2);
DebugManager.flags.EnableImplicitScaling.set(1);
AppendFillFixture::SetUp();
}
DebugManagerStateRestore restorer;
};
using AppendFillTest = Test<AppendFillFixture>;
using MultiTileAppendFillTest = Test<MultiTileAppendFillFixture>;
HWTEST2_F(AppendFillTest,
givenCallToAppendMemoryFillWithImmediateValueThenSuccessIsReturned, IsAtLeastSkl) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
@@ -343,19 +355,23 @@ HWTEST2_F(AppendFillTest,
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
auto &commandContainer = commandList->commandContainer;
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
result = commandList->appendMemoryFill(immediateDstPtr, &immediatePattern,
sizeof(immediatePattern),
immediateAllocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
EXPECT_EQ(2u, event->getPacketsInUse());
EXPECT_EQ(2u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0),
commandList->commandContainer.getCommandStream()->getUsed()));
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u, itorWalkers.size());
@@ -394,17 +410,21 @@ HWTEST2_F(AppendFillTest,
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
auto &commandContainer = commandList->commandContainer;
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
EXPECT_EQ(2u, event->getPacketsInUse());
EXPECT_EQ(2u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0),
commandList->commandContainer.getCommandStream()->getUsed()));
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u, itorWalkers.size());
@@ -420,5 +440,85 @@ HWTEST2_F(AppendFillTest,
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
}
HWTEST2_F(MultiTileAppendFillTest,
givenMultiTileCmdListCallToAppendMemoryFillWhenSignalScopeTimestampEventUsesComputeWalkerPostSyncThenSeparateKernelsUsesPostSyncProfilingAndSingleDcFlushWhenRequired, IsAtLeastXeHpCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize();
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
EXPECT_EQ(2u, commandList->partitionCount);
auto &commandContainer = commandList->commandContainer;
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
result = commandList->appendMemoryFill(dstPtr, pattern, patternSize, allocSize, event->toHandle(), 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
EXPECT_EQ(4u, event->getPacketsInUse());
EXPECT_EQ(2u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
auto itorPipeControls = findAll<PIPE_CONTROL *>(secondWalker, cmdList.end());
uint32_t postSyncPipeControls = 0;
uint32_t dcFlushFound = 0;
for (auto it : itorPipeControls) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
postSyncPipeControls++;
}
if (cmd->getDcFlushEnable()) {
dcFlushFound++;
}
}
uint32_t expectedDcFlush =
NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, device->getHwInfo())
? 2 // 1st dc flush after cross-tile sync, 2nd dc flush for signal scope event
: 0;
EXPECT_EQ(0u, postSyncPipeControls);
EXPECT_EQ(expectedDcFlush, dcFlushFound);
}
} // namespace ult
} // namespace L0

View File

@@ -280,6 +280,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenVariousKernelsAndPatchingDisallowe
}
using AppendMemoryCopyXeHpAndLater = Test<DeviceFixture>;
using MultiTileAppendMemoryCopyXeHpAndLater = Test<ImplicitScalingRootDevice>;
HWTEST2_F(AppendMemoryCopyXeHpAndLater,
givenCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateKernels,
@@ -341,7 +342,7 @@ HWTEST2_F(AppendMemoryCopyXeHpAndLater,
EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
}
HWTEST2_F(AppendMemoryCopyXeHpAndLater,
HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLater,
givenMultiTileCommandListWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateMultiTileKernels,
IsAtLeastXeHpCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
@@ -350,9 +351,9 @@ HWTEST2_F(AppendMemoryCopyXeHpAndLater,
MockAppendMemoryCopy<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.partitionCount = 2;
commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
EXPECT_EQ(2u, commandList.partitionCount);
void *srcPtr = reinterpret_cast<void *>(0x1231);
void *dstPtr = reinterpret_cast<void *>(0x200002345);
@@ -403,7 +404,7 @@ HWTEST2_F(AppendMemoryCopyXeHpAndLater,
}
HWTEST2_F(AppendMemoryCopyXeHpAndLater,
givenCommandListAndEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateKernelsAndL3FlushWaHandled,
givenCommandListAndEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateKernelsAndL3FlushAddedOnce,
isXeHpOrXeHpgCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
@@ -431,11 +432,95 @@ HWTEST2_F(AppendMemoryCopyXeHpAndLater,
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + event->getSinglePacketSize();
uint64_t thirdKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize();
commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr);
EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled);
EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled);
EXPECT_EQ(3u, event->getPacketsInUse());
EXPECT_EQ(3u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0),
commandList.commandContainer.getCommandStream()->getUsed()));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(3u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto thirdWalker = itorWalkers[2];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*thirdWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
auto itorPipeControls = findAll<PIPE_CONTROL *>(firstWalker, cmdList.end());
uint32_t postSyncPipeControls = 0;
uint32_t dcFlushFound = 0;
for (auto it : itorPipeControls) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
postSyncPipeControls++;
}
if (cmd->getDcFlushEnable()) {
dcFlushFound++;
}
}
EXPECT_EQ(0u, postSyncPipeControls);
EXPECT_EQ(1u, dcFlushFound);
}
HWTEST2_F(MultiTileAppendMemoryCopyXeHpAndLater,
givenMultiTileCommandListAndEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateMultiTileKernelsAndL3FlusAddedForScopedEvent,
isXeHpOrXeHpgCore) {
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
EXPECT_EQ(2u, commandList.partitionCount);
auto &commandContainer = commandList.commandContainer;
void *srcPtr = reinterpret_cast<void *>(0x1231);
void *dstPtr = reinterpret_cast<void *>(0x200002345);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + 2 * event->getSinglePacketSize();
uint64_t thirdKernelEventAddress = event->getGpuAddress(device) + 4 * event->getSinglePacketSize();
size_t usedBefore = commandContainer.getCommandStream()->getUsed();
commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr);
size_t usedAfter = commandContainer.getCommandStream()->getUsed();
EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled);
EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled);
EXPECT_EQ(6u, event->getPacketsInUse());
@@ -443,8 +528,9 @@ HWTEST2_F(AppendMemoryCopyXeHpAndLater,
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0),
commandList.commandContainer.getCommandStream()->getUsed()));
cmdList,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(3u, itorWalkers.size());
@@ -464,113 +550,29 @@ HWTEST2_F(AppendMemoryCopyXeHpAndLater,
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
auto itorPipeControls = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
uint64_t eventGpuAddress = firstKernelEventAddress + event->getSinglePacketSize();
if (event->isUsingContextEndOffset()) {
eventGpuAddress += event->getContextEndOffset();
}
auto itorPipeControls = findAll<PIPE_CONTROL *>(thirdWalker, cmdList.end());
uint32_t postSyncPipeControls = 0;
uint32_t dcFlushFound = 0;
for (auto it : itorPipeControls) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
EXPECT_EQ(cmd->getImmediateData(), Event::STATE_SIGNALED);
EXPECT_TRUE(cmd->getCommandStreamerStallEnable());
EXPECT_FALSE(cmd->getWorkloadPartitionIdOffsetEnable());
EXPECT_TRUE(cmd->getDcFlushEnable());
EXPECT_EQ(eventGpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
postSyncPipeControls++;
eventGpuAddress += (2 * event->getSinglePacketSize());
}
if (cmd->getDcFlushEnable()) {
dcFlushFound++;
}
}
EXPECT_EQ(3u, postSyncPipeControls);
}
HWTEST2_F(AppendMemoryCopyXeHpAndLater,
givenMultiTileCommandListAndEventWithSignalScopeWhenTimestampProvidedByComputeWalkerPostSyncPassedToMemoryCopyThenAppendProfilingCalledForThreeSeparateMultiTileKernelsAndL3FlushWaHandled,
isXeHpOrXeHpgCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.partitionCount = 2;
commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1231);
void *dstPtr = reinterpret_cast<void *>(0x200002345);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
ze_result_t result = ZE_RESULT_SUCCESS;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
uint64_t firstKernelEventAddress = event->getGpuAddress(device);
uint64_t secondKernelEventAddress = event->getGpuAddress(device) + 4 * event->getSinglePacketSize();
uint64_t thirdKernelEventAddress = event->getGpuAddress(device) + 8 * event->getSinglePacketSize();
commandList.appendMemoryCopy(dstPtr, srcPtr, 0x100002345, event->toHandle(), 0, nullptr);
EXPECT_EQ(3u, commandList.appendMemoryCopyKernelWithGACalled);
EXPECT_EQ(0u, commandList.appendMemoryCopyBlitCalled);
EXPECT_EQ(12u, event->getPacketsInUse());
EXPECT_EQ(3u, event->getKernelCount());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0),
commandList.commandContainer.getCommandStream()->getUsed()));
auto itorWalkers = findAll<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(3u, itorWalkers.size());
auto firstWalker = itorWalkers[0];
auto secondWalker = itorWalkers[1];
auto thirdWalker = itorWalkers[2];
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*firstWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(firstKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*secondWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(secondKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
walkerCmd = genCmdCast<COMPUTE_WALKER *>(*thirdWalker);
EXPECT_EQ(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
EXPECT_EQ(thirdKernelEventAddress, walkerCmd->getPostSync().getDestinationAddress());
auto itorPipeControls = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
uint64_t eventGpuAddress = firstKernelEventAddress + 2 * event->getSinglePacketSize();
if (event->isUsingContextEndOffset()) {
eventGpuAddress += event->getContextEndOffset();
}
uint32_t postSyncPipeControls = 0;
for (auto it : itorPipeControls) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
EXPECT_EQ(cmd->getImmediateData(), Event::STATE_SIGNALED);
EXPECT_TRUE(cmd->getCommandStreamerStallEnable());
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
EXPECT_TRUE(cmd->getDcFlushEnable());
EXPECT_EQ(eventGpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
postSyncPipeControls++;
eventGpuAddress += (4 * event->getSinglePacketSize());
}
}
EXPECT_EQ(3u, postSyncPipeControls);
constexpr uint32_t expectedDcFlush = 2; //dc flush for last cross-tile sync and separately for signal scope event after last kernel split
EXPECT_EQ(0u, postSyncPipeControls);
EXPECT_EQ(expectedDcFlush, dcFlushFound);
}
HWTEST2_F(AppendMemoryCopyXeHpAndLater,
givenCommandListWhenMemoryCopyWithSignalEventScopeSetToSubDeviceThenB2BPipeControlIsAddedWithDcFlushForLastPC, isXeHpOrXeHpgCore) {
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
@@ -601,7 +603,10 @@ HWTEST2_F(AppendMemoryCopyXeHpAndLater,
ptrOffset(commandContainer.getCommandStream()->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto pipeControls = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
auto itorWalker = find<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), itorWalker);
auto pipeControls = findAll<PIPE_CONTROL *>(itorWalker, cmdList.end());
uint32_t postSyncFound = 0;
uint32_t dcFlushFound = 0;
ASSERT_NE(0u, pipeControls.size());
@@ -616,16 +621,9 @@ HWTEST2_F(AppendMemoryCopyXeHpAndLater,
}
}
uint32_t expectedDcFlushFound = 2u;
constexpr uint32_t expectedDcFlushFound = 1u;
auto &hwInfo = device->getHwInfo();
auto &hwInfoConfig = (*NEO::HwInfoConfig::get(hwInfo.platform.eProductFamily));
const auto waPair = hwInfoConfig.isPipeControlPriorToNonPipelinedStateCommandsWARequired(hwInfo, true);
if (waPair.first) {
expectedDcFlushFound++;
}
EXPECT_EQ(2u, postSyncFound);
EXPECT_EQ(0u, postSyncFound);
EXPECT_EQ(expectedDcFlushFound, dcFlushFound);
}