feature: synchronize patch preamble with wait commands before patching

Related-To: NEO-16140

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2025-09-17 11:30:29 +00:00
committed by Compute-Runtime-Automation
parent 8ab463f47c
commit 0b65b86ccb
9 changed files with 214 additions and 25 deletions

View File

@@ -521,12 +521,15 @@ struct CommandList : _ze_command_list_handle_t {
return activeScratchPatchElements;
}
bool isDualStreamCopyOffloadOperation(bool offloadOperation) const { return (getCopyOffloadModeForOperation(offloadOperation) == CopyOffloadModes::dualStream); }
void saveLatestTagAndTaskCount(uint64_t tagGpuAddress, TaskCountType submittedTaskCount) {
this->latestTagGpuAddress = tagGpuAddress;
void saveLatestTagAndTaskCount(NEO::GraphicsAllocation *tagGpuAllocation, TaskCountType submittedTaskCount) {
this->latesTagGpuAllocation = tagGpuAllocation;
this->latestTaskCount = submittedTaskCount;
}
uint64_t getLatestTagGpuAddress() const {
return this->latestTagGpuAddress;
return this->latesTagGpuAllocation == nullptr ? 0 : this->latesTagGpuAllocation->getGpuAddress();
}
NEO::GraphicsAllocation *getLatestTagGpuAllocation() const {
return this->latesTagGpuAllocation;
}
TaskCountType getLatestTaskCount() const {
return this->latestTaskCount;
@@ -563,7 +566,6 @@ struct CommandList : _ze_command_list_handle_t {
NEO::L1CachePolicy l1CachePolicyData{};
NEO::EncodeDummyBlitWaArgs dummyBlitWa{};
uint64_t latestTagGpuAddress = 0;
int64_t currentSurfaceStateBaseAddress = NEO::StreamProperty64::initValue;
int64_t currentDynamicStateBaseAddress = NEO::StreamProperty64::initValue;
int64_t currentIndirectObjectBaseAddress = NEO::StreamProperty64::initValue;
@@ -571,6 +573,7 @@ struct CommandList : _ze_command_list_handle_t {
TaskCountType latestTaskCount = 0;
NEO::GraphicsAllocation *latesTagGpuAllocation = nullptr;
ze_context_handle_t hContext = nullptr;
CommandQueue *cmdQImmediate = nullptr;
CommandQueue *cmdQImmediateCopyOffload = nullptr;

View File

@@ -154,7 +154,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
this->inOrderPatchCmds.clear();
this->totalNoopSpace = 0;
this->latestTagGpuAddress = 0;
this->latesTagGpuAllocation = nullptr;
this->latestTaskCount = 0;
return ZE_RESULT_SUCCESS;

View File

@@ -1880,7 +1880,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendCommandLists(ui
requireTaskCountUpdate,
&mainAppendLock,
&mainLockForIndirect);
queueImp->saveTagAndTaskCountForCommandLists(numCommandLists, phCommandLists, queueImp->getCsr()->getTagAllocation()->getGpuAddress(), queueImp->getTaskCount());
queueImp->saveTagAndTaskCountForCommandLists(numCommandLists, phCommandLists, queueImp->getCsr()->getTagAllocation(), queueImp->getTaskCount());
return retCode;
}

View File

@@ -43,11 +43,11 @@ bool CommandQueue::frontEndTrackingEnabled() const {
}
void CommandQueue::saveTagAndTaskCountForCommandLists(uint32_t numCommandLists, ze_command_list_handle_t *commandListHandles,
uint64_t tagGpuAddress, TaskCountType submittedTaskCount) {
NEO::GraphicsAllocation *tagGpuAllocation, TaskCountType submittedTaskCount) {
if (this->saveWaitForPreamble) {
for (uint32_t i = 0; i < numCommandLists; i++) {
auto commandList = CommandList::fromHandle(commandListHandles[i]);
commandList->saveLatestTagAndTaskCount(tagGpuAddress, submittedTaskCount);
commandList->saveLatestTagAndTaskCount(tagGpuAllocation, submittedTaskCount);
}
}
}
@@ -410,8 +410,9 @@ void CommandQueueImp::makeResidentForResidencyContainer(const NEO::ResidencyCont
}
}
bool CommandQueueImp::checkNeededPatchPreambleWait(uint64_t tagGpuAddress) {
return this->saveWaitForPreamble && (getCsr()->getTagAllocation()->getGpuAddress() != tagGpuAddress);
bool CommandQueueImp::checkNeededPatchPreambleWait(CommandList *commandList) {
uint64_t tagGpuAddress = commandList->getLatestTagGpuAddress();
return this->saveWaitForPreamble && (tagGpuAddress != 0) && (getCsr()->getTagAllocation()->getGpuAddress() != tagGpuAddress);
}
} // namespace L0

View File

@@ -101,7 +101,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
return this->saveWaitForPreamble;
}
void saveTagAndTaskCountForCommandLists(uint32_t numCommandLists, ze_command_list_handle_t *commandListHandles,
uint64_t tagGpuAddress, TaskCountType submittedTaskCount);
NEO::GraphicsAllocation *tagGpuAllocation, TaskCountType submittedTaskCount);
protected:
bool frontEndTrackingEnabled() const;

View File

@@ -159,10 +159,12 @@ struct CommandQueueHw : public CommandQueueImp {
inline size_t estimateCommandListPatchPreamble(CommandListExecutionContext &ctx, uint32_t numCommandLists);
inline size_t estimateCommandListPatchPreambleFrontEndCmd(CommandListExecutionContext &ctx, CommandList *commandList);
inline void getCommandListPatchPreambleData(CommandListExecutionContext &ctx, CommandList *commandList);
inline size_t estimateCommandListPatchPreambleWaitSync(CommandListExecutionContext &ctx, CommandList *commandList);
inline size_t estimateTotalPatchPreambleData(CommandListExecutionContext &ctx);
inline void retrivePatchPreambleSpace(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream);
inline void dispatchPatchPreambleEnding(CommandListExecutionContext &ctx);
inline void dispatchPatchPreambleInOrderNoop(CommandListExecutionContext &ctx, CommandList *commandList);
inline void dispatchPatchPreambleCommandListWaitSync(CommandListExecutionContext &ctx, CommandList *commandList);
inline size_t estimateCommandListResidencySize(CommandList *commandList);
inline void setFrontEndStateProperties(CommandListExecutionContext &ctx);
inline void handleScratchSpaceAndUpdateGSBAStateDirtyFlag(CommandListExecutionContext &ctx);

View File

@@ -209,6 +209,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless(
ctx.childGpuAddressPositionBeforeDynamicPreamble = (*streamForDispatch).getCurrentGpuAddressPosition();
this->dispatchPatchPreambleCommandListWaitSync(ctx, commandList);
this->patchCommands(*commandList, ctx);
this->programOneCmdListBatchBufferStart(commandList, *streamForDispatch, ctx);
@@ -266,6 +268,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateStreamSizeForExecuteCommandListsRe
getCommandListPatchPreambleData(ctx, cmdList);
linearStreamSizeEstimate += estimateCommandListPatchPreambleFrontEndCmd(ctx, cmdList);
linearStreamSizeEstimate += estimateCommandListPatchPreambleWaitSync(ctx, cmdList);
linearStreamSizeEstimate += estimateCommandListSecondaryStart(cmdList);
}
linearStreamSizeEstimate += estimateTotalPatchPreambleData(ctx);
@@ -431,6 +434,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
}
}
this->dispatchPatchPreambleCommandListWaitSync(ctx, commandList);
this->patchCommands(*commandList, ctx);
this->programOneCmdListBatchBufferStart(commandList, *streamForDispatch, ctx);
@@ -493,6 +498,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
fenceRequired |= commandList->isTaskCountUpdateFenceRequired();
linearStreamSizeEstimate += estimateCommandListSecondaryStart(commandList);
linearStreamSizeEstimate += estimateCommandListPatchPreambleWaitSync(ctx, commandList);
}
linearStreamSizeEstimate += this->estimateCommandListPatchPreamble(ctx, numCommandLists);
@@ -522,6 +528,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
auto commandList = CommandList::fromHandle(phCommandLists[i]);
ctx.childGpuAddressPositionBeforeDynamicPreamble = (*streamForDispatch).getCurrentGpuAddressPosition();
this->dispatchPatchPreambleCommandListWaitSync(ctx, commandList);
this->programOneCmdListBatchBufferStart(commandList, *streamForDispatch, ctx);
this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList);
this->dispatchPatchPreambleInOrderNoop(ctx, commandList);
@@ -924,6 +932,21 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPatchPreambleFrontEndCm
return encodeSize;
}
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPatchPreambleWaitSync(CommandListExecutionContext &ctx, CommandList *commandList) {
using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
size_t waitSize = 0;
if (this->patchingPreamble) {
bool needWait = this->checkNeededPatchPreambleWait(commandList);
if (needWait) {
waitSize = NEO::EncodeSemaphore<GfxFamily>::getSizeMiSemaphoreWait();
waitSize += (2 * sizeof(MI_LOAD_REGISTER_IMM));
}
ctx.bufferSpaceForPatchPreamble += waitSize;
}
return waitSize;
}
template <GFXCORE_FAMILY gfxCoreFamily>
inline size_t CommandQueueHw<gfxCoreFamily>::estimateTotalPatchPreambleData(CommandListExecutionContext &ctx) {
size_t encodeSize = 0;
@@ -1007,6 +1030,49 @@ void CommandQueueHw<gfxCoreFamily>::dispatchPatchPreambleInOrderNoop(CommandList
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::dispatchPatchPreambleCommandListWaitSync(CommandListExecutionContext &ctx, CommandList *commandList) {
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
if (this->patchingPreamble) {
if (this->checkNeededPatchPreambleWait(commandList)) {
constexpr uint32_t firstRegister = RegisterOffsets::csGprR0;
constexpr uint32_t secondRegister = RegisterOffsets::csGprR0 + 4;
auto waitValue = commandList->getLatestTaskCount();
NEO::LriHelper<GfxFamily>::program(reinterpret_cast<MI_LOAD_REGISTER_IMM *>(ctx.currentPatchPreambleBuffer),
firstRegister,
getLowPart(waitValue),
true,
this->isCopyOnlyCommandQueue);
ctx.currentPatchPreambleBuffer = ptrOffset(ctx.currentPatchPreambleBuffer, sizeof(MI_LOAD_REGISTER_IMM));
NEO::LriHelper<GfxFamily>::program(reinterpret_cast<MI_LOAD_REGISTER_IMM *>(ctx.currentPatchPreambleBuffer),
secondRegister,
getHighPart(waitValue),
true,
this->isCopyOnlyCommandQueue);
ctx.currentPatchPreambleBuffer = ptrOffset(ctx.currentPatchPreambleBuffer, sizeof(MI_LOAD_REGISTER_IMM));
NEO::EncodeSemaphore<GfxFamily>::programMiSemaphoreWait(reinterpret_cast<MI_SEMAPHORE_WAIT *>(ctx.currentPatchPreambleBuffer),
commandList->getLatestTagGpuAddress(),
commandList->getLatestTaskCount(),
COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD,
false,
true,
GfxFamily::isQwordInOrderCounter,
GfxFamily::isQwordInOrderCounter,
false);
ctx.currentPatchPreambleBuffer = ptrOffset(ctx.currentPatchPreambleBuffer, NEO::EncodeSemaphore<GfxFamily>::getSizeMiSemaphoreWait());
this->csr->makeResident(*commandList->getLatestTagGpuAllocation());
}
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListResidencySize(CommandList *commandList) {
return commandList->getCmdContainer().getResidencyContainer().size();
@@ -1079,6 +1145,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
getCommandListPatchPreambleData(ctx, cmdList);
linearStreamSizeEstimate += estimateCommandListPatchPreambleFrontEndCmd(ctx, cmdList);
linearStreamSizeEstimate += estimateCommandListPatchPreambleWaitSync(ctx, cmdList);
linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(frontEndStateDirty, cmdList,
streamProperties, requiredStreamState, finalStreamState,
cmdListState.requiredState,
@@ -1423,10 +1490,6 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStartSecondaryBa
}
}
}
if (ctx.containsParentImmediateStream) {
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferEnd(commandContainer);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -1609,7 +1672,7 @@ void CommandQueueHw<gfxCoreFamily>::updateTaskCountAndPostSync(bool isDispatchTa
this->taskCount = this->csr->peekTaskCount();
this->csr->setLatestFlushedTaskCount(this->taskCount);
this->saveTagAndTaskCountForCommandLists(numCommandLists, commandListHandles, this->csr->getTagAllocation()->getGpuAddress(), this->taskCount);
this->saveTagAndTaskCountForCommandLists(numCommandLists, commandListHandles, this->csr->getTagAllocation(), this->taskCount);
}
template <GFXCORE_FAMILY gfxCoreFamily>

View File

@@ -120,7 +120,7 @@ struct CommandQueueImp : public CommandQueue {
}
void makeResidentForResidencyContainer(const NEO::ResidencyContainer &residencyContainer);
bool checkNeededPatchPreambleWait(uint64_t tagGpuAddress);
bool checkNeededPatchPreambleWait(CommandList *commandList);
protected:
MOCKABLE_VIRTUAL NEO::SubmissionStatus submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr,

View File

@@ -9,6 +9,7 @@
#include "shared/source/helpers/pause_on_gpu_properties.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_bindless_heaps_helper.h"
#include "shared/test/common/test_macros/hw_test.h"
@@ -1321,31 +1322,39 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleAndSavingW
EXPECT_TRUE(commandQueue->getPatchingPreamble());
EXPECT_FALSE(commandQueue->getSaveWaitForPreamble());
uint64_t expectedGpuAddress = commandQueue->getCsr()->getTagAllocation()->getGpuAddress();
NEO::GraphicsAllocation *expectedGpuAllocation = commandQueue->getCsr()->getTagAllocation();
TaskCountType expectedTaskCount = 0x456;
uint64_t expectedGpuAddress = expectedGpuAllocation->getGpuAddress();
commandQueue->saveTagAndTaskCountForCommandLists(1, &commandListHandle, expectedGpuAddress, expectedTaskCount);
commandQueue->saveTagAndTaskCountForCommandLists(1, &commandListHandle, expectedGpuAllocation, expectedTaskCount);
// save and wait is disabled, so nothing to be saved
EXPECT_EQ(0u, commandList->getLatestTagGpuAddress());
EXPECT_EQ(0u, commandList->getLatestTaskCount());
EXPECT_FALSE(commandQueue->checkNeededPatchPreambleWait(expectedGpuAddress));
EXPECT_FALSE(commandQueue->checkNeededPatchPreambleWait(commandList));
commandQueue->setPatchingPreamble(true, true);
EXPECT_TRUE(commandQueue->getPatchingPreamble());
EXPECT_TRUE(commandQueue->getSaveWaitForPreamble());
EXPECT_TRUE(commandQueue->checkNeededPatchPreambleWait(expectedGpuAddress + 1000));
EXPECT_FALSE(commandQueue->checkNeededPatchPreambleWait(expectedGpuAddress));
EXPECT_FALSE(commandQueue->checkNeededPatchPreambleWait(commandList));
commandQueue->saveTagAndTaskCountForCommandLists(1, &commandListHandle, expectedGpuAddress, expectedTaskCount);
commandQueue->saveTagAndTaskCountForCommandLists(1, &commandListHandle, expectedGpuAllocation, expectedTaskCount);
// save and wait is now enabled
EXPECT_EQ(expectedGpuAddress, commandList->getLatestTagGpuAddress());
EXPECT_EQ(expectedTaskCount, commandList->getLatestTaskCount());
EXPECT_FALSE(commandQueue->checkNeededPatchPreambleWait(commandList));
MockGraphicsAllocation otherTagAllocation(nullptr, expectedGpuAddress + 0x1000, 1);
commandQueue->saveTagAndTaskCountForCommandLists(1, &commandListHandle, &otherTagAllocation, expectedTaskCount);
EXPECT_TRUE(commandQueue->checkNeededPatchPreambleWait(commandList));
commandList->reset();
EXPECT_EQ(0u, commandList->getLatestTagGpuAddress());
EXPECT_EQ(0u, commandList->getLatestTaskCount());
EXPECT_EQ(nullptr, commandList->getLatestTagGpuAllocation());
commandList->destroy();
commandQueue->destroy();
@@ -1376,7 +1385,8 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleAndSavingW
commandList->close();
commandQueue->setPatchingPreamble(true, true);
commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr);
returnValue = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
uint64_t expectedGpuAddress = commandQueue->getCsr()->getTagAllocation()->getGpuAddress();
TaskCountType expectedTaskCount = commandQueue->getTaskCount();
@@ -1409,7 +1419,8 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleAndSavingW
commandList->close();
immediateCmdList->setPatchingPreamble(true, true);
immediateCmdList->appendCommandLists(1, &commandListHandle, nullptr, 0, nullptr);
returnValue = immediateCmdList->appendCommandLists(1, &commandListHandle, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
uint64_t expectedGpuAddress = immediateQueue->getCsr()->getTagAllocation()->getGpuAddress();
TaskCountType expectedTaskCount = immediateQueue->getTaskCount();
@@ -1420,5 +1431,114 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleAndSavingW
commandList->destroy();
}
HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleAndSavingWaitDataWhenCmdListExecutedByQueueThenSemaphoreDispatchedWhenNeeded) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using COMPARE_OPERATION = typename FamilyType::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
ze_result_t returnValue;
ze_command_queue_desc_t queueDesc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
queueDesc.ordinal = 0u;
queueDesc.index = 0u;
queueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
queueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
WhiteBox<L0::CommandQueue> *commandQueue = whiteboxCast(CommandQueue::create(productFamily,
device,
neoDevice->getDefaultEngine().commandStreamReceiver,
&queueDesc,
false,
false,
false,
returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
MockGraphicsAllocation otherTagAllocation(nullptr, commandQueue->getCsr()->getTagAllocation()->getGpuAddress() + 0x1000, 1);
auto commandList = CommandList::create(productFamily, device, NEO::EngineGroupType::compute, 0u, returnValue, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
ze_command_list_handle_t commandListHandle = commandList->toHandle();
commandList->close();
commandQueue->setPatchingPreamble(true, true);
auto usedSpaceBefore = commandQueue->commandStream.getUsed();
returnValue = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto usedSpaceAfter = commandQueue->commandStream.getUsed();
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList, ptrOffset(commandQueue->commandStream.getCpuBase(), usedSpaceBefore), usedSpaceAfter - usedSpaceBefore));
// first execution of command list, no prior history and no semaphore required
auto semWaitCmds = findAll<MI_SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(0u, semWaitCmds.size());
usedSpaceBefore = commandQueue->commandStream.getUsed();
returnValue = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
usedSpaceAfter = commandQueue->commandStream.getUsed();
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
cmdList.clear();
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList, ptrOffset(commandQueue->commandStream.getCpuBase(), usedSpaceBefore), usedSpaceAfter - usedSpaceBefore));
// second execution of command list, same tag allocation and no semaphore required
semWaitCmds = findAll<MI_SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(0u, semWaitCmds.size());
// change tag allocation to simulate previous execution on different context
constexpr uint32_t otherTaskCount = 0x123;
commandList->saveLatestTagAndTaskCount(&otherTagAllocation, otherTaskCount);
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(commandQueue->getCsr());
ultCsr->storeMakeResidentAllocations = true;
usedSpaceBefore = commandQueue->commandStream.getUsed();
returnValue = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
usedSpaceAfter = commandQueue->commandStream.getUsed();
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
cmdList.clear();
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList, ptrOffset(commandQueue->commandStream.getCpuBase(), usedSpaceBefore), usedSpaceAfter - usedSpaceBefore));
// third execution of command list, different tag allocation and semaphore required
auto lriCmds = findAll<MI_LOAD_REGISTER_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(2u, lriCmds.size());
auto lriCmd = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(*lriCmds[0]);
EXPECT_EQ(RegisterOffsets::csGprR0, lriCmd->getRegisterOffset());
EXPECT_EQ(getLowPart(otherTaskCount), lriCmd->getDataDword());
lriCmd = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(*lriCmds[1]);
EXPECT_EQ(RegisterOffsets::csGprR0 + 4, lriCmd->getRegisterOffset());
EXPECT_EQ(getHighPart(otherTaskCount), lriCmd->getDataDword());
semWaitCmds = findAll<MI_SEMAPHORE_WAIT *>(lriCmds[1], cmdList.end());
ASSERT_EQ(1u, semWaitCmds.size());
auto semWaitCmd = reinterpret_cast<MI_SEMAPHORE_WAIT *>(*semWaitCmds[0]);
EXPECT_EQ(otherTagAllocation.getGpuAddress(), semWaitCmd->getSemaphoreGraphicsAddress());
EXPECT_EQ(COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, semWaitCmd->getCompareOperation());
EXPECT_TRUE(ultCsr->isMadeResident(&otherTagAllocation));
// verify that all sdi patching commands are after wait synchronize
auto sdiCmds = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), semWaitCmds[0]);
EXPECT_EQ(0u, sdiCmds.size());
sdiCmds = findAll<MI_STORE_DATA_IMM *>(semWaitCmds[0], cmdList.end());
EXPECT_NE(0u, sdiCmds.size());
commandList->destroy();
commandQueue->destroy();
}
} // namespace ult
} // namespace L0