feature: add patch preamble to level zero queue handling bb_start commands

Related-To: NEO-15376

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2025-08-01 12:56:02 +00:00
committed by Compute-Runtime-Automation
parent 91a9ccaebe
commit 01889c97a5
5 changed files with 448 additions and 12 deletions

View File

@@ -89,6 +89,9 @@ struct CommandQueue : _ze_command_queue_handle_t {
this->isWalkerWithProfilingEnqueued = false;
return retVal;
}
inline void setPatchingPreamble(bool patching) {
this->patchingPreamble = patching;
}
protected:
bool frontEndTrackingEnabled() const;
@@ -111,6 +114,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
bool heaplessModeEnabled = false;
bool heaplessStateInitEnabled = false;
bool isWalkerWithProfilingEnqueued = false;
bool patchingPreamble = false;
};
using CommandQueueAllocatorFn = CommandQueue *(*)(Device *device, NEO::CommandStreamReceiver *csr,

View File

@@ -77,11 +77,14 @@ struct CommandQueueHw : public CommandQueueImp {
NEO::StreamProperties cmdListBeginState{};
uint64_t scratchGsba = 0;
uint64_t childGpuAddressPositionBeforeDynamicPreamble = 0;
uint64_t currentGpuAddressForChainedBbStart = 0;
size_t spaceForResidency = 10;
size_t bufferSpaceForPatchPreamble = 0;
CommandList *firstCommandList = nullptr;
CommandList *lastCommandList = nullptr;
void *currentPatchForChainedBbStart = nullptr;
void *currentPatchPreambleBuffer = nullptr;
NEO::ScratchSpaceController *scratchSpaceController = nullptr;
NEO::GraphicsAllocation *globalStatelessAllocation = nullptr;
std::unique_lock<std::mutex> *outerLockForIndirect = nullptr;
@@ -150,6 +153,9 @@ struct CommandQueueHw : public CommandQueueImp {
bool stateCacheFlushRequired);
inline size_t estimateCommandListSecondaryStart(CommandList *commandList);
inline size_t estimateCommandListPrimaryStart(bool required);
inline size_t estimateCommandListPatchPreamble(CommandListExecutionContext &ctx, uint32_t numCommandLists);
inline void retrivePatchPreambleSpace(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream);
inline void dispatchPatchPreambleEnding(CommandListExecutionContext &ctx);
inline size_t estimateCommandListResidencySize(CommandList *commandList);
inline void setFrontEndStateProperties(CommandListExecutionContext &ctx);
inline void handleScratchSpaceAndUpdateGSBAStateDirtyFlag(CommandListExecutionContext &ctx);

View File

@@ -202,6 +202,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless(
neoDevice->getBindlessHeapsHelper()->clearStateDirtyForContext(this->csr->getOsContext().getContextId());
}
this->retrivePatchPreambleSpace(ctx, *streamForDispatch);
for (auto i = 0u; i < numCommandLists; ++i) {
auto commandList = CommandList::fromHandle(commandListHandles[i]);
@@ -221,6 +223,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless(
this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList);
this->programLastCommandListReturnBbStart(*streamForDispatch, ctx);
this->dispatchPatchPreambleEnding(ctx);
if (!ctx.containsParentImmediateStream) {
this->assignCsrTaskCountToFenceIfAvailable(hFence);
@@ -255,6 +258,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateStreamSizeForExecuteCommandListsRe
linearStreamSizeEstimate += NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferEndSize();
}
linearStreamSizeEstimate += this->estimateCommandListPatchPreamble(ctx, numCommandLists);
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit || this->forceBbStartJump);
for (uint32_t i = 0; i < numCommandLists; i++) {
auto cmdList = CommandList::fromHandle(commandListHandles[i]);
@@ -401,6 +405,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
}
}
this->retrivePatchPreambleSpace(ctx, *streamForDispatch);
for (auto i = 0u; i < numCommandLists; ++i) {
auto commandList = CommandList::fromHandle(commandListHandles[i]);
@@ -436,6 +442,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList);
this->programLastCommandListReturnBbStart(*streamForDispatch, ctx);
this->dispatchPatchPreambleEnding(ctx);
this->csr->setPreemptionMode(ctx.statePreemption);
@@ -482,6 +489,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
linearStreamSizeEstimate += estimateCommandListSecondaryStart(commandList);
}
linearStreamSizeEstimate += this->estimateCommandListPatchPreamble(ctx, numCommandLists);
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit || this->forceBbStartJump);
if (fenceRequired) {
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleAdditionalSynchronization(NEO::FenceType::release, device->getNEODevice()->getRootDeviceEnvironment());
@@ -501,6 +509,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
this->getTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(*streamForDispatch);
this->csr->programHardwareContext(*streamForDispatch);
this->retrivePatchPreambleSpace(ctx, *streamForDispatch);
for (auto i = 0u; i < numCommandLists; ++i) {
auto commandList = CommandList::fromHandle(phCommandLists[i]);
ctx.childGpuAddressPositionBeforeDynamicPreamble = (*streamForDispatch).getCurrentGpuAddressPosition();
@@ -512,6 +522,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList);
this->programLastCommandListReturnBbStart(*streamForDispatch, ctx);
this->dispatchPatchPreambleEnding(ctx);
this->makeCsrTagAllocationResident();
@@ -897,6 +908,49 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPrimaryStart(bool requi
return 0;
}
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPatchPreamble(CommandListExecutionContext &ctx, uint32_t numCommandLists) {
size_t encodeSize = 0;
if (this->patchingPreamble) {
constexpr size_t bbStartSize = NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize();
size_t singleBbStartEncodeSize = NEO::EncodeDataMemory<GfxFamily>::getCommandSizeForEncode(bbStartSize);
encodeSize = singleBbStartEncodeSize * numCommandLists;
// barrier command to pause between patch preamble completion and execution of command lists
encodeSize += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier();
encodeSize += 2 * NEO::EncodeMiArbCheck<GfxFamily>::getCommandSize();
ctx.bufferSpaceForPatchPreamble = encodeSize;
// patch preamble dispatched into queue's buffer forces not to use cmdlist as a starting buffer
this->forceBbStartJump = true;
}
return encodeSize;
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::retrivePatchPreambleSpace(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream) {
if (this->patchingPreamble) {
ctx.currentPatchPreambleBuffer = commandStream.getSpace(ctx.bufferSpaceForPatchPreamble);
memset(ctx.currentPatchPreambleBuffer, 0, ctx.bufferSpaceForPatchPreamble);
NEO::EncodeMiArbCheck<GfxFamily>::program(ctx.currentPatchPreambleBuffer, true);
ctx.currentPatchPreambleBuffer = ptrOffset(ctx.currentPatchPreambleBuffer, NEO::EncodeMiArbCheck<GfxFamily>::getCommandSize());
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::dispatchPatchPreambleEnding(CommandListExecutionContext &ctx) {
if (this->patchingPreamble) {
NEO::PipeControlArgs args;
NEO::MemorySynchronizationCommands<GfxFamily>::setSingleBarrier(ctx.currentPatchPreambleBuffer, args);
ctx.currentPatchPreambleBuffer = ptrOffset(ctx.currentPatchPreambleBuffer, NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier());
NEO::EncodeMiArbCheck<GfxFamily>::program(ctx.currentPatchPreambleBuffer, false);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListResidencySize(CommandList *commandList) {
return commandList->getCmdContainer().getResidencyContainer().size();
@@ -1018,6 +1072,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
linearStreamSizeEstimate += NEO::PreemptionHelper::getRequiredStateSipCmdSize<GfxFamily>(*neoDevice, this->csr->isRcs());
}
linearStreamSizeEstimate += this->estimateCommandListPatchPreamble(ctx, numCommandLists);
bool firstCmdlistDynamicPreamble = (this->stateChanges.size() > 0 && this->stateChanges[0].cmdListIndex == 0);
bool estimateBbStartForGlobalInitOnly = !firstCmdlistDynamicPreamble && (ctx.globalInit || this->forceBbStartJump);
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(estimateBbStartForGlobalInitOnly);
@@ -1223,10 +1278,17 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStartPrimaryBatc
if (ctx.currentPatchForChainedBbStart) {
// dynamic preamble, 2nd or later command list
// jump from previous command list to the position before dynamic preamble
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(
bbStartPatchLocation,
ctx.childGpuAddressPositionBeforeDynamicPreamble,
false, false, false);
if (this->patchingPreamble) {
NEO::EncodeDataMemory<GfxFamily>::programBbStart(ctx.currentPatchPreambleBuffer,
ctx.currentGpuAddressForChainedBbStart,
ctx.childGpuAddressPositionBeforeDynamicPreamble,
false, false, false);
} else {
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(
bbStartPatchLocation,
ctx.childGpuAddressPositionBeforeDynamicPreamble,
false, false, false);
}
}
// dynamic preamble, jump from current position, after dynamic preamble to the current command list
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&commandStream, cmdListFirstCmdBuffer->getGpuAddress(), false, false, false);
@@ -1243,14 +1305,22 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStartPrimaryBatc
this->startingCmdBuffer = &this->firstCmdListStream;
} else {
// chain between command lists when no dynamic preamble required between 2nd and next command list
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(
bbStartPatchLocation,
cmdListFirstCmdBuffer->getGpuAddress(),
false, false, false);
if (this->patchingPreamble) {
NEO::EncodeDataMemory<GfxFamily>::programBbStart(ctx.currentPatchPreambleBuffer,
ctx.currentGpuAddressForChainedBbStart,
cmdListFirstCmdBuffer->getGpuAddress(),
false, false, false);
} else {
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(
bbStartPatchLocation,
cmdListFirstCmdBuffer->getGpuAddress(),
false, false, false);
}
}
}
ctx.currentPatchForChainedBbStart = cmdListContainer.getEndCmdPtr();
ctx.currentGpuAddressForChainedBbStart = cmdListContainer.getEndCmdGpuAddress();
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -1307,10 +1377,17 @@ void CommandQueueHw<gfxCoreFamily>::programLastCommandListReturnBbStart(
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
if (this->dispatchCmdListBatchBufferAsPrimary) {
auto finalReturnPosition = commandStream.getCurrentGpuAddressPosition();
auto bbStartCmd = reinterpret_cast<MI_BATCH_BUFFER_START *>(ctx.currentPatchForChainedBbStart);
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(bbStartCmd,
finalReturnPosition,
false, false, false);
if (this->patchingPreamble) {
NEO::EncodeDataMemory<GfxFamily>::programBbStart(ctx.currentPatchPreambleBuffer,
ctx.currentGpuAddressForChainedBbStart,
finalReturnPosition,
false, false, false);
} else {
auto bbStartCmd = reinterpret_cast<MI_BATCH_BUFFER_START *>(ctx.currentPatchForChainedBbStart);
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(bbStartCmd,
finalReturnPosition,
false, false, false);
}
}
}

View File

@@ -61,6 +61,7 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp {
using CommandQueue::heaplessStateInitEnabled;
using CommandQueue::internalUsage;
using CommandQueue::partitionCount;
using CommandQueue::patchingPreamble;
using CommandQueue::pipelineSelectStateTracking;
using CommandQueue::stateBaseAddressTracking;
using CommandQueue::stateComputeModeTracking;
@@ -105,6 +106,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
using L0::CommandQueue::heaplessStateInitEnabled;
using L0::CommandQueue::internalUsage;
using L0::CommandQueue::partitionCount;
using L0::CommandQueue::patchingPreamble;
using L0::CommandQueue::pipelineSelectStateTracking;
using L0::CommandQueue::preemptionCmdSyncProgramming;
using L0::CommandQueue::stateBaseAddressTracking;

View File

@@ -946,5 +946,352 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenRegularCommandListNotCl
commandQueue->destroy();
}
HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleWhenSingleCmdListExecutedThenPatchPreambleContainsEncodingReturningBbStartCmd) {
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
uint32_t bbStartDwordBuffer[sizeof(MI_BATCH_BUFFER_START) / sizeof(uint32_t)] = {0};
ze_result_t returnValue;
ze_command_queue_desc_t queueDesc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
queueDesc.ordinal = 0u;
queueDesc.index = 0u;
queueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
queueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
WhiteBox<L0::CommandQueue> *commandQueue = whiteboxCast(CommandQueue::create(productFamily,
device,
neoDevice->getDefaultEngine().commandStreamReceiver,
&queueDesc,
false,
false,
false,
returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto commandList = CommandList::create(productFamily, device, NEO::EngineGroupType::renderCompute, 0u, returnValue, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
ze_command_list_handle_t commandLists[] = {commandList->toHandle()};
commandList->close();
uint64_t endGpuAddress = commandList->getCmdContainer().getEndCmdGpuAddress();
uint64_t startGpuAddress = commandList->getCmdContainer().getCmdBufferAllocations()[0]->getGpuAddress();
commandQueue->setPatchingPreamble(true);
void *queueCpuBase = commandQueue->commandStream.getCpuBase();
uint64_t queueGpuBase = commandQueue->commandStream.getGpuBase();
auto usedSpaceBefore = commandQueue->commandStream.getUsed();
returnValue = commandQueue->executeCommandLists(1, commandLists, nullptr, true, nullptr, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto usedSpaceAfter = commandQueue->commandStream.getUsed();
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
queueCpuBase,
usedSpaceAfter));
GenCmdList::iterator patchCmdIterator = cmdList.end();
size_t bbStartIdx = 0;
auto sdiCmds = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(0u, sdiCmds.size());
for (auto &sdiCmd : sdiCmds) {
auto storeDataImm = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmd);
EXPECT_EQ(endGpuAddress + bbStartIdx * sizeof(uint64_t), storeDataImm->getAddress());
bbStartDwordBuffer[2 * bbStartIdx] = storeDataImm->getDataDword0();
if (storeDataImm->getStoreQword()) {
bbStartDwordBuffer[2 * bbStartIdx + 1] = storeDataImm->getDataDword1();
}
bbStartIdx++;
patchCmdIterator = sdiCmd;
}
auto bbStarts = findAll<MI_BATCH_BUFFER_START *>(patchCmdIterator, cmdList.end());
ASSERT_NE(0u, bbStarts.size());
auto startingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[0]);
EXPECT_EQ(startGpuAddress, startingBbStart->getBatchBufferStartAddress());
size_t offsetToReturn = ptrDiff(startingBbStart, queueCpuBase);
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
uint64_t expectedReturnAddress = queueGpuBase + offsetToReturn;
MI_BATCH_BUFFER_START *chainBackBbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartDwordBuffer);
ASSERT_NE(nullptr, chainBackBbStartCmd);
EXPECT_EQ(expectedReturnAddress, chainBackBbStartCmd->getBatchBufferStartAddress());
usedSpaceBefore = commandQueue->commandStream.getUsed();
returnValue = commandQueue->executeCommandLists(1, commandLists, nullptr, true, nullptr, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
usedSpaceAfter = commandQueue->commandStream.getUsed();
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
cmdList.clear();
bbStartIdx = 0;
memset(bbStartDwordBuffer, 0, sizeof(bbStartDwordBuffer));
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
ptrOffset(queueCpuBase, usedSpaceBefore),
usedSpaceAfter - usedSpaceBefore));
sdiCmds = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(0u, sdiCmds.size());
for (auto &sdiCmd : sdiCmds) {
auto storeDataImm = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmd);
EXPECT_EQ(endGpuAddress + bbStartIdx * sizeof(uint64_t), storeDataImm->getAddress());
bbStartDwordBuffer[2 * bbStartIdx] = storeDataImm->getDataDword0();
if (storeDataImm->getStoreQword()) {
bbStartDwordBuffer[2 * bbStartIdx + 1] = storeDataImm->getDataDword1();
}
bbStartIdx++;
patchCmdIterator = sdiCmd;
}
bbStarts = findAll<MI_BATCH_BUFFER_START *>(patchCmdIterator, cmdList.end());
ASSERT_NE(0u, bbStarts.size());
// second BB_START command should be the one that jumps to the begin of the 1st command list
startingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[0]);
EXPECT_EQ(startGpuAddress, startingBbStart->getBatchBufferStartAddress());
offsetToReturn = ptrDiff(startingBbStart, queueCpuBase);
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
expectedReturnAddress = queueGpuBase + offsetToReturn;
chainBackBbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartDwordBuffer);
ASSERT_NE(nullptr, chainBackBbStartCmd);
EXPECT_EQ(expectedReturnAddress, chainBackBbStartCmd->getBatchBufferStartAddress());
commandList->destroy();
commandQueue->destroy();
}
HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleWhenTwoCmdListsExecutedThenPatchPreambleContainsEncodingReturningAndChainingBbStartCmd) {
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
uint32_t bbStartDwordBuffer[sizeof(MI_BATCH_BUFFER_START) / sizeof(uint32_t)] = {0};
uint32_t bbStartDwordBuffer2[sizeof(MI_BATCH_BUFFER_START) / sizeof(uint32_t)] = {0};
ze_result_t returnValue;
ze_command_queue_desc_t queueDesc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
queueDesc.ordinal = 0u;
queueDesc.index = 0u;
queueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
queueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
WhiteBox<L0::CommandQueue> *commandQueue = whiteboxCast(CommandQueue::create(productFamily,
device,
neoDevice->getDefaultEngine().commandStreamReceiver,
&queueDesc,
false,
false,
false,
returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto commandList = CommandList::create(productFamily, device, NEO::EngineGroupType::renderCompute, 0u, returnValue, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto commandList2 = CommandList::create(productFamily, device, NEO::EngineGroupType::renderCompute, 0u, returnValue, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
commandList->getRequiredStreamState().stateBaseAddress.surfaceStateBaseAddress.set(0x1000);
commandList2->getRequiredStreamState().stateBaseAddress.surfaceStateBaseAddress.set(0x2000);
ze_command_list_handle_t commandLists[] = {commandList->toHandle(), commandList2->toHandle()};
commandList->close();
commandList2->close();
uint64_t startGpuAddress = commandList->getCmdContainer().getCmdBufferAllocations()[0]->getGpuAddress();
uint64_t chainedGpuAddress = commandList->getCmdContainer().getEndCmdGpuAddress();
uint64_t start2GpuAddress = commandList2->getCmdContainer().getCmdBufferAllocations()[0]->getGpuAddress();
uint64_t endGpuAddress = commandList2->getCmdContainer().getEndCmdGpuAddress();
commandQueue->setPatchingPreamble(true);
void *queueCpuBase = commandQueue->commandStream.getCpuBase();
uint64_t queueGpuBase = commandQueue->commandStream.getGpuBase();
auto usedSpaceBefore = commandQueue->commandStream.getUsed();
returnValue = commandQueue->executeCommandLists(2, commandLists, nullptr, true, nullptr, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto usedSpaceAfter = commandQueue->commandStream.getUsed();
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
queueCpuBase,
usedSpaceAfter));
GenCmdList::iterator patchCmdIterator = cmdList.end();
size_t bbStartIdx = 0;
auto sdiCmds = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(0u, sdiCmds.size());
size_t sdiSizeHalf = sdiCmds.size() / 2;
for (uint32_t i = 0; i < sdiSizeHalf; i++) {
auto &sdiCmd = sdiCmds[i];
auto &sdiCmd2 = sdiCmds[i + sdiSizeHalf];
auto storeDataImm = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmd);
auto storeDataImm2 = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmd2);
EXPECT_EQ(chainedGpuAddress + bbStartIdx * sizeof(uint64_t), storeDataImm->getAddress());
EXPECT_EQ(endGpuAddress + bbStartIdx * sizeof(uint64_t), storeDataImm2->getAddress());
bbStartDwordBuffer[2 * bbStartIdx] = storeDataImm->getDataDword0();
bbStartDwordBuffer2[2 * bbStartIdx] = storeDataImm2->getDataDword0();
if (storeDataImm->getStoreQword()) {
bbStartDwordBuffer[2 * bbStartIdx + 1] = storeDataImm->getDataDword1();
}
if (storeDataImm2->getStoreQword()) {
bbStartDwordBuffer2[2 * bbStartIdx + 1] = storeDataImm2->getDataDword1();
}
bbStartIdx++;
patchCmdIterator = sdiCmd2;
}
MI_BATCH_BUFFER_START *chainLinkBbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartDwordBuffer);
ASSERT_NE(nullptr, chainLinkBbStartCmd);
MI_BATCH_BUFFER_START *chainBackBbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartDwordBuffer2);
ASSERT_NE(nullptr, chainBackBbStartCmd);
auto bbStarts = findAll<MI_BATCH_BUFFER_START *>(patchCmdIterator, cmdList.end());
ASSERT_NE(0u, bbStarts.size());
// single bb start means two command lists are chained directly
// two bb starts means command lists are chained to queue as SBA tracking makes dynamic preamble to reprogram SBA difference between lists
if (bbStarts.size() == 1u) {
EXPECT_EQ(start2GpuAddress, chainLinkBbStartCmd->getBatchBufferStartAddress());
auto startingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[0]);
EXPECT_EQ(startGpuAddress, startingBbStart->getBatchBufferStartAddress());
size_t offsetToReturn = ptrDiff(startingBbStart, queueCpuBase);
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
uint64_t expectedReturnAddress = queueGpuBase + offsetToReturn;
EXPECT_EQ(expectedReturnAddress, chainBackBbStartCmd->getBatchBufferStartAddress());
} else {
auto startingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[0]);
EXPECT_EQ(startGpuAddress, startingBbStart->getBatchBufferStartAddress());
size_t offsetToReturn = ptrDiff(startingBbStart, queueCpuBase);
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
uint64_t expectedReturnAddress = queueGpuBase + offsetToReturn;
EXPECT_EQ(expectedReturnAddress, chainLinkBbStartCmd->getBatchBufferStartAddress());
auto dynamicPremableStartingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[1]);
EXPECT_EQ(start2GpuAddress, dynamicPremableStartingBbStart->getBatchBufferStartAddress());
offsetToReturn = ptrDiff(dynamicPremableStartingBbStart, queueCpuBase);
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
expectedReturnAddress = queueGpuBase + offsetToReturn;
EXPECT_EQ(expectedReturnAddress, chainBackBbStartCmd->getBatchBufferStartAddress());
}
usedSpaceBefore = commandQueue->commandStream.getUsed();
returnValue = commandQueue->executeCommandLists(2, commandLists, nullptr, true, nullptr, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
usedSpaceAfter = commandQueue->commandStream.getUsed();
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
cmdList.clear();
bbStartIdx = 0;
memset(bbStartDwordBuffer, 0, sizeof(bbStartDwordBuffer));
memset(bbStartDwordBuffer2, 0, sizeof(bbStartDwordBuffer2));
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
ptrOffset(queueCpuBase, usedSpaceBefore),
usedSpaceAfter - usedSpaceBefore));
sdiCmds = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(0u, sdiCmds.size());
sdiSizeHalf = sdiCmds.size() / 2;
for (uint32_t i = 0; i < sdiSizeHalf; i++) {
auto &sdiCmd = sdiCmds[i];
auto &sdiCmd2 = sdiCmds[i + sdiSizeHalf];
auto storeDataImm = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmd);
auto storeDataImm2 = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmd2);
EXPECT_EQ(chainedGpuAddress + bbStartIdx * sizeof(uint64_t), storeDataImm->getAddress());
EXPECT_EQ(endGpuAddress + bbStartIdx * sizeof(uint64_t), storeDataImm2->getAddress());
bbStartDwordBuffer[2 * bbStartIdx] = storeDataImm->getDataDword0();
bbStartDwordBuffer2[2 * bbStartIdx] = storeDataImm2->getDataDword0();
if (storeDataImm->getStoreQword()) {
bbStartDwordBuffer[2 * bbStartIdx + 1] = storeDataImm->getDataDword1();
}
if (storeDataImm2->getStoreQword()) {
bbStartDwordBuffer2[2 * bbStartIdx + 1] = storeDataImm2->getDataDword1();
}
bbStartIdx++;
patchCmdIterator = sdiCmd2;
}
chainLinkBbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartDwordBuffer);
ASSERT_NE(nullptr, chainLinkBbStartCmd);
chainBackBbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartDwordBuffer2);
ASSERT_NE(nullptr, chainBackBbStartCmd);
bbStarts = findAll<MI_BATCH_BUFFER_START *>(patchCmdIterator, cmdList.end());
ASSERT_NE(0u, bbStarts.size());
// single bb start means two command lists are chained directly
// two bb starts means command lists are chained to queue as SBA tracking makes dynamic preamble to reprogram SBA difference between lists
if (bbStarts.size() == 1u) {
EXPECT_EQ(start2GpuAddress, chainLinkBbStartCmd->getBatchBufferStartAddress());
auto startingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[0]);
EXPECT_EQ(startGpuAddress, startingBbStart->getBatchBufferStartAddress());
size_t offsetToReturn = ptrDiff(startingBbStart, queueCpuBase);
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
uint64_t expectedReturnAddress = queueGpuBase + offsetToReturn;
EXPECT_EQ(expectedReturnAddress, chainBackBbStartCmd->getBatchBufferStartAddress());
} else {
auto startingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[0]);
EXPECT_EQ(startGpuAddress, startingBbStart->getBatchBufferStartAddress());
size_t offsetToReturn = ptrDiff(startingBbStart, queueCpuBase);
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
uint64_t expectedReturnAddress = queueGpuBase + offsetToReturn;
EXPECT_EQ(expectedReturnAddress, chainLinkBbStartCmd->getBatchBufferStartAddress());
auto dynamicPremableStartingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[1]);
EXPECT_EQ(start2GpuAddress, dynamicPremableStartingBbStart->getBatchBufferStartAddress());
offsetToReturn = ptrDiff(dynamicPremableStartingBbStart, queueCpuBase);
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
expectedReturnAddress = queueGpuBase + offsetToReturn;
EXPECT_EQ(expectedReturnAddress, chainBackBbStartCmd->getBatchBufferStartAddress());
}
commandList->destroy();
commandList2->destroy();
commandQueue->destroy();
}
} // namespace ult
} // namespace L0