feature: add support for wait event preamble in append command list

- add mechanism in queue to trigger start command from queue to regular
- add detection in immediate command list need to dispatch extra start in queue
- fix secondary linear stream in immediate case as it should not use container
- modify tests for primary batch buffer dispatch as default mode
- remove invalid or obsolete tests

Related-To: NEO-10356

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2025-03-07 09:52:44 +00:00
committed by Compute-Runtime-Automation
parent 571e0f2ba3
commit f8be8414ac
11 changed files with 111 additions and 101 deletions

View File

@@ -438,7 +438,9 @@ inline ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommand
size_t commandStreamStart = this->cmdListCurrentStartOffset;
if (appendOperation == NEO::AppendOperations::cmdList && this->dispatchCmdListBatchBufferAsPrimary) {
auto cmdListStartCmdBufferStream = reinterpret_cast<CommandQueueImp *>(cmdQ)->getStartingCmdBuffer();
// check if queue starting stream is the same as immediate, if not - regular cmdlist is the starting command buffer
// check if queue starting stream is the same as immediate,
// if they are the same - immediate command list buffer has preamble in it including jump from immediate to regular cmdlist - proceed normal
// if not - regular cmdlist is the starting command buffer - no queue preamble or waiting commands
if (cmdListStartCmdBufferStream != commandStream) {
commandStream = cmdListStartCmdBufferStream;
commandStreamStart = 0u;
@@ -1720,7 +1722,16 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendCommandLists(ui
return ret;
}
auto mainAppendLock = static_cast<CommandQueueImp *>(this->cmdQImmediate)->getCsr()->obtainUniqueOwnership();
auto queueImp = static_cast<CommandQueueImp *>(this->cmdQImmediate);
auto mainAppendLock = queueImp->getCsr()->obtainUniqueOwnership();
if (this->dispatchCmdListBatchBufferAsPrimary) {
// check if wait event preamble or implicit synchronization is present and force bb start jump in queue, even when no preamble is required there
if (this->commandContainer.getCommandStream()->getUsed() != this->cmdListCurrentStartOffset) {
queueImp->triggerBbStartJump();
}
}
ret = this->cmdQImmediate->executeCommandLists(numCommandLists, phCommandLists, nullptr, true, this->commandContainer.getCommandStream());
if (ret != ZE_RESULT_SUCCESS) {
return ret;

View File

@@ -253,7 +253,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateStreamSizeForExecuteCommandListsRe
linearStreamSizeEstimate += NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferEndSize();
}
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit);
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit || this->forceBbStartJump);
for (uint32_t i = 0; i < numCommandLists; i++) {
auto cmdList = CommandList::fromHandle(commandListHandles[i]);
linearStreamSizeEstimate += estimateCommandListSecondaryStart(cmdList);
@@ -486,7 +486,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
ctx.spaceForResidency += estimateCommandListResidencySize(commandList);
}
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit);
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit || this->forceBbStartJump);
if (fenceRequired) {
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleAdditionalSynchronization(device->getNEODevice()->getRootDeviceEnvironment());
}
@@ -1020,7 +1020,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
}
bool firstCmdlistDynamicPreamble = (this->stateChanges.size() > 0 && this->stateChanges[0].cmdListIndex == 0);
bool estimateBbStartForGlobalInitOnly = !firstCmdlistDynamicPreamble && ctx.globalInit;
bool estimateBbStartForGlobalInitOnly = !firstCmdlistDynamicPreamble && (ctx.globalInit || this->forceBbStartJump);
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(estimateBbStartForGlobalInitOnly);
return linearStreamSizeEstimate;
@@ -1217,7 +1217,7 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStartPrimaryBatc
auto bbStartPatchLocation = reinterpret_cast<MI_BATCH_BUFFER_START *>(ctx.currentPatchForChainedBbStart);
bool dynamicPreamble = ctx.childGpuAddressPositionBeforeDynamicPreamble != commandStream.getCurrentGpuAddressPosition();
if (ctx.globalInit || dynamicPreamble) {
if (ctx.globalInit || dynamicPreamble || this->forceBbStartJump) {
if (ctx.currentPatchForChainedBbStart) {
// dynamic preamble, 2nd or later command list
// jump from previous command list to the position before dynamic preamble
@@ -1230,6 +1230,7 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStartPrimaryBatc
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&commandStream, cmdListFirstCmdBuffer->getGpuAddress(), false, false, false);
ctx.globalInit = false;
this->forceBbStartJump = false;
} else {
if (ctx.currentPatchForChainedBbStart == nullptr) {
// nothing to dispatch from queue, first command list will be used as submitting batch buffer to KMD or ULLS

View File

@@ -114,6 +114,9 @@ struct CommandQueueImp : public CommandQueue {
NEO::LinearStream *getStartingCmdBuffer() const {
return startingCmdBuffer;
}
void triggerBbStartJump() {
forceBbStartJump = true;
}
protected:
MOCKABLE_VIRTUAL NEO::SubmissionStatus submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr,
@@ -171,6 +174,7 @@ struct CommandQueueImp : public CommandQueue {
std::atomic<bool> cmdListWithAssertExecuted = false;
bool useKmdWaitFunction = false;
bool forceBbStartJump = false;
};
} // namespace L0