feature: handle passing separate epilogue immediate command buffer

Related-To: NEO-10356

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2025-03-06 11:39:37 +00:00
committed by Compute-Runtime-Automation
parent bb61dafd72
commit 2b370f6a6f
6 changed files with 156 additions and 11 deletions

View File

@@ -240,6 +240,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
void allocateOrReuseKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread) override;
void handleInOrderNonWalkerSignaling(Event *event, bool &hasStallingCmds, bool &relaxedOrderingDispatch, ze_result_t &result);
CommandQueue *getCmdQImmediate(bool copyOffloadOperation) const;
NEO::LinearStream *getOptionalEpilogueCmdStream(NEO::LinearStream *taskCmdStream, NEO::AppendOperations appendOperation);
MOCKABLE_VIRTUAL void checkAssert();
ComputeFlushMethodType computeFlushMethod = nullptr;

View File

@@ -119,14 +119,12 @@ void CommandListCoreFamilyImmediate<gfxCoreFamily>::updateDispatchFlagsWithRequi
template <GFXCORE_FAMILY gfxCoreFamily>
NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushBcsTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool requireTaskCountUpdate, NEO::AppendOperations appendOperation, NEO::CommandStreamReceiver *csr) {
NEO::LinearStream *optionalEpilogueCmdStream = nullptr;
NEO::DispatchBcsFlags dispatchBcsFlags(
this->isSyncModeQueue || requireTaskCountUpdate, // flushTaskCount
hasStallingCmds, // hasStallingCmds
hasRelaxedOrderingDependencies // hasRelaxedOrderingDependencies
);
dispatchBcsFlags.optionalEpilogueCmdStream = optionalEpilogueCmdStream;
dispatchBcsFlags.optionalEpilogueCmdStream = getOptionalEpilogueCmdStream(&cmdStreamTask, appendOperation);
dispatchBcsFlags.dispatchOperation = appendOperation;
CommandListImp::storeReferenceTsToMappedEvents(true);
@@ -261,7 +259,7 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmedia
handleHeapsAndResidencyForImmediateRegularTask<streamStatesSupported>(sshCpuPointer);
}
NEO::LinearStream *optionalEpilogueCmdStream = nullptr;
NEO::LinearStream *optionalEpilogueCmdStream = getOptionalEpilogueCmdStream(&cmdStreamTask, appendOperation);
NEO::ImmediateDispatchFlags dispatchFlags{
&this->requiredStreamState, // requiredState
@@ -292,7 +290,7 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmedia
handleHeapsAndResidencyForImmediateRegularTask<streamStatesSupported>(sshCpuPointer);
}
NEO::LinearStream *optionalEpilogueCmdStream = nullptr;
NEO::LinearStream *optionalEpilogueCmdStream = getOptionalEpilogueCmdStream(&cmdStreamTask, appendOperation);
NEO::ImmediateDispatchFlags dispatchFlags{
nullptr, // requiredState
@@ -332,7 +330,7 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushRegular
this->isSyncModeQueue, // blocking
this->isSyncModeQueue, // dcFlush
this->getCommandListSLMEnable(), // useSLM
this->isSyncModeQueue, // guardCommandBufferWithPipeControl
this->isSyncModeQueue || requireTaskCountUpdate, // guardCommandBufferWithPipeControl
false, // gsba32BitRequired
false, // lowPriority
true, // implicitFlush
@@ -349,6 +347,8 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushRegular
false // isDcFlushRequiredOnStallingCommandsOnNextFlush
);
dispatchFlags.optionalEpilogueCmdStream = getOptionalEpilogueCmdStream(&cmdStreamTask, appendOperation);
auto ioh = (this->commandContainer.getIndirectHeap(NEO::IndirectHeap::Type::indirectObject));
NEO::IndirectHeap *dsh = nullptr;
NEO::IndirectHeap *ssh = nullptr;
@@ -436,6 +436,14 @@ inline ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommand
auto commandStream = this->commandContainer.getCommandStream();
size_t commandStreamStart = this->cmdListCurrentStartOffset;
if (appendOperation == NEO::AppendOperations::cmdList && this->dispatchCmdListBatchBufferAsPrimary) {
auto cmdListStartCmdBufferStream = reinterpret_cast<CommandQueueImp *>(cmdQ)->getStartingCmdBuffer();
// check if queue starting stream is the same as immediate, if not - regular cmdlist is the starting command buffer
if (cmdListStartCmdBufferStream != commandStream) {
commandStream = cmdListStartCmdBufferStream;
commandStreamStart = 0u;
}
}
auto csr = static_cast<CommandQueueImp *>(cmdQ)->getCsr();
auto lockCSR = outerLock != nullptr ? std::move(*outerLock) : csr->obtainUniqueOwnership();
@@ -494,7 +502,8 @@ inline ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommand
auto cmdQImp = static_cast<CommandQueueImp *>(cmdQ);
cmdQImp->clearHeapContainer();
this->cmdListCurrentStartOffset = commandStream->getUsed();
// save offset from immediate stream - even when not used to dispatch commands, can be used for epilogue
this->cmdListCurrentStartOffset = this->commandContainer.getCommandStream()->getUsed();
this->containsAnyKernel = false;
this->handlePostSubmissionState();
@@ -1186,6 +1195,18 @@ CommandQueue *CommandListCoreFamilyImmediate<gfxCoreFamily>::getCmdQImmediate(bo
return copyOffloadOperation ? this->cmdQImmediateCopyOffload : this->cmdQImmediate;
}
template <GFXCORE_FAMILY gfxCoreFamily>
NEO::LinearStream *CommandListCoreFamilyImmediate<gfxCoreFamily>::getOptionalEpilogueCmdStream(NEO::LinearStream *taskCmdStream, NEO::AppendOperations appendOperation) {
if (appendOperation == NEO::AppendOperations::cmdList && this->dispatchCmdListBatchBufferAsPrimary) {
auto commandStream = this->commandContainer.getCommandStream();
// when regular cmd list is present as main command buffer, provide immediate command stream for epilogue
if (commandStream != taskCmdStream) {
return commandStream;
}
}
return nullptr;
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies,
NEO::AppendOperations appendOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate,
@@ -1715,7 +1736,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendCommandLists(ui
}
bool hasStallingCmds = true;
return flushImmediate(ret, true, hasStallingCmds, relaxedOrderingDispatch, NEO::AppendOperations::kernel, false, hSignalEvent, true, &mainAppendLock);
return flushImmediate(ret, true, hasStallingCmds, relaxedOrderingDispatch, NEO::AppendOperations::cmdList, false, hSignalEvent, true, &mainAppendLock);
}
} // namespace L0

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2024 Intel Corporation
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,7 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp {
using BaseClass::csr;
using BaseClass::desc;
using BaseClass::device;
using BaseClass::firstCmdListStream;
using BaseClass::preemptionCmdSyncProgramming;
using BaseClass::printfKernelContainer;
using BaseClass::startingCmdBuffer;

View File

@@ -21,6 +21,7 @@
#include "level_zero/core/source/cmdqueue/cmdqueue_imp.h"
#include "level_zero/core/source/event/event.h"
#include "level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.h"
#include "level_zero/core/source/image/image_hw.h"
#include "level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.inl"
#include "level_zero/core/test/unit_tests/fixtures/host_pointer_manager_fixture.h"
@@ -1591,5 +1592,109 @@ HWTEST2_F(ImmediateCommandListTest, givenImmediateCmdListWhenAppendingRegularThe
}
}
HWTEST2_F(ImmediateCommandListTest,
givenImmediateCmdListWithPrimaryBatchBufferWhenAppendingRegularCmdListThenCorrectEpilogueCmdBufferIsUsed, MatchAny) {
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
auto &ultCsr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
commandList->close();
auto cmdListHandle = commandList->toHandle();
auto regularCmdBufferStream = commandList->getCmdContainer().getCommandStream();
auto regularCmdBufferAllocation = regularCmdBufferStream->getGraphicsAllocation();
auto cmdQImmediate = static_cast<WhiteBox<::L0::CommandQueue> *>(commandListImmediate->cmdQImmediate);
commandListImmediate->dispatchCmdListBatchBufferAsPrimary = true;
cmdQImmediate->dispatchCmdListBatchBufferAsPrimary = true;
auto dispatchRegularBufferLinearStream = &cmdQImmediate->firstCmdListStream;
// first append can carry preamble
commandListImmediate->appendCommandLists(1, &cmdListHandle, nullptr, 0, nullptr);
ultCsr.recordFlushedBatchBuffer = true;
auto immediateCmdBufferStream = commandListImmediate->getCmdContainer().getCommandStream();
auto immediateCmdBufferOffset = immediateCmdBufferStream->getUsed();
// no preamble - regular cmdlist buffer will be first and immediate cmd buffer will be epilogue
commandListImmediate->appendCommandLists(1, &cmdListHandle, nullptr, 0, nullptr);
if (L0GfxCoreHelper::useImmediateComputeFlushTask(device->getNEODevice()->getRootDeviceEnvironment())) {
EXPECT_EQ(NEO::AppendOperations::cmdList, ultCsr.recordedImmediateDispatchFlags.dispatchOperation);
EXPECT_EQ(dispatchRegularBufferLinearStream, ultCsr.lastFlushedImmediateCommandStream);
EXPECT_EQ(immediateCmdBufferStream, ultCsr.recordedImmediateDispatchFlags.optionalEpilogueCmdStream);
} else {
EXPECT_EQ(dispatchRegularBufferLinearStream, ultCsr.lastFlushedCommandStream);
EXPECT_EQ(immediateCmdBufferStream, ultCsr.recordedDispatchFlags.optionalEpilogueCmdStream);
}
EXPECT_EQ(regularCmdBufferAllocation, ultCsr.latestFlushedBatchBuffer.commandBufferAllocation);
auto startStream = static_cast<L0::CommandQueueImp *>(commandListImmediate->cmdQImmediate)->getStartingCmdBuffer();
EXPECT_EQ(dispatchRegularBufferLinearStream, startStream);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
ptrOffset(immediateCmdBufferStream->getCpuBase(), immediateCmdBufferOffset),
immediateCmdBufferStream->getUsed() - immediateCmdBufferOffset));
auto iterator = find<MI_BATCH_BUFFER_END *>(cmdList.begin(), cmdList.end());
EXPECT_NE(cmdList.end(), iterator);
}
HWTEST2_F(ImmediateCommandListTest,
givenCopyEngineImmediateCmdListWithPrimaryBatchBufferWhenAppendingRegularCmdListThenCorrectEpilogueCmdBufferIsUsed, MatchAny) {
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
ze_result_t returnValue;
commandList.reset(CommandList::whiteboxCast(CommandList::create(productFamily, device, NEO::EngineGroupType::copy, 0u, returnValue, false)));
commandList->close();
auto cmdListHandle = commandList->toHandle();
ze_command_queue_desc_t desc = {};
commandListImmediate.reset(CommandList::whiteboxCast(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::copy, returnValue)));
auto regularCmdBufferStream = commandList->getCmdContainer().getCommandStream();
auto regularCmdBufferAllocation = regularCmdBufferStream->getGraphicsAllocation();
auto cmdQImmediate = static_cast<WhiteBox<::L0::CommandQueue> *>(commandListImmediate->cmdQImmediate);
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(cmdQImmediate->csr);
commandListImmediate->dispatchCmdListBatchBufferAsPrimary = true;
cmdQImmediate->dispatchCmdListBatchBufferAsPrimary = true;
auto dispatchRegularBufferLinearStream = &cmdQImmediate->firstCmdListStream;
// first append can carry preamble
commandListImmediate->appendCommandLists(1, &cmdListHandle, nullptr, 0, nullptr);
ultCsr->recordFlushedBatchBuffer = true;
auto immediateCmdBufferStream = commandListImmediate->getCmdContainer().getCommandStream();
auto immediateCmdBufferOffset = immediateCmdBufferStream->getUsed();
// no preamble - regular cmdlist buffer will be first and immediate cmd buffer will be epilogue
commandListImmediate->appendCommandLists(1, &cmdListHandle, nullptr, 0, nullptr);
EXPECT_EQ(NEO::AppendOperations::cmdList, ultCsr->recordedBcsDispatchFlags.dispatchOperation);
EXPECT_EQ(dispatchRegularBufferLinearStream, ultCsr->lastFlushedBcsCommandStream);
EXPECT_EQ(immediateCmdBufferStream, ultCsr->recordedBcsDispatchFlags.optionalEpilogueCmdStream);
EXPECT_EQ(regularCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation);
auto startStream = static_cast<L0::CommandQueueImp *>(commandListImmediate->cmdQImmediate)->getStartingCmdBuffer();
EXPECT_EQ(dispatchRegularBufferLinearStream, startStream);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
ptrOffset(immediateCmdBufferStream->getCpuBase(), immediateCmdBufferOffset),
immediateCmdBufferStream->getUsed() - immediateCmdBufferOffset));
auto iterator = find<MI_BATCH_BUFFER_END *>(cmdList.begin(), cmdList.end());
EXPECT_NE(cmdList.end(), iterator);
}
} // namespace ult
} // namespace L0

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2024 Intel Corporation
* Copyright (C) 2019-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -46,4 +46,8 @@ struct DispatchFlagsHelper {
false // isDcFlushRequiredOnStallingCommandsOnNextFlush
);
}
static DispatchBcsFlags createDefaultBcsDispatchFlags() {
return DispatchBcsFlags(false, false, false);
}
};

View File

@@ -177,7 +177,8 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily> {
uint32_t rootDeviceIndex,
const DeviceBitfield deviceBitfield)
: BaseClass(executionEnvironment, rootDeviceIndex, deviceBitfield), recursiveLockCounter(0),
recordedDispatchFlags(DispatchFlagsHelper::createDefaultDispatchFlags()) {
recordedDispatchFlags(DispatchFlagsHelper::createDefaultDispatchFlags()),
recordedBcsDispatchFlags(DispatchFlagsHelper::createDefaultBcsDispatchFlags()) {
this->downloadAllocationImpl = [this](GraphicsAllocation &graphicsAllocation) {
this->downloadAllocationUlt(graphicsAllocation);
};
@@ -238,6 +239,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily> {
Device &device) override {
recordedImmediateDispatchFlags = dispatchFlags;
this->lastFlushedCommandStream = &commandStream;
this->lastFlushedImmediateCommandStream = &immediateCommandStream;
return BaseClass::flushImmediateTask(immediateCommandStream, immediateCommandStreamStart, dispatchFlags, device);
}
@@ -247,9 +249,17 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily> {
Device &device) override {
recordedImmediateDispatchFlags = dispatchFlags;
this->lastFlushedCommandStream = &commandStream;
this->lastFlushedImmediateCommandStream = &immediateCommandStream;
return BaseClass::flushImmediateTaskStateless(immediateCommandStream, immediateCommandStreamStart, dispatchFlags, device);
}
CompletionStamp flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) override {
this->recordedBcsDispatchFlags = dispatchBcsFlags;
this->lastFlushedBcsCommandStream = &commandStreamTask;
return BaseClass::flushBcsTask(commandStreamTask, commandStreamTaskStart, dispatchBcsFlags, hwInfo);
}
SubmissionStatus initializeDeviceWithFirstSubmission(Device &device) override {
initializeDeviceWithFirstSubmissionCalled++;
return BaseClass::initializeDeviceWithFirstSubmission(device);
@@ -566,6 +576,8 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily> {
TaskCountType flushBcsTaskReturnValue{};
LinearStream *lastFlushedCommandStream = nullptr;
LinearStream *lastFlushedImmediateCommandStream = nullptr;
LinearStream *lastFlushedBcsCommandStream = nullptr;
LinearStream *commandStreamHeaplessStateInit = nullptr;
const IndirectHeap *recordedSsh = nullptr;
@@ -588,6 +600,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily> {
mutable uint32_t checkGpuHangDetectedCalled = 0;
int ensureCommandBufferAllocationCalled = 0;
DispatchFlags recordedDispatchFlags;
DispatchBcsFlags recordedBcsDispatchFlags;
ImmediateDispatchFlags recordedImmediateDispatchFlags = {};
BlitPropertiesContainer receivedBlitProperties = {};
uint32_t createAllocationForHostSurfaceCalled = 0;