feature: add support encodng front end command to patch preamble

Related-To: NEO-15376

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2025-08-05 10:26:20 +00:00
committed by Compute-Runtime-Automation
parent e88de52133
commit b0e9267e62
7 changed files with 140 additions and 19 deletions

View File

@@ -496,7 +496,7 @@ inline ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommand
*csr);
}
static_cast<CommandQueueHw<gfxCoreFamily> *>(this->cmdQImmediate)->patchCommands(*this, 0u, false);
static_cast<CommandQueueHw<gfxCoreFamily> *>(this->cmdQImmediate)->patchCommands(*this, 0u, false, nullptr);
} else {
lockForIndirect = std::move(*outerLockForIndirect);
cmdQImp->makeResidentForResidencyContainer(this->commandContainer.getResidencyContainer());

View File

@@ -54,7 +54,8 @@ struct CommandQueueHw : public CommandQueueImp {
uint32_t perThreadScratchSpaceSlot1Size);
bool getPreemptionCmdProgramming() override;
void patchCommands(CommandList &commandList, uint64_t scratchAddress, bool patchNewScratchController);
void patchCommands(CommandList &commandList, uint64_t scratchAddress, bool patchNewScratchController,
void **patchPreambleBuffer);
protected:
struct CommandListExecutionContext {
@@ -154,6 +155,7 @@ struct CommandQueueHw : public CommandQueueImp {
inline size_t estimateCommandListSecondaryStart(CommandList *commandList);
inline size_t estimateCommandListPrimaryStart(bool required);
inline size_t estimateCommandListPatchPreamble(CommandListExecutionContext &ctx, uint32_t numCommandLists);
inline size_t estimateCommandListPatchPreambleFrontEndCmd(CommandListExecutionContext &ctx, CommandList *commandList);
inline void retrivePatchPreambleSpace(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream);
inline void dispatchPatchPreambleEnding(CommandListExecutionContext &ctx);
inline size_t estimateCommandListResidencySize(CommandList *commandList);

View File

@@ -262,6 +262,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateStreamSizeForExecuteCommandListsRe
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit || this->forceBbStartJump);
for (uint32_t i = 0; i < numCommandLists; i++) {
auto cmdList = CommandList::fromHandle(commandListHandles[i]);
linearStreamSizeEstimate += estimateCommandListPatchPreambleFrontEndCmd(ctx, cmdList);
linearStreamSizeEstimate += estimateCommandListSecondaryStart(cmdList);
}
@@ -908,6 +909,19 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPrimaryStart(bool requi
return 0;
}
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPatchPreambleFrontEndCmd(CommandListExecutionContext &ctx, CommandList *commandList) {
size_t encodeSize = 0;
if (this->patchingPreamble) {
const size_t feCmdSize = NEO::PreambleHelper<GfxFamily>::getVFECommandsSize();
size_t singleFeCmdEncodeSize = NEO::EncodeDataMemory<GfxFamily>::getCommandSizeForEncode(feCmdSize);
encodeSize = singleFeCmdEncodeSize * commandList->getFrontEndPatchListCount();
ctx.bufferSpaceForPatchPreamble += encodeSize;
}
return encodeSize;
}
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPatchPreamble(CommandListExecutionContext &ctx, uint32_t numCommandLists) {
size_t encodeSize = 0;
@@ -920,7 +934,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPatchPreamble(CommandLi
encodeSize += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier();
encodeSize += 2 * NEO::EncodeMiArbCheck<GfxFamily>::getCommandSize();
ctx.bufferSpaceForPatchPreamble = encodeSize;
ctx.bufferSpaceForPatchPreamble += encodeSize;
// patch preamble dispatched into queue's buffer forces not to use cmdlist as a starting buffer
this->forceBbStartJump = true;
@@ -1021,6 +1035,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
const NEO::StreamProperties &requiredStreamState = cmdList->getRequiredStreamState();
const NEO::StreamProperties &finalStreamState = cmdList->getFinalStreamState();
linearStreamSizeEstimate += estimateCommandListPatchPreambleFrontEndCmd(ctx, cmdList);
linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(frontEndStateDirty, cmdList,
streamProperties, requiredStreamState, finalStreamState,
cmdListState.requiredState,
@@ -1902,7 +1917,7 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, Comm
}
}
patchCommands(commandList, scratchAddress, patchNewScratchController);
patchCommands(commandList, scratchAddress, patchNewScratchController, &ctx.currentPatchPreambleBuffer);
if (patchNewScratchController) {
commandList.setCommandListUsedScratchController(ctx.scratchSpaceController);

View File

@@ -132,7 +132,8 @@ void CommandQueueHw<gfxCoreFamily>::handleScratchSpace(NEO::HeapContainer &heapC
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint64_t scratchAddress,
bool patchNewScratchController) {
bool patchNewScratchController,
void **patchPreambleBuffer) {
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;

View File

@@ -164,7 +164,8 @@ void CommandQueueHw<gfxCoreFamily>::handleScratchSpace(NEO::HeapContainer &sshHe
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint64_t scratchAddress,
bool patchNewScratchController) {
bool patchNewScratchController,
void **patchPreambleBuffer) {
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
@@ -181,7 +182,11 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
cfeStateCmd->setScratchSpaceBuffer(lowScratchAddress);
NEO::PreambleHelper<GfxFamily>::setSingleSliceDispatchMode(cfeStateCmd, false);
*reinterpret_cast<CFE_STATE *>(commandToPatch.pDestination) = *cfeStateCmd;
if (this->patchingPreamble) {
NEO::EncodeDataMemory<GfxFamily>::programDataMemory(*patchPreambleBuffer, commandToPatch.gpuAddress, commandToPatch.pCommand, sizeof(CFE_STATE));
} else {
*reinterpret_cast<CFE_STATE *>(commandToPatch.pDestination) = *cfeStateCmd;
}
break;
} else {
UNRECOVERABLE_IF(true);

View File

@@ -6,6 +6,7 @@
*/
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_stream/scratch_space_controller.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/bindless_heaps_helper.h"
@@ -1524,6 +1525,103 @@ HWTEST2_F(CommandListAppendLaunchKernel, GivenHeapfulSupportWhenAppendVfeStateCm
}
}
HWTEST2_F(CommandListAppendLaunchKernel, GivenPatchPreambleActiveWhenExecutingCommandListWithFrontEndCmdInPatchListThenExpectPatchPreambleEncoding, IsAtLeastXeCore) {
if constexpr (FamilyType::isHeaplessRequired() == true) {
GTEST_SKIP();
} else {
using CFE_STATE = typename FamilyType::CFE_STATE;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
ze_result_t returnValue;
ze_command_queue_desc_t queueDesc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
auto commandQueue = whiteboxCast(CommandQueue::create(productFamily,
device,
neoDevice->getDefaultEngine().commandStreamReceiver,
&queueDesc,
false,
false,
false,
returnValue));
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
returnValue = commandList->initialize(device, NEO::EngineGroupType::compute, 0u);
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto commandListHandle = commandList->toHandle();
commandList->setCommandListPerThreadScratchSize(0, 0x1000);
auto expectedGpuAddress = commandList->getCmdContainer().getCommandStream()->getCurrentGpuAddressPosition();
commandList->appendVfeStateCmdToPatch();
ASSERT_NE(0u, commandList->commandsToPatch.size());
EXPECT_EQ(CommandToPatch::FrontEndState, commandList->commandsToPatch[0].type);
EXPECT_EQ(expectedGpuAddress, commandList->commandsToPatch[0].gpuAddress);
EXPECT_EQ(1u, commandList->getFrontEndPatchListCount());
auto expectedGpuAddress2 = commandList->getCmdContainer().getCommandStream()->getCurrentGpuAddressPosition();
commandList->appendVfeStateCmdToPatch();
EXPECT_EQ(CommandToPatch::FrontEndState, commandList->commandsToPatch[1].type);
EXPECT_EQ(expectedGpuAddress2, commandList->commandsToPatch[1].gpuAddress);
EXPECT_EQ(2u, commandList->getFrontEndPatchListCount());
commandList->close();
void *cfeInputPtr = commandList->commandsToPatch[0].pCommand;
void *cfeInputPtr2 = commandList->commandsToPatch[1].pCommand;
commandQueue->setPatchingPreamble(true);
void *queueCpuBase = commandQueue->commandStream.getCpuBase();
auto usedSpaceBefore = commandQueue->commandStream.getUsed();
returnValue = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto usedSpaceAfter = commandQueue->commandStream.getUsed();
auto scratchAddress = static_cast<uint32_t>(commandQueue->getCsr()->getScratchSpaceController()->getScratchPatchAddress());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
ptrOffset(queueCpuBase, usedSpaceBefore),
usedSpaceAfter - usedSpaceBefore));
uint32_t cfeStateDwordBuffer[sizeof(CFE_STATE) / sizeof(uint32_t)] = {0};
uint32_t cfeStateDwordBuffer2[sizeof(CFE_STATE) / sizeof(uint32_t)] = {0};
auto sdiCmds = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_LT(6u, sdiCmds.size());
// CFE_STATE size is qword aligned and are only commands dispatched into command lists, so optimal number of SDIs - 3xqword
for (uint32_t i = 0; i < 3; i++) {
auto storeDataImmForCfe = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmds[i]);
auto storeDataImmForCfe2 = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmds[i + 3]);
EXPECT_EQ(expectedGpuAddress + i * sizeof(uint64_t), storeDataImmForCfe->getAddress());
EXPECT_EQ(expectedGpuAddress2 + i * sizeof(uint64_t), storeDataImmForCfe2->getAddress());
EXPECT_TRUE(storeDataImmForCfe->getStoreQword());
EXPECT_TRUE(storeDataImmForCfe2->getStoreQword());
cfeStateDwordBuffer[2 * i] = storeDataImmForCfe->getDataDword0();
cfeStateDwordBuffer[2 * i + 1] = storeDataImmForCfe->getDataDword1();
cfeStateDwordBuffer2[2 * i] = storeDataImmForCfe2->getDataDword0();
cfeStateDwordBuffer2[2 * i + 1] = storeDataImmForCfe2->getDataDword1();
}
auto cfeEncodedCmd = genCmdCast<CFE_STATE *>(cfeStateDwordBuffer);
ASSERT_NE(nullptr, cfeEncodedCmd);
EXPECT_EQ(scratchAddress, cfeEncodedCmd->getScratchSpaceBuffer());
auto cfeEncodedCmd2 = genCmdCast<CFE_STATE *>(cfeStateDwordBuffer2);
ASSERT_NE(nullptr, cfeEncodedCmd2);
EXPECT_EQ(scratchAddress, cfeEncodedCmd2->getScratchSpaceBuffer());
EXPECT_EQ(0, memcmp(cfeInputPtr, cfeStateDwordBuffer, sizeof(CFE_STATE)));
EXPECT_EQ(0, memcmp(cfeInputPtr2, cfeStateDwordBuffer2, sizeof(CFE_STATE)));
commandQueue->destroy();
}
}
HWTEST2_F(CommandListAppendLaunchKernel, whenUpdateStreamPropertiesIsCalledThenCorrectThreadArbitrationPolicyIsSet, IsHeapfulSupported) {
DebugManagerStateRestore restorer;
debugManager.flags.ForceThreadArbitrationPolicyProgrammingWithScm.set(1);

View File

@@ -981,9 +981,9 @@ HWTEST2_F(CommandQueueScratchTests, whenPatchCommandsIsCalledThenCommandsAreCorr
auto commandQueue = std::make_unique<MockCommandQueueHw<FamilyType::gfxCoreFamily>>(device, csr, &desc);
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0, false));
EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0, false, nullptr));
commandList->commandsToPatch.push_back({});
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false, nullptr));
commandList->commandsToPatch.clear();
if constexpr (FamilyType::isHeaplessRequired()) {
@@ -992,7 +992,7 @@ HWTEST2_F(CommandQueueScratchTests, whenPatchCommandsIsCalledThenCommandsAreCorr
commandToPatch.pCommand = nullptr;
commandToPatch.type = CommandToPatch::FrontEndState;
commandList->commandsToPatch.push_back(commandToPatch);
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false, nullptr));
commandList->commandsToPatch.clear();
} else {
using CFE_STATE = typename FamilyType::CFE_STATE;
@@ -1021,7 +1021,7 @@ HWTEST2_F(CommandQueueScratchTests, whenPatchCommandsIsCalledThenCommandsAreCorr
}
uint64_t patchedScratchAddress = 0xABCD00;
commandQueue->patchCommands(*commandList, patchedScratchAddress, false);
commandQueue->patchCommands(*commandList, patchedScratchAddress, false, nullptr);
for (size_t i = 0; i < 4; i++) {
EXPECT_EQ(patchedScratchAddress, destinationCfeStates[i].getScratchSpaceBuffer());
auto &sourceCfeState = *reinterpret_cast<CFE_STATE *>(commandList->commandsToPatch[i].pCommand);
@@ -1041,21 +1041,21 @@ HWTEST2_F(CommandQueueScratchTests, givenCommandsToPatchToNotSupportedPlatformWh
auto commandQueue = std::make_unique<MockCommandQueueHw<FamilyType::gfxCoreFamily>>(device, csr, &desc);
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0, false));
EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0, false, nullptr));
commandList->commandsToPatch.push_back({});
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false, nullptr));
commandList->commandsToPatch.clear();
CommandToPatch commandToPatch;
commandToPatch.type = CommandToPatch::FrontEndState;
commandList->commandsToPatch.push_back(commandToPatch);
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false, nullptr));
commandList->commandsToPatch.clear();
commandToPatch.type = CommandToPatch::Invalid;
commandList->commandsToPatch.push_back(commandToPatch);
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false, nullptr));
commandList->commandsToPatch.clear();
}
@@ -1100,7 +1100,7 @@ HWTEST2_F(CommandQueueScratchTests, givenInlineDataScratchWhenPatchCommandsIsCal
cmd.scratchAddressAfterPatch = testCase.scratchAlreadyPatched ? scratchAddress : 0;
commandList->commandsToPatch.push_back(cmd);
commandQueue->patchCommands(*commandList, scratchAddress, testCase.scratchControllerChanged);
commandQueue->patchCommands(*commandList, scratchAddress, testCase.scratchControllerChanged, nullptr);
EXPECT_EQ(testCase.expectedValue, scratchBuffer);
}
@@ -1133,7 +1133,7 @@ HWTEST2_F(CommandQueueScratchTests, givenImplicitArgsScratchWhenPatchCommandsIsC
cmd.scratchAddressAfterPatch = scratchAlreadyPatched ? scratchAddress : 0;
commandList->commandsToPatch.push_back(cmd);
commandQueue->patchCommands(*commandList, scratchAddress, scratchControllerChanged);
commandQueue->patchCommands(*commandList, scratchAddress, scratchControllerChanged, nullptr);
EXPECT_EQ(expectedValue, scratchBuffer);
}
@@ -1162,12 +1162,12 @@ HWTEST_F(CommandQueueCreate, givenCommandsToPatchWithNoopSpacePatchWhenPatchComm
commandToPatch.patchSize = dataSize;
commandList->commandsToPatch.push_back(commandToPatch);
commandQueue->patchCommands(*commandList, 0, false);
commandQueue->patchCommands(*commandList, 0, false, nullptr);
EXPECT_EQ(0, memcmp(patchBuffer.get(), zeroBuffer.get(), dataSize));
memset(patchBuffer.get(), 0xFF, dataSize);
commandList->commandsToPatch[0].pDestination = nullptr;
commandQueue->patchCommands(*commandList, 0, false);
commandQueue->patchCommands(*commandList, 0, false, nullptr);
EXPECT_NE(0, memcmp(patchBuffer.get(), zeroBuffer.get(), dataSize));
}