mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-11 16:45:25 +08:00
feature: add patch preamble to level zero queue handling bb_start commands
Related-To: NEO-15376 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
91a9ccaebe
commit
01889c97a5
@@ -89,6 +89,9 @@ struct CommandQueue : _ze_command_queue_handle_t {
|
||||
this->isWalkerWithProfilingEnqueued = false;
|
||||
return retVal;
|
||||
}
|
||||
inline void setPatchingPreamble(bool patching) {
|
||||
this->patchingPreamble = patching;
|
||||
}
|
||||
|
||||
protected:
|
||||
bool frontEndTrackingEnabled() const;
|
||||
@@ -111,6 +114,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
|
||||
bool heaplessModeEnabled = false;
|
||||
bool heaplessStateInitEnabled = false;
|
||||
bool isWalkerWithProfilingEnqueued = false;
|
||||
bool patchingPreamble = false;
|
||||
};
|
||||
|
||||
using CommandQueueAllocatorFn = CommandQueue *(*)(Device *device, NEO::CommandStreamReceiver *csr,
|
||||
|
||||
@@ -77,11 +77,14 @@ struct CommandQueueHw : public CommandQueueImp {
|
||||
NEO::StreamProperties cmdListBeginState{};
|
||||
uint64_t scratchGsba = 0;
|
||||
uint64_t childGpuAddressPositionBeforeDynamicPreamble = 0;
|
||||
uint64_t currentGpuAddressForChainedBbStart = 0;
|
||||
|
||||
size_t spaceForResidency = 10;
|
||||
size_t bufferSpaceForPatchPreamble = 0;
|
||||
CommandList *firstCommandList = nullptr;
|
||||
CommandList *lastCommandList = nullptr;
|
||||
void *currentPatchForChainedBbStart = nullptr;
|
||||
void *currentPatchPreambleBuffer = nullptr;
|
||||
NEO::ScratchSpaceController *scratchSpaceController = nullptr;
|
||||
NEO::GraphicsAllocation *globalStatelessAllocation = nullptr;
|
||||
std::unique_lock<std::mutex> *outerLockForIndirect = nullptr;
|
||||
@@ -150,6 +153,9 @@ struct CommandQueueHw : public CommandQueueImp {
|
||||
bool stateCacheFlushRequired);
|
||||
inline size_t estimateCommandListSecondaryStart(CommandList *commandList);
|
||||
inline size_t estimateCommandListPrimaryStart(bool required);
|
||||
inline size_t estimateCommandListPatchPreamble(CommandListExecutionContext &ctx, uint32_t numCommandLists);
|
||||
inline void retrivePatchPreambleSpace(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream);
|
||||
inline void dispatchPatchPreambleEnding(CommandListExecutionContext &ctx);
|
||||
inline size_t estimateCommandListResidencySize(CommandList *commandList);
|
||||
inline void setFrontEndStateProperties(CommandListExecutionContext &ctx);
|
||||
inline void handleScratchSpaceAndUpdateGSBAStateDirtyFlag(CommandListExecutionContext &ctx);
|
||||
|
||||
@@ -202,6 +202,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless(
|
||||
neoDevice->getBindlessHeapsHelper()->clearStateDirtyForContext(this->csr->getOsContext().getContextId());
|
||||
}
|
||||
|
||||
this->retrivePatchPreambleSpace(ctx, *streamForDispatch);
|
||||
|
||||
for (auto i = 0u; i < numCommandLists; ++i) {
|
||||
auto commandList = CommandList::fromHandle(commandListHandles[i]);
|
||||
|
||||
@@ -221,6 +223,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless(
|
||||
|
||||
this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList);
|
||||
this->programLastCommandListReturnBbStart(*streamForDispatch, ctx);
|
||||
this->dispatchPatchPreambleEnding(ctx);
|
||||
|
||||
if (!ctx.containsParentImmediateStream) {
|
||||
this->assignCsrTaskCountToFenceIfAvailable(hFence);
|
||||
@@ -255,6 +258,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateStreamSizeForExecuteCommandListsRe
|
||||
linearStreamSizeEstimate += NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferEndSize();
|
||||
}
|
||||
|
||||
linearStreamSizeEstimate += this->estimateCommandListPatchPreamble(ctx, numCommandLists);
|
||||
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit || this->forceBbStartJump);
|
||||
for (uint32_t i = 0; i < numCommandLists; i++) {
|
||||
auto cmdList = CommandList::fromHandle(commandListHandles[i]);
|
||||
@@ -401,6 +405,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
|
||||
}
|
||||
}
|
||||
|
||||
this->retrivePatchPreambleSpace(ctx, *streamForDispatch);
|
||||
|
||||
for (auto i = 0u; i < numCommandLists; ++i) {
|
||||
auto commandList = CommandList::fromHandle(commandListHandles[i]);
|
||||
|
||||
@@ -436,6 +442,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
|
||||
this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList);
|
||||
|
||||
this->programLastCommandListReturnBbStart(*streamForDispatch, ctx);
|
||||
this->dispatchPatchPreambleEnding(ctx);
|
||||
|
||||
this->csr->setPreemptionMode(ctx.statePreemption);
|
||||
|
||||
@@ -482,6 +489,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
|
||||
linearStreamSizeEstimate += estimateCommandListSecondaryStart(commandList);
|
||||
}
|
||||
|
||||
linearStreamSizeEstimate += this->estimateCommandListPatchPreamble(ctx, numCommandLists);
|
||||
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit || this->forceBbStartJump);
|
||||
if (fenceRequired) {
|
||||
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleAdditionalSynchronization(NEO::FenceType::release, device->getNEODevice()->getRootDeviceEnvironment());
|
||||
@@ -501,6 +509,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
|
||||
this->getTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(*streamForDispatch);
|
||||
this->csr->programHardwareContext(*streamForDispatch);
|
||||
|
||||
this->retrivePatchPreambleSpace(ctx, *streamForDispatch);
|
||||
|
||||
for (auto i = 0u; i < numCommandLists; ++i) {
|
||||
auto commandList = CommandList::fromHandle(phCommandLists[i]);
|
||||
ctx.childGpuAddressPositionBeforeDynamicPreamble = (*streamForDispatch).getCurrentGpuAddressPosition();
|
||||
@@ -512,6 +522,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
|
||||
this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList);
|
||||
|
||||
this->programLastCommandListReturnBbStart(*streamForDispatch, ctx);
|
||||
this->dispatchPatchPreambleEnding(ctx);
|
||||
|
||||
this->makeCsrTagAllocationResident();
|
||||
|
||||
@@ -897,6 +908,49 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPrimaryStart(bool requi
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPatchPreamble(CommandListExecutionContext &ctx, uint32_t numCommandLists) {
|
||||
size_t encodeSize = 0;
|
||||
if (this->patchingPreamble) {
|
||||
constexpr size_t bbStartSize = NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize();
|
||||
size_t singleBbStartEncodeSize = NEO::EncodeDataMemory<GfxFamily>::getCommandSizeForEncode(bbStartSize);
|
||||
encodeSize = singleBbStartEncodeSize * numCommandLists;
|
||||
|
||||
// barrier command to pause between patch preamble completion and execution of command lists
|
||||
encodeSize += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier();
|
||||
encodeSize += 2 * NEO::EncodeMiArbCheck<GfxFamily>::getCommandSize();
|
||||
|
||||
ctx.bufferSpaceForPatchPreamble = encodeSize;
|
||||
|
||||
// patch preamble dispatched into queue's buffer forces not to use cmdlist as a starting buffer
|
||||
this->forceBbStartJump = true;
|
||||
}
|
||||
return encodeSize;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandQueueHw<gfxCoreFamily>::retrivePatchPreambleSpace(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream) {
|
||||
if (this->patchingPreamble) {
|
||||
ctx.currentPatchPreambleBuffer = commandStream.getSpace(ctx.bufferSpaceForPatchPreamble);
|
||||
memset(ctx.currentPatchPreambleBuffer, 0, ctx.bufferSpaceForPatchPreamble);
|
||||
|
||||
NEO::EncodeMiArbCheck<GfxFamily>::program(ctx.currentPatchPreambleBuffer, true);
|
||||
ctx.currentPatchPreambleBuffer = ptrOffset(ctx.currentPatchPreambleBuffer, NEO::EncodeMiArbCheck<GfxFamily>::getCommandSize());
|
||||
}
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandQueueHw<gfxCoreFamily>::dispatchPatchPreambleEnding(CommandListExecutionContext &ctx) {
|
||||
if (this->patchingPreamble) {
|
||||
NEO::PipeControlArgs args;
|
||||
|
||||
NEO::MemorySynchronizationCommands<GfxFamily>::setSingleBarrier(ctx.currentPatchPreambleBuffer, args);
|
||||
ctx.currentPatchPreambleBuffer = ptrOffset(ctx.currentPatchPreambleBuffer, NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier());
|
||||
|
||||
NEO::EncodeMiArbCheck<GfxFamily>::program(ctx.currentPatchPreambleBuffer, false);
|
||||
}
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListResidencySize(CommandList *commandList) {
|
||||
return commandList->getCmdContainer().getResidencyContainer().size();
|
||||
@@ -1018,6 +1072,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
|
||||
linearStreamSizeEstimate += NEO::PreemptionHelper::getRequiredStateSipCmdSize<GfxFamily>(*neoDevice, this->csr->isRcs());
|
||||
}
|
||||
|
||||
linearStreamSizeEstimate += this->estimateCommandListPatchPreamble(ctx, numCommandLists);
|
||||
bool firstCmdlistDynamicPreamble = (this->stateChanges.size() > 0 && this->stateChanges[0].cmdListIndex == 0);
|
||||
bool estimateBbStartForGlobalInitOnly = !firstCmdlistDynamicPreamble && (ctx.globalInit || this->forceBbStartJump);
|
||||
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(estimateBbStartForGlobalInitOnly);
|
||||
@@ -1223,10 +1278,17 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStartPrimaryBatc
|
||||
if (ctx.currentPatchForChainedBbStart) {
|
||||
// dynamic preamble, 2nd or later command list
|
||||
// jump from previous command list to the position before dynamic preamble
|
||||
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(
|
||||
bbStartPatchLocation,
|
||||
ctx.childGpuAddressPositionBeforeDynamicPreamble,
|
||||
false, false, false);
|
||||
if (this->patchingPreamble) {
|
||||
NEO::EncodeDataMemory<GfxFamily>::programBbStart(ctx.currentPatchPreambleBuffer,
|
||||
ctx.currentGpuAddressForChainedBbStart,
|
||||
ctx.childGpuAddressPositionBeforeDynamicPreamble,
|
||||
false, false, false);
|
||||
} else {
|
||||
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(
|
||||
bbStartPatchLocation,
|
||||
ctx.childGpuAddressPositionBeforeDynamicPreamble,
|
||||
false, false, false);
|
||||
}
|
||||
}
|
||||
// dynamic preamble, jump from current position, after dynamic preamble to the current command list
|
||||
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&commandStream, cmdListFirstCmdBuffer->getGpuAddress(), false, false, false);
|
||||
@@ -1243,14 +1305,22 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStartPrimaryBatc
|
||||
this->startingCmdBuffer = &this->firstCmdListStream;
|
||||
} else {
|
||||
// chain between command lists when no dynamic preamble required between 2nd and next command list
|
||||
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(
|
||||
bbStartPatchLocation,
|
||||
cmdListFirstCmdBuffer->getGpuAddress(),
|
||||
false, false, false);
|
||||
if (this->patchingPreamble) {
|
||||
NEO::EncodeDataMemory<GfxFamily>::programBbStart(ctx.currentPatchPreambleBuffer,
|
||||
ctx.currentGpuAddressForChainedBbStart,
|
||||
cmdListFirstCmdBuffer->getGpuAddress(),
|
||||
false, false, false);
|
||||
} else {
|
||||
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(
|
||||
bbStartPatchLocation,
|
||||
cmdListFirstCmdBuffer->getGpuAddress(),
|
||||
false, false, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ctx.currentPatchForChainedBbStart = cmdListContainer.getEndCmdPtr();
|
||||
ctx.currentGpuAddressForChainedBbStart = cmdListContainer.getEndCmdGpuAddress();
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
@@ -1307,10 +1377,17 @@ void CommandQueueHw<gfxCoreFamily>::programLastCommandListReturnBbStart(
|
||||
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
|
||||
if (this->dispatchCmdListBatchBufferAsPrimary) {
|
||||
auto finalReturnPosition = commandStream.getCurrentGpuAddressPosition();
|
||||
auto bbStartCmd = reinterpret_cast<MI_BATCH_BUFFER_START *>(ctx.currentPatchForChainedBbStart);
|
||||
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(bbStartCmd,
|
||||
finalReturnPosition,
|
||||
false, false, false);
|
||||
if (this->patchingPreamble) {
|
||||
NEO::EncodeDataMemory<GfxFamily>::programBbStart(ctx.currentPatchPreambleBuffer,
|
||||
ctx.currentGpuAddressForChainedBbStart,
|
||||
finalReturnPosition,
|
||||
false, false, false);
|
||||
} else {
|
||||
auto bbStartCmd = reinterpret_cast<MI_BATCH_BUFFER_START *>(ctx.currentPatchForChainedBbStart);
|
||||
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(bbStartCmd,
|
||||
finalReturnPosition,
|
||||
false, false, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -61,6 +61,7 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp {
|
||||
using CommandQueue::heaplessStateInitEnabled;
|
||||
using CommandQueue::internalUsage;
|
||||
using CommandQueue::partitionCount;
|
||||
using CommandQueue::patchingPreamble;
|
||||
using CommandQueue::pipelineSelectStateTracking;
|
||||
using CommandQueue::stateBaseAddressTracking;
|
||||
using CommandQueue::stateComputeModeTracking;
|
||||
@@ -105,6 +106,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
|
||||
using L0::CommandQueue::heaplessStateInitEnabled;
|
||||
using L0::CommandQueue::internalUsage;
|
||||
using L0::CommandQueue::partitionCount;
|
||||
using L0::CommandQueue::patchingPreamble;
|
||||
using L0::CommandQueue::pipelineSelectStateTracking;
|
||||
using L0::CommandQueue::preemptionCmdSyncProgramming;
|
||||
using L0::CommandQueue::stateBaseAddressTracking;
|
||||
|
||||
@@ -946,5 +946,352 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenRegularCommandListNotCl
|
||||
commandQueue->destroy();
|
||||
}
|
||||
|
||||
HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleWhenSingleCmdListExecutedThenPatchPreambleContainsEncodingReturningBbStartCmd) {
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||
|
||||
uint32_t bbStartDwordBuffer[sizeof(MI_BATCH_BUFFER_START) / sizeof(uint32_t)] = {0};
|
||||
|
||||
ze_result_t returnValue;
|
||||
ze_command_queue_desc_t queueDesc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
|
||||
queueDesc.ordinal = 0u;
|
||||
queueDesc.index = 0u;
|
||||
queueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
|
||||
queueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
|
||||
|
||||
WhiteBox<L0::CommandQueue> *commandQueue = whiteboxCast(CommandQueue::create(productFamily,
|
||||
device,
|
||||
neoDevice->getDefaultEngine().commandStreamReceiver,
|
||||
&queueDesc,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
auto commandList = CommandList::create(productFamily, device, NEO::EngineGroupType::renderCompute, 0u, returnValue, false);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
ze_command_list_handle_t commandLists[] = {commandList->toHandle()};
|
||||
commandList->close();
|
||||
uint64_t endGpuAddress = commandList->getCmdContainer().getEndCmdGpuAddress();
|
||||
uint64_t startGpuAddress = commandList->getCmdContainer().getCmdBufferAllocations()[0]->getGpuAddress();
|
||||
|
||||
commandQueue->setPatchingPreamble(true);
|
||||
|
||||
void *queueCpuBase = commandQueue->commandStream.getCpuBase();
|
||||
uint64_t queueGpuBase = commandQueue->commandStream.getGpuBase();
|
||||
|
||||
auto usedSpaceBefore = commandQueue->commandStream.getUsed();
|
||||
returnValue = commandQueue->executeCommandLists(1, commandLists, nullptr, true, nullptr, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
auto usedSpaceAfter = commandQueue->commandStream.getUsed();
|
||||
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
|
||||
cmdList,
|
||||
queueCpuBase,
|
||||
usedSpaceAfter));
|
||||
|
||||
GenCmdList::iterator patchCmdIterator = cmdList.end();
|
||||
size_t bbStartIdx = 0;
|
||||
|
||||
auto sdiCmds = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(0u, sdiCmds.size());
|
||||
for (auto &sdiCmd : sdiCmds) {
|
||||
auto storeDataImm = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmd);
|
||||
EXPECT_EQ(endGpuAddress + bbStartIdx * sizeof(uint64_t), storeDataImm->getAddress());
|
||||
|
||||
bbStartDwordBuffer[2 * bbStartIdx] = storeDataImm->getDataDword0();
|
||||
if (storeDataImm->getStoreQword()) {
|
||||
bbStartDwordBuffer[2 * bbStartIdx + 1] = storeDataImm->getDataDword1();
|
||||
}
|
||||
bbStartIdx++;
|
||||
patchCmdIterator = sdiCmd;
|
||||
}
|
||||
|
||||
auto bbStarts = findAll<MI_BATCH_BUFFER_START *>(patchCmdIterator, cmdList.end());
|
||||
ASSERT_NE(0u, bbStarts.size());
|
||||
|
||||
auto startingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[0]);
|
||||
EXPECT_EQ(startGpuAddress, startingBbStart->getBatchBufferStartAddress());
|
||||
|
||||
size_t offsetToReturn = ptrDiff(startingBbStart, queueCpuBase);
|
||||
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
|
||||
|
||||
uint64_t expectedReturnAddress = queueGpuBase + offsetToReturn;
|
||||
|
||||
MI_BATCH_BUFFER_START *chainBackBbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartDwordBuffer);
|
||||
ASSERT_NE(nullptr, chainBackBbStartCmd);
|
||||
EXPECT_EQ(expectedReturnAddress, chainBackBbStartCmd->getBatchBufferStartAddress());
|
||||
|
||||
usedSpaceBefore = commandQueue->commandStream.getUsed();
|
||||
returnValue = commandQueue->executeCommandLists(1, commandLists, nullptr, true, nullptr, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
usedSpaceAfter = commandQueue->commandStream.getUsed();
|
||||
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
|
||||
|
||||
cmdList.clear();
|
||||
bbStartIdx = 0;
|
||||
memset(bbStartDwordBuffer, 0, sizeof(bbStartDwordBuffer));
|
||||
|
||||
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
|
||||
cmdList,
|
||||
ptrOffset(queueCpuBase, usedSpaceBefore),
|
||||
usedSpaceAfter - usedSpaceBefore));
|
||||
|
||||
sdiCmds = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(0u, sdiCmds.size());
|
||||
for (auto &sdiCmd : sdiCmds) {
|
||||
auto storeDataImm = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmd);
|
||||
EXPECT_EQ(endGpuAddress + bbStartIdx * sizeof(uint64_t), storeDataImm->getAddress());
|
||||
|
||||
bbStartDwordBuffer[2 * bbStartIdx] = storeDataImm->getDataDword0();
|
||||
if (storeDataImm->getStoreQword()) {
|
||||
bbStartDwordBuffer[2 * bbStartIdx + 1] = storeDataImm->getDataDword1();
|
||||
}
|
||||
bbStartIdx++;
|
||||
patchCmdIterator = sdiCmd;
|
||||
}
|
||||
|
||||
bbStarts = findAll<MI_BATCH_BUFFER_START *>(patchCmdIterator, cmdList.end());
|
||||
ASSERT_NE(0u, bbStarts.size());
|
||||
|
||||
// second BB_START command should be the one that jumps to the begin of the 1st command list
|
||||
startingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[0]);
|
||||
EXPECT_EQ(startGpuAddress, startingBbStart->getBatchBufferStartAddress());
|
||||
|
||||
offsetToReturn = ptrDiff(startingBbStart, queueCpuBase);
|
||||
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
|
||||
|
||||
expectedReturnAddress = queueGpuBase + offsetToReturn;
|
||||
|
||||
chainBackBbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartDwordBuffer);
|
||||
ASSERT_NE(nullptr, chainBackBbStartCmd);
|
||||
EXPECT_EQ(expectedReturnAddress, chainBackBbStartCmd->getBatchBufferStartAddress());
|
||||
|
||||
commandList->destroy();
|
||||
commandQueue->destroy();
|
||||
}
|
||||
|
||||
HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleWhenTwoCmdListsExecutedThenPatchPreambleContainsEncodingReturningAndChainingBbStartCmd) {
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||
|
||||
uint32_t bbStartDwordBuffer[sizeof(MI_BATCH_BUFFER_START) / sizeof(uint32_t)] = {0};
|
||||
uint32_t bbStartDwordBuffer2[sizeof(MI_BATCH_BUFFER_START) / sizeof(uint32_t)] = {0};
|
||||
|
||||
ze_result_t returnValue;
|
||||
ze_command_queue_desc_t queueDesc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
|
||||
queueDesc.ordinal = 0u;
|
||||
queueDesc.index = 0u;
|
||||
queueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
|
||||
queueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
|
||||
|
||||
WhiteBox<L0::CommandQueue> *commandQueue = whiteboxCast(CommandQueue::create(productFamily,
|
||||
device,
|
||||
neoDevice->getDefaultEngine().commandStreamReceiver,
|
||||
&queueDesc,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
auto commandList = CommandList::create(productFamily, device, NEO::EngineGroupType::renderCompute, 0u, returnValue, false);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
auto commandList2 = CommandList::create(productFamily, device, NEO::EngineGroupType::renderCompute, 0u, returnValue, false);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
commandList->getRequiredStreamState().stateBaseAddress.surfaceStateBaseAddress.set(0x1000);
|
||||
commandList2->getRequiredStreamState().stateBaseAddress.surfaceStateBaseAddress.set(0x2000);
|
||||
|
||||
ze_command_list_handle_t commandLists[] = {commandList->toHandle(), commandList2->toHandle()};
|
||||
commandList->close();
|
||||
commandList2->close();
|
||||
|
||||
uint64_t startGpuAddress = commandList->getCmdContainer().getCmdBufferAllocations()[0]->getGpuAddress();
|
||||
uint64_t chainedGpuAddress = commandList->getCmdContainer().getEndCmdGpuAddress();
|
||||
uint64_t start2GpuAddress = commandList2->getCmdContainer().getCmdBufferAllocations()[0]->getGpuAddress();
|
||||
uint64_t endGpuAddress = commandList2->getCmdContainer().getEndCmdGpuAddress();
|
||||
|
||||
commandQueue->setPatchingPreamble(true);
|
||||
|
||||
void *queueCpuBase = commandQueue->commandStream.getCpuBase();
|
||||
uint64_t queueGpuBase = commandQueue->commandStream.getGpuBase();
|
||||
|
||||
auto usedSpaceBefore = commandQueue->commandStream.getUsed();
|
||||
returnValue = commandQueue->executeCommandLists(2, commandLists, nullptr, true, nullptr, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
auto usedSpaceAfter = commandQueue->commandStream.getUsed();
|
||||
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
|
||||
cmdList,
|
||||
queueCpuBase,
|
||||
usedSpaceAfter));
|
||||
|
||||
GenCmdList::iterator patchCmdIterator = cmdList.end();
|
||||
size_t bbStartIdx = 0;
|
||||
|
||||
auto sdiCmds = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(0u, sdiCmds.size());
|
||||
size_t sdiSizeHalf = sdiCmds.size() / 2;
|
||||
for (uint32_t i = 0; i < sdiSizeHalf; i++) {
|
||||
auto &sdiCmd = sdiCmds[i];
|
||||
auto &sdiCmd2 = sdiCmds[i + sdiSizeHalf];
|
||||
auto storeDataImm = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmd);
|
||||
auto storeDataImm2 = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmd2);
|
||||
|
||||
EXPECT_EQ(chainedGpuAddress + bbStartIdx * sizeof(uint64_t), storeDataImm->getAddress());
|
||||
EXPECT_EQ(endGpuAddress + bbStartIdx * sizeof(uint64_t), storeDataImm2->getAddress());
|
||||
|
||||
bbStartDwordBuffer[2 * bbStartIdx] = storeDataImm->getDataDword0();
|
||||
bbStartDwordBuffer2[2 * bbStartIdx] = storeDataImm2->getDataDword0();
|
||||
if (storeDataImm->getStoreQword()) {
|
||||
bbStartDwordBuffer[2 * bbStartIdx + 1] = storeDataImm->getDataDword1();
|
||||
}
|
||||
if (storeDataImm2->getStoreQword()) {
|
||||
bbStartDwordBuffer2[2 * bbStartIdx + 1] = storeDataImm2->getDataDword1();
|
||||
}
|
||||
|
||||
bbStartIdx++;
|
||||
patchCmdIterator = sdiCmd2;
|
||||
}
|
||||
|
||||
MI_BATCH_BUFFER_START *chainLinkBbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartDwordBuffer);
|
||||
ASSERT_NE(nullptr, chainLinkBbStartCmd);
|
||||
|
||||
MI_BATCH_BUFFER_START *chainBackBbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartDwordBuffer2);
|
||||
ASSERT_NE(nullptr, chainBackBbStartCmd);
|
||||
|
||||
auto bbStarts = findAll<MI_BATCH_BUFFER_START *>(patchCmdIterator, cmdList.end());
|
||||
ASSERT_NE(0u, bbStarts.size());
|
||||
|
||||
// single bb start means two command lists are chained directly
|
||||
// two bb starts means command lists are chained to queue as SBA tracking makes dynamic preamble to reprogram SBA difference between lists
|
||||
if (bbStarts.size() == 1u) {
|
||||
EXPECT_EQ(start2GpuAddress, chainLinkBbStartCmd->getBatchBufferStartAddress());
|
||||
|
||||
auto startingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[0]);
|
||||
EXPECT_EQ(startGpuAddress, startingBbStart->getBatchBufferStartAddress());
|
||||
|
||||
size_t offsetToReturn = ptrDiff(startingBbStart, queueCpuBase);
|
||||
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
|
||||
|
||||
uint64_t expectedReturnAddress = queueGpuBase + offsetToReturn;
|
||||
|
||||
EXPECT_EQ(expectedReturnAddress, chainBackBbStartCmd->getBatchBufferStartAddress());
|
||||
} else {
|
||||
auto startingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[0]);
|
||||
EXPECT_EQ(startGpuAddress, startingBbStart->getBatchBufferStartAddress());
|
||||
|
||||
size_t offsetToReturn = ptrDiff(startingBbStart, queueCpuBase);
|
||||
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
|
||||
|
||||
uint64_t expectedReturnAddress = queueGpuBase + offsetToReturn;
|
||||
EXPECT_EQ(expectedReturnAddress, chainLinkBbStartCmd->getBatchBufferStartAddress());
|
||||
|
||||
auto dynamicPremableStartingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[1]);
|
||||
EXPECT_EQ(start2GpuAddress, dynamicPremableStartingBbStart->getBatchBufferStartAddress());
|
||||
|
||||
offsetToReturn = ptrDiff(dynamicPremableStartingBbStart, queueCpuBase);
|
||||
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
|
||||
|
||||
expectedReturnAddress = queueGpuBase + offsetToReturn;
|
||||
EXPECT_EQ(expectedReturnAddress, chainBackBbStartCmd->getBatchBufferStartAddress());
|
||||
}
|
||||
|
||||
usedSpaceBefore = commandQueue->commandStream.getUsed();
|
||||
returnValue = commandQueue->executeCommandLists(2, commandLists, nullptr, true, nullptr, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
usedSpaceAfter = commandQueue->commandStream.getUsed();
|
||||
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
|
||||
|
||||
cmdList.clear();
|
||||
bbStartIdx = 0;
|
||||
memset(bbStartDwordBuffer, 0, sizeof(bbStartDwordBuffer));
|
||||
memset(bbStartDwordBuffer2, 0, sizeof(bbStartDwordBuffer2));
|
||||
|
||||
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
|
||||
cmdList,
|
||||
ptrOffset(queueCpuBase, usedSpaceBefore),
|
||||
usedSpaceAfter - usedSpaceBefore));
|
||||
|
||||
sdiCmds = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(0u, sdiCmds.size());
|
||||
sdiSizeHalf = sdiCmds.size() / 2;
|
||||
for (uint32_t i = 0; i < sdiSizeHalf; i++) {
|
||||
auto &sdiCmd = sdiCmds[i];
|
||||
auto &sdiCmd2 = sdiCmds[i + sdiSizeHalf];
|
||||
auto storeDataImm = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmd);
|
||||
auto storeDataImm2 = reinterpret_cast<MI_STORE_DATA_IMM *>(*sdiCmd2);
|
||||
|
||||
EXPECT_EQ(chainedGpuAddress + bbStartIdx * sizeof(uint64_t), storeDataImm->getAddress());
|
||||
EXPECT_EQ(endGpuAddress + bbStartIdx * sizeof(uint64_t), storeDataImm2->getAddress());
|
||||
|
||||
bbStartDwordBuffer[2 * bbStartIdx] = storeDataImm->getDataDword0();
|
||||
bbStartDwordBuffer2[2 * bbStartIdx] = storeDataImm2->getDataDword0();
|
||||
if (storeDataImm->getStoreQword()) {
|
||||
bbStartDwordBuffer[2 * bbStartIdx + 1] = storeDataImm->getDataDword1();
|
||||
}
|
||||
if (storeDataImm2->getStoreQword()) {
|
||||
bbStartDwordBuffer2[2 * bbStartIdx + 1] = storeDataImm2->getDataDword1();
|
||||
}
|
||||
|
||||
bbStartIdx++;
|
||||
patchCmdIterator = sdiCmd2;
|
||||
}
|
||||
|
||||
chainLinkBbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartDwordBuffer);
|
||||
ASSERT_NE(nullptr, chainLinkBbStartCmd);
|
||||
|
||||
chainBackBbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartDwordBuffer2);
|
||||
ASSERT_NE(nullptr, chainBackBbStartCmd);
|
||||
|
||||
bbStarts = findAll<MI_BATCH_BUFFER_START *>(patchCmdIterator, cmdList.end());
|
||||
ASSERT_NE(0u, bbStarts.size());
|
||||
|
||||
// single bb start means two command lists are chained directly
|
||||
// two bb starts means command lists are chained to queue as SBA tracking makes dynamic preamble to reprogram SBA difference between lists
|
||||
if (bbStarts.size() == 1u) {
|
||||
EXPECT_EQ(start2GpuAddress, chainLinkBbStartCmd->getBatchBufferStartAddress());
|
||||
|
||||
auto startingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[0]);
|
||||
EXPECT_EQ(startGpuAddress, startingBbStart->getBatchBufferStartAddress());
|
||||
|
||||
size_t offsetToReturn = ptrDiff(startingBbStart, queueCpuBase);
|
||||
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
|
||||
|
||||
uint64_t expectedReturnAddress = queueGpuBase + offsetToReturn;
|
||||
|
||||
EXPECT_EQ(expectedReturnAddress, chainBackBbStartCmd->getBatchBufferStartAddress());
|
||||
} else {
|
||||
auto startingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[0]);
|
||||
EXPECT_EQ(startGpuAddress, startingBbStart->getBatchBufferStartAddress());
|
||||
|
||||
size_t offsetToReturn = ptrDiff(startingBbStart, queueCpuBase);
|
||||
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
|
||||
|
||||
uint64_t expectedReturnAddress = queueGpuBase + offsetToReturn;
|
||||
EXPECT_EQ(expectedReturnAddress, chainLinkBbStartCmd->getBatchBufferStartAddress());
|
||||
|
||||
auto dynamicPremableStartingBbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStarts[1]);
|
||||
EXPECT_EQ(start2GpuAddress, dynamicPremableStartingBbStart->getBatchBufferStartAddress());
|
||||
|
||||
offsetToReturn = ptrDiff(dynamicPremableStartingBbStart, queueCpuBase);
|
||||
offsetToReturn += sizeof(MI_BATCH_BUFFER_START);
|
||||
|
||||
expectedReturnAddress = queueGpuBase + offsetToReturn;
|
||||
EXPECT_EQ(expectedReturnAddress, chainBackBbStartCmd->getBatchBufferStartAddress());
|
||||
}
|
||||
|
||||
commandList->destroy();
|
||||
commandList2->destroy();
|
||||
commandQueue->destroy();
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
|
||||
Reference in New Issue
Block a user