Change level zero command queue internal interface for front end programing

Front end estimation use internal loop for command list browsing and
estimation of each command list.
This refactor moves internal loop into external execution, so command list
browsing in loop can be shared by all state commands.
This refactor - sharing loop - will correct performance of each added state
estimator.

Related-To: NEO-5019

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2022-09-20 16:46:15 +00:00
committed by Compute-Runtime-Automation
parent 43676ed02a
commit 7832195cd8
4 changed files with 84 additions and 56 deletions

View File

@@ -28,6 +28,10 @@ namespace L0 {
CommandQueueAllocatorFn commandQueueFactory[IGFX_MAX_PRODUCT] = {}; CommandQueueAllocatorFn commandQueueFactory[IGFX_MAX_PRODUCT] = {};
bool CommandQueue::frontEndTrackingEnabled() const {
return NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get() || this->multiReturnPointCommandList;
}
CommandQueueImp::CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc) CommandQueueImp::CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc)
: desc(*desc), device(device), csr(csr) { : desc(*desc), device(device), csr(csr) {
int overrideCmdQueueSyncMode = NEO::DebugManager.flags.OverrideCmdQueueSynchronousMode.get(); int overrideCmdQueueSyncMode = NEO::DebugManager.flags.OverrideCmdQueueSynchronousMode.get();

View File

@@ -54,6 +54,8 @@ struct CommandQueue : _ze_command_queue_handle_t {
bool peekIsCopyOnlyCommandQueue() const { return this->isCopyOnlyCommandQueue; } bool peekIsCopyOnlyCommandQueue() const { return this->isCopyOnlyCommandQueue; }
protected: protected:
bool frontEndTrackingEnabled() const;
uint32_t partitionCount = 1; uint32_t partitionCount = 1;
uint32_t activeSubDevices = 1; uint32_t activeSubDevices = 1;
bool preemptionCmdSyncProgramming = true; bool preemptionCmdSyncProgramming = true;

View File

@@ -36,9 +36,13 @@ struct CommandQueueHw : public CommandQueueImp {
size_t estimateStateBaseAddressCmdSize(); size_t estimateStateBaseAddressCmdSize();
MOCKABLE_VIRTUAL void programFrontEnd(uint64_t scratchAddress, uint32_t perThreadScratchSpaceSize, NEO::LinearStream &commandStream, NEO::StreamProperties &streamProperties); MOCKABLE_VIRTUAL void programFrontEnd(uint64_t scratchAddress, uint32_t perThreadScratchSpaceSize, NEO::LinearStream &commandStream, NEO::StreamProperties &streamProperties);
MOCKABLE_VIRTUAL size_t estimateFrontEndCmdSizeForMultipleCommandLists(bool isFrontEndStateDirty, uint32_t numCommandLists, MOCKABLE_VIRTUAL size_t estimateFrontEndCmdSizeForMultipleCommandLists(bool &isFrontEndStateDirty, int32_t engineInstanced, CommandList *commandList,
ze_command_list_handle_t *phCommandLists, int32_t engineInstanced); NEO::StreamProperties &csrStateCopy,
const NEO::StreamProperties &cmdListRequired,
const NEO::StreamProperties &cmdListFinal);
size_t estimateFrontEndCmdSize(); size_t estimateFrontEndCmdSize();
size_t estimateFrontEndCmdSize(bool isFrontEndDirty);
size_t estimatePipelineSelect(); size_t estimatePipelineSelect();
void programPipelineSelectIfGpgpuDisabled(NEO::LinearStream &commandStream); void programPipelineSelectIfGpgpuDisabled(NEO::LinearStream &commandStream);
@@ -141,14 +145,15 @@ struct CommandQueueHw : public CommandQueueImp {
inline void encodeKernelArgsBufferAndMakeItResident(); inline void encodeKernelArgsBufferAndMakeItResident();
inline void writeCsrStreamInlineIfLogicalStateHelperAvailable(NEO::LinearStream &commandStream); inline void writeCsrStreamInlineIfLogicalStateHelperAvailable(NEO::LinearStream &commandStream);
inline void programOneCmdListFrontEndIfDirty(CommandListExecutionContext &ctx, inline void programOneCmdListFrontEndIfDirty(CommandListExecutionContext &ctx,
CommandList *commandList, NEO::LinearStream &commandStream, NEO::StreamProperties &csrState,
NEO::LinearStream &commandStream); const NEO::StreamProperties &cmdListRequired, const NEO::StreamProperties &cmdListFinal);
inline void programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream); inline void programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream);
inline void programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx); inline void programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx);
inline void mergeOneCmdListPipelinedState(CommandList *commandList); inline void mergeOneCmdListPipelinedState(CommandList *commandList);
inline void programFrontEndAndClearDirtyFlag(bool shouldFrontEndBeProgrammed, inline void programFrontEndAndClearDirtyFlag(bool shouldFrontEndBeProgrammed,
CommandListExecutionContext &ctx, CommandListExecutionContext &ctx,
NEO::LinearStream &commandStream); NEO::LinearStream &commandStream,
NEO::StreamProperties &csrState);
inline void collectPrintfContentsFromAllCommandsLists(ze_command_list_handle_t *phCommandLists, uint32_t numCommandLists); inline void collectPrintfContentsFromAllCommandsLists(ze_command_list_handle_t *phCommandLists, uint32_t numCommandLists);
inline void migrateSharedAllocationsIfRequested(bool isMigrationRequested, ze_command_list_handle_t hCommandList); inline void migrateSharedAllocationsIfRequested(bool isMigrationRequested, ze_command_list_handle_t hCommandList);
inline void prefetchMemoryIfRequested(bool &isMemoryPrefetchRequested); inline void prefetchMemoryIfRequested(bool &isMemoryPrefetchRequested);

View File

@@ -153,9 +153,9 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
this->programActivePartitionConfig(ctx.isProgramActivePartitionConfigRequired, child); this->programActivePartitionConfig(ctx.isProgramActivePartitionConfigRequired, child);
this->encodeKernelArgsBufferAndMakeItResident(); this->encodeKernelArgsBufferAndMakeItResident();
auto &csrStateProperties = csr->getStreamProperties();
bool shouldProgramVfe = this->csr->getLogicalStateHelper() && ctx.frontEndStateDirty; bool shouldProgramVfe = this->csr->getLogicalStateHelper() && ctx.frontEndStateDirty;
this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, child); this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, child, csrStateProperties);
this->writeCsrStreamInlineIfLogicalStateHelperAvailable(child); this->writeCsrStreamInlineIfLogicalStateHelperAvailable(child);
@@ -163,9 +163,12 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
for (auto i = 0u; i < numCommandLists; ++i) { for (auto i = 0u; i < numCommandLists; ++i) {
auto commandList = CommandList::fromHandle(phCommandLists[i]); auto commandList = CommandList::fromHandle(phCommandLists[i]);
auto &requiredStreamState = commandList->getRequiredStreamState();
auto &finalStreamState = commandList->getFinalStreamState();
this->updateOneCmdListPreemptionModeAndCtxStatePreemption(ctx, commandList->getCommandListPreemptionMode(), child); this->updateOneCmdListPreemptionModeAndCtxStatePreemption(ctx, commandList->getCommandListPreemptionMode(), child);
this->updatePipelineSelectState(commandList); this->updatePipelineSelectState(commandList);
this->programOneCmdListFrontEndIfDirty(ctx, commandList, child); this->programOneCmdListFrontEndIfDirty(ctx, child, csrStateProperties, requiredStreamState, finalStreamState);
this->patchCommands(*commandList, this->csr->getScratchSpaceController()->getScratchPatchAddress()); this->patchCommands(*commandList, this->csr->getScratchSpaceController()->getScratchPatchAddress());
this->programOneCmdListBatchBufferStart(commandList, child, ctx); this->programOneCmdListBatchBufferStart(commandList, child, ctx);
@@ -278,30 +281,27 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::validateCommandListsParams(
template <GFXCORE_FAMILY gfxCoreFamily> template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::programOneCmdListFrontEndIfDirty( void CommandQueueHw<gfxCoreFamily>::programOneCmdListFrontEndIfDirty(
CommandListExecutionContext &ctx, CommandListExecutionContext &ctx,
CommandList *commandList, NEO::LinearStream &cmdStream,
NEO::LinearStream &cmdStream) { NEO::StreamProperties &csrState,
const NEO::StreamProperties &cmdListRequired,
const NEO::StreamProperties &cmdListFinal) {
bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get();
auto &streamProperties = this->csr->getStreamProperties();
bool shouldProgramVfe = ctx.frontEndStateDirty; bool shouldProgramVfe = ctx.frontEndStateDirty;
ctx.cmdListBeginState.frontEndState = {}; ctx.cmdListBeginState.frontEndState = {};
if (isPatchingVfeStateAllowed || this->multiReturnPointCommandList) { if (frontEndTrackingEnabled()) {
auto &requiredStreamState = commandList->getRequiredStreamState(); csrState.frontEndState.setProperties(cmdListRequired.frontEndState);
streamProperties.frontEndState.setProperties(requiredStreamState.frontEndState); csrState.frontEndState.setPropertySingleSliceDispatchCcsMode(ctx.engineInstanced, device->getHwInfo());
streamProperties.frontEndState.setPropertySingleSliceDispatchCcsMode(ctx.engineInstanced, device->getHwInfo());
shouldProgramVfe |= streamProperties.frontEndState.isDirty(); shouldProgramVfe |= csrState.frontEndState.isDirty();
} }
ctx.cmdListBeginState.frontEndState.setProperties(streamProperties.frontEndState); ctx.cmdListBeginState.frontEndState.setProperties(csrState.frontEndState);
this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, cmdStream, csrState);
this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, cmdStream); if (frontEndTrackingEnabled()) {
csrState.frontEndState.setProperties(cmdListFinal.frontEndState);
if (isPatchingVfeStateAllowed || this->multiReturnPointCommandList) {
auto &finalStreamState = commandList->getFinalStreamState();
streamProperties.frontEndState.setProperties(finalStreamState.frontEndState);
} }
} }
@@ -309,7 +309,8 @@ template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::programFrontEndAndClearDirtyFlag( void CommandQueueHw<gfxCoreFamily>::programFrontEndAndClearDirtyFlag(
bool shouldFrontEndBeProgrammed, bool shouldFrontEndBeProgrammed,
CommandListExecutionContext &ctx, CommandListExecutionContext &ctx,
NEO::LinearStream &cmdStream) { NEO::LinearStream &cmdStream,
NEO::StreamProperties &csrState) {
if (!shouldFrontEndBeProgrammed) { if (!shouldFrontEndBeProgrammed) {
return; return;
@@ -318,7 +319,7 @@ void CommandQueueHw<gfxCoreFamily>::programFrontEndAndClearDirtyFlag(
programFrontEnd(scratchSpaceController->getScratchPatchAddress(), programFrontEnd(scratchSpaceController->getScratchPatchAddress(),
scratchSpaceController->getPerThreadScratchSpaceSize(), scratchSpaceController->getPerThreadScratchSpaceSize(),
cmdStream, cmdStream,
csr->getStreamProperties()); csrState);
ctx.frontEndStateDirty = false; ctx.frontEndStateDirty = false;
} }
@@ -345,36 +346,40 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSize() {
return NEO::PreambleHelper<GfxFamily>::getVFECommandsSize(); return NEO::PreambleHelper<GfxFamily>::getVFECommandsSize();
} }
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSize(bool isFrontEndDirty) {
if (!frontEndTrackingEnabled()) {
return isFrontEndDirty * estimateFrontEndCmdSize();
}
return 0;
}
template <GFXCORE_FAMILY gfxCoreFamily> template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSizeForMultipleCommandLists( size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSizeForMultipleCommandLists(
bool isFrontEndStateDirty, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, int32_t engineInstanced) { bool &isFrontEndStateDirty, int32_t engineInstanced, CommandList *commandList,
NEO::StreamProperties &csrStateCopy,
const NEO::StreamProperties &cmdListRequired,
const NEO::StreamProperties &cmdListFinal) {
if (!frontEndTrackingEnabled()) {
return 0;
}
auto singleFrontEndCmdSize = estimateFrontEndCmdSize(); auto singleFrontEndCmdSize = estimateFrontEndCmdSize();
bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get();
if (!isPatchingVfeStateAllowed && !this->multiReturnPointCommandList) {
return isFrontEndStateDirty * singleFrontEndCmdSize;
}
auto streamPropertiesCopy = csr->getStreamProperties();
size_t estimatedSize = 0; size_t estimatedSize = 0;
for (size_t i = 0; i < numCommandLists; i++) { csrStateCopy.frontEndState.setProperties(cmdListRequired.frontEndState);
auto commandList = CommandList::fromHandle(phCommandLists[i]); csrStateCopy.frontEndState.setPropertySingleSliceDispatchCcsMode(engineInstanced, device->getHwInfo());
auto &requiredStreamState = commandList->getRequiredStreamState(); if (isFrontEndStateDirty || csrStateCopy.frontEndState.isDirty()) {
streamPropertiesCopy.frontEndState.setProperties(requiredStreamState.frontEndState); estimatedSize += singleFrontEndCmdSize;
streamPropertiesCopy.frontEndState.setPropertySingleSliceDispatchCcsMode(engineInstanced, device->getHwInfo()); isFrontEndStateDirty = false;
if (isFrontEndStateDirty || streamPropertiesCopy.frontEndState.isDirty()) {
estimatedSize += singleFrontEndCmdSize;
isFrontEndStateDirty = false;
}
if (this->multiReturnPointCommandList) {
uint32_t frontEndChanges = commandList->getReturnPointsSize();
estimatedSize += (frontEndChanges * singleFrontEndCmdSize);
estimatedSize += (frontEndChanges * NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize());
}
auto &finalStreamState = commandList->getFinalStreamState();
streamPropertiesCopy.frontEndState.setProperties(finalStreamState.frontEndState);
} }
if (this->multiReturnPointCommandList) {
uint32_t frontEndChanges = commandList->getReturnPointsSize();
estimatedSize += (frontEndChanges * singleFrontEndCmdSize);
estimatedSize += (frontEndChanges * NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize());
}
csrStateCopy.frontEndState.setProperties(cmdListFinal.frontEndState);
return estimatedSize; return estimatedSize;
} }
@@ -608,8 +613,7 @@ void CommandQueueHw<gfxCoreFamily>::setFrontEndStateProperties(CommandListExecut
auto isEngineInstanced = csr->getOsContext().isEngineInstanced(); auto isEngineInstanced = csr->getOsContext().isEngineInstanced();
auto &streamProperties = this->csr->getStreamProperties(); auto &streamProperties = this->csr->getStreamProperties();
bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get(); if (!frontEndTrackingEnabled()) {
if (!isPatchingVfeStateAllowed && !this->multiReturnPointCommandList) {
streamProperties.frontEndState.setProperties(ctx.anyCommandListWithCooperativeKernels, ctx.anyCommandListRequiresDisabledEUFusion, streamProperties.frontEndState.setProperties(ctx.anyCommandListWithCooperativeKernels, ctx.anyCommandListRequiresDisabledEUFusion,
disableOverdispatch, isEngineInstanced, hwInfo); disableOverdispatch, isEngineInstanced, hwInfo);
} else { } else {
@@ -641,7 +645,20 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
linearStreamSizeEstimate += estimatePipelineSelect(); linearStreamSizeEstimate += estimatePipelineSelect();
} }
linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(ctx.frontEndStateDirty, numCommandLists, phCommandLists, ctx.engineInstanced); linearStreamSizeEstimate += estimateFrontEndCmdSize(ctx.frontEndStateDirty);
if (frontEndTrackingEnabled()) {
bool frontEndStateDirtyCopy = ctx.frontEndStateDirty;
auto streamPropertiesCopy = csr->getStreamProperties();
for (uint32_t i = 0; i < numCommandLists; i++) {
auto cmdList = CommandList::fromHandle(phCommandLists[i]);
auto &requiredStreamState = cmdList->getRequiredStreamState();
auto &finalStreamState = cmdList->getFinalStreamState();
linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(frontEndStateDirtyCopy, ctx.engineInstanced, cmdList,
streamPropertiesCopy, requiredStreamState, finalStreamState);
}
}
if (ctx.gsbaStateDirty) { if (ctx.gsbaStateDirty) {
linearStreamSizeEstimate += estimateStateBaseAddressCmdSize(); linearStreamSizeEstimate += estimateStateBaseAddressCmdSize();
@@ -1031,11 +1048,6 @@ NEO::SubmissionStatus CommandQueueHw<gfxCoreFamily>::prepareAndSubmitBatchBuffer
ctx.anyCommandListWithCooperativeKernels); ctx.anyCommandListWithCooperativeKernels);
} }
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandQueueHw<gfxCoreFamily>::isCleanLeftoverMemoryRequired() {
return false;
}
template <GFXCORE_FAMILY gfxCoreFamily> template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::cleanLeftoverMemory(NEO::LinearStream &outerCommandStream, NEO::LinearStream &innerCommandStream) { void CommandQueueHw<gfxCoreFamily>::cleanLeftoverMemory(NEO::LinearStream &outerCommandStream, NEO::LinearStream &innerCommandStream) {
@@ -1109,4 +1121,9 @@ void CommandQueueHw<gfxCoreFamily>::updatePipelineSelectState(CommandList *comma
streamProperties.pipelineSelect.setProperties(finalStreamState.pipelineSelect); streamProperties.pipelineSelect.setProperties(finalStreamState.pipelineSelect);
} }
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandQueueHw<gfxCoreFamily>::isCleanLeftoverMemoryRequired() {
return false;
}
} // namespace L0 } // namespace L0