[perf] add selective properties update for one-time and multi-time properties

Related-To: NEO-5055

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2023-03-10 21:51:52 +00:00 committed by Compute-Runtime-Automation
parent 38e50007f7
commit ef12312672
7 changed files with 97 additions and 45 deletions

View File

@ -182,6 +182,7 @@ struct CommandQueueHw : public CommandQueueImp {
const NEO::StreamProperties &cmdListFinal);
inline size_t estimateScmCmdSizeForMultipleCommandLists(NEO::StreamProperties &csrStateCopy,
bool &scmStateDirty,
const NEO::StreamProperties &cmdListRequired,
const NEO::StreamProperties &cmdListFinal);
inline void programRequiredStateComputeModeForCommandList(CommandList *commandList,

View File

@ -304,7 +304,11 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListFrontEndIfDirty(
ctx.cmdListBeginState.frontEndState = {};
if (frontEndTrackingEnabled()) {
csrState.frontEndState.copyPropertiesAll(cmdListRequired.frontEndState);
if (shouldProgramVfe) {
csrState.frontEndState.copyPropertiesAll(cmdListRequired.frontEndState);
} else {
csrState.frontEndState.copyPropertiesComputeDispatchAllWalkerEnableDisableEuFusion(cmdListRequired.frontEndState);
}
csrState.frontEndState.setPropertySingleSliceDispatchCcsMode(ctx.engineInstanced);
shouldProgramVfe |= csrState.frontEndState.isDirty();
@ -314,7 +318,11 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListFrontEndIfDirty(
this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, cmdStream, csrState);
if (frontEndTrackingEnabled()) {
csrState.frontEndState.copyPropertiesAll(cmdListFinal.frontEndState);
if (shouldProgramVfe) {
csrState.frontEndState.copyPropertiesAll(cmdListFinal.frontEndState);
} else {
csrState.frontEndState.copyPropertiesComputeDispatchAllWalkerEnableDisableEuFusion(cmdListFinal.frontEndState);
}
}
}
@ -381,18 +389,28 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSizeForMultipleCommandL
auto singleFrontEndCmdSize = estimateFrontEndCmdSize();
size_t estimatedSize = 0;
csrStateCopy.frontEndState.copyPropertiesAll(cmdListRequired.frontEndState);
if (isFrontEndStateDirty) {
csrStateCopy.frontEndState.copyPropertiesAll(cmdListRequired.frontEndState);
} else {
csrStateCopy.frontEndState.copyPropertiesComputeDispatchAllWalkerEnableDisableEuFusion(cmdListRequired.frontEndState);
}
csrStateCopy.frontEndState.setPropertySingleSliceDispatchCcsMode(engineInstanced);
if (isFrontEndStateDirty || csrStateCopy.frontEndState.isDirty()) {
estimatedSize += singleFrontEndCmdSize;
isFrontEndStateDirty = false;
}
if (this->frontEndStateTracking) {
uint32_t frontEndChanges = commandList->getReturnPointsSize();
estimatedSize += (frontEndChanges * singleFrontEndCmdSize);
estimatedSize += (frontEndChanges * NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize());
}
csrStateCopy.frontEndState.copyPropertiesAll(cmdListFinal.frontEndState);
if (isFrontEndStateDirty) {
csrStateCopy.frontEndState.copyPropertiesAll(cmdListFinal.frontEndState);
isFrontEndStateDirty = false;
} else {
csrStateCopy.frontEndState.copyPropertiesComputeDispatchAllWalkerEnableDisableEuFusion(cmdListFinal.frontEndState);
}
return estimatedSize;
}
@ -662,6 +680,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
bool frontEndStateDirtyCopy = ctx.frontEndStateDirty;
bool gpgpuEnabledCopy = this->csr->getPreambleSetFlag();
bool baseAdresStateDirtyCopy = ctx.gsbaStateDirty;
bool scmStateDirtyCopy = this->csr->getStateComputeModeDirty();
for (uint32_t i = 0; i < numCommandLists; i++) {
auto cmdList = CommandList::fromHandle(phCommandLists[i]);
auto &requiredStreamState = cmdList->getRequiredStreamState();
@ -670,7 +689,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(frontEndStateDirtyCopy, ctx.engineInstanced, cmdList,
streamPropertiesCopy, requiredStreamState, finalStreamState);
linearStreamSizeEstimate += estimatePipelineSelectCmdSizeForMultipleCommandLists(streamPropertiesCopy, requiredStreamState, finalStreamState, gpgpuEnabledCopy);
linearStreamSizeEstimate += estimateScmCmdSizeForMultipleCommandLists(streamPropertiesCopy, requiredStreamState, finalStreamState);
linearStreamSizeEstimate += estimateScmCmdSizeForMultipleCommandLists(streamPropertiesCopy, scmStateDirtyCopy, requiredStreamState, finalStreamState);
linearStreamSizeEstimate += estimateStateBaseAddressCmdSizeForMultipleCommandLists(baseAdresStateDirtyCopy, cmdList->getCmdListHeapAddressModel(), streamPropertiesCopy, requiredStreamState, finalStreamState);
}
}
@ -1148,16 +1167,24 @@ size_t CommandQueueHw<gfxCoreFamily>::estimatePipelineSelectCmdSizeForMultipleCo
return 0;
}
size_t singlePipelineSelectSize = NEO::PreambleHelper<GfxFamily>::getCmdSizeForPipelineSelect(device->getNEODevice()->getRootDeviceEnvironment());
size_t estimatedSize = 0;
csrStateCopy.pipelineSelect.copyPropertiesAll(cmdListRequired.pipelineSelect);
if (!gpgpuEnabled || csrStateCopy.pipelineSelect.isDirty()) {
estimatedSize += singlePipelineSelectSize;
gpgpuEnabled = true;
if (!gpgpuEnabled) {
csrStateCopy.pipelineSelect.copyPropertiesAll(cmdListRequired.pipelineSelect);
} else {
csrStateCopy.pipelineSelect.copyPropertiesSystolicMode(cmdListRequired.pipelineSelect);
}
csrStateCopy.pipelineSelect.copyPropertiesAll(cmdListFinal.pipelineSelect);
if (!gpgpuEnabled || csrStateCopy.pipelineSelect.isDirty()) {
estimatedSize += NEO::PreambleHelper<GfxFamily>::getCmdSizeForPipelineSelect(device->getNEODevice()->getRootDeviceEnvironment());
}
if (!gpgpuEnabled) {
csrStateCopy.pipelineSelect.copyPropertiesAll(cmdListFinal.pipelineSelect);
gpgpuEnabled = true;
} else {
csrStateCopy.pipelineSelect.copyPropertiesSystolicMode(cmdListFinal.pipelineSelect);
}
return estimatedSize;
}
@ -1170,7 +1197,11 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListPipelineSelect(CommandList
}
bool preambleSet = csr->getPreambleSetFlag();
csrState.pipelineSelect.copyPropertiesAll(cmdListRequired.pipelineSelect);
if (!preambleSet) {
csrState.pipelineSelect.copyPropertiesAll(cmdListRequired.pipelineSelect);
} else {
csrState.pipelineSelect.copyPropertiesSystolicMode(cmdListRequired.pipelineSelect);
}
if (!preambleSet || csrState.pipelineSelect.isDirty()) {
bool systolic = csrState.pipelineSelect.systolicMode.value == 1 ? true : false;
@ -1184,11 +1215,16 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListPipelineSelect(CommandList
csr->setPreambleSetFlag(true);
}
csrState.pipelineSelect.copyPropertiesAll(cmdListFinal.pipelineSelect);
if (!preambleSet) {
csrState.pipelineSelect.copyPropertiesAll(cmdListFinal.pipelineSelect);
} else {
csrState.pipelineSelect.copyPropertiesSystolicMode(cmdListFinal.pipelineSelect);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateScmCmdSizeForMultipleCommandLists(NEO::StreamProperties &csrStateCopy,
bool &scmStateDirty,
const NEO::StreamProperties &cmdListRequired,
const NEO::StreamProperties &cmdListFinal) {
if (!this->stateComputeModeTracking) {
@ -1199,11 +1235,22 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateScmCmdSizeForMultipleCommandLists(
bool isRcs = this->getCsr()->isRcs();
csrStateCopy.stateComputeMode.copyPropertiesAll(cmdListRequired.stateComputeMode);
if (scmStateDirty) {
csrStateCopy.stateComputeMode.copyPropertiesAll(cmdListRequired.stateComputeMode);
} else {
csrStateCopy.stateComputeMode.copyPropertiesGrfNumberThreadArbitration(cmdListRequired.stateComputeMode);
}
if (csrStateCopy.stateComputeMode.isDirty()) {
estimatedSize = NEO::EncodeComputeMode<GfxFamily>::getCmdSizeForComputeMode(device->getNEODevice()->getRootDeviceEnvironment(), false, isRcs);
}
csrStateCopy.stateComputeMode.copyPropertiesAll(cmdListFinal.stateComputeMode);
if (scmStateDirty) {
csrStateCopy.stateComputeMode.copyPropertiesAll(cmdListFinal.stateComputeMode);
scmStateDirty = false;
} else {
csrStateCopy.stateComputeMode.copyPropertiesGrfNumberThreadArbitration(cmdListFinal.stateComputeMode);
}
return estimatedSize;
}
@ -1218,7 +1265,12 @@ void CommandQueueHw<gfxCoreFamily>::programRequiredStateComputeModeForCommandLis
return;
}
csrState.stateComputeMode.copyPropertiesAll(cmdListRequired.stateComputeMode);
bool scmCsrDirty = this->csr->getStateComputeModeDirty();
if (scmCsrDirty) {
csrState.stateComputeMode.copyPropertiesAll(cmdListRequired.stateComputeMode);
} else {
csrState.stateComputeMode.copyPropertiesGrfNumberThreadArbitration(cmdListRequired.stateComputeMode);
}
if (csrState.stateComputeMode.isDirty()) {
NEO::PipelineSelectArgs pipelineSelectArgs = {
@ -1232,7 +1284,12 @@ void CommandQueueHw<gfxCoreFamily>::programRequiredStateComputeModeForCommandLis
false, device->getNEODevice()->getRootDeviceEnvironment(), isRcs, this->getCsr()->getDcFlushSupport(), nullptr);
this->csr->setStateComputeModeDirty(false);
}
csrState.stateComputeMode.copyPropertiesAll(cmdListFinal.stateComputeMode);
if (scmCsrDirty) {
csrState.stateComputeMode.copyPropertiesAll(cmdListFinal.stateComputeMode);
} else {
csrState.stateComputeMode.copyPropertiesGrfNumberThreadArbitration(cmdListFinal.stateComputeMode);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -1263,9 +1320,8 @@ void CommandQueueHw<gfxCoreFamily>::programRequiredStateBaseAddressForGlobalStat
const NEO::StreamProperties &cmdListFinal) {
auto globalStatelessHeap = this->csr->getGlobalStatelessHeap();
csrState.stateBaseAddress.copyPropertiesAll(cmdListRequired.stateBaseAddress);
csrState.stateBaseAddress.setPropertiesBindingTableSurfaceState(NEO::StreamProperty64::initValue, NEO::StreamPropertySizeT::initValue,
globalStatelessHeap->getHeapGpuBase(), globalStatelessHeap->getHeapSizeInPages());
csrState.stateBaseAddress.copyPropertiesStatelessMocsIndirectState(cmdListRequired.stateBaseAddress);
csrState.stateBaseAddress.setPropertiesSurfaceState(globalStatelessHeap->getHeapGpuBase(), globalStatelessHeap->getHeapSizeInPages());
if (ctx.gsbaStateDirty || csrState.stateBaseAddress.isDirty()) {
programStateBaseAddress(ctx.scratchGsba,
@ -1277,7 +1333,7 @@ void CommandQueueHw<gfxCoreFamily>::programRequiredStateBaseAddressForGlobalStat
ctx.gsbaStateDirty = false;
}
csrState.stateBaseAddress.copyPropertiesAll(cmdListFinal.stateBaseAddress);
csrState.stateBaseAddress.copyPropertiesStatelessMocs(cmdListFinal.stateBaseAddress);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -1356,15 +1412,14 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateStateBaseAddressCmdSizeForGlobalSt
size_t estimatedSize = 0;
csrStateCopy.stateBaseAddress.copyPropertiesAll(cmdListRequired.stateBaseAddress);
csrStateCopy.stateBaseAddress.setPropertiesBindingTableSurfaceState(NEO::StreamProperty64::initValue, NEO::StreamPropertySizeT::initValue,
globalStatelessHeap->getHeapGpuBase(), globalStatelessHeap->getHeapSizeInPages());
csrStateCopy.stateBaseAddress.copyPropertiesStatelessMocsIndirectState(cmdListRequired.stateBaseAddress);
csrStateCopy.stateBaseAddress.setPropertiesSurfaceState(globalStatelessHeap->getHeapGpuBase(), globalStatelessHeap->getHeapSizeInPages());
if (baseAddressStateDirty || csrStateCopy.stateBaseAddress.isDirty()) {
bool useBtiCommand = csrStateCopy.stateBaseAddress.bindingTablePoolBaseAddress.value != NEO::StreamProperty64::initValue;
estimatedSize = estimateStateBaseAddressCmdDispatchSize(useBtiCommand);
baseAddressStateDirty = false;
}
csrStateCopy.stateBaseAddress.copyPropertiesAll(cmdListFinal.stateBaseAddress);
csrStateCopy.stateBaseAddress.copyPropertiesStatelessMocs(cmdListFinal.stateBaseAddress);
return estimatedSize;
}

View File

@ -435,8 +435,8 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
}
programHardwareContext(commandStreamCSR);
programComputeMode(commandStreamCSR, dispatchFlags, hwInfo);
programPipelineSelect(commandStreamCSR, dispatchFlags.pipelineSelectArgs);
programComputeMode(commandStreamCSR, dispatchFlags, hwInfo);
programL3(commandStreamCSR, newL3Config);
programPreamble(commandStreamCSR, device, newL3Config);
programMediaSampler(commandStreamCSR, dispatchFlags);
@ -568,6 +568,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
device.getDeviceInfo().imageSupport);
}
setGSBAStateDirty(false);
this->streamProperties.stateBaseAddress.clearIsDirty();
}
addPipeControlBeforeStateSip(commandStreamCSR, device);
@ -809,6 +810,7 @@ void CommandStreamReceiverHw<GfxFamily>::programComputeMode(LinearStream &stream
stream, this->streamProperties.stateComputeMode, dispatchFlags.pipelineSelectArgs,
hasSharedHandles(), this->peekRootDeviceEnvironment(), isRcs(), this->dcFlushSupport, logicalStateHelper.get());
this->setStateComputeModeDirty(false);
this->streamProperties.stateComputeMode.clearIsDirty();
}
}
@ -1106,7 +1108,7 @@ inline void CommandStreamReceiverHw<GfxFamily>::programVFEState(LinearStream &cs
auto isCooperative = dispatchFlags.kernelExecutionType == KernelExecutionType::Concurrent;
auto disableOverdispatch = (dispatchFlags.additionalKernelExecInfo != AdditionalKernelExecInfo::NotSet);
streamProperties.frontEndState.setPropertiesAll(isCooperative, dispatchFlags.disableEUFusion, disableOverdispatch, osContext->isEngineInstanced());
this->streamProperties.frontEndState.setPropertiesAll(isCooperative, dispatchFlags.disableEUFusion, disableOverdispatch, osContext->isEngineInstanced());
auto &gfxCoreHelper = getGfxCoreHelper();
auto engineGroupType = gfxCoreHelper.getEngineGroupType(getOsContext().getEngineType(), getOsContext().getEngineUsage(), hwInfo);
@ -1120,6 +1122,7 @@ inline void CommandStreamReceiverHw<GfxFamily>::programVFEState(LinearStream &cs
flatBatchBufferHelper->collectScratchSpacePatchInfo(getScratchPatchAddress(), commandOffset, csr);
}
setMediaVFEStateDirty(false);
this->streamProperties.frontEndState.clearIsDirty();
}
}

View File

@ -61,6 +61,7 @@ void CommandStreamReceiverHw<GfxFamily>::programPipelineSelect(LinearStream &com
this->lastMediaSamplerConfig = pipelineSelectArgs.mediaSamplerRequired;
this->lastSystolicPipelineSelectMode = pipelineSelectArgs.systolicPipelineSelectMode;
this->streamProperties.pipelineSelect.setPropertiesAll(true, this->lastMediaSamplerConfig, this->lastSystolicPipelineSelectMode);
this->streamProperties.pipelineSelect.clearIsDirty();
}
}

View File

@ -45,6 +45,7 @@ void CommandStreamReceiverHw<GfxFamily>::programPipelineSelect(LinearStream &com
this->lastMediaSamplerConfig = pipelineSelectArgs.mediaSamplerRequired;
this->lastSystolicPipelineSelectMode = pipelineSelectArgs.systolicPipelineSelectMode;
this->streamProperties.pipelineSelect.setPropertiesAll(true, this->lastMediaSamplerConfig, this->lastSystolicPipelineSelectMode);
this->streamProperties.pipelineSelect.clearIsDirty();
}
}

View File

@ -39,9 +39,9 @@ struct StateComputeModeProperties {
void copyPropertiesGrfNumberThreadArbitration(const StateComputeModeProperties &properties);
bool isDirty() const;
void clearIsDirty();
protected:
void clearIsDirty();
void clearIsDirtyExtraPerContext();
void clearIsDirtyExtraPerKernel();
bool isDirtyExtra() const;
@ -85,10 +85,9 @@ struct FrontEndProperties {
void copyPropertiesComputeDispatchAllWalkerEnableDisableEuFusion(const FrontEndProperties &properties);
bool isDirty() const;
protected:
void clearIsDirty();
protected:
FrontEndPropertiesSupport frontEndPropertiesSupport = {};
bool propertiesSupportLoaded = false;
};
@ -113,10 +112,9 @@ struct PipelineSelectProperties {
void copyPropertiesSystolicMode(const PipelineSelectProperties &properties);
bool isDirty() const;
protected:
void clearIsDirty();
protected:
PipelineSelectPropertiesSupport pipelineSelectPropertiesSupport = {};
bool propertiesSupportLoaded = false;
};
@ -158,10 +156,9 @@ struct StateBaseAddressProperties {
void copyPropertiesStatelessMocsIndirectState(const StateBaseAddressProperties &properties);
bool isDirty() const;
protected:
void clearIsDirty();
protected:
StateBaseAddressPropertiesSupport stateBaseAddressPropertiesSupport = {};
bool propertiesSupportLoaded = false;
};

View File

@ -215,7 +215,7 @@ GEN12LPTEST_F(Gen12LpCoherencyRequirements, givenCoherencyRequirementWithoutShar
csr->flushTask(stream, 0, &stream, &stream, &stream, 0, flags, *device);
};
auto findCmd = [&](bool expectToBeProgrammed, bool expectCoherent, bool expectPipeControl) {
auto findCmd = [&](bool expectToBeProgrammed, bool expectCoherent) {
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(csr->commandStream, startOffset);
bool foundOne = false;
@ -230,12 +230,6 @@ GEN12LPTEST_F(Gen12LpCoherencyRequirements, givenCoherencyRequirementWithoutShar
EXPECT_EQ(expectedCoherentMask, cmd->getMaskBits());
EXPECT_FALSE(foundOne);
foundOne = true;
auto pc = genCmdCast<PIPE_CONTROL *>(*(++it));
if (!expectPipeControl && !SpecialUltHelperGen12lp::isPipeControlWArequired(device->getHardwareInfo().platform.eProductFamily)) {
EXPECT_EQ(nullptr, pc);
} else {
EXPECT_NE(nullptr, pc);
}
}
}
EXPECT_EQ(expectToBeProgrammed, foundOne);
@ -243,13 +237,13 @@ GEN12LPTEST_F(Gen12LpCoherencyRequirements, givenCoherencyRequirementWithoutShar
flushTask(false);
if (MemorySynchronizationCommands<FamilyType>::isBarrierPriorToPipelineSelectWaRequired(device->getRootDeviceEnvironment())) {
findCmd(true, false, true); // first time
findCmd(true, false); // first time
} else {
findCmd(true, false, false); // first time
findCmd(true, false); // first time
}
flushTask(false);
findCmd(false, false, false); // not changed
findCmd(false, false); // not changed
csr->getMemoryManager()->freeGraphicsMemory(graphicAlloc);
}