[perf] add state compute mode dirty flag to allow selective properties update

- full properties update is time intesive task and must be done only once
- selective update can be done after initial update
- dirty flag will allow to distinguish initial update is done

Related-To: NEO-5055

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2023-03-10 13:20:23 +00:00 committed by Compute-Runtime-Automation
parent d93f00e075
commit 24c8f089ed
8 changed files with 50 additions and 6 deletions

View File

@ -653,9 +653,9 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
linearStreamSizeEstimate += estimatePipelineSelectCmdSize();
if (this->stateComputeModeTracking || this->pipelineSelectStateTracking || frontEndTrackingEnabled() || this->stateBaseAddressTracking) {
auto streamPropertiesCopy = csr->getStreamProperties();
auto streamPropertiesCopy = this->csr->getStreamProperties();
bool frontEndStateDirtyCopy = ctx.frontEndStateDirty;
bool gpgpuEnabledCopy = csr->getPreambleSetFlag();
bool gpgpuEnabledCopy = this->csr->getPreambleSetFlag();
bool baseAdresStateDirtyCopy = ctx.gsbaStateDirty;
for (uint32_t i = 0; i < numCommandLists; i++) {
auto cmdList = CommandList::fromHandle(phCommandLists[i]);
@ -1194,11 +1194,10 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateScmCmdSizeForMultipleCommandLists(
size_t estimatedSize = 0;
bool isRcs = this->getCsr()->isRcs();
size_t singleScmCmdSize = NEO::EncodeComputeMode<GfxFamily>::getCmdSizeForComputeMode(device->getNEODevice()->getRootDeviceEnvironment(), false, isRcs);
csrStateCopy.stateComputeMode.setProperties(cmdListRequired.stateComputeMode);
if (csrStateCopy.stateComputeMode.isDirty()) {
estimatedSize += singleScmCmdSize;
estimatedSize = NEO::EncodeComputeMode<GfxFamily>::getCmdSizeForComputeMode(device->getNEODevice()->getRootDeviceEnvironment(), false, isRcs);
}
csrStateCopy.stateComputeMode.setProperties(cmdListFinal.stateComputeMode);
@ -1227,6 +1226,7 @@ void CommandQueueHw<gfxCoreFamily>::programRequiredStateComputeModeForCommandLis
bool isRcs = this->getCsr()->isRcs();
NEO::EncodeComputeMode<GfxFamily>::programComputeModeCommandWithSynchronization(commandStream, csrState.stateComputeMode, pipelineSelectArgs,
false, device->getNEODevice()->getRootDeviceEnvironment(), isRcs, this->getCsr()->getDcFlushSupport(), nullptr);
this->csr->setStateComputeModeDirty(false);
}
csrState.stateComputeMode.setProperties(cmdListFinal.stateComputeMode);
}

View File

@ -593,6 +593,7 @@ void CmdListThreadArbitrationFixture::testBody() {
auto &commandListStream = *commandList->commandContainer.getCommandStream();
auto &cmdQueueStream = commandQueue->commandStream;
auto queueCsr = commandQueue->getCsr();
GenCmdList cmdList;
std::vector<GenCmdList::iterator> stateComputeModeList;
@ -622,11 +623,15 @@ void CmdListThreadArbitrationFixture::testBody() {
stateComputeModeList.clear();
commandList->close();
EXPECT_TRUE(queueCsr->getStateComputeModeDirty());
sizeBefore = cmdQueueStream.getUsed();
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
sizeAfter = cmdQueueStream.getUsed();
EXPECT_FALSE(queueCsr->getStateComputeModeDirty());
EXPECT_EQ(NEO::ThreadArbitrationPolicy::AgeBased, csrState.stateComputeMode.threadArbitrationPolicy.value);
currentBuffer = ptrOffset(cmdQueueStream.getCpuBase(), sizeBefore);
@ -850,6 +855,7 @@ void CmdListLargeGrfFixture::testBody() {
auto &cmdlistRequiredState = commandList->getRequiredStreamState();
auto &cmdListFinalState = commandList->getFinalStreamState();
auto &csrState = commandQueue->csr->getStreamProperties();
auto queueCsr = commandQueue->getCsr();
auto commandListHandle = commandList->toHandle();
@ -884,11 +890,15 @@ void CmdListLargeGrfFixture::testBody() {
stateComputeModeList.clear();
commandList->close();
EXPECT_TRUE(queueCsr->getStateComputeModeDirty());
sizeBefore = cmdQueueStream.getUsed();
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
sizeAfter = cmdQueueStream.getUsed();
EXPECT_FALSE(queueCsr->getStateComputeModeDirty());
EXPECT_EQ(0, csrState.stateComputeMode.largeGrfMode.value);
currentBuffer = ptrOffset(cmdQueueStream.getCpuBase(), sizeBefore);

View File

@ -143,6 +143,7 @@ struct UltCommandStreamReceiverTest
commandStreamReceiver.isStateSipSent = true;
commandStreamReceiver.lastPreemptionMode = pDevice->getPreemptionMode();
commandStreamReceiver.setMediaVFEStateDirty(false);
commandStreamReceiver.stateComputeModeDirty = false;
auto gmmHelper = pDevice->getGmmHelper();
auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
auto mocsIndex = gfxCoreHelper.getMocsIndex(*gmmHelper, true, isL1CacheEnabled);

View File

@ -534,6 +534,7 @@ void CommandStreamReceiver::initProgrammingFlags() {
bindingTableBaseAddressRequired = true;
mediaVfeStateDirty = true;
lastVmeSubslicesConfig = false;
stateComputeModeDirty = true;
lastSentL3Config = 0;
lastMediaSamplerConfig = -1;

View File

@ -159,10 +159,13 @@ class CommandStreamReceiver {
void overrideDispatchPolicy(DispatchMode overrideValue) { this->dispatchMode = overrideValue; }
void setMediaVFEStateDirty(bool dirty) { mediaVfeStateDirty = dirty; }
bool getMediaVFEStateDirty() { return mediaVfeStateDirty; }
bool getMediaVFEStateDirty() const { return mediaVfeStateDirty; }
void setGSBAStateDirty(bool dirty) { GSBAStateDirty = dirty; }
bool getGSBAStateDirty() { return GSBAStateDirty; }
bool getGSBAStateDirty() const { return GSBAStateDirty; }
void setStateComputeModeDirty(bool dirty) { stateComputeModeDirty = dirty; }
bool getStateComputeModeDirty() const { return stateComputeModeDirty; }
void setRequiredScratchSizes(uint32_t newRequiredScratchSize, uint32_t newRequiredPrivateScratchSize);
GraphicsAllocation *getScratchAllocation();
@ -507,6 +510,7 @@ class CommandStreamReceiver {
bool bindingTableBaseAddressRequired = false;
bool heapStorageRequiresRecyclingTag = false;
bool mediaVfeStateDirty = true;
bool stateComputeModeDirty = true;
bool lastVmeSubslicesConfig = false;
bool timestampPacketWriteEnabled = false;
bool staticWorkPartitioningEnabled = false;

View File

@ -804,6 +804,7 @@ void CommandStreamReceiverHw<GfxFamily>::programComputeMode(LinearStream &stream
EncodeComputeMode<GfxFamily>::programComputeModeCommandWithSynchronization(
stream, this->streamProperties.stateComputeMode, dispatchFlags.pipelineSelectArgs,
hasSharedHandles(), this->peekRootDeviceEnvironment(), isRcs(), this->dcFlushSupport, logicalStateHelper.get());
this->setStateComputeModeDirty(false);
}
}

View File

@ -124,6 +124,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
using BaseClass::CommandStreamReceiver::samplerCacheFlushRequired;
using BaseClass::CommandStreamReceiver::sbaSupportFlags;
using BaseClass::CommandStreamReceiver::scratchSpaceController;
using BaseClass::CommandStreamReceiver::stateComputeModeDirty;
using BaseClass::CommandStreamReceiver::submissionAggregator;
using BaseClass::CommandStreamReceiver::tagAddress;
using BaseClass::CommandStreamReceiver::taskCount;

View File

@ -198,6 +198,7 @@ HWTEST_F(CommandStreamReceiverTest, WhenCreatingCsrThenFlagsAreSetCorrectly) {
EXPECT_FALSE(csr.isPreambleSent);
EXPECT_FALSE(csr.GSBAFor32BitProgrammed);
EXPECT_TRUE(csr.mediaVfeStateDirty);
EXPECT_TRUE(csr.stateComputeModeDirty);
EXPECT_FALSE(csr.lastVmeSubslicesConfig);
EXPECT_EQ(0u, csr.lastSentL3Config);
EXPECT_EQ(-1, csr.lastMediaSamplerConfig);
@ -2999,3 +3000,28 @@ HWTEST2_F(CommandStreamReceiverHwTest,
ASSERT_NE(nullptr, hwParserCsr.cmdStateBaseAddress);
EXPECT_EQ(nullptr, hwParserCsr.cmdBindingTableBaseAddress);
}
HWTEST2_F(CommandStreamReceiverHwTest,
givenStateComputeModeDirtyWhenFlushingFirstTimeThenCleanDirtyFlagToDispatchStateComputeMode,
IsAtLeastXeHpCore) {
using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE;
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
EXPECT_TRUE(commandStreamReceiver.getStateComputeModeDirty());
commandStreamReceiver.flushTask(commandStream,
0,
&dsh,
&ioh,
&ssh,
taskLevel,
flushTaskFlags,
*pDevice);
EXPECT_FALSE(commandStreamReceiver.getStateComputeModeDirty());
HardwareParse hwParserCsr;
hwParserCsr.parseCommands<FamilyType>(commandStreamReceiver.commandStream, 0);
hwParserCsr.findHardwareCommands<FamilyType>();
auto scmCmd = hwParserCsr.getCommand<STATE_COMPUTE_MODE>();
EXPECT_NE(nullptr, scmCmd);
}