Add cross regular and intermediate command lists base address state transitions

- updates coming from regular list are updated in csr last sent variables
- all per context and per kernel transitions kept in single place
- state updates from intermediate to regular are set in csr properties
- global atomics support duplicates removed

Related-To: NEO-5055

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2023-02-02 18:57:24 +00:00
committed by Compute-Runtime-Automation
parent 140e59810f
commit bf2072c3ea
16 changed files with 363 additions and 337 deletions

View File

@@ -136,8 +136,6 @@ struct EncodeDispatchKernel {
static void adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
static constexpr bool shouldUpdateGlobalAtomics(bool &currentVal, bool refVal, bool updateCurrent);
static size_t getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount);
static size_t getSizeRequiredSsh(const KernelInfo &kernelInfo);
inline static size_t additionalSizeRequiredDsh(uint32_t iddCount);

View File

@@ -727,9 +727,6 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
template <typename Family>
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {}
template <typename Family>
constexpr bool EncodeDispatchKernel<Family>::shouldUpdateGlobalAtomics(bool &currentVal, bool refVal, bool updateCurrent) { return false; }
template <typename Family>
size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) {
using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA;

View File

@@ -220,8 +220,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
}
}
if (shouldUpdateGlobalAtomics(container.lastSentUseGlobalAtomics, args.useGlobalAtomics, args.partitionCount > 1) ||
container.isAnyHeapDirty() ||
if (container.isAnyHeapDirty() ||
args.requiresUncachedMocs) {
PipeControlArgs syncArgs;

View File

@@ -92,6 +92,7 @@ CommandStreamReceiver::CommandStreamReceiver(ExecutionEnvironment &executionEnvi
auto &productHelper = getProductHelper();
productHelper.fillFrontEndPropertiesSupportStructure(feSupportFlags, hwInfo);
productHelper.fillPipelineSelectPropertiesSupportStructure(pipelineSupportFlags, hwInfo);
productHelper.fillStateBaseAddressPropertiesSupportStructure(sbaSupportFlags);
}
CommandStreamReceiver::~CommandStreamReceiver() {
@@ -530,8 +531,12 @@ void CommandStreamReceiver::initProgrammingFlags() {
lastSentL3Config = 0;
lastMediaSamplerConfig = -1;
lastPreemptionMode = PreemptionMode::Initial;
latestSentStatelessMocsConfig = CacheSettings::unknownMocs;
this->streamProperties.stateBaseAddress.statelessMocs = {};
lastSentUseGlobalAtomics = false;
this->streamProperties.stateBaseAddress.globalAtomics = {};
}
void CommandStreamReceiver::programForAubSubCapture(bool wasActiveInPreviousEnqueue, bool isActive) {

View File

@@ -427,6 +427,7 @@ class CommandStreamReceiver {
StreamProperties streamProperties{};
FrontEndPropertiesSupport feSupportFlags{};
PipelineSelectPropertiesSupport pipelineSupportFlags{};
StateBaseAddressPropertiesSupport sbaSupportFlags{};
uint64_t totalMemoryUsed = 0u;

View File

@@ -185,10 +185,10 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
bool checkPlatformSupportsGpuIdleImplicitFlush() const;
void configurePostSyncWriteOffset();
void unregisterDirectSubmissionFromController();
constexpr bool isGlobalAtomicsProgrammingRequired(bool currentValue) const;
void createKernelArgsBufferAllocation() override;
void handleFrontEndStateTransition(DispatchFlags &dispatchFlags);
void handlePipelineSelectStateTransition(DispatchFlags &dispatchFlags);
void handleFrontEndStateTransition(const DispatchFlags &dispatchFlags);
void handlePipelineSelectStateTransition(const DispatchFlags &dispatchFlags);
void handleStateBaseAddressStateTransition(const DispatchFlags &dispatchFlags, bool &isStateBaseAddressDirty);
HeapDirtyState dshState;
HeapDirtyState iohState;

View File

@@ -480,39 +480,11 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
auto isStateBaseAddressDirty = dshDirty || iohDirty || sshDirty || stateBaseAddressDirty;
auto mocsIndex = latestSentStatelessMocsConfig;
if (dispatchFlags.l3CacheSettings != L3CachingSettings::NotApplicable) {
auto l3On = dispatchFlags.l3CacheSettings != L3CachingSettings::l3CacheOff;
auto l1On = dispatchFlags.l3CacheSettings == L3CachingSettings::l3AndL1On;
mocsIndex = gfxCoreHelper.getMocsIndex(*device.getGmmHelper(), l3On, l1On);
}
if (mocsIndex != latestSentStatelessMocsConfig) {
isStateBaseAddressDirty = true;
latestSentStatelessMocsConfig = mocsIndex;
}
this->streamProperties.stateBaseAddress.setPropertyStatelessMocs(mocsIndex);
if (this->isGlobalAtomicsProgrammingRequired(dispatchFlags.useGlobalAtomics) && (this->isMultiOsContextCapable() || dispatchFlags.areMultipleSubDevicesInContext)) {
isStateBaseAddressDirty = true;
lastSentUseGlobalAtomics = dispatchFlags.useGlobalAtomics;
this->streamProperties.stateBaseAddress.setPropertyGlobalAtomics(lastSentUseGlobalAtomics, rootDeviceEnvironment, false);
}
handleStateBaseAddressStateTransition(dispatchFlags, isStateBaseAddressDirty);
bool debuggingEnabled = device.getDebugger() != nullptr;
bool sourceLevelDebuggerActive = device.getSourceLevelDebugger() != nullptr ? true : false;
auto memoryCompressionState = lastMemoryCompressionState;
if (dispatchFlags.memoryCompressionState != MemoryCompressionState::NotApplicable) {
memoryCompressionState = dispatchFlags.memoryCompressionState;
}
if (memoryCompressionState != lastMemoryCompressionState) {
isStateBaseAddressDirty = true;
lastMemoryCompressionState = memoryCompressionState;
}
// Reprogram state base address if required
if (isStateBaseAddressDirty || sourceLevelDebuggerActive) {
EncodeWA<GfxFamily>::addPipeControlBeforeStateBaseAddress(commandStreamCSR, rootDeviceEnvironment, isRcs(), this->dcFlushSupport);
@@ -546,13 +518,13 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
ssh, // ssh
device.getGmmHelper(), // gmmHelper
&hwInfo, // hwInfo
mocsIndex, // statelessMocsIndex
memoryCompressionState, // memoryCompressionState
this->latestSentStatelessMocsConfig, // statelessMocsIndex
this->lastMemoryCompressionState, // memoryCompressionState
true, // setInstructionStateBaseAddress
true, // setGeneralStateBaseAddress
false, // useGlobalHeapsBaseAddress
isMultiOsContextCapable(), // isMultiOsContextCapable
dispatchFlags.useGlobalAtomics, // useGlobalAtomics
this->lastSentUseGlobalAtomics, // useGlobalAtomics
dispatchFlags.areMultipleSubDevicesInContext, // areMultipleSubDevicesInContext
false, // overrideSurfaceStateBaseAddress
debuggingEnabled || device.isDebuggerActive() // isDebuggerActive
@@ -1625,11 +1597,6 @@ size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForComputeMode() {
return EncodeComputeMode<GfxFamily>::getCmdSizeForComputeMode(this->peekRootDeviceEnvironment(), hasSharedHandles(), isRcs());
}
template <typename GfxFamily>
constexpr bool CommandStreamReceiverHw<GfxFamily>::isGlobalAtomicsProgrammingRequired(bool currentVal) const {
return false;
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::createKernelArgsBufferAllocation() {
}
@@ -1640,7 +1607,7 @@ SubmissionStatus CommandStreamReceiverHw<GfxFamily>::initializeDeviceWithFirstSu
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::handleFrontEndStateTransition(DispatchFlags &dispatchFlags) {
void CommandStreamReceiverHw<GfxFamily>::handleFrontEndStateTransition(const DispatchFlags &dispatchFlags) {
if (streamProperties.frontEndState.disableOverdispatch.value != -1) {
lastAdditionalKernelExecInfo = streamProperties.frontEndState.disableOverdispatch.value == 1 ? AdditionalKernelExecInfo::DisableOverdispatch : AdditionalKernelExecInfo::NotSet;
}
@@ -1665,7 +1632,7 @@ void CommandStreamReceiverHw<GfxFamily>::handleFrontEndStateTransition(DispatchF
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::handlePipelineSelectStateTransition(DispatchFlags &dispatchFlags) {
void CommandStreamReceiverHw<GfxFamily>::handlePipelineSelectStateTransition(const DispatchFlags &dispatchFlags) {
if (streamProperties.pipelineSelect.mediaSamplerDopClockGate.value != -1) {
this->lastMediaSamplerConfig = static_cast<int8_t>(streamProperties.pipelineSelect.mediaSamplerDopClockGate.value);
}
@@ -1685,4 +1652,48 @@ bool CommandStreamReceiverHw<GfxFamily>::directSubmissionRelaxedOrderingEnabled(
(blitterDirectSubmission.get() && blitterDirectSubmission->isRelaxedOrderingEnabled()));
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::handleStateBaseAddressStateTransition(const DispatchFlags &dispatchFlags, bool &isStateBaseAddressDirty) {
auto &rootDeviceEnvironment = this->peekRootDeviceEnvironment();
if (this->streamProperties.stateBaseAddress.statelessMocs.value != -1) {
this->latestSentStatelessMocsConfig = static_cast<uint32_t>(this->streamProperties.stateBaseAddress.statelessMocs.value);
}
auto mocsIndex = this->latestSentStatelessMocsConfig;
if (dispatchFlags.l3CacheSettings != L3CachingSettings::NotApplicable) {
auto l3On = dispatchFlags.l3CacheSettings != L3CachingSettings::l3CacheOff;
auto l1On = dispatchFlags.l3CacheSettings == L3CachingSettings::l3AndL1On;
auto &gfxCoreHelper = getGfxCoreHelper();
mocsIndex = gfxCoreHelper.getMocsIndex(*rootDeviceEnvironment.getGmmHelper(), l3On, l1On);
}
if (mocsIndex != this->latestSentStatelessMocsConfig) {
isStateBaseAddressDirty = true;
this->latestSentStatelessMocsConfig = mocsIndex;
}
this->streamProperties.stateBaseAddress.setPropertyStatelessMocs(mocsIndex);
auto memoryCompressionState = this->lastMemoryCompressionState;
if (dispatchFlags.memoryCompressionState != MemoryCompressionState::NotApplicable) {
memoryCompressionState = dispatchFlags.memoryCompressionState;
}
if (memoryCompressionState != this->lastMemoryCompressionState) {
isStateBaseAddressDirty = true;
this->lastMemoryCompressionState = memoryCompressionState;
}
if (this->sbaSupportFlags.globalAtomics) {
if (this->streamProperties.stateBaseAddress.globalAtomics.value != -1) {
this->lastSentUseGlobalAtomics = !!this->streamProperties.stateBaseAddress.globalAtomics.value;
}
bool globalAtomics = (this->isMultiOsContextCapable() || dispatchFlags.areMultipleSubDevicesInContext) && dispatchFlags.useGlobalAtomics;
if (this->lastSentUseGlobalAtomics != globalAtomics) {
isStateBaseAddressDirty = true;
this->lastSentUseGlobalAtomics = globalAtomics;
}
this->streamProperties.stateBaseAddress.setPropertyGlobalAtomics(globalAtomics, rootDeviceEnvironment, false);
}
}
} // namespace NEO

View File

@@ -73,15 +73,6 @@ void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCR
}
}
template <>
constexpr bool EncodeDispatchKernel<Family>::shouldUpdateGlobalAtomics(bool &currentVal, bool refVal, bool predicate) {
if (predicate && currentVal != refVal) {
currentVal = refVal;
return true;
}
return false;
}
template <>
void adjustL3ControlField<Family>(void *l3ControlBuffer) { ; }

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
* Copyright (C) 2021-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -71,11 +71,6 @@ size_t CommandStreamReceiverHw<Family>::getCmdSizeForPerDssBackedBuffer(const Ha
template <>
void CommandStreamReceiverHw<Family>::addPipeControlBefore3dState(LinearStream &commandStream, DispatchFlags &dispatchFlags) {}
template <>
constexpr bool CommandStreamReceiverHw<Family>::isGlobalAtomicsProgrammingRequired(bool currentValue) const {
return currentValue != this->lastSentUseGlobalAtomics;
}
template <>
void BlitCommandsHelper<Family>::appendClearColor(const BlitProperties &blitProperties, typename Family::XY_BLOCK_COPY_BLT &blitCmd) {
using XY_BLOCK_COPY_BLT = typename Family::XY_BLOCK_COPY_BLT;

View File

@@ -53,12 +53,10 @@ struct XeHpCore {
struct StateBaseAddressStateSupport {
static constexpr bool globalAtomics = true;
static constexpr bool statelessMocs = true;
static constexpr bool bindingTablePoolBaseAddress = true;
};
struct PipelineSelectStateSupport {
static constexpr bool modeSelected = true;
static constexpr bool mediaSamplerDopClockGate = true;
static constexpr bool systolicMode = true;
};