Files
compute-runtime/shared/source/command_stream/command_stream_receiver_hw.h
Dominik Dabek 2b964254d6 performance: debug key for adjust ULLS on battery
ULLS controller timeout settings will be adjusted based on ac line
status and lowest queue throttle from submissions.

Lowest queue throttle is reset when controller stops ULLS.

Related-To: NEO-10800

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
2024-03-22 14:24:00 +01:00

355 lines
20 KiB
C++

/*
* Copyright (C) 2018-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/direct_submission/direct_submission_hw.h"
#include "shared/source/direct_submission/dispatchers/blitter_dispatcher.h"
#include "shared/source/direct_submission/dispatchers/render_dispatcher.h"
#include "shared/source/helpers/dirty_state_helpers.h"
namespace NEO {
class TagNodeBase;
template <typename GfxFamily>
class DeviceCommandStreamReceiver;
struct PipeControlArgs;
template <typename GfxFamily>
class CommandStreamReceiverHw : public CommandStreamReceiver {
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using STATE_BASE_ADDRESS = typename GfxFamily::STATE_BASE_ADDRESS;
struct ImmediateFlushData {
PipelineSelectArgs pipelineSelectArgs{};
size_t estimatedSize = 0;
void *endPtr = nullptr;
size_t csrStartOffset = 0;
bool pipelineSelectFullConfigurationNeeded = false;
bool pipelineSelectDirty = false;
bool frontEndFullConfigurationNeeded = false;
bool frontEndDirty = false;
bool stateComputeModeFullConfigurationNeeded = false;
bool stateComputeModeDirty = false;
bool stateBaseAddressFullConfigurationNeeded = false;
bool stateBaseAddressDirty = false;
bool contextOneTimeInit = false;
};
public:
static CommandStreamReceiver *create(ExecutionEnvironment &executionEnvironment,
uint32_t rootDeviceIndex,
const DeviceBitfield deviceBitfield) {
return new CommandStreamReceiverHw<GfxFamily>(executionEnvironment, rootDeviceIndex, deviceBitfield);
}
CommandStreamReceiverHw(ExecutionEnvironment &executionEnvironment,
uint32_t rootDeviceIndex,
const DeviceBitfield deviceBitfield);
~CommandStreamReceiverHw() override;
SubmissionStatus flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) override;
CompletionStamp flushTask(LinearStream &commandStream, size_t commandStreamStart,
const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh,
TaskCountType taskLevel, DispatchFlags &dispatchFlags, Device &device) override;
CompletionStamp flushTaskStateless(LinearStream &commandStream, size_t commandStreamStart,
const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh,
TaskCountType taskLevel, DispatchFlags &dispatchFlags, Device &device) override;
void addPipeControlFlushTaskIfNeeded(LinearStream &commandStreamCSR, TaskCountType taskLevel);
CompletionStamp flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart, const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) override;
CompletionStamp flushImmediateTask(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags, Device &device) override;
void forcePipeControl(NEO::LinearStream &commandStreamCSR);
bool flushBatchedSubmissions() override;
void programHardwareContext(LinearStream &cmdStream) override;
size_t getCmdsSizeForHardwareContext() const override;
static void addBatchBufferEnd(LinearStream &commandStream, void **patchLocation);
void programEndingCmd(LinearStream &commandStream, void **patchLocation, bool directSubmissionEnabled, bool hasRelaxedOrderingDependencies);
void addBatchBufferStart(MI_BATCH_BUFFER_START *commandBufferMemory, uint64_t startAddress, bool secondary);
size_t getRequiredStateBaseAddressSize(const Device &device) const;
size_t getRequiredCmdStreamSize(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdStreamSizeAligned(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdStreamSize(const DispatchBcsFlags &dispatchBcsFlags);
size_t getRequiredCmdStreamSizeAligned(const DispatchBcsFlags &dispatchBcsFlags);
size_t getRequiredCmdStreamHeaplessSize(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdStreamHeaplessSizeAligned(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdSizeForPreamble(Device &device) const;
size_t getCmdSizeForPreemption(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForEpilogue(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForEpilogueCommands(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForL3Config() const;
size_t getCmdSizeForPipelineSelect() const;
size_t getCmdSizeForMediaSampler(bool mediaSamplerRequired) const;
size_t getCmdSizeForEngineMode(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForPerDssBackedBuffer(const HardwareInfo &hwInfo);
size_t getCmdSizeForActivePartitionConfig() const;
size_t getCmdSizeForStallingCommands(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForStallingNoPostSyncCommands() const;
size_t getCmdSizeForStallingPostSyncCommands() const;
size_t getCmdSizeForComputeMode();
MOCKABLE_VIRTUAL bool hasSharedHandles();
bool isPipelineSelectAlreadyProgrammed() const;
void programComputeMode(LinearStream &csr, DispatchFlags &dispatchFlags, const HardwareInfo &hwInfo);
WaitStatus waitForTaskCountWithKmdNotifyFallback(TaskCountType taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, QueueThrottle throttle) override;
void collectStateBaseAddresPatchInfo(
uint64_t commandBufferAddress,
uint64_t commandOffset,
const LinearStream *dsh,
const LinearStream *ioh,
const LinearStream *ssh,
uint64_t generalStateBase,
bool imagesSupported);
void collectStateBaseAddresIohPatchInfo(uint64_t commandBufferAddress, uint64_t commandOffset, const LinearStream &ioh);
void resetKmdNotifyHelper(KmdNotifyHelper *newHelper);
CommandStreamReceiverType getType() const override {
return CommandStreamReceiverType::CSR_HW;
}
TaskCountType flushBcsTask(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled, Device &device) override;
SubmissionStatus flushTagUpdate() override;
SubmissionStatus flushMiFlushDW();
SubmissionStatus flushPipeControl(bool stateCacheFlush);
SubmissionStatus flushSmallTask(LinearStream &commandStreamTask,
size_t commandStreamStartTask);
SubmissionStatus flushHandler(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency);
SubmissionStatus sendRenderStateCacheFlush() override;
bool isUpdateTagFromWaitEnabled() override;
void updateTagFromWait() override;
bool isMultiOsContextCapable() const override;
MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired) const override;
bool isDirectSubmissionEnabled() const override {
return directSubmission.get() != nullptr;
}
bool isBlitterDirectSubmissionEnabled() const override {
return blitterDirectSubmission.get() != nullptr;
}
bool directSubmissionRelaxedOrderingEnabled() const override;
void stopDirectSubmission(bool blocking) override;
QueueThrottle getLastDirectSubmissionThrottle() override;
virtual bool isKmdWaitModeActive() { return true; }
bool initDirectSubmission() override;
GraphicsAllocation *getClearColorAllocation() override;
TagAllocatorBase *getTimestampPacketAllocator() override;
std::unique_ptr<TagAllocatorBase> createMultiRootDeviceTimestampPacketAllocator(const RootDeviceIndicesContainer rootDeviceIndices) override;
void postInitFlagsSetup() override;
void programActivePartitionConfig(LinearStream &csr);
void programComputeBarrierCommand(LinearStream &cmdStream) override {
programStallingNoPostSyncCommandsForBarrier(cmdStream);
}
size_t getCmdsSizeForComputeBarrierCommand() const override {
return getCmdSizeForStallingNoPostSyncCommands();
}
void programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) override;
SubmissionStatus initializeDeviceWithFirstSubmission(Device &device) override;
HeapDirtyState &getDshState() {
return dshState;
}
HeapDirtyState &getSshState() {
return sshState;
}
HeapDirtyState &getIohState() {
return iohState;
}
void dispatchRayTracingStateCommand(LinearStream &cmdStream, Device &device);
uint64_t getScratchPatchAddress();
SubmissionStatus programHeaplessProlog(Device &device);
void programHeaplessStateProlog(Device &device, LinearStream &commandStream);
void programStateBaseAddressHeapless(Device &device, LinearStream &commandStream);
void programComputeModeHeapless(Device &device, LinearStream &commandStream);
void handleAllocationsResidencyForflushTaskStateless(const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh);
protected:
void programPreemption(LinearStream &csr, DispatchFlags &dispatchFlags);
void programL3(LinearStream &csr, uint32_t &newL3Config);
void programPreamble(LinearStream &csr, Device &device, uint32_t &newL3Config);
void programPipelineSelect(LinearStream &csr, PipelineSelectArgs &pipelineSelectArgs);
void programEpilogue(LinearStream &csr, Device &device, void **batchBufferEndLocation, DispatchFlags &dispatchFlags);
void programEpliogueCommands(LinearStream &csr, const DispatchFlags &dispatchFlags);
void programMediaSampler(LinearStream &csr, DispatchFlags &dispatchFlags);
void programPerDssBackedBuffer(LinearStream &scr, Device &device, DispatchFlags &dispatchFlags);
void programStateSip(LinearStream &cmdStream, Device &device);
void programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t maxFrontEndThreads);
void programStallingNoPostSyncCommandsForBarrier(LinearStream &cmdStream);
void programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode, bool dcFlushRequired);
void programEngineModeCommands(LinearStream &csr, const DispatchFlags &dispatchFlags);
void programEngineModeEpliogue(LinearStream &csr, const DispatchFlags &dispatchFlags);
void programActivePartitionConfigFlushTask(LinearStream &csr);
void programEnginePrologue(LinearStream &csr);
size_t getCmdSizeForPrologue() const;
size_t getCmdSizeForHeaplessPrologue(Device &device) const;
void handleAllocationsResidencyForHeaplessProlog(LinearStream &linearStream, Device &device);
void setClearSlmWorkAroundParameter(PipeControlArgs &args);
void addPipeControlBeforeStateSip(LinearStream &commandStream, Device &device);
void addPipeControlBefore3dState(LinearStream &commandStream, DispatchFlags &dispatchFlags);
bool are4GbHeapsAvailable() const;
void createScratchSpaceController();
bool detectInitProgrammingFlagsRequired(const DispatchFlags &dispatchFlags) const;
bool checkPlatformSupportsNewResourceImplicitFlush() const;
bool checkPlatformSupportsGpuIdleImplicitFlush() const;
void configurePostSyncWriteOffset();
void unregisterDirectSubmissionFromController();
void handleFrontEndStateTransition(const DispatchFlags &dispatchFlags);
void handlePipelineSelectStateTransition(const DispatchFlags &dispatchFlags);
void handleStateBaseAddressStateTransition(const DispatchFlags &dispatchFlags, bool &isStateBaseAddressDirty);
void updateStreamTaskCount(LinearStream &stream, TaskCountType newTaskCount);
inline void programStateBaseAddress(const IndirectHeap *dsh,
const IndirectHeap *ioh,
const IndirectHeap *ssh,
DispatchFlags &dispatchFlags,
Device &device, LinearStream &commandStreamCSR,
bool stateBaseAddressDirty);
inline void reprogramStateBaseAddress(const IndirectHeap *dsh,
const IndirectHeap *ioh,
const IndirectHeap *ssh,
DispatchFlags &dispatchFlags,
Device &device, LinearStream &commandStreamCSR,
bool force32BitAllocations, bool sshDirty, bool bindingTablePoolCommandNeeded);
inline void programStateBaseAddressCommon(const IndirectHeap *dsh,
const IndirectHeap *ioh,
const IndirectHeap *ssh,
StateBaseAddressProperties *sbaProperties,
uint64_t generalStateBaseAddress,
uint64_t indirectObjectStateBaseAddress,
PipelineSelectArgs &pipelineSelectArgs,
Device &device,
LinearStream &csrCommandStream,
bool dispatchBindingTableCommand,
bool areMultipleSubDevicesInContext,
bool setGeneralStateBaseAddress);
inline void processBarrierWithPostSync(LinearStream &commandStreamTask,
DispatchFlags &dispatchFlags,
bool &levelClosed,
void *&currentPipeControlForNooping,
void *&epiloguePipeControlLocation,
bool &hasStallingCmdsOnTaskStream,
PipeControlArgs &args);
inline CompletionStamp handleFlushTaskSubmission(BatchBuffer &&batchBuffer,
const DispatchFlags &dispatchFlags,
Device &device,
void *currentPipeControlForNooping,
void *epiloguePipeControlLocation,
PipeControlArgs &args,
bool submitTask,
bool submitCSR,
bool hasStallingCmdsOnTaskStream,
bool levelClosed,
bool implicitFlush);
inline CompletionStamp updateTaskCountAndGetCompletionStamp(bool levelClosed);
inline void programSamplerCacheFlushBetweenRedescribedSurfaceReads(LinearStream &commandStreamCSR);
bool bcsRelaxedOrderingAllowed(const BlitPropertiesContainer &blitPropertiesContainer, bool hasStallingCmds) const;
inline void handleImmediateFlushPipelineSelectState(ImmediateDispatchFlags &dispatchFlags, ImmediateFlushData &flushData);
inline void dispatchImmediateFlushPipelineSelectCommand(ImmediateFlushData &flushData, LinearStream &csrStream);
inline void handleImmediateFlushFrontEndState(ImmediateDispatchFlags &dispatchFlags, ImmediateFlushData &flushData);
inline void dispatchImmediateFlushFrontEndCommand(ImmediateFlushData &flushData, Device &device, LinearStream &csrStream);
inline void handleImmediateFlushStateComputeModeState(ImmediateDispatchFlags &dispatchFlags, ImmediateFlushData &flushData);
inline void dispatchImmediateFlushStateComputeModeCommand(ImmediateFlushData &flushData, LinearStream &csrStream);
inline void handleImmediateFlushStateBaseAddressState(ImmediateDispatchFlags &dispatchFlags, ImmediateFlushData &flushData, Device &device);
inline void dispatchImmediateFlushStateBaseAddressCommand(ImmediateFlushData &flushData, LinearStream &csrStream, Device &device);
inline void handleImmediateFlushOneTimeContextInitState(ImmediateDispatchFlags &dispatchFlags, ImmediateFlushData &flushData, Device &device);
inline void dispatchImmediateFlushOneTimeContextInitCommand(ImmediateFlushData &flushData, LinearStream &csrStream, Device &device);
inline void handleImmediateFlushJumpToImmediate(ImmediateFlushData &flushData);
inline void dispatchImmediateFlushJumpToImmediateCommand(LinearStream &immediateCommandStream,
size_t immediateCommandStreamStart,
ImmediateFlushData &flushData,
LinearStream &csrStream);
inline void dispatchImmediateFlushClientBufferCommands(ImmediateDispatchFlags &dispatchFlags,
LinearStream &immediateCommandStream,
ImmediateFlushData &flushData);
inline void handleImmediateFlushAllocationsResidency(Device &device,
LinearStream &immediateCommandStream,
ImmediateFlushData &flushData,
LinearStream &csrStream);
inline CompletionStamp handleImmediateFlushSendBatchBuffer(LinearStream &immediateCommandStream,
size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags,
ImmediateFlushData &flushData,
LinearStream &csrStream);
inline void handleBatchedDispatchImplicitFlush(uint64_t globalMemorySize, bool implicitFlush);
inline BatchBuffer prepareBatchBufferForSubmission(LinearStream &commandStreamTask,
size_t commandStreamStartTask,
LinearStream &commandStreamCSR,
size_t commandStreamStartCSR,
DispatchFlags &dispatchFlags,
Device &device,
bool submitTask,
bool submitCSR,
bool hasStallingCmdsOnTaskStream);
inline void chainCsrWorkToTask(LinearStream &commandStreamCSR,
LinearStream &commandStreamTask,
size_t commandStreamStartTask,
void *bbEndLocation,
size_t &chainedBatchBufferStartOffset,
GraphicsAllocation *&chainedBatchBuffer);
HeapDirtyState dshState;
HeapDirtyState iohState;
HeapDirtyState sshState;
CsrSizeRequestFlags csrSizeRequestFlags = {};
bool wasSubmittedToSingleSubdevice = false;
std::unique_ptr<DirectSubmissionHw<GfxFamily, RenderDispatcher<GfxFamily>>> directSubmission;
std::unique_ptr<DirectSubmissionHw<GfxFamily, BlitterDispatcher<GfxFamily>>> blitterDirectSubmission;
size_t cmdStreamStart = 0;
uint32_t latestSentBcsWaValue = std::numeric_limits<uint32_t>::max();
bool heaplessPrologueSent = false;
};
} // namespace NEO