performance(ocl): program barrier pc in taskStream

Program barrier immediately to task stream.
This will reduce the number of batch buffer starts.

Related-To: NEO-8147

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek
2023-08-30 15:06:48 +00:00
committed by Compute-Runtime-Automation
parent a38ac3557b
commit 839c2d6737
16 changed files with 163 additions and 40 deletions

View File

@@ -359,6 +359,8 @@ class CommandStreamReceiver {
virtual void programComputeBarrierCommand(LinearStream &cmdStream) = 0;
virtual size_t getCmdsSizeForComputeBarrierCommand() const = 0;
virtual void programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) = 0;
const HardwareInfo &peekHwInfo() const;
const RootDeviceEnvironment &peekRootDeviceEnvironment() const;

View File

@@ -162,6 +162,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
size_t getCmdsSizeForComputeBarrierCommand() const override {
return getCmdSizeForStallingNoPostSyncCommands();
}
void programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) override;
SubmissionStatus initializeDeviceWithFirstSubmission() override;
HeapDirtyState &getDshState() {
@@ -187,7 +188,6 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
void programPerDssBackedBuffer(LinearStream &scr, Device &device, DispatchFlags &dispatchFlags);
void programStateSip(LinearStream &cmdStream, Device &device);
void programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t maxFrontEndThreads);
void programStallingCommandsForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags);
void programStallingNoPostSyncCommandsForBarrier(LinearStream &cmdStream);
void programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode, bool dcFlushRequired);
void programEngineModeCommands(LinearStream &csr, const DispatchFlags &dispatchFlags);

View File

@@ -508,9 +508,9 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
if (dispatchFlags.isStallingCommandsOnNextFlushRequired) {
if (DebugManager.flags.ProgramBarrierInCommandStreamTask.get() == 1) {
programStallingCommandsForBarrier(commandStreamTask, dispatchFlags);
programStallingCommandsForBarrier(commandStreamTask, dispatchFlags.barrierTimestampPacketNodes, dispatchFlags.isDcFlushRequiredOnStallingCommandsOnNextFlush);
} else {
programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags);
programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags.barrierTimestampPacketNodes, dispatchFlags.isDcFlushRequiredOnStallingCommandsOnNextFlush);
}
}
@@ -744,12 +744,9 @@ void CommandStreamReceiverHw<GfxFamily>::programComputeMode(LinearStream &stream
}
template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::programStallingCommandsForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags) {
auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes;
inline void CommandStreamReceiverHw<GfxFamily>::programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) {
if (barrierTimestampPacketNodes && barrierTimestampPacketNodes->peekNodes().size() != 0) {
programStallingPostSyncCommandsForBarrier(cmdStream, *barrierTimestampPacketNodes->peekNodes()[0], dispatchFlags.isDcFlushRequiredOnStallingCommandsOnNextFlush);
programStallingPostSyncCommandsForBarrier(cmdStream, *barrierTimestampPacketNodes->peekNodes()[0], isDcFlushRequired);
barrierTimestampPacketNodes->makeResident(*this);
} else {
programStallingNoPostSyncCommandsForBarrier(cmdStream);