performance(ocl): program barrier pc in taskStream

Program barrier immediately to task stream.
This will reduce the number of batch buffer starts.

Related-To: NEO-8147

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek
2023-08-30 15:06:48 +00:00
committed by Compute-Runtime-Automation
parent a38ac3557b
commit 839c2d6737
16 changed files with 163 additions and 40 deletions

View File

@@ -359,6 +359,8 @@ class CommandStreamReceiver {
virtual void programComputeBarrierCommand(LinearStream &cmdStream) = 0;
virtual size_t getCmdsSizeForComputeBarrierCommand() const = 0;
virtual void programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) = 0;
const HardwareInfo &peekHwInfo() const;
const RootDeviceEnvironment &peekRootDeviceEnvironment() const;

View File

@@ -162,6 +162,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
size_t getCmdsSizeForComputeBarrierCommand() const override {
return getCmdSizeForStallingNoPostSyncCommands();
}
void programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) override;
SubmissionStatus initializeDeviceWithFirstSubmission() override;
HeapDirtyState &getDshState() {
@@ -187,7 +188,6 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
void programPerDssBackedBuffer(LinearStream &scr, Device &device, DispatchFlags &dispatchFlags);
void programStateSip(LinearStream &cmdStream, Device &device);
void programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t maxFrontEndThreads);
void programStallingCommandsForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags);
void programStallingNoPostSyncCommandsForBarrier(LinearStream &cmdStream);
void programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode, bool dcFlushRequired);
void programEngineModeCommands(LinearStream &csr, const DispatchFlags &dispatchFlags);

View File

@@ -508,9 +508,9 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
if (dispatchFlags.isStallingCommandsOnNextFlushRequired) {
if (DebugManager.flags.ProgramBarrierInCommandStreamTask.get() == 1) {
programStallingCommandsForBarrier(commandStreamTask, dispatchFlags);
programStallingCommandsForBarrier(commandStreamTask, dispatchFlags.barrierTimestampPacketNodes, dispatchFlags.isDcFlushRequiredOnStallingCommandsOnNextFlush);
} else {
programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags);
programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags.barrierTimestampPacketNodes, dispatchFlags.isDcFlushRequiredOnStallingCommandsOnNextFlush);
}
}
@@ -744,12 +744,9 @@ void CommandStreamReceiverHw<GfxFamily>::programComputeMode(LinearStream &stream
}
template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::programStallingCommandsForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags) {
auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes;
inline void CommandStreamReceiverHw<GfxFamily>::programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) {
if (barrierTimestampPacketNodes && barrierTimestampPacketNodes->peekNodes().size() != 0) {
programStallingPostSyncCommandsForBarrier(cmdStream, *barrierTimestampPacketNodes->peekNodes()[0], dispatchFlags.isDcFlushRequiredOnStallingCommandsOnNextFlush);
programStallingPostSyncCommandsForBarrier(cmdStream, *barrierTimestampPacketNodes->peekNodes()[0], isDcFlushRequired);
barrierTimestampPacketNodes->makeResident(*this);
} else {
programStallingNoPostSyncCommandsForBarrier(cmdStream);

View File

@@ -166,6 +166,9 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
size_t getCmdsSizeForComputeBarrierCommand() const override {
return 0;
}
void programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) override {
programStallingCommandsForBarrierCalled = true;
}
bool createPreemptionAllocation() override {
if (createPreemptionAllocationParentCall) {
@@ -214,6 +217,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
bool createPreemptionAllocationReturn = true;
bool createPreemptionAllocationParentCall = false;
bool programComputeBarrierCommandCalled = false;
bool programStallingCommandsForBarrierCalled = false;
std::optional<bool> isGpuHangDetectedReturnValue{};
std::optional<bool> testTaskCountReadyReturnValue{};
WaitStatus waitForCompletionWithTimeoutReturnValue{WaitStatus::Ready};

View File

@@ -9,11 +9,14 @@
#include "shared/source/command_stream/command_stream_receiver_simulated_hw.h"
#include "shared/source/helpers/array_count.h"
#include "shared/source/helpers/hardware_context_controller.h"
#include "shared/source/helpers/timestamp_packet.h"
#include "shared/source/memory_manager/memory_pool.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/test/common/cmd_parse/hw_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/engine_descriptor_helper.h"
#include "shared/test/common/helpers/gfx_core_helper_tests.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/mocks/mock_aub_manager.h"
#include "shared/test/common/mocks/mock_gmm.h"
#include "shared/test/common/mocks/mock_graphics_allocation.h"
@@ -595,3 +598,98 @@ HWTEST_F(CommandStreamSimulatedTests, givenSpecificMemoryPoolAllocationWhenWrite
}
}
}
HWTEST_F(CommandStreamSimulatedTests, givenBarrierNodesWhenProgramStallingCommandsForBarrierCalledThenPostSyncWritePipeControlIsProgrammed) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
auto csr = std::make_unique<MockSimulatedCsrHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
MockOsContext osContext(0, EngineDescriptorHelper::getDefaultDescriptor());
csr->setupContext(osContext);
TagAllocatorBase *allocator = pDevice->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
auto barrierNode = allocator->getTag();
const auto barrierNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*barrierNode);
TimestampPacketContainer barrierNodes{};
barrierNodes.add(barrierNode);
{
MockGraphicsAllocation streamAllocation{};
uint32_t streamBuffer[100] = {};
LinearStream linearStream(&streamAllocation, streamBuffer, sizeof(streamBuffer));
csr->programStallingCommandsForBarrier(linearStream, &barrierNodes, false);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(linearStream);
auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
auto pipeControl = genCmdCast<PIPE_CONTROL *>(*pipeControlItor);
if (UnitTestHelper<FamilyType>::isPipeControlWArequired(hardwareInfo)) {
auto nextPipeControlItor = find<PIPE_CONTROL *>(++pipeControlItor, hwParser.cmdList.end());
pipeControl = genCmdCast<PIPE_CONTROL *>(*nextPipeControlItor);
}
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation());
EXPECT_FALSE(pipeControl->getDcFlushEnable());
EXPECT_EQ(barrierNodeAddress, UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
}
{
MockGraphicsAllocation streamAllocation{};
uint32_t streamBuffer[100] = {};
LinearStream linearStream(&streamAllocation, streamBuffer, sizeof(streamBuffer));
csr->programStallingCommandsForBarrier(linearStream, &barrierNodes, true);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(linearStream);
auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
auto pipeControl = genCmdCast<PIPE_CONTROL *>(*pipeControlItor);
if (UnitTestHelper<FamilyType>::isPipeControlWArequired(hardwareInfo)) {
auto nextPipeControlItor = find<PIPE_CONTROL *>(++pipeControlItor, hwParser.cmdList.end());
pipeControl = genCmdCast<PIPE_CONTROL *>(*nextPipeControlItor);
}
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation());
EXPECT_EQ(csr->getDcFlushSupport(), pipeControl->getDcFlushEnable());
EXPECT_EQ(barrierNodeAddress, UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
}
}
HWTEST_F(CommandStreamSimulatedTests, givenEmptyBarrierNodesWhenProgramStallingCommandsForBarrierCalledThenNoWritePipeControlIsProgrammed) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
auto csr = std::make_unique<MockSimulatedCsrHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
MockOsContext osContext(0, EngineDescriptorHelper::getDefaultDescriptor());
csr->setupContext(osContext);
{
TimestampPacketContainer barrierNodes{};
MockGraphicsAllocation streamAllocation{};
uint32_t streamBuffer[100] = {};
LinearStream linearStream(&streamAllocation, streamBuffer, sizeof(streamBuffer));
csr->programStallingCommandsForBarrier(linearStream, &barrierNodes, false);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(linearStream);
const auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
const auto pipeControl = genCmdCast<PIPE_CONTROL *>(*pipeControlItor);
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_NO_WRITE, pipeControl->getPostSyncOperation());
EXPECT_EQ(0u, UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
}
{
MockGraphicsAllocation streamAllocation{};
uint32_t streamBuffer[100] = {};
LinearStream linearStream(&streamAllocation, streamBuffer, sizeof(streamBuffer));
csr->programStallingCommandsForBarrier(linearStream, nullptr, false);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(linearStream);
const auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
const auto pipeControl = genCmdCast<PIPE_CONTROL *>(*pipeControlItor);
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_NO_WRITE, pipeControl->getPostSyncOperation());
EXPECT_EQ(0u, UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
}
}