performance(ocl): program barrier pc in taskStream

Program barrier to task stream, before next enqueue kernel.
This will reduce the number of batch buffer starts for sequences of
enqueue, barrier, enqueue, ... .

Related-To: NEO-8147

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek
2023-09-12 14:17:52 +00:00
committed by Compute-Runtime-Automation
parent e08d46085b
commit 1b7e178b25
23 changed files with 224 additions and 98 deletions

View File

@@ -9,11 +9,14 @@
#include "shared/source/command_stream/command_stream_receiver_simulated_hw.h"
#include "shared/source/helpers/array_count.h"
#include "shared/source/helpers/hardware_context_controller.h"
#include "shared/source/helpers/timestamp_packet.h"
#include "shared/source/memory_manager/memory_pool.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/test/common/cmd_parse/hw_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/engine_descriptor_helper.h"
#include "shared/test/common/helpers/gfx_core_helper_tests.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/mocks/mock_aub_manager.h"
#include "shared/test/common/mocks/mock_gmm.h"
#include "shared/test/common/mocks/mock_graphics_allocation.h"
@@ -595,3 +598,98 @@ HWTEST_F(CommandStreamSimulatedTests, givenSpecificMemoryPoolAllocationWhenWrite
}
}
}
HWTEST_F(CommandStreamSimulatedTests, givenBarrierNodesWhenProgramStallingCommandsForBarrierCalledThenPostSyncWritePipeControlIsProgrammed) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
auto csr = std::make_unique<MockSimulatedCsrHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
MockOsContext osContext(0, EngineDescriptorHelper::getDefaultDescriptor());
csr->setupContext(osContext);
TagAllocatorBase *allocator = pDevice->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
auto barrierNode = allocator->getTag();
const auto barrierNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*barrierNode);
TimestampPacketContainer barrierNodes{};
barrierNodes.add(barrierNode);
{
MockGraphicsAllocation streamAllocation{};
uint32_t streamBuffer[100] = {};
LinearStream linearStream(&streamAllocation, streamBuffer, sizeof(streamBuffer));
csr->programStallingCommandsForBarrier(linearStream, &barrierNodes, false);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(linearStream);
auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
auto pipeControl = genCmdCast<PIPE_CONTROL *>(*pipeControlItor);
if (UnitTestHelper<FamilyType>::isPipeControlWArequired(hardwareInfo)) {
auto nextPipeControlItor = find<PIPE_CONTROL *>(++pipeControlItor, hwParser.cmdList.end());
pipeControl = genCmdCast<PIPE_CONTROL *>(*nextPipeControlItor);
}
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation());
EXPECT_FALSE(pipeControl->getDcFlushEnable());
EXPECT_EQ(barrierNodeAddress, UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
}
{
MockGraphicsAllocation streamAllocation{};
uint32_t streamBuffer[100] = {};
LinearStream linearStream(&streamAllocation, streamBuffer, sizeof(streamBuffer));
csr->programStallingCommandsForBarrier(linearStream, &barrierNodes, true);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(linearStream);
auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
auto pipeControl = genCmdCast<PIPE_CONTROL *>(*pipeControlItor);
if (UnitTestHelper<FamilyType>::isPipeControlWArequired(hardwareInfo)) {
auto nextPipeControlItor = find<PIPE_CONTROL *>(++pipeControlItor, hwParser.cmdList.end());
pipeControl = genCmdCast<PIPE_CONTROL *>(*nextPipeControlItor);
}
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation());
EXPECT_EQ(csr->getDcFlushSupport(), pipeControl->getDcFlushEnable());
EXPECT_EQ(barrierNodeAddress, UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
}
}
HWTEST_F(CommandStreamSimulatedTests, givenEmptyBarrierNodesWhenProgramStallingCommandsForBarrierCalledThenNoWritePipeControlIsProgrammed) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
auto csr = std::make_unique<MockSimulatedCsrHw<FamilyType>>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
MockOsContext osContext(0, EngineDescriptorHelper::getDefaultDescriptor());
csr->setupContext(osContext);
{
TimestampPacketContainer barrierNodes{};
MockGraphicsAllocation streamAllocation{};
uint32_t streamBuffer[100] = {};
LinearStream linearStream(&streamAllocation, streamBuffer, sizeof(streamBuffer));
csr->programStallingCommandsForBarrier(linearStream, &barrierNodes, false);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(linearStream);
const auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
const auto pipeControl = genCmdCast<PIPE_CONTROL *>(*pipeControlItor);
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_NO_WRITE, pipeControl->getPostSyncOperation());
EXPECT_EQ(0u, UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
}
{
MockGraphicsAllocation streamAllocation{};
uint32_t streamBuffer[100] = {};
LinearStream linearStream(&streamAllocation, streamBuffer, sizeof(streamBuffer));
csr->programStallingCommandsForBarrier(linearStream, nullptr, false);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(linearStream);
const auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
const auto pipeControl = genCmdCast<PIPE_CONTROL *>(*pipeControlItor);
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_NO_WRITE, pipeControl->getPostSyncOperation());
EXPECT_EQ(0u, UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
}
}

View File

@@ -2665,6 +2665,33 @@ HWTEST_F(CommandStreamReceiverHwTest, givenDcFlushFlagSetWhenGettingCsrFlagValue
EXPECT_EQ(helperValue, csrValue);
}
HWTEST_F(CommandStreamReceiverHwTest, givenBarrierTimestampPacketNodesWhenGetCmdSizeForStallingCommandsCalledThenReturnCorrectSize) {
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
const auto expectedCmdSizeNoPostSync = commandStreamReceiver.getCmdSizeForStallingNoPostSyncCommands();
{
DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
dispatchFlags.barrierTimestampPacketNodes = nullptr;
EXPECT_EQ(expectedCmdSizeNoPostSync, commandStreamReceiver.getCmdSizeForStallingCommands(dispatchFlags));
}
{
DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
TimestampPacketContainer emptyContainer;
dispatchFlags.barrierTimestampPacketNodes = &emptyContainer;
EXPECT_EQ(expectedCmdSizeNoPostSync, commandStreamReceiver.getCmdSizeForStallingCommands(dispatchFlags));
}
const auto expectedCmdSizePostSync = commandStreamReceiver.getCmdSizeForStallingPostSyncCommands();
{
DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
TimestampPacketContainer barrierNodes;
barrierNodes.add(commandStreamReceiver.getTimestampPacketAllocator()->getTag());
dispatchFlags.barrierTimestampPacketNodes = &barrierNodes;
EXPECT_EQ(expectedCmdSizePostSync, commandStreamReceiver.getCmdSizeForStallingCommands(dispatchFlags));
}
}
struct MockRequiredScratchSpaceController : public ScratchSpaceControllerBase {
MockRequiredScratchSpaceController(uint32_t rootDeviceIndex,
ExecutionEnvironment &environment,
@@ -4426,39 +4453,6 @@ HWTEST_F(CommandStreamReceiverHwTest, givenDcFlushRequiredFalseWhenProgramStalli
EXPECT_FALSE(pipeControl->getDcFlushEnable());
}
HWTEST_F(CommandStreamReceiverHwTest, givenFlagProgramBarrierInCommandStreamTaskWhenFlushTaskThenPipeControlProgrammedInTaskCommandStream) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
DebugManagerStateRestore restorer;
DebugManager.flags.ProgramBarrierInCommandStreamTask.set(1);
auto &ultCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
GraphicsAllocation *allocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties({ultCsr.getRootDeviceIndex(), MemoryConstants::pageSize, AllocationType::COMMAND_BUFFER, pDevice->getDeviceBitfield()});
LinearStream commandStream{allocation};
ASSERT_NE(nullptr, commandStream.getGraphicsAllocation());
auto dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
dispatchFlags.isStallingCommandsOnNextFlushRequired = true;
ultCsr.flushTask(commandStream,
MemoryConstants::pageSize,
&dsh,
&ioh,
&ssh,
0,
dispatchFlags,
*pDevice);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
commandStream.getCpuBase(),
commandStream.getUsed()));
auto pipeControlIteratorVector = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(pipeControlIteratorVector.size(), 1u);
auto pipeControlIterator = pipeControlIteratorVector[0];
auto pipeControl = genCmdCast<PIPE_CONTROL *>(*pipeControlIterator);
ASSERT_NE(nullptr, pipeControl);
pDevice->getMemoryManager()->freeGraphicsMemory(allocation);
}
HWTEST2_F(CommandStreamReceiverHwTest,
givenImmediateFlushTaskWhenNextDispatchRequiresScratchSpaceAndSshPointerIsNullThenFrontEndCommandIsNotDispatched,
IsAtLeastXeHpCore) {