Add multi tile support for OCL post sync barrier

Related-To: NEO-6262

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2021-12-09 19:31:27 +00:00
committed by Compute-Runtime-Automation
parent 4b589bffd3
commit 56bef79733
7 changed files with 274 additions and 12 deletions

View File

@@ -1424,3 +1424,47 @@ HWCMDTEST_F(IGFX_GEN8_CORE, UltCommandStreamReceiverTest, WhenProgrammingActiveP
size_t usedAfter = commandStreamReceiver.commandStream.getUsed();
EXPECT_EQ(usedBefore, usedAfter);
}
HWCMDTEST_F(IGFX_GEN8_CORE, UltCommandStreamReceiverTest, givenBarrierNodeSetWhenProgrammingBarrierCommandThenExpectPostSyncPipeControl) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
auto &hwInfo = pDevice->getHardwareInfo();
auto commandStreamReceiver = &pDevice->getUltCommandStreamReceiver<FamilyType>();
auto &commandStreamCSR = commandStreamReceiver->getCS();
TagNodeBase *tagNode = commandStreamReceiver->getTimestampPacketAllocator()->getTag();
uint64_t gpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tagNode);
TimestampPacketDependencies timestampPacketDependencies;
timestampPacketDependencies.barrierNodes.add(tagNode);
DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
dispatchFlags.barrierTimestampPacketNodes = &timestampPacketDependencies.barrierNodes;
size_t expectedCmdSize = MemorySynchronizationCommands<FamilyType>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags);
EXPECT_EQ(expectedCmdSize, estimatedCmdSize);
commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags);
EXPECT_EQ(estimatedCmdSize, commandStreamCSR.getUsed());
parseCommands<FamilyType>(commandStreamCSR, 0);
findHardwareCommands<FamilyType>();
auto cmdItor = cmdList.begin();
if (MemorySynchronizationCommands<FamilyType>::isPipeControlWArequired(hwInfo)) {
PIPE_CONTROL *pipeControl = genCmdCast<PIPE_CONTROL *>(*cmdItor);
ASSERT_NE(nullptr, pipeControl);
cmdItor++;
if (MemorySynchronizationCommands<FamilyType>::getSizeForSingleAdditionalSynchronization(hwInfo) > 0) {
cmdItor++;
}
}
PIPE_CONTROL *pipeControl = genCmdCast<PIPE_CONTROL *>(*cmdItor);
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation());
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
EXPECT_EQ(0u, pipeControl->getImmediateData());
EXPECT_EQ(gpuAddress, UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
}

View File

@@ -950,3 +950,171 @@ HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionEnabledWh
EXPECT_EQ(estimatedCmdSize, offset);
}
HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionEnabledWhenSinglePartitionUsedForPostSyncBarrierThenExpectOnlyPostSyncCommands, IsAtLeastXeHpCore) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
auto &hwInfo = pDevice->getHardwareInfo();
auto commandStreamReceiver = new MockCsrHw<FamilyType>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
pDevice->resetCommandStreamReceiver(commandStreamReceiver);
auto &commandStreamCSR = commandStreamReceiver->getCS();
TagNodeBase *tagNode = commandStreamReceiver->getTimestampPacketAllocator()->getTag();
uint64_t gpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tagNode);
TimestampPacketDependencies timestampPacketDependencies;
timestampPacketDependencies.barrierNodes.add(tagNode);
DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
dispatchFlags.barrierTimestampPacketNodes = &timestampPacketDependencies.barrierNodes;
commandStreamReceiver->staticWorkPartitioningEnabled = true;
commandStreamReceiver->activePartitions = 1;
size_t expectedCmdSize = MemorySynchronizationCommands<FamilyType>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags);
EXPECT_EQ(expectedCmdSize, estimatedCmdSize);
commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags);
EXPECT_EQ(estimatedCmdSize, commandStreamCSR.getUsed());
parseCommands<FamilyType>(commandStreamCSR, 0);
findHardwareCommands<FamilyType>();
auto cmdItor = cmdList.begin();
if (MemorySynchronizationCommands<FamilyType>::isPipeControlWArequired(hwInfo)) {
PIPE_CONTROL *pipeControl = genCmdCast<PIPE_CONTROL *>(*cmdItor);
ASSERT_NE(nullptr, pipeControl);
cmdItor++;
if (MemorySynchronizationCommands<FamilyType>::getSizeForSingleAdditionalSynchronization(hwInfo) > 0) {
cmdItor++;
}
}
PIPE_CONTROL *pipeControl = genCmdCast<PIPE_CONTROL *>(*cmdItor);
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation());
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
EXPECT_EQ(0u, pipeControl->getImmediateData());
EXPECT_EQ(gpuAddress, UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
}
HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionDisabledWhenMultiplePartitionsUsedForPostSyncBarrierThenExpectOnlyPostSyncCommands, IsAtLeastXeHpCore) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
auto &hwInfo = pDevice->getHardwareInfo();
auto commandStreamReceiver = new MockCsrHw<FamilyType>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
pDevice->resetCommandStreamReceiver(commandStreamReceiver);
auto &commandStreamCSR = commandStreamReceiver->getCS();
TagNodeBase *tagNode = commandStreamReceiver->getTimestampPacketAllocator()->getTag();
uint64_t gpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tagNode);
TimestampPacketDependencies timestampPacketDependencies;
timestampPacketDependencies.barrierNodes.add(tagNode);
DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
dispatchFlags.barrierTimestampPacketNodes = &timestampPacketDependencies.barrierNodes;
commandStreamReceiver->staticWorkPartitioningEnabled = false;
commandStreamReceiver->activePartitions = 2;
size_t expectedCmdSize = MemorySynchronizationCommands<FamilyType>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags);
EXPECT_EQ(expectedCmdSize, estimatedCmdSize);
commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags);
EXPECT_EQ(estimatedCmdSize, commandStreamCSR.getUsed());
parseCommands<FamilyType>(commandStreamCSR, 0);
findHardwareCommands<FamilyType>();
auto cmdItor = cmdList.begin();
if (MemorySynchronizationCommands<FamilyType>::isPipeControlWArequired(hwInfo)) {
PIPE_CONTROL *pipeControl = genCmdCast<PIPE_CONTROL *>(*cmdItor);
ASSERT_NE(nullptr, pipeControl);
cmdItor++;
if (MemorySynchronizationCommands<FamilyType>::getSizeForSingleAdditionalSynchronization(hwInfo) > 0) {
cmdItor++;
}
}
PIPE_CONTROL *pipeControl = genCmdCast<PIPE_CONTROL *>(*cmdItor);
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation());
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
EXPECT_EQ(0u, pipeControl->getImmediateData());
EXPECT_EQ(gpuAddress, UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
}
HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionEnabledWhenMultiplePartitionsUsedThenExpectImplicitScalingPostSyncBarrierWithoutSelfCleanup, IsAtLeastXeHpCore) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
auto &hwInfo = pDevice->getHardwareInfo();
auto commandStreamReceiver = new MockCsrHw<FamilyType>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
pDevice->resetCommandStreamReceiver(commandStreamReceiver);
auto &commandStreamCSR = commandStreamReceiver->getCS();
TagNodeBase *tagNode = commandStreamReceiver->getTimestampPacketAllocator()->getTag();
uint64_t gpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tagNode);
TimestampPacketDependencies timestampPacketDependencies;
timestampPacketDependencies.barrierNodes.add(tagNode);
DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
dispatchFlags.barrierTimestampPacketNodes = &timestampPacketDependencies.barrierNodes;
commandStreamReceiver->staticWorkPartitioningEnabled = true;
commandStreamReceiver->activePartitions = 2;
size_t expectedSize = MemorySynchronizationCommands<FamilyType>::getSizeForPipeControlWithPostSyncOperation(hwInfo) +
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
sizeof(MI_BATCH_BUFFER_START) +
2 * sizeof(uint32_t);
size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags);
EXPECT_EQ(expectedSize, estimatedCmdSize);
commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags);
EXPECT_EQ(estimatedCmdSize, commandStreamCSR.getUsed());
EXPECT_EQ(2u, tagNode->getPacketsUsed());
parseCommands<FamilyType>(commandStreamCSR, 0);
findHardwareCommands<FamilyType>();
auto cmdItor = cmdList.begin();
if (MemorySynchronizationCommands<FamilyType>::isPipeControlWArequired(hwInfo)) {
PIPE_CONTROL *pipeControl = genCmdCast<PIPE_CONTROL *>(*cmdItor);
ASSERT_NE(nullptr, pipeControl);
cmdItor++;
if (MemorySynchronizationCommands<FamilyType>::getSizeForSingleAdditionalSynchronization(hwInfo) > 0) {
cmdItor++;
}
}
PIPE_CONTROL *pipeControl = genCmdCast<PIPE_CONTROL *>(*cmdItor);
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation());
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
EXPECT_EQ(0u, pipeControl->getImmediateData());
EXPECT_EQ(gpuAddress, UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl));
EXPECT_TRUE(pipeControl->getWorkloadPartitionIdOffsetEnable());
cmdItor++;
if (MemorySynchronizationCommands<FamilyType>::getSizeForSingleAdditionalSynchronization(hwInfo) > 0) {
cmdItor++;
}
MI_ATOMIC *miAtomic = genCmdCast<MI_ATOMIC *>(*cmdItor);
ASSERT_NE(nullptr, miAtomic);
cmdItor++;
MI_SEMAPHORE_WAIT *miSemaphore = genCmdCast<MI_SEMAPHORE_WAIT *>(*cmdItor);
ASSERT_NE(nullptr, miSemaphore);
cmdItor++;
MI_BATCH_BUFFER_START *bbStart = genCmdCast<MI_BATCH_BUFFER_START *>(*cmdItor);
ASSERT_NE(nullptr, bbStart);
}

View File

@@ -70,6 +70,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
size_t getCmdSizeForActivePartitionConfig() const;
size_t getCmdSizeForStallingCommands(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForStallingNoPostSyncCommands() const;
size_t getCmdSizeForStallingPostSyncCommands() const;
bool isComputeModeNeeded() const;
bool isPipelineSelectAlreadyProgrammed() const;
@@ -149,6 +150,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
void programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t maxFrontEndThreads);
void programStallingCommandsForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags);
void programStallingNoPostSyncCommandsForBarrier(LinearStream &cmdStream);
void programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode);
void programEngineModeCommands(LinearStream &csr, const DispatchFlags &dispatchFlags);
void programEngineModeEpliogue(LinearStream &csr, const DispatchFlags &dispatchFlags);
void programActivePartitionConfigFlushTask(LinearStream &csr);

View File

@@ -660,17 +660,8 @@ inline void CommandStreamReceiverHw<GfxFamily>::programStallingCommandsForBarrie
auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes;
if (barrierTimestampPacketNodes && barrierTimestampPacketNodes->peekNodes().size() != 0) {
auto barrierTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*dispatchFlags.barrierTimestampPacketNodes->peekNodes()[0]);
PipeControlArgs args(true);
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
cmdStream,
PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
barrierTimestampPacketGpuAddress,
0,
peekHwInfo(),
args);
dispatchFlags.barrierTimestampPacketNodes->makeResident(*this);
programStallingPostSyncCommandsForBarrier(cmdStream, *barrierTimestampPacketNodes->peekNodes()[0]);
barrierTimestampPacketNodes->makeResident(*this);
} else {
programStallingNoPostSyncCommandsForBarrier(cmdStream);
}
@@ -1474,7 +1465,7 @@ template <typename GfxFamily>
size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForStallingCommands(const DispatchFlags &dispatchFlags) const {
auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes;
if (barrierTimestampPacketNodes && barrierTimestampPacketNodes->peekNodes().size() > 0) {
return MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(peekHwInfo());
return getCmdSizeForStallingPostSyncCommands();
} else {
return getCmdSizeForStallingNoPostSyncCommands();
}

View File

@@ -159,12 +159,30 @@ inline size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForStallingNoPostSyn
return sizeof(typename GfxFamily::PIPE_CONTROL);
}
template <typename GfxFamily>
inline size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForStallingPostSyncCommands() const {
return MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(peekHwInfo());
}
template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::programStallingNoPostSyncCommandsForBarrier(LinearStream &cmdStream) {
PipeControlArgs args;
MemorySynchronizationCommands<GfxFamily>::addPipeControl(cmdStream, args);
}
template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode) {
auto barrierTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(tagNode);
PipeControlArgs args(true);
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
cmdStream,
PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
barrierTimestampPacketGpuAddress,
0,
peekHwInfo(),
args);
}
template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::configurePostSyncWriteOffset() {
}

View File

@@ -206,6 +206,17 @@ inline size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForStallingNoPostSyn
}
}
template <typename GfxFamily>
inline size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForStallingPostSyncCommands() const {
if (this->activePartitions > 1 && this->staticWorkPartitioningEnabled) {
return ImplicitScalingDispatch<GfxFamily>::getBarrierSize(peekHwInfo(),
false,
true);
} else {
return MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(peekHwInfo());
}
}
template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::programStallingNoPostSyncCommandsForBarrier(LinearStream &cmdStream) {
PipeControlArgs args;
@@ -223,6 +234,32 @@ inline void CommandStreamReceiverHw<GfxFamily>::programStallingNoPostSyncCommand
}
}
template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode) {
auto barrierTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(tagNode);
PipeControlArgs args(true);
if (this->activePartitions > 1 && this->staticWorkPartitioningEnabled) {
args.workloadPartitionOffset = true;
ImplicitScalingDispatch<GfxFamily>::dispatchBarrierCommands(cmdStream,
this->deviceBitfield,
args,
peekHwInfo(),
barrierTimestampPacketGpuAddress,
0,
false,
false);
tagNode.setPacketsUsed(this->activePartitions);
} else {
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
cmdStream,
PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
barrierTimestampPacketGpuAddress,
0,
peekHwInfo(),
args);
}
}
template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::configurePostSyncWriteOffset() {
this->postSyncWriteOffset = ImplicitScalingDispatch<GfxFamily>::getPostSyncOffset();

View File

@@ -48,7 +48,9 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
using BaseClass::programEnginePrologue;
using BaseClass::programPerDssBackedBuffer;
using BaseClass::programPreamble;
using BaseClass::programStallingCommandsForBarrier;
using BaseClass::programStallingNoPostSyncCommandsForBarrier;
using BaseClass::programStallingPostSyncCommandsForBarrier;
using BaseClass::programStateSip;
using BaseClass::programVFEState;
using BaseClass::requiresInstructionCacheFlush;