Add implicit scaling barrier implementation

Related-To: NEO-6262

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2021-10-29 10:12:13 +00:00
committed by Compute-Runtime-Automation
parent 95610188af
commit b2124f43b8
6 changed files with 555 additions and 4 deletions

View File

@ -1350,3 +1350,300 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenDynamicPartitioningWhenP
parsedOffset += sizeof(BatchBufferControlData);
EXPECT_EQ(parsedOffset, cleanupSectionOffset);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenBarrierProgrammingWhenDoNotEmitSelfCleanupThenExpectNoCleanupSection) {
testArgs.tileCount = 4u;
testArgs.emitSelfCleanup = false;
uint32_t totalBytesProgrammed = 0u;
uint64_t gpuVirtualAddress = 0xFF0000;
auto expectedOffsetSectionSize = sizeof(WalkerPartition::PIPE_CONTROL<FamilyType>) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>) +
sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
auto expectedCommandUsedSize = expectedOffsetSectionSize +
sizeof(BarrierControlSection);
EXPECT_EQ(expectedOffsetSectionSize, computeBarrierControlSectionOffset<FamilyType>(testArgs));
EXPECT_EQ(expectedCommandUsedSize, estimateBarrierSpaceRequiredInCommandBuffer<FamilyType>(testArgs));
WalkerPartition::constructBarrierCommandBuffer<FamilyType>(cmdBuffer,
gpuVirtualAddress,
totalBytesProgrammed,
testArgs);
EXPECT_EQ(expectedCommandUsedSize, totalBytesProgrammed);
auto pipeControl = genCmdCast<WalkerPartition::PIPE_CONTROL<FamilyType> *>(cmdBufferAddress);
ASSERT_NE(nullptr, pipeControl);
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
EXPECT_EQ(false, pipeControl->getDcFlushEnable());
auto parsedOffset = sizeof(WalkerPartition::PIPE_CONTROL<FamilyType>);
auto miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miAtomic);
auto crossTileSyncAddress = gpuVirtualAddress + expectedOffsetSectionSize + offsetof(BarrierControlSection, crossTileSyncCount);
auto miAtomicProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
EXPECT_EQ(crossTileSyncAddress, miAtomicProgrammedAddress);
EXPECT_FALSE(miAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
auto miSemaphoreWait = genCmdCast<WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSemaphoreWait);
EXPECT_EQ(crossTileSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT<FamilyType>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation());
EXPECT_EQ(testArgs.tileCount, miSemaphoreWait->getSemaphoreDataDword());
parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>);
auto batchBufferStart = genCmdCast<WalkerPartition::BATCH_BUFFER_START<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, batchBufferStart);
EXPECT_EQ(gpuVirtualAddress + expectedOffsetSectionSize + sizeof(BarrierControlSection), batchBufferStart->getBatchBufferStartAddress());
EXPECT_EQ(BATCH_BUFFER_START<FamilyType>::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, batchBufferStart->getSecondLevelBatchBuffer());
parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
auto controlSection = reinterpret_cast<BarrierControlSection *>(ptrOffset(cmdBuffer, parsedOffset));
EXPECT_EQ(0u, controlSection->crossTileSyncCount);
EXPECT_EQ(0u, controlSection->finalSyncTileCount);
parsedOffset += sizeof(BarrierControlSection);
EXPECT_EQ(parsedOffset, expectedCommandUsedSize);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenBarrierProgrammingWhenEmitsSelfCleanupThenExpectStoreDataImmCommandCleanupSection) {
testArgs.tileCount = 4u;
testArgs.emitSelfCleanup = true;
testArgs.secondaryBatchBuffer = true;
testArgs.dcFlush = true;
uint32_t totalBytesProgrammed = 0u;
uint64_t gpuVirtualAddress = 0xFF0000;
auto expectedOffsetSectionSize = sizeof(WalkerPartition::MI_STORE_DATA_IMM<FamilyType>) +
sizeof(WalkerPartition::PIPE_CONTROL<FamilyType>) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>) +
sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
auto expectedCommandUsedSize = expectedOffsetSectionSize +
sizeof(BarrierControlSection) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>) +
sizeof(WalkerPartition::MI_STORE_DATA_IMM<FamilyType>) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>);
EXPECT_EQ(expectedOffsetSectionSize, computeBarrierControlSectionOffset<FamilyType>(testArgs));
EXPECT_EQ(expectedCommandUsedSize, estimateBarrierSpaceRequiredInCommandBuffer<FamilyType>(testArgs));
WalkerPartition::constructBarrierCommandBuffer<FamilyType>(cmdBuffer,
gpuVirtualAddress,
totalBytesProgrammed,
testArgs);
EXPECT_EQ(expectedCommandUsedSize, totalBytesProgrammed);
size_t parsedOffset = 0;
uint64_t finalSyncTileCountAddress = gpuVirtualAddress + expectedOffsetSectionSize + offsetof(BarrierControlSection, finalSyncTileCount);
constexpr uint32_t expectedData = 0u;
auto finalSyncTileCountFieldStore = genCmdCast<WalkerPartition::MI_STORE_DATA_IMM<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, finalSyncTileCountFieldStore);
EXPECT_EQ(finalSyncTileCountAddress, finalSyncTileCountFieldStore->getAddress());
EXPECT_EQ(expectedData, finalSyncTileCountFieldStore->getDataDword0());
parsedOffset += sizeof(WalkerPartition::MI_STORE_DATA_IMM<FamilyType>);
auto pipeControl = genCmdCast<WalkerPartition::PIPE_CONTROL<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, pipeControl);
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::isDcFlushAllowed(), pipeControl->getDcFlushEnable());
parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL<FamilyType>);
auto miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miAtomic);
auto crossTileSyncAddress = gpuVirtualAddress + expectedOffsetSectionSize + offsetof(BarrierControlSection, crossTileSyncCount);
auto miAtomicProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
EXPECT_EQ(crossTileSyncAddress, miAtomicProgrammedAddress);
EXPECT_FALSE(miAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
auto miSemaphoreWait = genCmdCast<WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSemaphoreWait);
EXPECT_EQ(crossTileSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT<FamilyType>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation());
EXPECT_EQ(testArgs.tileCount, miSemaphoreWait->getSemaphoreDataDword());
parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>);
auto batchBufferStart = genCmdCast<WalkerPartition::BATCH_BUFFER_START<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, batchBufferStart);
EXPECT_EQ(gpuVirtualAddress + expectedOffsetSectionSize + sizeof(BarrierControlSection), batchBufferStart->getBatchBufferStartAddress());
EXPECT_EQ(BATCH_BUFFER_START<FamilyType>::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH, batchBufferStart->getSecondLevelBatchBuffer());
parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
auto controlSection = reinterpret_cast<BarrierControlSection *>(ptrOffset(cmdBuffer, parsedOffset));
EXPECT_EQ(0u, controlSection->crossTileSyncCount);
EXPECT_EQ(0u, controlSection->finalSyncTileCount);
parsedOffset += sizeof(BarrierControlSection);
miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miAtomic);
miAtomicProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
EXPECT_EQ(finalSyncTileCountAddress, miAtomicProgrammedAddress);
EXPECT_FALSE(miAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
miSemaphoreWait = genCmdCast<WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSemaphoreWait);
EXPECT_EQ(finalSyncTileCountAddress, miSemaphoreWait->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT<FamilyType>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation());
EXPECT_EQ(testArgs.tileCount, miSemaphoreWait->getSemaphoreDataDword());
parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>);
auto crossTileFieldStore = genCmdCast<WalkerPartition::MI_STORE_DATA_IMM<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, crossTileFieldStore);
EXPECT_EQ(crossTileSyncAddress, crossTileFieldStore->getAddress());
EXPECT_EQ(expectedData, crossTileFieldStore->getDataDword0());
parsedOffset += sizeof(WalkerPartition::MI_STORE_DATA_IMM<FamilyType>);
miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miAtomic);
miAtomicProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
EXPECT_EQ(finalSyncTileCountAddress, miAtomicProgrammedAddress);
EXPECT_FALSE(miAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
miSemaphoreWait = genCmdCast<WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSemaphoreWait);
EXPECT_EQ(finalSyncTileCountAddress, miSemaphoreWait->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT<FamilyType>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation());
EXPECT_EQ(2u * testArgs.tileCount, miSemaphoreWait->getSemaphoreDataDword());
parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>);
EXPECT_EQ(parsedOffset, expectedCommandUsedSize);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenBarrierProgrammingWhenEmitsSelfCleanupUsingAtomicThenExpectMiAtomicCommandCleanupSection) {
testArgs.tileCount = 4u;
testArgs.emitSelfCleanup = true;
testArgs.secondaryBatchBuffer = true;
testArgs.dcFlush = true;
testArgs.useAtomicsForSelfCleanup = true;
uint32_t totalBytesProgrammed = 0u;
uint64_t gpuVirtualAddress = 0xFF0000;
auto expectedOffsetSectionSize = sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) +
sizeof(WalkerPartition::PIPE_CONTROL<FamilyType>) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>) +
sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
auto expectedCommandUsedSize = expectedOffsetSectionSize +
sizeof(BarrierControlSection) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>);
EXPECT_EQ(expectedOffsetSectionSize, computeBarrierControlSectionOffset<FamilyType>(testArgs));
EXPECT_EQ(expectedCommandUsedSize, estimateBarrierSpaceRequiredInCommandBuffer<FamilyType>(testArgs));
WalkerPartition::constructBarrierCommandBuffer<FamilyType>(cmdBuffer,
gpuVirtualAddress,
totalBytesProgrammed,
testArgs);
EXPECT_EQ(expectedCommandUsedSize, totalBytesProgrammed);
size_t parsedOffset = 0;
uint64_t finalSyncTileCountAddress = gpuVirtualAddress + expectedOffsetSectionSize + offsetof(BarrierControlSection, finalSyncTileCount);
constexpr uint32_t expectedData = 0u;
auto finalSyncTileCountFieldAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, finalSyncTileCountFieldAtomic);
auto miAtomicProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*finalSyncTileCountFieldAtomic);
EXPECT_EQ(finalSyncTileCountAddress, miAtomicProgrammedAddress);
EXPECT_FALSE(finalSyncTileCountFieldAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_MOVE, finalSyncTileCountFieldAtomic->getAtomicOpcode());
EXPECT_EQ(MI_ATOMIC<FamilyType>::DWORD_LENGTH::DWORD_LENGTH_INLINE_DATA_1, finalSyncTileCountFieldAtomic->getDwordLength());
EXPECT_TRUE(finalSyncTileCountFieldAtomic->getInlineData());
EXPECT_EQ(expectedData, finalSyncTileCountFieldAtomic->getOperand1DataDword0());
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
auto pipeControl = genCmdCast<WalkerPartition::PIPE_CONTROL<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, pipeControl);
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::isDcFlushAllowed(), pipeControl->getDcFlushEnable());
parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL<FamilyType>);
auto miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miAtomic);
auto crossTileSyncAddress = gpuVirtualAddress + expectedOffsetSectionSize + offsetof(BarrierControlSection, crossTileSyncCount);
miAtomicProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
EXPECT_EQ(crossTileSyncAddress, miAtomicProgrammedAddress);
EXPECT_FALSE(miAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
auto miSemaphoreWait = genCmdCast<WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSemaphoreWait);
EXPECT_EQ(crossTileSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT<FamilyType>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation());
EXPECT_EQ(testArgs.tileCount, miSemaphoreWait->getSemaphoreDataDword());
parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>);
auto batchBufferStart = genCmdCast<WalkerPartition::BATCH_BUFFER_START<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, batchBufferStart);
EXPECT_EQ(gpuVirtualAddress + expectedOffsetSectionSize + sizeof(BarrierControlSection), batchBufferStart->getBatchBufferStartAddress());
EXPECT_EQ(BATCH_BUFFER_START<FamilyType>::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH, batchBufferStart->getSecondLevelBatchBuffer());
parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
auto controlSection = reinterpret_cast<BarrierControlSection *>(ptrOffset(cmdBuffer, parsedOffset));
EXPECT_EQ(0u, controlSection->crossTileSyncCount);
EXPECT_EQ(0u, controlSection->finalSyncTileCount);
parsedOffset += sizeof(BarrierControlSection);
miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miAtomic);
miAtomicProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
EXPECT_EQ(finalSyncTileCountAddress, miAtomicProgrammedAddress);
EXPECT_FALSE(miAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
miSemaphoreWait = genCmdCast<WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSemaphoreWait);
EXPECT_EQ(finalSyncTileCountAddress, miSemaphoreWait->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT<FamilyType>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation());
EXPECT_EQ(testArgs.tileCount, miSemaphoreWait->getSemaphoreDataDword());
parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>);
auto crossTileFieldAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, crossTileFieldAtomic);
miAtomicProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*crossTileFieldAtomic);
EXPECT_EQ(crossTileSyncAddress, miAtomicProgrammedAddress);
EXPECT_FALSE(crossTileFieldAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_MOVE, crossTileFieldAtomic->getAtomicOpcode());
EXPECT_EQ(MI_ATOMIC<FamilyType>::DWORD_LENGTH::DWORD_LENGTH_INLINE_DATA_1, crossTileFieldAtomic->getDwordLength());
EXPECT_TRUE(crossTileFieldAtomic->getInlineData());
EXPECT_EQ(expectedData, crossTileFieldAtomic->getOperand1DataDword0());
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miAtomic);
miAtomicProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
EXPECT_EQ(finalSyncTileCountAddress, miAtomicProgrammedAddress);
EXPECT_FALSE(miAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
miSemaphoreWait = genCmdCast<WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSemaphoreWait);
EXPECT_EQ(finalSyncTileCountAddress, miSemaphoreWait->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT<FamilyType>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation());
EXPECT_EQ(2u * testArgs.tileCount, miSemaphoreWait->getSemaphoreDataDword());
parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>);
EXPECT_EQ(parsedOffset, expectedCommandUsedSize);
}

View File

@ -53,6 +53,13 @@ struct ImplicitScalingDispatch {
static bool &getPipeControlStallRequired();
static size_t getBarrierSize(bool apiSelfCleanup);
static void dispatchBarrierCommands(LinearStream &commandStream,
const DeviceBitfield &devices,
bool apiSelfCleanup,
bool dcFlush,
bool useSecondaryBatchBuffer);
private:
static bool pipeControlStallRequired;
};

View File

@ -96,10 +96,12 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
staticPartitioning,
useSecondaryBatchBuffer);
uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed();
void *commandBuffer = commandStream.getSpace(0u);
if (staticPartitioning) {
UNRECOVERABLE_IF(tileCount != partitionCount);
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandBuffer,
cmdBufferGpuAddress,
&walkerCmd,
totalProgrammedSize,
args);
@ -112,8 +114,8 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
args.partitionCount = partitionCount;
}
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily>(commandBuffer,
cmdBufferGpuAddress,
&walkerCmd,
totalProgrammedSize,
args);
@ -126,4 +128,38 @@ bool &ImplicitScalingDispatch<GfxFamily>::getPipeControlStallRequired() {
return ImplicitScalingDispatch<GfxFamily>::pipeControlStallRequired;
}
template <typename GfxFamily>
size_t ImplicitScalingDispatch<GfxFamily>::getBarrierSize(bool apiSelfCleanup) {
WalkerPartition::WalkerPartitionArgs args = {};
args.emitSelfCleanup = apiSelfCleanup;
args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup();
return static_cast<size_t>(WalkerPartition::estimateBarrierSpaceRequiredInCommandBuffer<GfxFamily>(args));
}
template <typename GfxFamily>
void ImplicitScalingDispatch<GfxFamily>::dispatchBarrierCommands(LinearStream &commandStream,
const DeviceBitfield &devices,
bool apiSelfCleanup,
bool dcFlush,
bool useSecondaryBatchBuffer) {
uint32_t totalProgrammedSize = 0u;
WalkerPartition::WalkerPartitionArgs args = {};
args.emitSelfCleanup = apiSelfCleanup;
args.dcFlush = dcFlush;
args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup();
args.tileCount = static_cast<uint32_t>(devices.count());
args.secondaryBatchBuffer = useSecondaryBatchBuffer;
uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed();
void *commandBuffer = commandStream.getSpace(0u);
WalkerPartition::constructBarrierCommandBuffer<GfxFamily>(commandBuffer,
cmdBufferGpuAddress,
totalProgrammedSize,
args);
commandStream.getSpace(totalProgrammedSize);
}
} // namespace NEO

View File

@ -26,6 +26,7 @@ struct WalkerPartitionArgs {
bool initializeWparidRegister = false;
bool emitPipeControlStall = false;
bool preferredStaticPartitioning = false;
bool dcFlush = false;
};
constexpr uint32_t wparidCCSOffset = 0x221C;
@ -54,4 +55,10 @@ struct StaticPartitioningControlSection {
uint32_t finalSyncTileCounter = 0;
};
constexpr size_t staticPartitioningFieldsForCleanupCount = sizeof(StaticPartitioningControlSection) / sizeof(uint32_t) - 1;
struct BarrierControlSection {
uint32_t crossTileSyncCount = 0u;
uint32_t finalSyncTileCount = 0;
};
constexpr size_t barrierControlSectionFieldsForCleanupCount = sizeof(BarrierControlSection) / sizeof(uint32_t) - 1;
} // namespace WalkerPartition

View File

@ -720,4 +720,64 @@ uint64_t estimateSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args) {
return size;
}
template <typename GfxFamily>
uint64_t computeBarrierControlSectionOffset(WalkerPartitionArgs &args) {
uint64_t offset = 0u;
if (args.emitSelfCleanup) {
offset += computeSelfCleanupSectionSize<GfxFamily>(args.useAtomicsForSelfCleanup);
}
offset += (sizeof(PIPE_CONTROL<GfxFamily>) +
computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>() +
sizeof(BATCH_BUFFER_START<GfxFamily>));
return offset;
}
template <typename GfxFamily>
uint64_t estimateBarrierSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args) {
uint64_t size = computeBarrierControlSectionOffset<GfxFamily>(args) +
sizeof(BarrierControlSection);
if (args.emitSelfCleanup) {
size += computeSelfCleanupEndSectionSize<GfxFamily>(barrierControlSectionFieldsForCleanupCount, args.useAtomicsForSelfCleanup);
}
return size;
}
template <typename GfxFamily>
void constructBarrierCommandBuffer(void *cpuPointer,
uint64_t gpuAddressOfAllocation,
uint32_t &totalBytesProgrammed,
WalkerPartitionArgs &args) {
void *currentBatchBufferPointer = cpuPointer;
const auto controlSectionOffset = computeBarrierControlSectionOffset<GfxFamily>(args);
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BarrierControlSection, finalSyncTileCount);
if (args.emitSelfCleanup) {
programSelfCleanupSection<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForSelfCleanup);
}
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.dcFlush);
const auto crossTileSyncCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BarrierControlSection, crossTileSyncCount);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, crossTileSyncCountField, args.tileCount);
const auto afterControlSectionOffset = controlSectionOffset + sizeof(BarrierControlSection);
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer);
DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset);
BarrierControlSection *controlSection = putCommand<BarrierControlSection>(currentBatchBufferPointer, totalBytesProgrammed);
controlSection->crossTileSyncCount = 0u;
controlSection->finalSyncTileCount = 0u;
DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset);
if (args.emitSelfCleanup) {
programSelfCleanupEndSection<GfxFamily>(currentBatchBufferPointer,
totalBytesProgrammed,
finalSyncTileCountField,
gpuAddressOfAllocation + controlSectionOffset,
barrierControlSectionFieldsForCleanupCount,
args.tileCount,
args.useAtomicsForSelfCleanup);
}
}
} // namespace WalkerPartition

View File

@ -736,3 +736,147 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
auto miSemaphoreList = hwParser.getCommandsList<MI_SEMAPHORE_WAIT>();
EXPECT_EQ(3u, miSemaphoreList.size());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
givenBarrierDispatchWhenApiNotRequiresSelfCleanupThenExpectMinimalCommandBuffer) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
size_t expectedSize = sizeof(PIPE_CONTROL) +
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
sizeof(MI_BATCH_BUFFER_START) +
sizeof(WalkerPartition::BarrierControlSection);
size_t estimatedSize = 0;
size_t totalBytesProgrammed = 0;
estimatedSize = ImplicitScalingDispatch<FamilyType>::getBarrierSize(false);
EXPECT_EQ(expectedSize, estimatedSize);
ImplicitScalingDispatch<FamilyType>::dispatchBarrierCommands(commandStream, twoTile, false, false, false);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
HardwareParse hwParser;
hwParser.parsePipeControl = true;
hwParser.parseCommands<FamilyType>(commandStream, 0);
hwParser.findHardwareCommands<FamilyType>();
EXPECT_EQ(1u, hwParser.pipeControlList.size());
auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(*hwParser.pipeControlList.begin());
EXPECT_EQ(false, pipeControl->getDcFlushEnable());
auto miAtomicList = hwParser.getCommandsList<MI_ATOMIC>();
EXPECT_EQ(1u, miAtomicList.size());
auto miSemaphoreList = hwParser.getCommandsList<MI_SEMAPHORE_WAIT>();
EXPECT_EQ(1u, miSemaphoreList.size());
auto bbStartList = hwParser.getCommandsList<MI_BATCH_BUFFER_START>();
EXPECT_EQ(1u, bbStartList.size());
auto bbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStartList.begin());
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
givenBarrierDispatchWhenApiRequiresSelfCleanupThenExpectDefaultSelfCleanupSection) {
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
size_t expectedSize = sizeof(MI_STORE_DATA_IMM) +
sizeof(PIPE_CONTROL) +
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
sizeof(MI_BATCH_BUFFER_START) +
sizeof(WalkerPartition::BarrierControlSection) +
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
sizeof(MI_STORE_DATA_IMM) +
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT);
size_t estimatedSize = 0;
size_t totalBytesProgrammed = 0;
estimatedSize = ImplicitScalingDispatch<FamilyType>::getBarrierSize(true);
EXPECT_EQ(expectedSize, estimatedSize);
ImplicitScalingDispatch<FamilyType>::dispatchBarrierCommands(commandStream, twoTile, true, true, true);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
HardwareParse hwParser;
hwParser.parsePipeControl = true;
hwParser.parseCommands<FamilyType>(commandStream, 0);
hwParser.findHardwareCommands<FamilyType>();
auto storeDataImmList = hwParser.getCommandsList<MI_STORE_DATA_IMM>();
EXPECT_EQ(2u, storeDataImmList.size());
EXPECT_EQ(1u, hwParser.pipeControlList.size());
auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(*hwParser.pipeControlList.begin());
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::isDcFlushAllowed(), pipeControl->getDcFlushEnable());
auto miAtomicList = hwParser.getCommandsList<MI_ATOMIC>();
EXPECT_EQ(3u, miAtomicList.size());
auto miSemaphoreList = hwParser.getCommandsList<MI_SEMAPHORE_WAIT>();
EXPECT_EQ(3u, miSemaphoreList.size());
auto bbStartList = hwParser.getCommandsList<MI_BATCH_BUFFER_START>();
EXPECT_EQ(1u, bbStartList.size());
auto bbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStartList.begin());
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
givenBarrierDispatchWhenApiRequiresSelfCleanupForcedUseAtomicThenExpectUseAtomicForSelfCleanupSection) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
size_t expectedSize = sizeof(MI_ATOMIC) +
sizeof(PIPE_CONTROL) +
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
sizeof(MI_BATCH_BUFFER_START) +
sizeof(WalkerPartition::BarrierControlSection) +
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
sizeof(MI_ATOMIC) +
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT);
DebugManager.flags.UseAtomicsForSelfCleanupSection.set(1);
size_t estimatedSize = 0;
size_t totalBytesProgrammed = 0;
estimatedSize = ImplicitScalingDispatch<FamilyType>::getBarrierSize(true);
EXPECT_EQ(expectedSize, estimatedSize);
ImplicitScalingDispatch<FamilyType>::dispatchBarrierCommands(commandStream, twoTile, true, true, true);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
HardwareParse hwParser;
hwParser.parsePipeControl = true;
hwParser.parseCommands<FamilyType>(commandStream, 0);
hwParser.findHardwareCommands<FamilyType>();
EXPECT_EQ(1u, hwParser.pipeControlList.size());
auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(*hwParser.pipeControlList.begin());
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::isDcFlushAllowed(), pipeControl->getDcFlushEnable());
auto miAtomicList = hwParser.getCommandsList<MI_ATOMIC>();
EXPECT_EQ(5u, miAtomicList.size());
auto miSemaphoreList = hwParser.getCommandsList<MI_SEMAPHORE_WAIT>();
EXPECT_EQ(3u, miSemaphoreList.size());
auto bbStartList = hwParser.getCommandsList<MI_BATCH_BUFFER_START>();
EXPECT_EQ(1u, bbStartList.size());
auto bbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStartList.begin());
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
}