Pass active partitions from dispatched kernel to context

Related-To: NEO-6244

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2021-10-04 16:37:12 +00:00 committed by Compute-Runtime-Automation
parent 85a52b7702
commit 0b64ecba3f
6 changed files with 198 additions and 11 deletions

View File

@ -138,6 +138,9 @@ inline void HardwareInterface<GfxFamily>::programWalker(
false,
kernel.usesImages(),
workPartitionAllocationGpuVa);
if (queueCsr.isStaticWorkPartitioningEnabled()) {
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount));
}
auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
timestampPacket->setPacketsUsed(partitionCount);
} else {

View File

@ -1459,7 +1459,7 @@ struct XeHPAndLaterDispatchWalkerBasicTestStaticPartition : public XeHPAndLaterD
}
};
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, givenStaticPartitioningWhenEnqueueingKernelThenNoMultipleActivePartitionsAreSetInCsr) {
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, givenStaticPartitioningWhenEnqueueingKernelThenMultipleActivePartitionsAreSetInCsr) {
if (!OSInterface::osEnableLocalMemory) {
GTEST_SKIP();
}
@ -1472,7 +1472,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition,
}
EXPECT_EQ(1u, commandStreamReceiver.activePartitions);
cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
EXPECT_EQ(1u, commandStreamReceiver.activePartitions);
EXPECT_EQ(2u, commandStreamReceiver.activePartitions);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(*cmdQ);
@ -1482,6 +1482,50 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition,
EXPECT_EQ(8u, computeWalker->getPartitionSize());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition,
givenStaticPartitioningWhenEnqueueingNonUnifromKernelThenMultipleActivePartitionsAreSetInCsrAndWparidRegisterIsReconfiguredToStatic) {
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
if (!OSInterface::osEnableLocalMemory) {
GTEST_SKIP();
}
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
size_t gws[] = {129, 1, 1};
size_t lws[] = {8, 1, 1};
auto &commandStreamReceiver = cmdQ->getUltCommandStreamReceiver();
if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) {
commandStreamReceiver.createPreemptionAllocation();
}
EXPECT_EQ(1u, commandStreamReceiver.activePartitions);
kernel->mockProgram->allowNonUniform = true;
cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
EXPECT_EQ(2u, commandStreamReceiver.activePartitions);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(*cmdQ->commandStream);
auto firstComputeWalkerItor = find<COMPUTE_WALKER *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), firstComputeWalkerItor);
auto computeWalker = reinterpret_cast<COMPUTE_WALKER *>(*firstComputeWalkerItor);
EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType());
EXPECT_EQ(8u, computeWalker->getPartitionSize());
auto nextCmdItor = firstComputeWalkerItor;
++nextCmdItor;
auto secondComputeWalkerItor = find<COMPUTE_WALKER *>(nextCmdItor, hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), secondComputeWalkerItor);
computeWalker = reinterpret_cast<COMPUTE_WALKER *>(*secondComputeWalkerItor);
EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, computeWalker->getPartitionType());
auto workPartitionAllocationGpuVa = commandStreamReceiver.getWorkPartitionAllocationGpuAddress();
auto expectedRegister = 0x221Cu;
auto loadRegisterMem = hwParser.getCommand<MI_LOAD_REGISTER_MEM>(firstComputeWalkerItor, secondComputeWalkerItor);
ASSERT_NE(nullptr, loadRegisterMem);
EXPECT_EQ(workPartitionAllocationGpuVa, loadRegisterMem->getMemoryAddress());
EXPECT_EQ(expectedRegister, loadRegisterMem->getRegisterAddress());
}
using NonDefaultPlatformGpuWalkerTest = XeHPAndLaterDispatchWalkerBasicTest;
HWCMDTEST_F(IGFX_XE_HP_CORE, NonDefaultPlatformGpuWalkerTest, givenNonDefaultPlatformWhenSetupTimestampPacketThenGmmHelperIsTakenFromNonDefaultPlatform) {

View File

@ -1276,3 +1276,135 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhe
}
EXPECT_EQ(parsedOffset, totalBytesProgrammed);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitionIsPreferredAndWalkerWithNonUniformStartWhenDynamicPartitionSelectedThenExpectReconfigureWparidToStatic) {
WalkerPartition::COMPUTE_WALKER<FamilyType> walker;
walker = FamilyType::cmdInitGpgpuWalker;
walker.setThreadGroupIdStartingX(1u);
checkForProperCmdBufferAddressOffset = false;
bool preferredStaticPartitioning = true;
bool staticPartitioning = false;
auto partitionCount = computePartitionCountAndSetPartitionType<FamilyType>(&walker, 4u, preferredStaticPartitioning, false, &staticPartitioning);
EXPECT_FALSE(staticPartitioning);
EXPECT_EQ(1u, partitionCount);
EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, walker.getPartitionType());
testArgs.partitionCount = partitionCount;
testArgs.staticPartitioning = staticPartitioning;
testArgs.preferredStaticPartitioning = preferredStaticPartitioning;
testArgs.workPartitionAllocationGpuVa = 0x800BADA55000;
auto expectedCommandUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM<FamilyType>) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) * 2 +
sizeof(WalkerPartition::LOAD_REGISTER_REG<FamilyType>) +
sizeof(WalkerPartition::MI_SET_PREDICATE<FamilyType>) * 2 +
sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>) * 3 +
sizeof(WalkerPartition::PIPE_CONTROL<FamilyType>) +
sizeof(WalkerPartition::COMPUTE_WALKER<FamilyType>) +
sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>) +
sizeof(WalkerPartition::LOAD_REGISTER_MEM<FamilyType>);
EXPECT_EQ(expectedCommandUsedSize, computeControlSectionOffset<FamilyType>(testArgs));
auto walkerSectionCommands = sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>) +
sizeof(WalkerPartition::COMPUTE_WALKER<FamilyType>);
auto totalProgrammedSize = expectedCommandUsedSize + sizeof(BatchBufferControlData);
testArgs.tileCount = 2;
uint64_t gpuVirtualAddress = 0x8000123000;
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<FamilyType>(cmdBuffer,
gpuVirtualAddress,
&walker,
totalBytesProgrammed,
testArgs);
EXPECT_EQ(totalProgrammedSize, totalBytesProgrammed);
auto expectedMask = 0xFFFFu;
auto expectedRegister = 0x21FCu;
auto loadRegisterImmediate = genCmdCast<WalkerPartition::LOAD_REGISTER_IMM<FamilyType> *>(cmdBufferAddress);
ASSERT_NE(nullptr, loadRegisterImmediate);
EXPECT_EQ(expectedRegister, loadRegisterImmediate->getRegisterOffset());
EXPECT_EQ(expectedMask, loadRegisterImmediate->getDataDword());
auto parsedOffset = sizeof(WalkerPartition::LOAD_REGISTER_IMM<FamilyType>);
auto miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miAtomic);
auto miAtomicAddress = gpuVirtualAddress + expectedCommandUsedSize;
auto miAtomicProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
EXPECT_EQ(miAtomicAddress, miAtomicProgrammedAddress);
EXPECT_TRUE(miAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
auto loadRegisterReg = genCmdCast<WalkerPartition::LOAD_REGISTER_REG<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, loadRegisterReg);
EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableDestination());
EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableSource());
EXPECT_EQ(wparidCCSOffset, loadRegisterReg->getDestinationRegisterAddress());
EXPECT_EQ(generalPurposeRegister4, loadRegisterReg->getSourceRegisterAddress());
parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_REG<FamilyType>);
auto miSetPredicate = genCmdCast<WalkerPartition::MI_SET_PREDICATE<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSetPredicate);
EXPECT_EQ(MI_SET_PREDICATE<FamilyType>::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE, miSetPredicate->getPredicateEnableWparid());
parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE<FamilyType>);
auto batchBufferStart = genCmdCast<WalkerPartition::BATCH_BUFFER_START<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, batchBufferStart);
EXPECT_TRUE(batchBufferStart->getPredicationEnable());
//address routes to WALKER section which is before control section
auto address = batchBufferStart->getBatchBufferStartAddress();
EXPECT_EQ(address, gpuVirtualAddress + expectedCommandUsedSize - walkerSectionCommands);
parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
miSetPredicate = genCmdCast<WalkerPartition::MI_SET_PREDICATE<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSetPredicate);
EXPECT_EQ(MI_SET_PREDICATE<FamilyType>::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_NEVER, miSetPredicate->getPredicateEnableWparid());
EXPECT_EQ(MI_SET_PREDICATE<FamilyType>::PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE, miSetPredicate->getPredicateEnable());
parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE<FamilyType>);
auto pipeControl = genCmdCast<WalkerPartition::PIPE_CONTROL<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::isDcFlushAllowed(), pipeControl->getDcFlushEnable());
parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL<FamilyType>);
miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miAtomic);
auto miAtomicTileAddress = gpuVirtualAddress + expectedCommandUsedSize + sizeof(uint32_t);
auto miAtomicTileProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
EXPECT_EQ(miAtomicTileAddress, miAtomicTileProgrammedAddress);
EXPECT_FALSE(miAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
auto miSemaphoreWait = genCmdCast<WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSemaphoreWait);
EXPECT_EQ(miAtomicTileAddress, miSemaphoreWait->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT<FamilyType>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation());
EXPECT_EQ(2u, miSemaphoreWait->getSemaphoreDataDword());
parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>);
auto loadRegisterMem = genCmdCast<WalkerPartition::LOAD_REGISTER_MEM<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, loadRegisterMem);
EXPECT_EQ(testArgs.workPartitionAllocationGpuVa, loadRegisterMem->getMemoryAddress());
EXPECT_EQ(wparidCCSOffset, loadRegisterMem->getRegisterAddress());
parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_MEM<FamilyType>);
//final batch buffer start that routes at the end of the batch buffer
auto batchBufferStartFinal = genCmdCast<WalkerPartition::BATCH_BUFFER_START<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
EXPECT_NE(nullptr, batchBufferStartFinal);
EXPECT_EQ(batchBufferStartFinal->getBatchBufferStartAddress(), gpuVirtualAddress + totalProgrammedSize);
parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
auto computeWalker = genCmdCast<WalkerPartition::COMPUTE_WALKER<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, computeWalker);
parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER<FamilyType>);
batchBufferStart = genCmdCast<WalkerPartition::BATCH_BUFFER_START<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, batchBufferStart);
EXPECT_FALSE(batchBufferStart->getPredicationEnable());
EXPECT_EQ(gpuVirtualAddress, batchBufferStart->getBatchBufferStartAddress());
parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
}

View File

@ -28,35 +28,35 @@ ClDeviceVector toClDeviceVector(ClDevice &clDevice);
////////////////////////////////////////////////////////////////////////////////
class MockProgram : public Program {
public:
using Program::createProgramFromBinary;
using Program::deviceBuildInfos;
using Program::internalOptionsToExtract;
using Program::kernelDebugEnabled;
using Program::linkBinary;
using Program::separateBlockKernels;
using Program::setBuildStatus;
using Program::updateNonUniformFlag;
using Program::allowNonUniform;
using Program::applyAdditionalOptions;
using Program::areSpecializationConstantsInitialized;
using Program::blockKernelManager;
using Program::buildInfos;
using Program::context;
using Program::createdFrom;
using Program::createProgramFromBinary;
using Program::debugData;
using Program::debugDataSize;
using Program::deviceBuildInfos;
using Program::extractInternalOptions;
using Program::getKernelInfo;
using Program::internalOptionsToExtract;
using Program::irBinary;
using Program::irBinarySize;
using Program::isSpirV;
using Program::kernelDebugEnabled;
using Program::linkBinary;
using Program::options;
using Program::packDeviceBinary;
using Program::Program;
using Program::separateBlockKernels;
using Program::setBuildStatus;
using Program::sourceCode;
using Program::specConstantsIds;
using Program::specConstantsSizes;
using Program::specConstantsValues;
using Program::updateNonUniformFlag;
MockProgram(const ClDeviceVector &deviceVector) : Program(nullptr, false, deviceVector) {
}

View File

@ -42,6 +42,7 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool emitSelfCleanup,
args.emitPipeControlStall = ImplicitScalingHelper::isPipeControlStallRequired();
args.emitBatchBufferEnd = false;
args.staticPartitioning = staticPartitioning;
args.preferredStaticPartitioning = preferStaticPartitioning;
return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(args));
}
@ -76,6 +77,7 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
args.emitBatchBufferEnd = false;
args.secondaryBatchBuffer = useSecondaryBatchBuffer;
args.staticPartitioning = staticPartitioning;
args.preferredStaticPartitioning = preferStaticPartitioning;
if (staticPartitioning) {
UNRECOVERABLE_IF(tileCount != partitionCount);

View File

@ -32,6 +32,7 @@ struct WalkerPartitionArgs {
bool useAtomicsForSelfCleanup = false;
bool initializeWparidRegister = false;
bool emitPipeControlStall = false;
bool preferredStaticPartitioning = false;
};
template <typename GfxFamily>
@ -457,6 +458,7 @@ uint64_t computeControlSectionOffset(WalkerPartitionArgs &args) {
if (args.emitSelfCleanup) {
size += computeSelfCleanupSectionSize<GfxFamily>(args.useAtomicsForSelfCleanup);
}
size += args.preferredStaticPartitioning ? sizeof(LOAD_REGISTER_MEM<GfxFamily>) : 0u;
return size;
}
@ -587,6 +589,10 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
}
if (args.preferredStaticPartitioning) {
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset);
}
//this bb start goes to the end of partitioned command buffer
programMiBatchBufferStart<GfxFamily>(
currentBatchBufferPointer,