Pass active partitions from dispatched kernel to context
Related-To: NEO-6244 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
parent
85a52b7702
commit
0b64ecba3f
|
@ -138,6 +138,9 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
|||
false,
|
||||
kernel.usesImages(),
|
||||
workPartitionAllocationGpuVa);
|
||||
if (queueCsr.isStaticWorkPartitioningEnabled()) {
|
||||
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount));
|
||||
}
|
||||
auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
|
||||
timestampPacket->setPacketsUsed(partitionCount);
|
||||
} else {
|
||||
|
|
|
@ -1459,7 +1459,7 @@ struct XeHPAndLaterDispatchWalkerBasicTestStaticPartition : public XeHPAndLaterD
|
|||
}
|
||||
};
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, givenStaticPartitioningWhenEnqueueingKernelThenNoMultipleActivePartitionsAreSetInCsr) {
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, givenStaticPartitioningWhenEnqueueingKernelThenMultipleActivePartitionsAreSetInCsr) {
|
||||
if (!OSInterface::osEnableLocalMemory) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
|
@ -1472,7 +1472,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition,
|
|||
}
|
||||
EXPECT_EQ(1u, commandStreamReceiver.activePartitions);
|
||||
cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(1u, commandStreamReceiver.activePartitions);
|
||||
EXPECT_EQ(2u, commandStreamReceiver.activePartitions);
|
||||
|
||||
HardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(*cmdQ);
|
||||
|
@ -1482,6 +1482,50 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition,
|
|||
EXPECT_EQ(8u, computeWalker->getPartitionSize());
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition,
|
||||
givenStaticPartitioningWhenEnqueueingNonUnifromKernelThenMultipleActivePartitionsAreSetInCsrAndWparidRegisterIsReconfiguredToStatic) {
|
||||
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
|
||||
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
|
||||
if (!OSInterface::osEnableLocalMemory) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
|
||||
size_t gws[] = {129, 1, 1};
|
||||
size_t lws[] = {8, 1, 1};
|
||||
auto &commandStreamReceiver = cmdQ->getUltCommandStreamReceiver();
|
||||
if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) {
|
||||
commandStreamReceiver.createPreemptionAllocation();
|
||||
}
|
||||
EXPECT_EQ(1u, commandStreamReceiver.activePartitions);
|
||||
kernel->mockProgram->allowNonUniform = true;
|
||||
cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(2u, commandStreamReceiver.activePartitions);
|
||||
|
||||
HardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(*cmdQ->commandStream);
|
||||
|
||||
auto firstComputeWalkerItor = find<COMPUTE_WALKER *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
|
||||
ASSERT_NE(hwParser.cmdList.end(), firstComputeWalkerItor);
|
||||
auto computeWalker = reinterpret_cast<COMPUTE_WALKER *>(*firstComputeWalkerItor);
|
||||
EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType());
|
||||
EXPECT_EQ(8u, computeWalker->getPartitionSize());
|
||||
|
||||
auto nextCmdItor = firstComputeWalkerItor;
|
||||
++nextCmdItor;
|
||||
|
||||
auto secondComputeWalkerItor = find<COMPUTE_WALKER *>(nextCmdItor, hwParser.cmdList.end());
|
||||
ASSERT_NE(hwParser.cmdList.end(), secondComputeWalkerItor);
|
||||
computeWalker = reinterpret_cast<COMPUTE_WALKER *>(*secondComputeWalkerItor);
|
||||
EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, computeWalker->getPartitionType());
|
||||
|
||||
auto workPartitionAllocationGpuVa = commandStreamReceiver.getWorkPartitionAllocationGpuAddress();
|
||||
auto expectedRegister = 0x221Cu;
|
||||
auto loadRegisterMem = hwParser.getCommand<MI_LOAD_REGISTER_MEM>(firstComputeWalkerItor, secondComputeWalkerItor);
|
||||
ASSERT_NE(nullptr, loadRegisterMem);
|
||||
EXPECT_EQ(workPartitionAllocationGpuVa, loadRegisterMem->getMemoryAddress());
|
||||
EXPECT_EQ(expectedRegister, loadRegisterMem->getRegisterAddress());
|
||||
}
|
||||
|
||||
using NonDefaultPlatformGpuWalkerTest = XeHPAndLaterDispatchWalkerBasicTest;
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, NonDefaultPlatformGpuWalkerTest, givenNonDefaultPlatformWhenSetupTimestampPacketThenGmmHelperIsTakenFromNonDefaultPlatform) {
|
||||
|
|
|
@ -1276,3 +1276,135 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhe
|
|||
}
|
||||
EXPECT_EQ(parsedOffset, totalBytesProgrammed);
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitionIsPreferredAndWalkerWithNonUniformStartWhenDynamicPartitionSelectedThenExpectReconfigureWparidToStatic) {
|
||||
WalkerPartition::COMPUTE_WALKER<FamilyType> walker;
|
||||
walker = FamilyType::cmdInitGpgpuWalker;
|
||||
walker.setThreadGroupIdStartingX(1u);
|
||||
|
||||
checkForProperCmdBufferAddressOffset = false;
|
||||
bool preferredStaticPartitioning = true;
|
||||
bool staticPartitioning = false;
|
||||
auto partitionCount = computePartitionCountAndSetPartitionType<FamilyType>(&walker, 4u, preferredStaticPartitioning, false, &staticPartitioning);
|
||||
EXPECT_FALSE(staticPartitioning);
|
||||
EXPECT_EQ(1u, partitionCount);
|
||||
EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, walker.getPartitionType());
|
||||
|
||||
testArgs.partitionCount = partitionCount;
|
||||
testArgs.staticPartitioning = staticPartitioning;
|
||||
testArgs.preferredStaticPartitioning = preferredStaticPartitioning;
|
||||
testArgs.workPartitionAllocationGpuVa = 0x800BADA55000;
|
||||
|
||||
auto expectedCommandUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM<FamilyType>) +
|
||||
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) * 2 +
|
||||
sizeof(WalkerPartition::LOAD_REGISTER_REG<FamilyType>) +
|
||||
sizeof(WalkerPartition::MI_SET_PREDICATE<FamilyType>) * 2 +
|
||||
sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>) * 3 +
|
||||
sizeof(WalkerPartition::PIPE_CONTROL<FamilyType>) +
|
||||
sizeof(WalkerPartition::COMPUTE_WALKER<FamilyType>) +
|
||||
sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>) +
|
||||
sizeof(WalkerPartition::LOAD_REGISTER_MEM<FamilyType>);
|
||||
|
||||
EXPECT_EQ(expectedCommandUsedSize, computeControlSectionOffset<FamilyType>(testArgs));
|
||||
|
||||
auto walkerSectionCommands = sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>) +
|
||||
sizeof(WalkerPartition::COMPUTE_WALKER<FamilyType>);
|
||||
auto totalProgrammedSize = expectedCommandUsedSize + sizeof(BatchBufferControlData);
|
||||
|
||||
testArgs.tileCount = 2;
|
||||
uint64_t gpuVirtualAddress = 0x8000123000;
|
||||
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<FamilyType>(cmdBuffer,
|
||||
gpuVirtualAddress,
|
||||
&walker,
|
||||
totalBytesProgrammed,
|
||||
testArgs);
|
||||
|
||||
EXPECT_EQ(totalProgrammedSize, totalBytesProgrammed);
|
||||
|
||||
auto expectedMask = 0xFFFFu;
|
||||
auto expectedRegister = 0x21FCu;
|
||||
auto loadRegisterImmediate = genCmdCast<WalkerPartition::LOAD_REGISTER_IMM<FamilyType> *>(cmdBufferAddress);
|
||||
ASSERT_NE(nullptr, loadRegisterImmediate);
|
||||
EXPECT_EQ(expectedRegister, loadRegisterImmediate->getRegisterOffset());
|
||||
EXPECT_EQ(expectedMask, loadRegisterImmediate->getDataDword());
|
||||
auto parsedOffset = sizeof(WalkerPartition::LOAD_REGISTER_IMM<FamilyType>);
|
||||
|
||||
auto miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
|
||||
ASSERT_NE(nullptr, miAtomic);
|
||||
auto miAtomicAddress = gpuVirtualAddress + expectedCommandUsedSize;
|
||||
auto miAtomicProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
|
||||
EXPECT_EQ(miAtomicAddress, miAtomicProgrammedAddress);
|
||||
EXPECT_TRUE(miAtomic->getReturnDataControl());
|
||||
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
|
||||
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
|
||||
|
||||
auto loadRegisterReg = genCmdCast<WalkerPartition::LOAD_REGISTER_REG<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
|
||||
ASSERT_NE(nullptr, loadRegisterReg);
|
||||
EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableDestination());
|
||||
EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableSource());
|
||||
EXPECT_EQ(wparidCCSOffset, loadRegisterReg->getDestinationRegisterAddress());
|
||||
EXPECT_EQ(generalPurposeRegister4, loadRegisterReg->getSourceRegisterAddress());
|
||||
parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_REG<FamilyType>);
|
||||
|
||||
auto miSetPredicate = genCmdCast<WalkerPartition::MI_SET_PREDICATE<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
|
||||
ASSERT_NE(nullptr, miSetPredicate);
|
||||
EXPECT_EQ(MI_SET_PREDICATE<FamilyType>::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE, miSetPredicate->getPredicateEnableWparid());
|
||||
parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE<FamilyType>);
|
||||
|
||||
auto batchBufferStart = genCmdCast<WalkerPartition::BATCH_BUFFER_START<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
|
||||
ASSERT_NE(nullptr, batchBufferStart);
|
||||
EXPECT_TRUE(batchBufferStart->getPredicationEnable());
|
||||
//address routes to WALKER section which is before control section
|
||||
auto address = batchBufferStart->getBatchBufferStartAddress();
|
||||
EXPECT_EQ(address, gpuVirtualAddress + expectedCommandUsedSize - walkerSectionCommands);
|
||||
parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
|
||||
|
||||
miSetPredicate = genCmdCast<WalkerPartition::MI_SET_PREDICATE<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
|
||||
ASSERT_NE(nullptr, miSetPredicate);
|
||||
EXPECT_EQ(MI_SET_PREDICATE<FamilyType>::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_NEVER, miSetPredicate->getPredicateEnableWparid());
|
||||
EXPECT_EQ(MI_SET_PREDICATE<FamilyType>::PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE, miSetPredicate->getPredicateEnable());
|
||||
parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE<FamilyType>);
|
||||
|
||||
auto pipeControl = genCmdCast<WalkerPartition::PIPE_CONTROL<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
|
||||
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
|
||||
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::isDcFlushAllowed(), pipeControl->getDcFlushEnable());
|
||||
parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL<FamilyType>);
|
||||
|
||||
miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
|
||||
ASSERT_NE(nullptr, miAtomic);
|
||||
auto miAtomicTileAddress = gpuVirtualAddress + expectedCommandUsedSize + sizeof(uint32_t);
|
||||
auto miAtomicTileProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
|
||||
EXPECT_EQ(miAtomicTileAddress, miAtomicTileProgrammedAddress);
|
||||
EXPECT_FALSE(miAtomic->getReturnDataControl());
|
||||
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
|
||||
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
|
||||
|
||||
auto miSemaphoreWait = genCmdCast<WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
|
||||
ASSERT_NE(nullptr, miSemaphoreWait);
|
||||
EXPECT_EQ(miAtomicTileAddress, miSemaphoreWait->getSemaphoreGraphicsAddress());
|
||||
EXPECT_EQ(MI_SEMAPHORE_WAIT<FamilyType>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation());
|
||||
EXPECT_EQ(2u, miSemaphoreWait->getSemaphoreDataDword());
|
||||
parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>);
|
||||
|
||||
auto loadRegisterMem = genCmdCast<WalkerPartition::LOAD_REGISTER_MEM<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
|
||||
ASSERT_NE(nullptr, loadRegisterMem);
|
||||
EXPECT_EQ(testArgs.workPartitionAllocationGpuVa, loadRegisterMem->getMemoryAddress());
|
||||
EXPECT_EQ(wparidCCSOffset, loadRegisterMem->getRegisterAddress());
|
||||
parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_MEM<FamilyType>);
|
||||
|
||||
//final batch buffer start that routes at the end of the batch buffer
|
||||
auto batchBufferStartFinal = genCmdCast<WalkerPartition::BATCH_BUFFER_START<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
|
||||
EXPECT_NE(nullptr, batchBufferStartFinal);
|
||||
EXPECT_EQ(batchBufferStartFinal->getBatchBufferStartAddress(), gpuVirtualAddress + totalProgrammedSize);
|
||||
parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
|
||||
|
||||
auto computeWalker = genCmdCast<WalkerPartition::COMPUTE_WALKER<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
|
||||
ASSERT_NE(nullptr, computeWalker);
|
||||
parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER<FamilyType>);
|
||||
|
||||
batchBufferStart = genCmdCast<WalkerPartition::BATCH_BUFFER_START<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
|
||||
ASSERT_NE(nullptr, batchBufferStart);
|
||||
EXPECT_FALSE(batchBufferStart->getPredicationEnable());
|
||||
EXPECT_EQ(gpuVirtualAddress, batchBufferStart->getBatchBufferStartAddress());
|
||||
parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
|
||||
}
|
||||
|
|
|
@ -28,35 +28,35 @@ ClDeviceVector toClDeviceVector(ClDevice &clDevice);
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
class MockProgram : public Program {
|
||||
public:
|
||||
using Program::createProgramFromBinary;
|
||||
using Program::deviceBuildInfos;
|
||||
using Program::internalOptionsToExtract;
|
||||
using Program::kernelDebugEnabled;
|
||||
using Program::linkBinary;
|
||||
using Program::separateBlockKernels;
|
||||
using Program::setBuildStatus;
|
||||
using Program::updateNonUniformFlag;
|
||||
|
||||
using Program::allowNonUniform;
|
||||
using Program::applyAdditionalOptions;
|
||||
using Program::areSpecializationConstantsInitialized;
|
||||
using Program::blockKernelManager;
|
||||
using Program::buildInfos;
|
||||
using Program::context;
|
||||
using Program::createdFrom;
|
||||
using Program::createProgramFromBinary;
|
||||
using Program::debugData;
|
||||
using Program::debugDataSize;
|
||||
using Program::deviceBuildInfos;
|
||||
using Program::extractInternalOptions;
|
||||
using Program::getKernelInfo;
|
||||
using Program::internalOptionsToExtract;
|
||||
using Program::irBinary;
|
||||
using Program::irBinarySize;
|
||||
using Program::isSpirV;
|
||||
using Program::kernelDebugEnabled;
|
||||
using Program::linkBinary;
|
||||
using Program::options;
|
||||
using Program::packDeviceBinary;
|
||||
using Program::Program;
|
||||
using Program::separateBlockKernels;
|
||||
using Program::setBuildStatus;
|
||||
using Program::sourceCode;
|
||||
using Program::specConstantsIds;
|
||||
using Program::specConstantsSizes;
|
||||
using Program::specConstantsValues;
|
||||
using Program::updateNonUniformFlag;
|
||||
|
||||
MockProgram(const ClDeviceVector &deviceVector) : Program(nullptr, false, deviceVector) {
|
||||
}
|
||||
|
|
|
@ -42,6 +42,7 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool emitSelfCleanup,
|
|||
args.emitPipeControlStall = ImplicitScalingHelper::isPipeControlStallRequired();
|
||||
args.emitBatchBufferEnd = false;
|
||||
args.staticPartitioning = staticPartitioning;
|
||||
args.preferredStaticPartitioning = preferStaticPartitioning;
|
||||
|
||||
return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(args));
|
||||
}
|
||||
|
@ -76,6 +77,7 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
|
|||
args.emitBatchBufferEnd = false;
|
||||
args.secondaryBatchBuffer = useSecondaryBatchBuffer;
|
||||
args.staticPartitioning = staticPartitioning;
|
||||
args.preferredStaticPartitioning = preferStaticPartitioning;
|
||||
|
||||
if (staticPartitioning) {
|
||||
UNRECOVERABLE_IF(tileCount != partitionCount);
|
||||
|
|
|
@ -32,6 +32,7 @@ struct WalkerPartitionArgs {
|
|||
bool useAtomicsForSelfCleanup = false;
|
||||
bool initializeWparidRegister = false;
|
||||
bool emitPipeControlStall = false;
|
||||
bool preferredStaticPartitioning = false;
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
@ -457,6 +458,7 @@ uint64_t computeControlSectionOffset(WalkerPartitionArgs &args) {
|
|||
if (args.emitSelfCleanup) {
|
||||
size += computeSelfCleanupSectionSize<GfxFamily>(args.useAtomicsForSelfCleanup);
|
||||
}
|
||||
size += args.preferredStaticPartitioning ? sizeof(LOAD_REGISTER_MEM<GfxFamily>) : 0u;
|
||||
return size;
|
||||
}
|
||||
|
||||
|
@ -587,6 +589,10 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
|||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
|
||||
}
|
||||
|
||||
if (args.preferredStaticPartitioning) {
|
||||
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset);
|
||||
}
|
||||
|
||||
//this bb start goes to the end of partitioned command buffer
|
||||
programMiBatchBufferStart<GfxFamily>(
|
||||
currentBatchBufferPointer,
|
||||
|
|
Loading…
Reference in New Issue