diff --git a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl index 9a2f5ceb2a..57bbf419c9 100644 --- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl +++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl @@ -138,6 +138,9 @@ inline void HardwareInterface::programWalker( false, kernel.usesImages(), workPartitionAllocationGpuVa); + if (queueCsr.isStaticWorkPartitioningEnabled()) { + queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount)); + } auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex); timestampPacket->setPacketsUsed(partitionCount); } else { diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp index d5840c35ad..6cf8516927 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp @@ -1459,7 +1459,7 @@ struct XeHPAndLaterDispatchWalkerBasicTestStaticPartition : public XeHPAndLaterD } }; -HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, givenStaticPartitioningWhenEnqueueingKernelThenNoMultipleActivePartitionsAreSetInCsr) { +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, givenStaticPartitioningWhenEnqueueingKernelThenMultipleActivePartitionsAreSetInCsr) { if (!OSInterface::osEnableLocalMemory) { GTEST_SKIP(); } @@ -1472,7 +1472,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, } EXPECT_EQ(1u, commandStreamReceiver.activePartitions); cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); - EXPECT_EQ(1u, commandStreamReceiver.activePartitions); + EXPECT_EQ(2u, commandStreamReceiver.activePartitions); HardwareParse hwParser; hwParser.parseCommands(*cmdQ); @@ -1482,6 +1482,50 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, EXPECT_EQ(8u, computeWalker->getPartitionSize()); } +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, + givenStaticPartitioningWhenEnqueueingNonUnifromKernelThenMultipleActivePartitionsAreSetInCsrAndWparidRegisterIsReconfiguredToStatic) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; + if (!OSInterface::osEnableLocalMemory) { + GTEST_SKIP(); + } + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + size_t gws[] = {129, 1, 1}; + size_t lws[] = {8, 1, 1}; + auto &commandStreamReceiver = cmdQ->getUltCommandStreamReceiver(); + if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) { + commandStreamReceiver.createPreemptionAllocation(); + } + EXPECT_EQ(1u, commandStreamReceiver.activePartitions); + kernel->mockProgram->allowNonUniform = true; + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + EXPECT_EQ(2u, commandStreamReceiver.activePartitions); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ->commandStream); + + auto firstComputeWalkerItor = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), firstComputeWalkerItor); + auto computeWalker = reinterpret_cast(*firstComputeWalkerItor); + EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType()); + EXPECT_EQ(8u, computeWalker->getPartitionSize()); + + auto nextCmdItor = firstComputeWalkerItor; + ++nextCmdItor; + + auto secondComputeWalkerItor = find(nextCmdItor, hwParser.cmdList.end()); + ASSERT_NE(hwParser.cmdList.end(), secondComputeWalkerItor); + computeWalker = reinterpret_cast(*secondComputeWalkerItor); + EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, computeWalker->getPartitionType()); + + auto workPartitionAllocationGpuVa = commandStreamReceiver.getWorkPartitionAllocationGpuAddress(); + auto expectedRegister = 0x221Cu; + auto loadRegisterMem = hwParser.getCommand(firstComputeWalkerItor, secondComputeWalkerItor); + ASSERT_NE(nullptr, loadRegisterMem); + EXPECT_EQ(workPartitionAllocationGpuVa, loadRegisterMem->getMemoryAddress()); + EXPECT_EQ(expectedRegister, loadRegisterMem->getRegisterAddress()); +} + using NonDefaultPlatformGpuWalkerTest = XeHPAndLaterDispatchWalkerBasicTest; HWCMDTEST_F(IGFX_XE_HP_CORE, NonDefaultPlatformGpuWalkerTest, givenNonDefaultPlatformWhenSetupTimestampPacketThenGmmHelperIsTakenFromNonDefaultPlatform) { diff --git a/opencl/test/unit_test/command_queue/walker_partition_tests_xehp_and_later_1.cpp b/opencl/test/unit_test/command_queue/walker_partition_tests_xehp_and_later_1.cpp index 5ff6aa1238..fb13d97366 100644 --- a/opencl/test/unit_test/command_queue/walker_partition_tests_xehp_and_later_1.cpp +++ b/opencl/test/unit_test/command_queue/walker_partition_tests_xehp_and_later_1.cpp @@ -1276,3 +1276,135 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhe } EXPECT_EQ(parsedOffset, totalBytesProgrammed); } + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitionIsPreferredAndWalkerWithNonUniformStartWhenDynamicPartitionSelectedThenExpectReconfigureWparidToStatic) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdStartingX(1u); + + checkForProperCmdBufferAddressOffset = false; + bool preferredStaticPartitioning = true; + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, preferredStaticPartitioning, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(1u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, walker.getPartitionType()); + + testArgs.partitionCount = partitionCount; + testArgs.staticPartitioning = staticPartitioning; + testArgs.preferredStaticPartitioning = preferredStaticPartitioning; + testArgs.workPartitionAllocationGpuVa = 0x800BADA55000; + + auto expectedCommandUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM) + + sizeof(WalkerPartition::MI_ATOMIC) * 2 + + sizeof(WalkerPartition::LOAD_REGISTER_REG) + + sizeof(WalkerPartition::MI_SET_PREDICATE) * 2 + + sizeof(WalkerPartition::BATCH_BUFFER_START) * 3 + + sizeof(WalkerPartition::PIPE_CONTROL) + + sizeof(WalkerPartition::COMPUTE_WALKER) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) + + sizeof(WalkerPartition::LOAD_REGISTER_MEM); + + EXPECT_EQ(expectedCommandUsedSize, computeControlSectionOffset(testArgs)); + + auto walkerSectionCommands = sizeof(WalkerPartition::BATCH_BUFFER_START) + + sizeof(WalkerPartition::COMPUTE_WALKER); + auto totalProgrammedSize = expectedCommandUsedSize + sizeof(BatchBufferControlData); + + testArgs.tileCount = 2; + uint64_t gpuVirtualAddress = 0x8000123000; + WalkerPartition::constructDynamicallyPartitionedCommandBuffer(cmdBuffer, + gpuVirtualAddress, + &walker, + totalBytesProgrammed, + testArgs); + + EXPECT_EQ(totalProgrammedSize, totalBytesProgrammed); + + auto expectedMask = 0xFFFFu; + auto expectedRegister = 0x21FCu; + auto loadRegisterImmediate = genCmdCast *>(cmdBufferAddress); + ASSERT_NE(nullptr, loadRegisterImmediate); + EXPECT_EQ(expectedRegister, loadRegisterImmediate->getRegisterOffset()); + EXPECT_EQ(expectedMask, loadRegisterImmediate->getDataDword()); + auto parsedOffset = sizeof(WalkerPartition::LOAD_REGISTER_IMM); + + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + auto miAtomicAddress = gpuVirtualAddress + expectedCommandUsedSize; + auto miAtomicProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicAddress, miAtomicProgrammedAddress); + EXPECT_TRUE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + auto loadRegisterReg = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterReg); + EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableDestination()); + EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableSource()); + EXPECT_EQ(wparidCCSOffset, loadRegisterReg->getDestinationRegisterAddress()); + EXPECT_EQ(generalPurposeRegister4, loadRegisterReg->getSourceRegisterAddress()); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_REG); + + auto miSetPredicate = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSetPredicate); + EXPECT_EQ(MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE, miSetPredicate->getPredicateEnableWparid()); + parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE); + + auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + EXPECT_TRUE(batchBufferStart->getPredicationEnable()); + //address routes to WALKER section which is before control section + auto address = batchBufferStart->getBatchBufferStartAddress(); + EXPECT_EQ(address, gpuVirtualAddress + expectedCommandUsedSize - walkerSectionCommands); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + miSetPredicate = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSetPredicate); + EXPECT_EQ(MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_NEVER, miSetPredicate->getPredicateEnableWparid()); + EXPECT_EQ(MI_SET_PREDICATE::PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE, miSetPredicate->getPredicateEnable()); + parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE); + + auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); + parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); + + miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + auto miAtomicTileAddress = gpuVirtualAddress + expectedCommandUsedSize + sizeof(uint32_t); + auto miAtomicTileProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicTileAddress, miAtomicTileProgrammedAddress); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + EXPECT_EQ(miAtomicTileAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(2u, miSemaphoreWait->getSemaphoreDataDword()); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + + auto loadRegisterMem = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterMem); + EXPECT_EQ(testArgs.workPartitionAllocationGpuVa, loadRegisterMem->getMemoryAddress()); + EXPECT_EQ(wparidCCSOffset, loadRegisterMem->getRegisterAddress()); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_MEM); + + //final batch buffer start that routes at the end of the batch buffer + auto batchBufferStartFinal = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + EXPECT_NE(nullptr, batchBufferStartFinal); + EXPECT_EQ(batchBufferStartFinal->getBatchBufferStartAddress(), gpuVirtualAddress + totalProgrammedSize); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + auto computeWalker = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, computeWalker); + parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); + + batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + EXPECT_FALSE(batchBufferStart->getPredicationEnable()); + EXPECT_EQ(gpuVirtualAddress, batchBufferStart->getBatchBufferStartAddress()); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); +} diff --git a/opencl/test/unit_test/mocks/mock_program.h b/opencl/test/unit_test/mocks/mock_program.h index 57deef4529..1f2c0aaa5a 100644 --- a/opencl/test/unit_test/mocks/mock_program.h +++ b/opencl/test/unit_test/mocks/mock_program.h @@ -28,35 +28,35 @@ ClDeviceVector toClDeviceVector(ClDevice &clDevice); //////////////////////////////////////////////////////////////////////////////// class MockProgram : public Program { public: - using Program::createProgramFromBinary; - using Program::deviceBuildInfos; - using Program::internalOptionsToExtract; - using Program::kernelDebugEnabled; - using Program::linkBinary; - using Program::separateBlockKernels; - using Program::setBuildStatus; - using Program::updateNonUniformFlag; - + using Program::allowNonUniform; using Program::applyAdditionalOptions; using Program::areSpecializationConstantsInitialized; using Program::blockKernelManager; using Program::buildInfos; using Program::context; using Program::createdFrom; + using Program::createProgramFromBinary; using Program::debugData; using Program::debugDataSize; + using Program::deviceBuildInfos; using Program::extractInternalOptions; using Program::getKernelInfo; + using Program::internalOptionsToExtract; using Program::irBinary; using Program::irBinarySize; using Program::isSpirV; + using Program::kernelDebugEnabled; + using Program::linkBinary; using Program::options; using Program::packDeviceBinary; using Program::Program; + using Program::separateBlockKernels; + using Program::setBuildStatus; using Program::sourceCode; using Program::specConstantsIds; using Program::specConstantsSizes; using Program::specConstantsValues; + using Program::updateNonUniformFlag; MockProgram(const ClDeviceVector &deviceVector) : Program(nullptr, false, deviceVector) { } diff --git a/shared/source/command_container/implicit_scaling_xehp_and_later.inl b/shared/source/command_container/implicit_scaling_xehp_and_later.inl index 6941c74ecc..79229b648c 100644 --- a/shared/source/command_container/implicit_scaling_xehp_and_later.inl +++ b/shared/source/command_container/implicit_scaling_xehp_and_later.inl @@ -42,6 +42,7 @@ size_t ImplicitScalingDispatch::getSize(bool emitSelfCleanup, args.emitPipeControlStall = ImplicitScalingHelper::isPipeControlStallRequired(); args.emitBatchBufferEnd = false; args.staticPartitioning = staticPartitioning; + args.preferredStaticPartitioning = preferStaticPartitioning; return static_cast(WalkerPartition::estimateSpaceRequiredInCommandBuffer(args)); } @@ -76,6 +77,7 @@ void ImplicitScalingDispatch::dispatchCommands(LinearStream &commandS args.emitBatchBufferEnd = false; args.secondaryBatchBuffer = useSecondaryBatchBuffer; args.staticPartitioning = staticPartitioning; + args.preferredStaticPartitioning = preferStaticPartitioning; if (staticPartitioning) { UNRECOVERABLE_IF(tileCount != partitionCount); diff --git a/shared/source/command_container/walker_partition_xehp_and_later.h b/shared/source/command_container/walker_partition_xehp_and_later.h index 62ee920218..e89d8fd938 100644 --- a/shared/source/command_container/walker_partition_xehp_and_later.h +++ b/shared/source/command_container/walker_partition_xehp_and_later.h @@ -32,6 +32,7 @@ struct WalkerPartitionArgs { bool useAtomicsForSelfCleanup = false; bool initializeWparidRegister = false; bool emitPipeControlStall = false; + bool preferredStaticPartitioning = false; }; template @@ -457,6 +458,7 @@ uint64_t computeControlSectionOffset(WalkerPartitionArgs &args) { if (args.emitSelfCleanup) { size += computeSelfCleanupSectionSize(args.useAtomicsForSelfCleanup); } + size += args.preferredStaticPartitioning ? sizeof(LOAD_REGISTER_MEM) : 0u; return size; } @@ -587,6 +589,10 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer, programTilesSynchronizationWithAtomics(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount); } + if (args.preferredStaticPartitioning) { + programMiLoadRegisterMem(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset); + } + //this bb start goes to the end of partitioned command buffer programMiBatchBufferStart( currentBatchBufferPointer,