diff --git a/level_zero/core/source/helpers/implicit_scaling_l0.cpp b/level_zero/core/source/helpers/implicit_scaling_l0.cpp index 5bfb2e15b8..a31a0deed3 100644 --- a/level_zero/core/source/helpers/implicit_scaling_l0.cpp +++ b/level_zero/core/source/helpers/implicit_scaling_l0.cpp @@ -10,7 +10,5 @@ namespace NEO { namespace ImplicitScaling { bool apiSupport = false; -bool semaphoreProgrammingRequired = false; -bool crossTileAtomicSynchronization = true; } // namespace ImplicitScaling } // namespace NEO diff --git a/opencl/source/helpers/implicit_scaling_ocl.cpp b/opencl/source/helpers/implicit_scaling_ocl.cpp index c2571e8302..41e6950ada 100644 --- a/opencl/source/helpers/implicit_scaling_ocl.cpp +++ b/opencl/source/helpers/implicit_scaling_ocl.cpp @@ -10,7 +10,5 @@ namespace NEO { namespace ImplicitScaling { bool apiSupport = true; -bool semaphoreProgrammingRequired = false; -bool crossTileAtomicSynchronization = true; } // namespace ImplicitScaling } // namespace NEO diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp index d5f0ca5085..ca6440e8fe 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp @@ -5,6 +5,7 @@ * */ +#include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_container/walker_partition_xehp_and_later.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/gmm_helper/gmm_helper.h" @@ -1066,8 +1067,9 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin memoryManager->freeGraphicsMemory(kernel->kernelInfo.kernelAllocation); } -HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenWalkerPartitionIsOnThenSizeIsProperlyEstimated) { +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, GivenPipeControlIsRequiredWhenWalkerPartitionIsOnThenSizeIsProperlyEstimated) { DebugManager.flags.EnableWalkerPartition.set(1u); + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), true); UltClDeviceFactory deviceFactory{1, 2}; MockClDevice *device = deviceFactory.rootDevices[0]; MockContext context{device}; @@ -1122,6 +1124,63 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenWalkerPart EXPECT_EQ(returnedSize, partitionSize + baseSize); } +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, GivenPipeControlIsNotRequiredWhenWalkerPartitionIsOnThenSizeIsProperlyEstimated) { + DebugManager.flags.EnableWalkerPartition.set(1u); + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), false); + UltClDeviceFactory deviceFactory{1, 2}; + MockClDevice *device = deviceFactory.rootDevices[0]; + MockContext context{device}; + + auto cmdQ = std::make_unique>(&context, device, nullptr); + auto &csr = cmdQ->getUltCommandStreamReceiver(); + + size_t numPipeControls = MemorySynchronizationCommands::isPipeControlWArequired(device->getHardwareInfo()) ? 2 : 1; + + auto baseSize = sizeof(typename FamilyType::COMPUTE_WALKER) + + (sizeof(typename FamilyType::PIPE_CONTROL) * numPipeControls) + + HardwareCommandsHelper::getSizeRequiredCS() + + EncodeMemoryPrefetch::getSizeForMemoryPrefetch(kernel->kernelInfo.heapInfo.KernelHeapSize); + + DispatchInfo dispatchInfo{}; + dispatchInfo.setNumberOfWorkgroups({32, 1, 1}); + + WalkerPartition::WalkerPartitionArgs testArgs = {}; + testArgs.initializeWparidRegister = true; + testArgs.crossTileAtomicSynchronization = false; + testArgs.emitPipeControlStall = false; + testArgs.partitionCount = 2u; + testArgs.tileCount = static_cast(device->getDeviceBitfield().count()); + + DebugManager.flags.SynchronizeWalkerInWparidMode.set(0); + testArgs.staticPartitioning = false; + testArgs.synchronizeBeforeExecution = false; + csr.staticWorkPartitioningEnabled = false; + auto partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer(testArgs); + auto returnedSize = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo); + EXPECT_EQ(returnedSize, partitionSize + baseSize); + + testArgs.staticPartitioning = true; + csr.staticWorkPartitioningEnabled = true; + partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer(testArgs); + returnedSize = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo); + EXPECT_EQ(returnedSize, partitionSize + baseSize); + + DebugManager.flags.SynchronizeWalkerInWparidMode.set(1); + testArgs.synchronizeBeforeExecution = true; + testArgs.staticPartitioning = false; + csr.staticWorkPartitioningEnabled = false; + partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer(testArgs); + returnedSize = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo); + EXPECT_EQ(returnedSize, partitionSize + baseSize); + + testArgs.synchronizeBeforeExecution = true; + testArgs.staticPartitioning = true; + csr.staticWorkPartitioningEnabled = true; + partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer(testArgs); + returnedSize = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo); + EXPECT_EQ(returnedSize, partitionSize + baseSize); +} + HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenWalkerPartitionIsDisabledThenSizeIsProperlyEstimated) { DebugManager.flags.EnableWalkerPartition.set(0u); auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); @@ -1157,8 +1216,9 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenPipeContro EXPECT_EQ(returnedSize, baseSize); } -HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenQueueIsMultiEngineCapableThenWalkerPartitionsAreEstimated) { +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, GivenPipeControlIsRequiredWhenQueueIsMultiEngineCapableThenWalkerPartitionsAreEstimated) { DebugManager.flags.EnableWalkerPartition.set(1u); + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), true); auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); @@ -1185,6 +1245,35 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenQueueIsMul EXPECT_EQ(returnedSize, partitionSize + baseSize); } +HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, GivenPipeControlIsNotRequiredWhenQueueIsMultiEngineCapableThenWalkerPartitionsAreEstimated) { + DebugManager.flags.EnableWalkerPartition.set(1u); + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), false); + + auto cmdQ = std::make_unique>(context.get(), device.get(), nullptr); + + size_t numPipeControls = MemorySynchronizationCommands::isPipeControlWArequired(device->getHardwareInfo()) ? 2 : 1; + + auto baseSize = sizeof(typename FamilyType::COMPUTE_WALKER) + + (sizeof(typename FamilyType::PIPE_CONTROL) * numPipeControls) + + HardwareCommandsHelper::getSizeRequiredCS() + + EncodeMemoryPrefetch::getSizeForMemoryPrefetch(kernel->kernelInfo.heapInfo.KernelHeapSize); + + WalkerPartition::WalkerPartitionArgs testArgs = {}; + testArgs.initializeWparidRegister = true; + testArgs.emitPipeControlStall = false; + testArgs.crossTileAtomicSynchronization = false; + testArgs.partitionCount = 16u; + testArgs.tileCount = static_cast(device->getDeviceBitfield().count()); + + auto partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer(testArgs); + + DispatchInfo dispatchInfo{}; + dispatchInfo.setNumberOfWorkgroups({32, 1, 1}); + + auto returnedSize = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo); + EXPECT_EQ(returnedSize, partitionSize + baseSize); +} + HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenProgramWalkerIsCalledThenWalkerPartitionLogicIsExecuted) { if (!OSInterface::osEnableLocalMemory) { GTEST_SKIP(); diff --git a/opencl/test/unit_test/command_queue/enqueue_with_walker_partition_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_with_walker_partition_tests.cpp index d21dd64a8e..53217c7be2 100644 --- a/opencl/test/unit_test/command_queue/enqueue_with_walker_partition_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_with_walker_partition_tests.cpp @@ -5,6 +5,7 @@ * */ +#include "shared/source/command_container/implicit_scaling.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" @@ -39,9 +40,12 @@ struct EnqueueWithWalkerPartitionTests : public ::testing::Test { std::unique_ptr context; }; -HWCMDTEST_F(IGFX_XE_HP_CORE, EnqueueWithWalkerPartitionTests, givenCsrWithSpecificNumberOfTilesWhenDispatchingThenConstructCmdBufferForAllSupportedTiles) { +HWCMDTEST_F(IGFX_XE_HP_CORE, EnqueueWithWalkerPartitionTests, + givenCsrWithSpecificNumberOfTilesAndPipeControlWithStallRequiredWhenDispatchingThenConstructCmdBufferForAllSupportedTiles) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), true); + MockCommandQueueHw commandQueue(context.get(), rootDevice.get(), nullptr); commandQueue.gpgpuEngine = &engineControlForFusedQueue; rootDevice->setPreemptionMode(PreemptionMode::Disabled); diff --git a/opencl/test/unit_test/command_queue/walker_partition_tests_xehp_and_later_1.cpp b/opencl/test/unit_test/command_queue/walker_partition_tests_xehp_and_later_1.cpp index fb13d97366..cd9d2d42e5 100644 --- a/opencl/test/unit_test/command_queue/walker_partition_tests_xehp_and_later_1.cpp +++ b/opencl/test/unit_test/command_queue/walker_partition_tests_xehp_and_later_1.cpp @@ -339,7 +339,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionAnd &walker, totalBytesProgrammed, testArgs); - EXPECT_EQ(controlSectionOffset + sizeof(StaticPartitioningControlSection), totalBytesProgrammed); + EXPECT_EQ(controlSectionOffset, totalBytesProgrammed); auto parsedOffset = 0u; { @@ -399,20 +399,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionAnd EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); EXPECT_EQ(1u, miSemaphoreWait->getSemaphoreDataDword()); } - { - auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); - ASSERT_NE(nullptr, batchBufferStart); - parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); - EXPECT_FALSE(batchBufferStart->getPredicationEnable()); - const auto afterControlSectionAddress = cmdBufferGpuAddress + controlSectionOffset + sizeof(StaticPartitioningControlSection); - EXPECT_EQ(afterControlSectionAddress, batchBufferStart->getBatchBufferStartAddress()); - } - { - auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, parsedOffset)); - parsedOffset += sizeof(StaticPartitioningControlSection); - StaticPartitioningControlSection expectedControlSection = {}; - EXPECT_EQ(0, std::memcmp(&expectedControlSection, controlSection, sizeof(StaticPartitioningControlSection))); - } + EXPECT_EQ(parsedOffset, totalBytesProgrammed); } @@ -1162,6 +1149,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhe testArgs.emitSelfCleanup = false; testArgs.crossTileAtomicSynchronization = false; testArgs.useAtomicsForSelfCleanup = false; + testArgs.emitPipeControlStall = false; testArgs.staticPartitioning = true; checkForProperCmdBufferAddressOffset = false; @@ -1170,9 +1158,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhe testArgs.workPartitionAllocationGpuVa = 0x8000444000; auto walker = createWalker(postSyncAddress); - uint64_t expectedControlSectionOffset = sizeof(WalkerPartition::COMPUTE_WALKER) + - sizeof(WalkerPartition::PIPE_CONTROL) + - sizeof(WalkerPartition::BATCH_BUFFER_START); + uint64_t expectedControlSectionOffset = sizeof(WalkerPartition::COMPUTE_WALKER); uint32_t totalBytesProgrammed{}; const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset(testArgs); @@ -1191,27 +1177,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhe ASSERT_NE(nullptr, computeWalker); parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); } - { - auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); - ASSERT_NE(nullptr, pipeControl); - parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); - EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); - EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); - } - { - auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); - ASSERT_NE(nullptr, batchBufferStart); - parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); - EXPECT_FALSE(batchBufferStart->getPredicationEnable()); - const auto afterControlSectionAddress = cmdBufferGpuAddress + controlSectionOffset + sizeof(StaticPartitioningControlSection); - EXPECT_EQ(afterControlSectionAddress, batchBufferStart->getBatchBufferStartAddress()); - } - { - auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, parsedOffset)); - parsedOffset += sizeof(StaticPartitioningControlSection); - StaticPartitioningControlSection expectedControlSection = {}; - EXPECT_EQ(0, std::memcmp(&expectedControlSection, controlSection, sizeof(StaticPartitioningControlSection))); - } EXPECT_EQ(parsedOffset, totalBytesProgrammed); } @@ -1231,8 +1196,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhe auto walker = createWalker(postSyncAddress); uint64_t expectedControlSectionOffset = sizeof(WalkerPartition::LOAD_REGISTER_MEM) + - sizeof(WalkerPartition::COMPUTE_WALKER) + - sizeof(WalkerPartition::BATCH_BUFFER_START); + sizeof(WalkerPartition::COMPUTE_WALKER); uint32_t totalBytesProgrammed{}; const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset(testArgs); @@ -1260,20 +1224,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhe ASSERT_NE(nullptr, computeWalker); parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); } - { - auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); - ASSERT_NE(nullptr, batchBufferStart); - parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); - EXPECT_FALSE(batchBufferStart->getPredicationEnable()); - const auto afterControlSectionAddress = cmdBufferGpuAddress + controlSectionOffset + sizeof(StaticPartitioningControlSection); - EXPECT_EQ(afterControlSectionAddress, batchBufferStart->getBatchBufferStartAddress()); - } - { - auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, parsedOffset)); - parsedOffset += sizeof(StaticPartitioningControlSection); - StaticPartitioningControlSection expectedControlSection = {}; - EXPECT_EQ(0, std::memcmp(&expectedControlSection, controlSection, sizeof(StaticPartitioningControlSection))); - } EXPECT_EQ(parsedOffset, totalBytesProgrammed); } diff --git a/opencl/test/unit_test/command_stream/implicit_scaling_ocl_tests.cpp b/opencl/test/unit_test/command_stream/implicit_scaling_ocl_tests.cpp index 6a6f609188..e18ef56c89 100644 --- a/opencl/test/unit_test/command_stream/implicit_scaling_ocl_tests.cpp +++ b/opencl/test/unit_test/command_stream/implicit_scaling_ocl_tests.cpp @@ -14,11 +14,3 @@ using namespace NEO; TEST(ImplicitScalingApiTests, givenOpenClApiUsedThenSupportEnabled) { EXPECT_TRUE(ImplicitScaling::apiSupport); } - -TEST(ImplicitScalingApiTests, givenOpenClApiUsedThenSemaphoreProgrammingRequiredIsFalse) { - EXPECT_FALSE(ImplicitScaling::semaphoreProgrammingRequired); -} - -TEST(ImplicitScalingApiTests, givenOpenClApiUsedThenCrossTileAtomicSynchronization) { - EXPECT_TRUE(ImplicitScaling::crossTileAtomicSynchronization); -} diff --git a/shared/source/command_container/implicit_scaling.cpp b/shared/source/command_container/implicit_scaling.cpp index 7c48982658..d6234d8441 100644 --- a/shared/source/command_container/implicit_scaling.cpp +++ b/shared/source/command_container/implicit_scaling.cpp @@ -7,6 +7,7 @@ #include "shared/source/command_container/implicit_scaling.h" +#include "shared/source/command_container/walker_partition_interface.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/os_interface/os_interface.h" @@ -36,17 +37,17 @@ bool ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired() { } bool ImplicitScalingHelper::isSemaphoreProgrammingRequired() { - auto semaphoreProgrammingRequired = ImplicitScaling::semaphoreProgrammingRequired; - int overrideSemaphoreProgrammingRequired = NEO::DebugManager.flags.SynchronizeWithSemaphores.get(); + auto semaphoreProgrammingRequired = false; + int overrideSemaphoreProgrammingRequired = DebugManager.flags.SynchronizeWithSemaphores.get(); if (overrideSemaphoreProgrammingRequired != -1) { semaphoreProgrammingRequired = !!overrideSemaphoreProgrammingRequired; } return semaphoreProgrammingRequired; } -bool ImplicitScalingHelper::isCrossTileAtomicRequired() { - auto crossTileAtomicSynchronization = ImplicitScaling::crossTileAtomicSynchronization; - int overrideCrossTileAtomicSynchronization = NEO::DebugManager.flags.UseCrossAtomicSynchronization.get(); +bool ImplicitScalingHelper::isCrossTileAtomicRequired(bool defaultCrossTileRequirement) { + auto crossTileAtomicSynchronization = defaultCrossTileRequirement; + int overrideCrossTileAtomicSynchronization = DebugManager.flags.UseCrossAtomicSynchronization.get(); if (overrideCrossTileAtomicSynchronization != -1) { crossTileAtomicSynchronization = !!overrideCrossTileAtomicSynchronization; } @@ -62,7 +63,12 @@ bool ImplicitScalingHelper::isAtomicsUsedForSelfCleanup() { return useAtomics; } -bool ImplicitScalingHelper::isSelfCleanupRequired(bool defaultSelfCleanup) { +bool ImplicitScalingHelper::isSelfCleanupRequired(const WalkerPartition::WalkerPartitionArgs &args, bool apiSelfCleanup) { + bool defaultSelfCleanup = apiSelfCleanup && + (args.crossTileAtomicSynchronization || + args.synchronizeBeforeExecution || + !args.staticPartitioning); + int overrideProgramSelfCleanup = DebugManager.flags.ProgramWalkerPartitionSelfCleanup.get(); if (overrideProgramSelfCleanup != -1) { defaultSelfCleanup = !!(overrideProgramSelfCleanup); @@ -79,13 +85,12 @@ bool ImplicitScalingHelper::isWparidRegisterInitializationRequired() { return initWparidRegister; } -bool ImplicitScalingHelper::isPipeControlStallRequired() { - bool emitPipeControl = true; +bool ImplicitScalingHelper::isPipeControlStallRequired(bool defaultEmitPipeControl) { int overrideUsePipeControl = DebugManager.flags.UsePipeControlAfterPartitionedWalker.get(); if (overrideUsePipeControl != -1) { - emitPipeControl = !!(overrideUsePipeControl); + defaultEmitPipeControl = !!(overrideUsePipeControl); } - return emitPipeControl; + return defaultEmitPipeControl; } } // namespace NEO diff --git a/shared/source/command_container/implicit_scaling.h b/shared/source/command_container/implicit_scaling.h index c5b6fe43e8..26d8ac9cb1 100644 --- a/shared/source/command_container/implicit_scaling.h +++ b/shared/source/command_container/implicit_scaling.h @@ -10,43 +10,51 @@ #include "shared/source/helpers/common_types.h" #include "shared/source/helpers/vec.h" +namespace WalkerPartition { +struct WalkerPartitionArgs; +} + namespace NEO { class LinearStream; namespace ImplicitScaling { extern bool apiSupport; -extern bool semaphoreProgrammingRequired; -extern bool crossTileAtomicSynchronization; } // namespace ImplicitScaling struct ImplicitScalingHelper { static bool isImplicitScalingEnabled(const DeviceBitfield &devices, bool preCondition); static bool isSemaphoreProgrammingRequired(); - static bool isCrossTileAtomicRequired(); + static bool isCrossTileAtomicRequired(bool defaultCrossTileRequirement); static bool isSynchronizeBeforeExecutionRequired(); static bool isAtomicsUsedForSelfCleanup(); - static bool isSelfCleanupRequired(bool defaultSelfCleanup); + static bool isSelfCleanupRequired(const WalkerPartition::WalkerPartitionArgs &args, bool apiSelfCleanup); static bool isWparidRegisterInitializationRequired(); - static bool isPipeControlStallRequired(); + static bool isPipeControlStallRequired(bool defaultEmitPipeControl); }; template struct ImplicitScalingDispatch { using WALKER_TYPE = typename GfxFamily::WALKER_TYPE; - static size_t getSize(bool emitSelfCleanup, + static size_t getSize(bool apiSelfCleanup, bool preferStaticPartitioning, const DeviceBitfield &devices, const Vec3 &groupStart, const Vec3 &groupCount); + static void dispatchCommands(LinearStream &commandStream, WALKER_TYPE &walkerCmd, const DeviceBitfield &devices, uint32_t &partitionCount, bool useSecondaryBatchBuffer, - bool emitSelfCleanup, + bool apiSelfCleanup, bool usesImages, uint64_t workPartitionAllocationGpuVa); + + static bool &getPipeControlStallRequired(); + + private: + static bool pipeControlStallRequired; }; template diff --git a/shared/source/command_container/implicit_scaling_xehp_and_later.inl b/shared/source/command_container/implicit_scaling_xehp_and_later.inl index 79229b648c..7239cdbf30 100644 --- a/shared/source/command_container/implicit_scaling_xehp_and_later.inl +++ b/shared/source/command_container/implicit_scaling_xehp_and_later.inl @@ -12,7 +12,39 @@ namespace NEO { template -size_t ImplicitScalingDispatch::getSize(bool emitSelfCleanup, +WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(uint64_t workPartitionAllocationGpuVa, + uint32_t tileCount, + uint32_t partitionCount, + bool emitSelfCleanup, + bool preferStaticPartitioning, + bool staticPartitioning, + bool useSecondaryBatchBuffer) { + WalkerPartition::WalkerPartitionArgs args = {}; + + args.workPartitionAllocationGpuVa = workPartitionAllocationGpuVa; + args.partitionCount = partitionCount; + args.tileCount = tileCount; + args.staticPartitioning = staticPartitioning; + args.preferredStaticPartitioning = preferStaticPartitioning; + + args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup(); + args.initializeWparidRegister = ImplicitScalingHelper::isWparidRegisterInitializationRequired(); + + args.emitPipeControlStall = ImplicitScalingHelper::isPipeControlStallRequired(ImplicitScalingDispatch::getPipeControlStallRequired()); + + args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired(); + args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired(args.emitPipeControlStall); + args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired(); + + args.emitSelfCleanup = ImplicitScalingHelper::isSelfCleanupRequired(args, emitSelfCleanup); + args.emitBatchBufferEnd = false; + args.secondaryBatchBuffer = useSecondaryBatchBuffer; + + return args; +} + +template +size_t ImplicitScalingDispatch::getSize(bool apiSelfCleanup, bool preferStaticPartitioning, const DeviceBitfield &devices, const Vec3 &groupStart, @@ -29,20 +61,13 @@ size_t ImplicitScalingDispatch::getSize(bool emitSelfCleanup, &partitionType, &staticPartitioning); UNRECOVERABLE_IF(staticPartitioning && (tileCount != partitionCount)); - WalkerPartition::WalkerPartitionArgs args = {}; - - args.partitionCount = partitionCount; - args.tileCount = tileCount; - args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired(); - args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup(); - args.emitSelfCleanup = ImplicitScalingHelper::isSelfCleanupRequired(emitSelfCleanup); - args.initializeWparidRegister = ImplicitScalingHelper::isWparidRegisterInitializationRequired(); - args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired(); - args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired(); - args.emitPipeControlStall = ImplicitScalingHelper::isPipeControlStallRequired(); - args.emitBatchBufferEnd = false; - args.staticPartitioning = staticPartitioning; - args.preferredStaticPartitioning = preferStaticPartitioning; + WalkerPartition::WalkerPartitionArgs args = prepareWalkerPartitionArgs(0u, + tileCount, + partitionCount, + apiSelfCleanup, + preferStaticPartitioning, + staticPartitioning, + false); return static_cast(WalkerPartition::estimateSpaceRequiredInCommandBuffer(args)); } @@ -53,7 +78,7 @@ void ImplicitScalingDispatch::dispatchCommands(LinearStream &commandS const DeviceBitfield &devices, uint32_t &partitionCount, bool useSecondaryBatchBuffer, - bool emitSelfCleanup, + bool apiSelfCleanup, bool usesImages, uint64_t workPartitionAllocationGpuVa) { uint32_t totalProgrammedSize = 0u; @@ -63,21 +88,13 @@ void ImplicitScalingDispatch::dispatchCommands(LinearStream &commandS bool staticPartitioning = false; partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType(&walkerCmd, tileCount, preferStaticPartitioning, usesImages, &staticPartitioning); - WalkerPartition::WalkerPartitionArgs args = {}; - args.workPartitionAllocationGpuVa = workPartitionAllocationGpuVa; - args.partitionCount = partitionCount; - args.tileCount = tileCount; - args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired(); - args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup(); - args.emitSelfCleanup = ImplicitScalingHelper::isSelfCleanupRequired(emitSelfCleanup); - args.initializeWparidRegister = ImplicitScalingHelper::isWparidRegisterInitializationRequired(); - args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired(); - args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired(); - args.emitPipeControlStall = ImplicitScalingHelper::isPipeControlStallRequired(); - args.emitBatchBufferEnd = false; - args.secondaryBatchBuffer = useSecondaryBatchBuffer; - args.staticPartitioning = staticPartitioning; - args.preferredStaticPartitioning = preferStaticPartitioning; + WalkerPartition::WalkerPartitionArgs args = prepareWalkerPartitionArgs(workPartitionAllocationGpuVa, + tileCount, + partitionCount, + apiSelfCleanup, + preferStaticPartitioning, + staticPartitioning, + useSecondaryBatchBuffer); if (staticPartitioning) { UNRECOVERABLE_IF(tileCount != partitionCount); @@ -104,4 +121,9 @@ void ImplicitScalingDispatch::dispatchCommands(LinearStream &commandS commandStream.getSpace(totalProgrammedSize); } +template +bool &ImplicitScalingDispatch::getPipeControlStallRequired() { + return ImplicitScalingDispatch::pipeControlStallRequired; +} + } // namespace NEO diff --git a/shared/source/command_container/walker_partition_interface.h b/shared/source/command_container/walker_partition_interface.h index dc08744cdb..e19b7d7b64 100644 --- a/shared/source/command_container/walker_partition_interface.h +++ b/shared/source/command_container/walker_partition_interface.h @@ -46,12 +46,12 @@ struct BatchBufferControlData { uint32_t inTileCount = 0u; uint32_t finalSyncTileCount = 0u; }; -static constexpr inline size_t dynamicPartitioningFieldsForCleanupCount = sizeof(BatchBufferControlData) / sizeof(uint32_t) - 1; +constexpr size_t dynamicPartitioningFieldsForCleanupCount = sizeof(BatchBufferControlData) / sizeof(uint32_t) - 1; struct StaticPartitioningControlSection { uint32_t synchronizeBeforeWalkerCounter = 0; uint32_t synchronizeAfterWalkerCounter = 0; uint32_t finalSyncTileCounter = 0; }; -static constexpr inline size_t staticPartitioningFieldsForCleanupCount = sizeof(StaticPartitioningControlSection) / sizeof(uint32_t) - 1; +constexpr size_t staticPartitioningFieldsForCleanupCount = sizeof(StaticPartitioningControlSection) / sizeof(uint32_t) - 1; } // namespace WalkerPartition diff --git a/shared/source/command_container/walker_partition_xehp_and_later.h b/shared/source/command_container/walker_partition_xehp_and_later.h index 340521c776..3e08e4afdc 100644 --- a/shared/source/command_container/walker_partition_xehp_and_later.h +++ b/shared/source/command_container/walker_partition_xehp_and_later.h @@ -595,6 +595,11 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer, } } +template +bool isStartAndControlSectionRequired(WalkerPartitionArgs &args) { + return args.synchronizeBeforeExecution || args.crossTileAtomicSynchronization || args.emitSelfCleanup; +} + template uint64_t computeStaticPartitioningControlSectionOffset(WalkerPartitionArgs &args) { const auto beforeExecutionSyncAtomicSize = args.synchronizeBeforeExecution @@ -615,6 +620,9 @@ uint64_t computeStaticPartitioningControlSectionOffset(WalkerPartitionArgs &args const auto pipeControlSize = args.emitPipeControlStall ? sizeof(PIPE_CONTROL) : 0u; + const auto bbStartSize = isStartAndControlSectionRequired(args) + ? sizeof(BATCH_BUFFER_START) + : 0u; return beforeExecutionSyncAtomicSize + wparidRegisterSize + pipeControlSize + @@ -622,7 +630,7 @@ uint64_t computeStaticPartitioningControlSectionOffset(WalkerPartitionArgs &args selfCleanupSectionSize + afterExecutionSyncAtomicSize + afterExecutionSyncPostSyncSize + - sizeof(BATCH_BUFFER_START); + bbStartSize; } template @@ -670,16 +678,18 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer, programTilesSynchronizationWithAtomics(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount); } - // Jump over the control section - programMiBatchBufferStart(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer); + // Jump over the control section only when needed + if (isStartAndControlSectionRequired(args)) { + programMiBatchBufferStart(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer); - // Control section - DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset); - StaticPartitioningControlSection *controlSection = putCommand(currentBatchBufferPointer, totalBytesProgrammed); - controlSection->synchronizeBeforeWalkerCounter = 0u; - controlSection->synchronizeAfterWalkerCounter = 0u; - controlSection->finalSyncTileCounter = 0u; - DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset); + // Control section + DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset); + StaticPartitioningControlSection *controlSection = putCommand(currentBatchBufferPointer, totalBytesProgrammed); + controlSection->synchronizeBeforeWalkerCounter = 0u; + controlSection->synchronizeAfterWalkerCounter = 0u; + controlSection->finalSyncTileCounter = 0u; + DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset); + } // Cleanup section if (args.emitSelfCleanup) { @@ -696,11 +706,10 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer, template uint64_t estimateSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args) { - uint64_t size = {}; if (args.staticPartitioning) { size += computeStaticPartitioningControlSectionOffset(args); - size += sizeof(StaticPartitioningControlSection); + size += isStartAndControlSectionRequired(args) ? sizeof(StaticPartitioningControlSection) : 0u; size += args.emitSelfCleanup ? computeSelfCleanupEndSectionSize(staticPartitioningFieldsForCleanupCount, args.useAtomicsForSelfCleanup) : 0u; } else { size += computeControlSectionOffset(args); diff --git a/shared/source/xe_hp_core/implicit_scaling_xe_hp_core.cpp b/shared/source/xe_hp_core/implicit_scaling_xe_hp_core.cpp index 0916836854..eca12e87dc 100644 --- a/shared/source/xe_hp_core/implicit_scaling_xe_hp_core.cpp +++ b/shared/source/xe_hp_core/implicit_scaling_xe_hp_core.cpp @@ -12,5 +12,8 @@ namespace NEO { using Family = XeHpFamily; +template <> +bool ImplicitScalingDispatch::pipeControlStallRequired = true; + template struct ImplicitScalingDispatch; } // namespace NEO diff --git a/shared/test/common/fixtures/CMakeLists.txt b/shared/test/common/fixtures/CMakeLists.txt index 75f9c836ed..2b4a290ca1 100644 --- a/shared/test/common/fixtures/CMakeLists.txt +++ b/shared/test/common/fixtures/CMakeLists.txt @@ -10,6 +10,7 @@ target_sources(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/direct_submission_fixture.h ${CMAKE_CURRENT_SOURCE_DIR}/front_window_fixture.cpp ${CMAKE_CURRENT_SOURCE_DIR}/front_window_fixture.h + ${CMAKE_CURRENT_SOURCE_DIR}/implicit_scaling_fixture.cpp ${CMAKE_CURRENT_SOURCE_DIR}/implicit_scaling_fixture.h ${CMAKE_CURRENT_SOURCE_DIR}/linear_stream_fixture.h ${CMAKE_CURRENT_SOURCE_DIR}/preemption_fixture.cpp diff --git a/shared/test/common/fixtures/implicit_scaling_fixture.cpp b/shared/test/common/fixtures/implicit_scaling_fixture.cpp new file mode 100644 index 0000000000..d54126cf00 --- /dev/null +++ b/shared/test/common/fixtures/implicit_scaling_fixture.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/test/common/fixtures/implicit_scaling_fixture.h" + +#include "shared/source/helpers/aligned_memory.h" +#include "shared/source/os_interface/os_interface.h" + +void ImplicitScalingFixture::SetUp() { + CommandEncodeStatesFixture::SetUp(); + apiSupportBackup = std::make_unique>(&ImplicitScaling::apiSupport, true); + osLocalMemoryBackup = std::make_unique>(&OSInterface::osEnableLocalMemory, true); + + singleTile = DeviceBitfield(static_cast(maxNBitValue(1))); + twoTile = DeviceBitfield(static_cast(maxNBitValue(2))); + + alignedMemory = alignedMalloc(bufferSize, 4096); + + cmdBufferAlloc.setCpuPtrAndGpuAddress(alignedMemory, gpuVa); + + commandStream.replaceBuffer(alignedMemory, bufferSize); + commandStream.replaceGraphicsAllocation(&cmdBufferAlloc); +} + +void ImplicitScalingFixture::TearDown() { + alignedFree(alignedMemory); + CommandEncodeStatesFixture::TearDown(); +} diff --git a/shared/test/common/fixtures/implicit_scaling_fixture.h b/shared/test/common/fixtures/implicit_scaling_fixture.h index 1aea384198..d584273760 100644 --- a/shared/test/common/fixtures/implicit_scaling_fixture.h +++ b/shared/test/common/fixtures/implicit_scaling_fixture.h @@ -9,10 +9,6 @@ #include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_stream/linear_stream.h" -#include "shared/source/helpers/aligned_memory.h" -#include "shared/source/os_interface/os_interface.h" -#include "shared/test/common/cmd_parse/gen_cmd_parse.h" -#include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/fixtures/command_container_fixture.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/variable_backup.h" @@ -27,26 +23,8 @@ using namespace NEO; struct ImplicitScalingFixture : public CommandEncodeStatesFixture { - void SetUp() { - CommandEncodeStatesFixture::SetUp(); - apiSupportBackup = std::make_unique>(&ImplicitScaling::apiSupport, true); - osLocalMemoryBackup = std::make_unique>(&OSInterface::osEnableLocalMemory, true); - - singleTile = DeviceBitfield(static_cast(maxNBitValue(1))); - twoTile = DeviceBitfield(static_cast(maxNBitValue(2))); - - alignedMemory = alignedMalloc(bufferSize, 4096); - - cmdBufferAlloc.setCpuPtrAndGpuAddress(alignedMemory, gpuVa); - - commandStream.replaceBuffer(alignedMemory, bufferSize); - commandStream.replaceGraphicsAllocation(&cmdBufferAlloc); - } - - void TearDown() { - alignedFree(alignedMemory); - CommandEncodeStatesFixture::TearDown(); - } + void SetUp(); + void TearDown(); static constexpr uint64_t gpuVa = (1ull << 48); static constexpr size_t bufferSize = 1024u; diff --git a/shared/test/unit_test/encoders/CMakeLists.txt b/shared/test/unit_test/encoders/CMakeLists.txt index 96c8c58b88..f4d4669ba8 100644 --- a/shared/test/unit_test/encoders/CMakeLists.txt +++ b/shared/test/unit_test/encoders/CMakeLists.txt @@ -32,4 +32,10 @@ if(TESTS_XEHP_AND_LATER) ) endif() +if(TESTS_XE_HP_CORE) + target_sources(${TARGET_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/test_implicit_scaling_xe_hp.cpp + ) +endif() + add_subdirectories() diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp index 8cb48cb5ac..0cddde4c61 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp @@ -979,11 +979,16 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp EXPECT_EQ(expectedPartitionSize, partitionWalkerCmd->getPartitionSize()); } -HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImplicitScalingWhenEncodingDispatchKernelThenExpectSelfCleanupSection) { +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImplicitScalingRequiresPipeControlStallWhenEncodingDispatchKernelThenExpectCrossTileSyncAndSelfCleanupSection) { using WALKER_TYPE = typename FamilyType::WALKER_TYPE; using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + VariableBackup backup(&ImplicitScaling::apiSupport, true); + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), true); uint32_t dims[] = {16, 1, 1}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); @@ -1069,6 +1074,88 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp auto inTileCountFieldImm = static_cast(*storeCmd); EXPECT_EQ(expectedCleanupGpuVa, inTileCountFieldImm->getAddress()); EXPECT_EQ(expectedData, inTileCountFieldImm->getDataDword0()); + + GenCmdList pipeControlList = hwParser.getCommandsList(); + EXPECT_EQ(1u, pipeControlList.size()); + + GenCmdList miAtomicList = hwParser.getCommandsList(); + EXPECT_EQ(4u, miAtomicList.size()); + + GenCmdList miSemaphoreWaitList = hwParser.getCommandsList(); + EXPECT_EQ(3u, miSemaphoreWaitList.size()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, + givenImplicitScalingRequiresNoPipeControlStallWhenEncodingDispatchKernelThenExpectCrossTileSyncAndSelfCleanupSection) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + VariableBackup backup(&ImplicitScaling::apiSupport, true); + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), false); + + uint32_t dims[] = {16, 1, 1}; + std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); + + DebugManager.flags.EnableWalkerPartition.set(0); + bool isInternal = false; + size_t baseEstimateSize = EncodeDispatchKernel::estimateEncodeDispatchKernelCmdsSize( + pDevice, Vec3(0, 0, 0), Vec3(16, 1, 1), isInternal, false, false, dispatchInterface.get()); + + DebugManager.flags.EnableWalkerPartition.set(1); + + uint32_t partitionCount = 0; + bool requiresUncachedMocs = false; + + size_t partitionEstimateSize = EncodeDispatchKernel::estimateEncodeDispatchKernelCmdsSize( + pDevice, Vec3(0, 0, 0), Vec3(16, 1, 1), isInternal, false, false, dispatchInterface.get()); + EncodeDispatchKernel::encode(*cmdContainer.get(), dims, false, false, dispatchInterface.get(), 0, false, false, pDevice, + NEO::PreemptionMode::Disabled, requiresUncachedMocs, false, partitionCount, isInternal, false); + + EXPECT_EQ(2u, partitionCount); + size_t partitionedWalkerSize = cmdContainer->getCommandStream()->getUsed(); + + size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch::getSize(true, false, pDevice->getDeviceBitfield(), Vec3(0, 0, 0), Vec3(16, 1, 1)); + EXPECT_EQ(partitionEstimateSize, baseEstimateSize + expectedPartitionedWalkerSize); + EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize); + + GenCmdList partitionedWalkerList; + CmdParse::parseCommandBuffer( + partitionedWalkerList, + cmdContainer->getCommandStream()->getCpuBase(), + partitionedWalkerSize); + auto startCmdList = findAll(partitionedWalkerList.begin(), partitionedWalkerList.end()); + EXPECT_EQ(3u, startCmdList.size()); + bool secondary = true; + for (auto &ptr : startCmdList) { + BATCH_BUFFER_START *startCmd = reinterpret_cast(*ptr); + secondary &= static_cast(startCmd->getSecondLevelBatchBuffer()); + } + EXPECT_TRUE(secondary); + + auto itor = find(partitionedWalkerList.begin(), partitionedWalkerList.end()); + ASSERT_NE(itor, partitionedWalkerList.end()); + auto partitionWalkerCmd = genCmdCast(*itor); + EXPECT_EQ(WALKER_TYPE::PARTITION_TYPE::PARTITION_TYPE_X, partitionWalkerCmd->getPartitionType()); + uint32_t expectedPartitionSize = (dims[0] + partitionCount - 1u) / partitionCount; + EXPECT_EQ(expectedPartitionSize, partitionWalkerCmd->getPartitionSize()); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdContainer->getCommandStream()); + GenCmdList storeDataImmList = hwParser.getCommandsList(); + EXPECT_EQ(4u, storeDataImmList.size()); + + GenCmdList pipeControlList = hwParser.getCommandsList(); + EXPECT_EQ(0u, pipeControlList.size()); + + GenCmdList miAtomicList = hwParser.getCommandsList(); + EXPECT_EQ(4u, miAtomicList.size()); + + GenCmdList miSemaphoreWaitList = hwParser.getCommandsList(); + EXPECT_EQ(3u, miSemaphoreWaitList.size()); } HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImplicitScalingWhenEncodingDispatchKernelOnInternalEngineThenExpectNoWalkerPartitioning) { diff --git a/shared/test/unit_test/encoders/test_implicit_scaling.cpp b/shared/test/unit_test/encoders/test_implicit_scaling.cpp index 6124b70ff5..8255321ee0 100644 --- a/shared/test/unit_test/encoders/test_implicit_scaling.cpp +++ b/shared/test/unit_test/encoders/test_implicit_scaling.cpp @@ -5,6 +5,7 @@ * */ +#include "shared/source/command_container/walker_partition_interface.h" #include "shared/test/common/fixtures/implicit_scaling_fixture.h" TEST_F(ImplicitScalingTests, givenMultiTileDeviceWhenApiAndOsSupportThenFeatureEnabled) { @@ -61,21 +62,70 @@ TEST_F(ImplicitScalingTests, givenForceUseAtomicsWhenCheckingAtomicsForSelfClean } TEST_F(ImplicitScalingTests, givenDefaultSettingsIsFalseWhenCheckingProgramSelfCleanupThenExpectFalse) { - EXPECT_FALSE(ImplicitScalingHelper::isSelfCleanupRequired(false)); + WalkerPartition::WalkerPartitionArgs args = {}; + args.crossTileAtomicSynchronization = true; + args.synchronizeBeforeExecution = true; + args.staticPartitioning = false; + + EXPECT_FALSE(ImplicitScalingHelper::isSelfCleanupRequired(args, false)); } -TEST_F(ImplicitScalingTests, givenDefaultSettingsIsTrueWhenCheckingProgramSelfCleanupThenExpectTrue) { - EXPECT_TRUE(ImplicitScalingHelper::isSelfCleanupRequired(true)); +TEST_F(ImplicitScalingTests, + givenDefaultSettingsAndCrossTileSyncBeforeAndStaticPartititionIsTrueAndCrossTileSyncAfterFalseWhenCheckingProgramSelfCleanupThenExpectTrue) { + WalkerPartition::WalkerPartitionArgs args = {}; + args.crossTileAtomicSynchronization = false; + args.synchronizeBeforeExecution = true; + args.staticPartitioning = true; + + EXPECT_TRUE(ImplicitScalingHelper::isSelfCleanupRequired(args, true)); +} + +TEST_F(ImplicitScalingTests, + givenDefaultSettingsAndCrossTileSyncAfterAndStaticPartitionIsTrueAndCrossTileSyncBeforeExecFalseWhenCheckingProgramSelfCleanupThenExpectTrue) { + WalkerPartition::WalkerPartitionArgs args = {}; + args.crossTileAtomicSynchronization = true; + args.synchronizeBeforeExecution = false; + args.staticPartitioning = true; + + EXPECT_TRUE(ImplicitScalingHelper::isSelfCleanupRequired(args, true)); +} + +TEST_F(ImplicitScalingTests, givenDefaultSettingsAndStaticPartititionIsTrueAndAllCrossTileSyncTrueWhenCheckingProgramSelfCleanupThenExpectTrue) { + WalkerPartition::WalkerPartitionArgs args = {}; + args.crossTileAtomicSynchronization = true; + args.synchronizeBeforeExecution = true; + args.staticPartitioning = true; + + EXPECT_TRUE(ImplicitScalingHelper::isSelfCleanupRequired(args, true)); +} + +TEST_F(ImplicitScalingTests, givenDefaultSettingsIsTrueAndStaticPartititionAndAllCrossTileSyncFalseWhenCheckingProgramSelfCleanupThenExpectTrue) { + WalkerPartition::WalkerPartitionArgs args = {}; + args.crossTileAtomicSynchronization = false; + args.synchronizeBeforeExecution = false; + args.staticPartitioning = true; + + EXPECT_FALSE(ImplicitScalingHelper::isSelfCleanupRequired(args, true)); } TEST_F(ImplicitScalingTests, givenForceNotProgramSelfCleanupWhenDefaultSelfCleanupIsTrueThenExpectFalse) { + WalkerPartition::WalkerPartitionArgs args = {}; + args.crossTileAtomicSynchronization = true; + args.synchronizeBeforeExecution = true; + args.staticPartitioning = false; + DebugManager.flags.ProgramWalkerPartitionSelfCleanup.set(0); - EXPECT_FALSE(ImplicitScalingHelper::isSelfCleanupRequired(true)); + EXPECT_FALSE(ImplicitScalingHelper::isSelfCleanupRequired(args, true)); } TEST_F(ImplicitScalingTests, givenForceProgramSelfCleanupWhenDefaultSelfCleanupIsFalseThenExpectTrue) { + WalkerPartition::WalkerPartitionArgs args = {}; + args.crossTileAtomicSynchronization = false; + args.synchronizeBeforeExecution = false; + args.staticPartitioning = true; + DebugManager.flags.ProgramWalkerPartitionSelfCleanup.set(1); - EXPECT_TRUE(ImplicitScalingHelper::isSelfCleanupRequired(false)); + EXPECT_TRUE(ImplicitScalingHelper::isSelfCleanupRequired(args, false)); } TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingToProgramWparidRegisterThenExpectTrue) { @@ -93,17 +143,19 @@ TEST_F(ImplicitScalingTests, givenForceProgramWparidRegisterWhenCheckingRegister } TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingToUsePipeControlThenExpectTrue) { - EXPECT_TRUE(ImplicitScalingHelper::isPipeControlStallRequired()); + EXPECT_TRUE(ImplicitScalingHelper::isPipeControlStallRequired(true)); + + EXPECT_FALSE(ImplicitScalingHelper::isPipeControlStallRequired(false)); } TEST_F(ImplicitScalingTests, givenForceNotUsePipeControlWhenCheckingPipeControlUseThenExpectFalse) { DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0); - EXPECT_FALSE(ImplicitScalingHelper::isPipeControlStallRequired()); + EXPECT_FALSE(ImplicitScalingHelper::isPipeControlStallRequired(true)); } TEST_F(ImplicitScalingTests, givenForceUsePipeControlWhenCheckingPipeControlUseThenExpectTrue) { DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(1); - EXPECT_TRUE(ImplicitScalingHelper::isPipeControlStallRequired()); + EXPECT_TRUE(ImplicitScalingHelper::isPipeControlStallRequired(false)); } TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingSemaphoreUseThenExpectFalse) { @@ -121,15 +173,17 @@ TEST_F(ImplicitScalingTests, givenForceSemaphoreUseWhenCheckingSemaphoreUseThenE } TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingCrossTileAtomicSyncThenExpectDefaultDefined) { - EXPECT_EQ(ImplicitScaling::crossTileAtomicSynchronization, ImplicitScalingHelper::isCrossTileAtomicRequired()); + EXPECT_FALSE(ImplicitScalingHelper::isCrossTileAtomicRequired(false)); + + EXPECT_TRUE(ImplicitScalingHelper::isCrossTileAtomicRequired(true)); } TEST_F(ImplicitScalingTests, givenForceDisableWhenCheckingCrossTileAtomicSyncThenExpectFalse) { DebugManager.flags.UseCrossAtomicSynchronization.set(0); - EXPECT_FALSE(ImplicitScalingHelper::isCrossTileAtomicRequired()); + EXPECT_FALSE(ImplicitScalingHelper::isCrossTileAtomicRequired(true)); } TEST_F(ImplicitScalingTests, givenForceEnableWhenCheckingCrossTileAtomicSyncThenExpectTrue) { DebugManager.flags.UseCrossAtomicSynchronization.set(1); - EXPECT_TRUE(ImplicitScalingHelper::isCrossTileAtomicRequired()); + EXPECT_TRUE(ImplicitScalingHelper::isCrossTileAtomicRequired(false)); } diff --git a/shared/test/unit_test/encoders/test_implicit_scaling_xe_hp.cpp b/shared/test/unit_test/encoders/test_implicit_scaling_xe_hp.cpp new file mode 100644 index 0000000000..d353d8d66c --- /dev/null +++ b/shared/test/unit_test/encoders/test_implicit_scaling_xe_hp.cpp @@ -0,0 +1,12 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/test/common/fixtures/implicit_scaling_fixture.h" + +HWTEST2_F(ImplicitScalingTests, GivenXeHpCoreWhenCheckingPipeControlStallRequiredThenExpectTrue, IsXeHpCore) { + EXPECT_TRUE(ImplicitScalingDispatch::getPipeControlStallRequired()); +} diff --git a/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp b/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp index 98de45c13a..0cd8c29f33 100644 --- a/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp +++ b/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp @@ -5,6 +5,9 @@ * */ +#include "shared/source/command_container/walker_partition_interface.h" +#include "shared/test/common/cmd_parse/gen_cmd_parse.h" +#include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/fixtures/implicit_scaling_fixture.h" HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenGetSizeWhenDispatchingCmdBufferThenConsumedSizeMatchEstimatedAndCmdBufferHasCorrectCmds) { @@ -140,6 +143,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenDi using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; + DebugManager.flags.UseCrossAtomicSynchronization.set(1); + uint64_t workPartitionAllocationAddress = 0x987654; uint64_t postSyncAddress = (1ull << 48) | (1ull << 24); @@ -191,6 +196,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; + DebugManager.flags.UseCrossAtomicSynchronization.set(1); + uint64_t workPartitionAllocationAddress = 0x987654; uint64_t postSyncAddress = (1ull << 48) | (1ull << 24); @@ -340,3 +347,392 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenDynamicPartitioningPrefe auto itorPipeControl = find(pipeControlList.begin(), pipeControlList.end()); EXPECT_EQ(itorPipeControl, pipeControlList.end()); } + +HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, + givenPipeControlIsRequiredWhenApiRequiresCleanupSectionThenDoAddPipeControlCrossTileSyncAndCleanupSection) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), true); + + uint64_t workPartitionAllocationAddress = 0x1000; + + WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(32); + + size_t expectedSize = sizeof(MI_LOAD_REGISTER_MEM) + + sizeof(WALKER_TYPE) + + sizeof(MI_STORE_DATA_IMM) + + sizeof(PIPE_CONTROL) + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_BATCH_BUFFER_START) + + sizeof(WalkerPartition::StaticPartitioningControlSection) + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_STORE_DATA_IMM) * 2 + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT); + + size_t estimatedSize = 0; + size_t totalBytesProgrammed = 0; + + estimatedSize = ImplicitScalingDispatch::getSize(true, true, twoTile, Vec3(0, 0, 0), Vec3(32, 1, 1)); + EXPECT_EQ(expectedSize, estimatedSize); + + uint32_t partitionCount = 0; + ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, workPartitionAllocationAddress); + totalBytesProgrammed = commandStream.getUsed(); + EXPECT_EQ(expectedSize, totalBytesProgrammed); + EXPECT_EQ(twoTile.count(), partitionCount); + + HardwareParse hwParser; + hwParser.parsePipeControl = true; + hwParser.parseCommands(commandStream, 0); + hwParser.findHardwareCommands(); + + auto loadRegisterMemList = hwParser.getCommandsList(); + EXPECT_EQ(1u, loadRegisterMemList.size()); + + auto computeWalkerList = hwParser.getCommandsList(); + EXPECT_EQ(1u, computeWalkerList.size()); + + auto bbStartList = hwParser.getCommandsList(); + EXPECT_EQ(1u, bbStartList.size()); + + auto storeDataImmList = hwParser.getCommandsList(); + EXPECT_EQ(3u, storeDataImmList.size()); + + EXPECT_EQ(1u, hwParser.pipeControlList.size()); + + auto miAtomicList = hwParser.getCommandsList(); + EXPECT_EQ(3u, miAtomicList.size()); + + auto miSemaphoreList = hwParser.getCommandsList(); + EXPECT_EQ(3u, miSemaphoreList.size()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, + givenPipeControlIsNotRequiredWhenApiRequiresCleanupSectionThenDoNotAddPipeControlCrossTileSyncAndCleanupSection) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), false); + + uint64_t workPartitionAllocationAddress = 0x1000; + + WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(32); + + size_t expectedSize = sizeof(MI_LOAD_REGISTER_MEM) + + sizeof(WALKER_TYPE); + + size_t estimatedSize = 0; + size_t totalBytesProgrammed = 0; + + estimatedSize = ImplicitScalingDispatch::getSize(true, true, twoTile, Vec3(0, 0, 0), Vec3(32, 1, 1)); + EXPECT_EQ(expectedSize, estimatedSize); + + uint32_t partitionCount = 0; + ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, workPartitionAllocationAddress); + totalBytesProgrammed = commandStream.getUsed(); + EXPECT_EQ(expectedSize, totalBytesProgrammed); + EXPECT_EQ(twoTile.count(), partitionCount); + + HardwareParse hwParser; + hwParser.parsePipeControl = true; + hwParser.parseCommands(commandStream, 0); + hwParser.findHardwareCommands(); + + auto loadRegisterMemList = hwParser.getCommandsList(); + EXPECT_EQ(1u, loadRegisterMemList.size()); + + auto computeWalkerList = hwParser.getCommandsList(); + EXPECT_EQ(1u, computeWalkerList.size()); + + auto bbStartList = hwParser.getCommandsList(); + EXPECT_EQ(0u, bbStartList.size()); + + auto storeDataImmList = hwParser.getCommandsList(); + EXPECT_EQ(0u, storeDataImmList.size()); + + EXPECT_EQ(0u, hwParser.pipeControlList.size()); + + auto miAtomicList = hwParser.getCommandsList(); + EXPECT_EQ(0u, miAtomicList.size()); + + auto miSemaphoreList = hwParser.getCommandsList(); + EXPECT_EQ(0u, miSemaphoreList.size()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, + givenPipeControlIsNotRequiredAndForcedCrossTileSyncWhenApiRequiresCleanupSectionThenDoNotAddPipeControlAndAddCrossTileSyncAndCleanupSection) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + DebugManager.flags.UseCrossAtomicSynchronization.set(1); + + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), false); + + uint64_t workPartitionAllocationAddress = 0x1000; + + WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(32); + + size_t expectedSize = sizeof(MI_LOAD_REGISTER_MEM) + + sizeof(WALKER_TYPE) + + sizeof(MI_STORE_DATA_IMM) + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_BATCH_BUFFER_START) + + sizeof(WalkerPartition::StaticPartitioningControlSection) + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_STORE_DATA_IMM) * 2 + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT); + + size_t estimatedSize = 0; + size_t totalBytesProgrammed = 0; + + estimatedSize = ImplicitScalingDispatch::getSize(true, true, twoTile, Vec3(0, 0, 0), Vec3(32, 1, 1)); + EXPECT_EQ(expectedSize, estimatedSize); + + uint32_t partitionCount = 0; + ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, workPartitionAllocationAddress); + totalBytesProgrammed = commandStream.getUsed(); + EXPECT_EQ(expectedSize, totalBytesProgrammed); + EXPECT_EQ(twoTile.count(), partitionCount); + + HardwareParse hwParser; + hwParser.parsePipeControl = true; + hwParser.parseCommands(commandStream, 0); + hwParser.findHardwareCommands(); + + auto loadRegisterMemList = hwParser.getCommandsList(); + EXPECT_EQ(1u, loadRegisterMemList.size()); + + auto computeWalkerList = hwParser.getCommandsList(); + EXPECT_EQ(1u, computeWalkerList.size()); + + auto bbStartList = hwParser.getCommandsList(); + EXPECT_EQ(1u, bbStartList.size()); + + auto storeDataImmList = hwParser.getCommandsList(); + EXPECT_EQ(3u, storeDataImmList.size()); + + EXPECT_EQ(0u, hwParser.pipeControlList.size()); + + auto miAtomicList = hwParser.getCommandsList(); + EXPECT_EQ(3u, miAtomicList.size()); + + auto miSemaphoreList = hwParser.getCommandsList(); + EXPECT_EQ(3u, miSemaphoreList.size()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, + givenPipeControlIsNotRequiredAndForcedCrossTileSyncWhenApiRequiresNoCleanupSectionThenDoNotAddPipeControlAndCleanupSectionAndAddCrossTileSync) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + DebugManager.flags.UseCrossAtomicSynchronization.set(1); + + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), false); + + uint64_t workPartitionAllocationAddress = 0x1000; + + WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(32); + + size_t expectedSize = sizeof(MI_LOAD_REGISTER_MEM) + + sizeof(WALKER_TYPE) + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_BATCH_BUFFER_START) + + sizeof(WalkerPartition::StaticPartitioningControlSection); + + size_t estimatedSize = 0; + size_t totalBytesProgrammed = 0; + + estimatedSize = ImplicitScalingDispatch::getSize(false, true, twoTile, Vec3(0, 0, 0), Vec3(32, 1, 1)); + EXPECT_EQ(expectedSize, estimatedSize); + + uint32_t partitionCount = 0; + ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress); + totalBytesProgrammed = commandStream.getUsed(); + EXPECT_EQ(expectedSize, totalBytesProgrammed); + EXPECT_EQ(twoTile.count(), partitionCount); + + HardwareParse hwParser; + hwParser.parsePipeControl = true; + hwParser.parseCommands(commandStream, 0); + hwParser.findHardwareCommands(); + + auto loadRegisterMemList = hwParser.getCommandsList(); + EXPECT_EQ(1u, loadRegisterMemList.size()); + + auto computeWalkerList = hwParser.getCommandsList(); + EXPECT_EQ(1u, computeWalkerList.size()); + + auto bbStartList = hwParser.getCommandsList(); + EXPECT_EQ(1u, bbStartList.size()); + + auto storeDataImmList = hwParser.getCommandsList(); + EXPECT_EQ(0u, storeDataImmList.size()); + + EXPECT_EQ(0u, hwParser.pipeControlList.size()); + + auto miAtomicList = hwParser.getCommandsList(); + EXPECT_EQ(1u, miAtomicList.size()); + + auto miSemaphoreList = hwParser.getCommandsList(); + EXPECT_EQ(1u, miSemaphoreList.size()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, + givenPipeControlIsNotRequiredAndForcedCrossTileSyncBeforeExecWhenApiRequiresCleanupSectionThenDoNotAddPipeControlAndAddCrossTileSyncAndCleanupSection) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + DebugManager.flags.SynchronizeWalkerInWparidMode.set(1); + + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), false); + + uint64_t workPartitionAllocationAddress = 0x1000; + + WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(32); + + size_t expectedSize = sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_LOAD_REGISTER_MEM) + + sizeof(WALKER_TYPE) + + sizeof(MI_STORE_DATA_IMM) + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_BATCH_BUFFER_START) + + sizeof(WalkerPartition::StaticPartitioningControlSection) + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_STORE_DATA_IMM) * 2 + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT); + + size_t estimatedSize = 0; + size_t totalBytesProgrammed = 0; + + estimatedSize = ImplicitScalingDispatch::getSize(true, true, twoTile, Vec3(0, 0, 0), Vec3(32, 1, 1)); + EXPECT_EQ(expectedSize, estimatedSize); + + uint32_t partitionCount = 0; + ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, true, false, workPartitionAllocationAddress); + totalBytesProgrammed = commandStream.getUsed(); + EXPECT_EQ(expectedSize, totalBytesProgrammed); + EXPECT_EQ(twoTile.count(), partitionCount); + + HardwareParse hwParser; + hwParser.parsePipeControl = true; + hwParser.parseCommands(commandStream, 0); + hwParser.findHardwareCommands(); + + auto loadRegisterMemList = hwParser.getCommandsList(); + EXPECT_EQ(1u, loadRegisterMemList.size()); + + auto computeWalkerList = hwParser.getCommandsList(); + EXPECT_EQ(1u, computeWalkerList.size()); + + auto bbStartList = hwParser.getCommandsList(); + EXPECT_EQ(1u, bbStartList.size()); + + auto storeDataImmList = hwParser.getCommandsList(); + EXPECT_EQ(3u, storeDataImmList.size()); + + EXPECT_EQ(0u, hwParser.pipeControlList.size()); + + auto miAtomicList = hwParser.getCommandsList(); + EXPECT_EQ(4u, miAtomicList.size()); + + auto miSemaphoreList = hwParser.getCommandsList(); + EXPECT_EQ(4u, miSemaphoreList.size()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, + givenPipeControlIsNotRequiredAndForcedCleanupSectionWhenApiNotRequiresCleanupSectionThenDoNotAddPipeControlAndCrossTileSyncAndAddCleanupSection) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + DebugManager.flags.ProgramWalkerPartitionSelfCleanup.set(1); + + VariableBackup pipeControlConfigBackup(&ImplicitScalingDispatch::getPipeControlStallRequired(), false); + + uint64_t workPartitionAllocationAddress = 0x1000; + + WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(32); + + size_t expectedSize = sizeof(MI_LOAD_REGISTER_MEM) + + sizeof(WALKER_TYPE) + + sizeof(MI_STORE_DATA_IMM) + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_BATCH_BUFFER_START) + + sizeof(WalkerPartition::StaticPartitioningControlSection) + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_STORE_DATA_IMM) * 2 + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT); + + size_t estimatedSize = 0; + size_t totalBytesProgrammed = 0; + + estimatedSize = ImplicitScalingDispatch::getSize(false, true, twoTile, Vec3(0, 0, 0), Vec3(32, 1, 1)); + EXPECT_EQ(expectedSize, estimatedSize); + + uint32_t partitionCount = 0; + ImplicitScalingDispatch::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress); + totalBytesProgrammed = commandStream.getUsed(); + EXPECT_EQ(expectedSize, totalBytesProgrammed); + EXPECT_EQ(twoTile.count(), partitionCount); + + HardwareParse hwParser; + hwParser.parsePipeControl = true; + hwParser.parseCommands(commandStream, 0); + hwParser.findHardwareCommands(); + + auto loadRegisterMemList = hwParser.getCommandsList(); + EXPECT_EQ(1u, loadRegisterMemList.size()); + + auto computeWalkerList = hwParser.getCommandsList(); + EXPECT_EQ(1u, computeWalkerList.size()); + + auto bbStartList = hwParser.getCommandsList(); + EXPECT_EQ(1u, bbStartList.size()); + + auto storeDataImmList = hwParser.getCommandsList(); + EXPECT_EQ(3u, storeDataImmList.size()); + + EXPECT_EQ(0u, hwParser.pipeControlList.size()); + + auto miAtomicList = hwParser.getCommandsList(); + EXPECT_EQ(3u, miAtomicList.size()); + + auto miSemaphoreList = hwParser.getCommandsList(); + EXPECT_EQ(3u, miSemaphoreList.size()); +} diff --git a/shared/test/unit_test/ult_specific_config.cpp b/shared/test/unit_test/ult_specific_config.cpp index 9a44d12a74..b9f6b5c598 100644 --- a/shared/test/unit_test/ult_specific_config.cpp +++ b/shared/test/unit_test/ult_specific_config.cpp @@ -14,8 +14,6 @@ namespace NEO { namespace ImplicitScaling { bool apiSupport = false; -bool semaphoreProgrammingRequired = false; -bool crossTileAtomicSynchronization = false; } // namespace ImplicitScaling bool CompressionSelector::preferRenderCompressedBuffer(const AllocationProperties &properties, const HardwareInfo &hwInfo) { return false;