From 37c0501f1c0f9b73e62c2e73230742c2f3466ea8 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Fri, 2 Jul 2021 16:31:57 +0000 Subject: [PATCH] Add atomic move for native cleanup operations Related-To: NEO-5848 Signed-off-by: Zbigniew Zdanowicz --- .../unit_test/command_queue/CMakeLists.txt | 8 + .../walker_partition_tests_xehp_plus.cpp | 2556 +++++++++++++++++ .../test/unit_test/test_files/igdrcl.config | 1 + .../command_container/implicit_scaling.cpp | 8 + .../command_container/implicit_scaling.h | 1 + .../implicit_scaling_xehp_plus.inl | 16 +- .../walker_partition_xehp_plus.h | 97 +- .../debug_settings/debug_variables_base.inl | 1 + .../encoders/test_implicit_scaling.cpp | 14 + 9 files changed, 2664 insertions(+), 38 deletions(-) create mode 100644 opencl/test/unit_test/command_queue/walker_partition_tests_xehp_plus.cpp diff --git a/opencl/test/unit_test/command_queue/CMakeLists.txt b/opencl/test/unit_test/command_queue/CMakeLists.txt index 0a25a8a1c3..5d2cbce1fa 100644 --- a/opencl/test/unit_test/command_queue/CMakeLists.txt +++ b/opencl/test/unit_test/command_queue/CMakeLists.txt @@ -86,8 +86,16 @@ set(IGDRCL_SRCS_tests_command_queue ${CMAKE_CURRENT_SOURCE_DIR}/ooq_task_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/read_write_buffer_cpu_copy.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler_tests.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_tests_xehp_plus.cpp ${CMAKE_CURRENT_SOURCE_DIR}/work_group_size_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/zero_size_enqueue_tests.cpp ) + +if(TESTS_XEHP_PLUS) + list(APPEND IGDRCL_SRCS_tests_command_queue + ${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_tests_xehp_plus.cpp + ) +endif() + target_sources(igdrcl_tests PRIVATE ${IGDRCL_SRCS_tests_command_queue}) add_subdirectories() diff --git a/opencl/test/unit_test/command_queue/walker_partition_tests_xehp_plus.cpp b/opencl/test/unit_test/command_queue/walker_partition_tests_xehp_plus.cpp new file mode 100644 index 0000000000..1271a858f2 --- /dev/null +++ b/opencl/test/unit_test/command_queue/walker_partition_tests_xehp_plus.cpp @@ -0,0 +1,2556 @@ +/* + * Copyright (C) 2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_container/walker_partition_xehp_plus.h" +#include "shared/source/execution_environment/execution_environment.h" +#include "shared/source/os_interface/os_context.h" +#include "shared/source/os_interface/os_interface.h" +#include "shared/test/common/cmd_parse/gen_cmd_parse.h" +#include "shared/test/common/cmd_parse/hw_parse.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/helpers/unit_test_helper.h" +#include "shared/test/common/helpers/variable_backup.h" +#include "shared/test/common/mocks/mock_device.h" + +#include "opencl/source/platform/platform.h" +#include "opencl/test/unit_test/mocks/mock_cl_device.h" +#include "opencl/test/unit_test/mocks/mock_command_queue.h" +#include "opencl/test/unit_test/mocks/mock_context.h" +#include "opencl/test/unit_test/mocks/mock_kernel.h" +#include "opencl/test/unit_test/mocks/mock_platform.h" +#include "test.h" + +using namespace WalkerPartition; + +struct WalkerPartitionTests : public ::testing::Test { + void SetUp() override { + cmdBufferAddress = cmdBuffer; + } + + void TearDown() override { + auto initialCommandBufferPointer = cmdBuffer; + if (checkForProperCmdBufferAddressOffset) { + EXPECT_EQ(ptrDiff(cmdBufferAddress, initialCommandBufferPointer), totalBytesProgrammed); + } + } + + template + auto createWalker(uint64_t postSyncAddress) { + WalkerPartition::COMPUTE_WALKER walker; + walker = GfxFamily::cmdInitGpgpuWalker; + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X); + auto &postSync = walker.getPostSync(); + postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP); + postSync.setDestinationAddress(postSyncAddress); + return walker; + } + + char cmdBuffer[4096u]; + uint32_t totalBytesProgrammed = 0u; + void *cmdBufferAddress = nullptr; + bool checkForProperCmdBufferAddressOffset = true; + bool synchronizeBeforeExecution = false; + bool nativeCrossTileSync = false; +}; + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerPartitionWhenConstructCommandBufferIsCalledThenBatchBufferIsBeingProgrammed) { + auto partitionCount = 16u; + checkForProperCmdBufferAddressOffset = false; + uint64_t gpuVirtualAddress = 0x8000123000; + uint64_t postSyncAddress = 0x8000456000; + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X); + auto &postSync = walker.getPostSync(); + postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP); + postSync.setDestinationAddress(postSyncAddress); + uint32_t totalBytesProgrammed; + + auto expectedCommandUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM) + + sizeof(WalkerPartition::MI_ATOMIC) * 2 + + sizeof(WalkerPartition::LOAD_REGISTER_REG) + + sizeof(WalkerPartition::MI_SET_PREDICATE) * 2 + + sizeof(WalkerPartition::BATCH_BUFFER_START) * 3 + + sizeof(WalkerPartition::PIPE_CONTROL) + + sizeof(WalkerPartition::COMPUTE_WALKER) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + + auto walkerSectionCommands = sizeof(WalkerPartition::BATCH_BUFFER_START) + + sizeof(WalkerPartition::COMPUTE_WALKER); + + EXPECT_EQ(expectedCommandUsedSize, computeControlSectionOffset(partitionCount, synchronizeBeforeExecution, false, false)); + + auto optionalBatchBufferEndOffset = expectedCommandUsedSize + sizeof(BatchBufferControlData); + + auto totalProgrammedSize = optionalBatchBufferEndOffset + sizeof(WalkerPartition::BATCH_BUFFER_END); + + WalkerPartition::constructDynamicallyPartitionedCommandBuffer(cmdBuffer, + gpuVirtualAddress, + &walker, + totalBytesProgrammed, + partitionCount, + 4u, + true, + synchronizeBeforeExecution, + false, + false, + false); + + EXPECT_EQ(totalProgrammedSize, totalBytesProgrammed); + auto wparidMaskProgrammingLocation = cmdBufferAddress; + + auto expectedMask = 0xFFF0u; + auto expectedRegister = 0x21FCu; + + auto loadRegisterImmediate = genCmdCast *>(wparidMaskProgrammingLocation); + ASSERT_NE(nullptr, loadRegisterImmediate); + EXPECT_EQ(expectedRegister, loadRegisterImmediate->getRegisterOffset()); + EXPECT_EQ(expectedMask, loadRegisterImmediate->getDataDword()); + auto parsedOffset = sizeof(WalkerPartition::LOAD_REGISTER_IMM); + + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + auto miAtomicAddress = gpuVirtualAddress + expectedCommandUsedSize; + auto miAtomicProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicAddress, miAtomicProgrammedAddress); + EXPECT_TRUE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + auto loadRegisterReg = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterReg); + EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableDestination()); + EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableSource()); + EXPECT_EQ(wparidCCSOffset, loadRegisterReg->getDestinationRegisterAddress()); + EXPECT_EQ(generalPurposeRegister4, loadRegisterReg->getSourceRegisterAddress()); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_REG); + + auto miSetPredicate = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSetPredicate); + EXPECT_EQ(miSetPredicate->getPredicateEnableWparid(), MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE); + parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE); + + auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + EXPECT_TRUE(batchBufferStart->getPredicationEnable()); + //address routes to WALKER section which is before control section + auto address = batchBufferStart->getBatchBufferStartAddress(); + EXPECT_EQ(address, gpuVirtualAddress + expectedCommandUsedSize - walkerSectionCommands); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + miSetPredicate = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSetPredicate); + EXPECT_EQ(miSetPredicate->getPredicateEnableWparid(), MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_NEVER); + EXPECT_EQ(miSetPredicate->getPredicateEnable(), MI_SET_PREDICATE::PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE); + + parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE); + + auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); + + parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); + + miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + auto miAtomicTileAddress = gpuVirtualAddress + expectedCommandUsedSize + sizeof(uint32_t); + auto miAtomicTileProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicTileAddress, miAtomicTileProgrammedAddress); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + EXPECT_EQ(miSemaphoreWait->getSemaphoreGraphicsAddress(), miAtomicTileAddress); + EXPECT_EQ(miSemaphoreWait->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); + EXPECT_EQ(miSemaphoreWait->getSemaphoreDataDword(), 4u); + + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + + //final batch buffer start that routes at the end of the batch buffer + auto batchBufferStartFinal = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + EXPECT_NE(nullptr, batchBufferStartFinal); + EXPECT_EQ(batchBufferStartFinal->getBatchBufferStartAddress(), gpuVirtualAddress + optionalBatchBufferEndOffset); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + auto computeWalker = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, computeWalker); + parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); + + batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + EXPECT_FALSE(batchBufferStart->getPredicationEnable()); + EXPECT_EQ(gpuVirtualAddress, batchBufferStart->getBatchBufferStartAddress()); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, expectedCommandUsedSize)); + EXPECT_EQ(0u, controlSection->partitionCount); + EXPECT_EQ(0u, controlSection->tileCount); + parsedOffset += sizeof(BatchBufferControlData); + + auto batchBufferEnd = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + EXPECT_NE(nullptr, batchBufferEnd); + EXPECT_EQ(parsedOffset, optionalBatchBufferEndOffset); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhenConstructCommandBufferIsCalledThenBatchBufferIsBeingProgrammed) { + const auto tileCount = 4u; + const auto partitionCount = tileCount; + checkForProperCmdBufferAddressOffset = false; + uint64_t cmdBufferGpuAddress = 0x8000123000; + uint64_t postSyncAddress = 0x8000456000; + uint64_t workPartitionAllocationAddress = 0x8000444000; + auto walker = createWalker(postSyncAddress); + + uint32_t totalBytesProgrammed{}; + const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileSync, false); + const auto postWalkerSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter); + WalkerPartition::constructStaticallyPartitionedCommandBuffer(cmdBuffer, + cmdBufferGpuAddress, + &walker, + totalBytesProgrammed, + partitionCount, + tileCount, + synchronizeBeforeExecution, + false, + nativeCrossTileSync, + workPartitionAllocationAddress, + false); + EXPECT_EQ(controlSectionOffset + sizeof(StaticPartitioningControlSection), totalBytesProgrammed); + + auto parsedOffset = 0u; + { + auto loadRegisterMem = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterMem); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_MEM); + const auto expectedRegister = 0x221Cu; + EXPECT_TRUE(loadRegisterMem->getMmioRemapEnable()); + EXPECT_EQ(expectedRegister, loadRegisterMem->getRegisterAddress()); + EXPECT_EQ(workPartitionAllocationAddress, loadRegisterMem->getMemoryAddress()); + } + { + auto computeWalker = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, computeWalker); + parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); + } + { + auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, pipeControl); + parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(postWalkerSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(postWalkerSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + EXPECT_FALSE(batchBufferStart->getPredicationEnable()); + const auto afterControlSectionAddress = cmdBufferGpuAddress + controlSectionOffset + sizeof(StaticPartitioningControlSection); + EXPECT_EQ(afterControlSectionAddress, batchBufferStart->getBatchBufferStartAddress()); + } + { + auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, parsedOffset)); + parsedOffset += sizeof(StaticPartitioningControlSection); + StaticPartitioningControlSection expectedControlSection = {}; + EXPECT_EQ(0, std::memcmp(&expectedControlSection, controlSection, sizeof(StaticPartitioningControlSection))); + } + EXPECT_EQ(parsedOffset, totalBytesProgrammed); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionAndPreWalkerSyncWhenConstructCommandBufferIsCalledThenBatchBufferIsBeingProgrammed) { + const auto tileCount = 4u; + const auto partitionCount = tileCount; + checkForProperCmdBufferAddressOffset = false; + synchronizeBeforeExecution = true; + uint64_t cmdBufferGpuAddress = 0x8000123000; + uint64_t postSyncAddress = 0x8000456000; + uint64_t workPartitionAllocationAddress = 0x8000444000; + auto walker = createWalker(postSyncAddress); + + uint32_t totalBytesProgrammed{}; + const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileSync, false); + const auto postWalkerSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter); + const auto preWalkerSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter); + WalkerPartition::constructStaticallyPartitionedCommandBuffer(cmdBuffer, + cmdBufferGpuAddress, + &walker, + totalBytesProgrammed, + partitionCount, + tileCount, + synchronizeBeforeExecution, + false, + nativeCrossTileSync, + workPartitionAllocationAddress, + false); + EXPECT_EQ(controlSectionOffset + sizeof(StaticPartitioningControlSection), totalBytesProgrammed); + + auto parsedOffset = 0u; + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(preWalkerSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(preWalkerSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto loadRegisterMem = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterMem); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_MEM); + const auto expectedRegister = 0x221Cu; + EXPECT_TRUE(loadRegisterMem->getMmioRemapEnable()); + EXPECT_EQ(expectedRegister, loadRegisterMem->getRegisterAddress()); + EXPECT_EQ(workPartitionAllocationAddress, loadRegisterMem->getMemoryAddress()); + } + { + auto computeWalker = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, computeWalker); + parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); + } + { + auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, pipeControl); + parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(postWalkerSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(postWalkerSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + EXPECT_FALSE(batchBufferStart->getPredicationEnable()); + const auto afterControlSectionAddress = cmdBufferGpuAddress + controlSectionOffset + sizeof(StaticPartitioningControlSection); + EXPECT_EQ(afterControlSectionAddress, batchBufferStart->getBatchBufferStartAddress()); + } + { + auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, parsedOffset)); + parsedOffset += sizeof(StaticPartitioningControlSection); + StaticPartitioningControlSection expectedControlSection = {}; + EXPECT_EQ(0, std::memcmp(&expectedControlSection, controlSection, sizeof(StaticPartitioningControlSection))); + } + EXPECT_EQ(parsedOffset, totalBytesProgrammed); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionAndSynchronizationWithPostSyncsWhenConstructCommandBufferIsCalledThenBatchBufferIsBeingProgrammed) { + DebugManagerStateRestore restore{}; + DebugManager.flags.ExperimentalSynchronizeWithSemaphores.set(1); + DebugManager.flags.ExperimentalForceCrossAtomicSynchronization.set(0); + + const auto tileCount = 4u; + const auto partitionCount = tileCount; + checkForProperCmdBufferAddressOffset = false; + uint64_t cmdBufferGpuAddress = 0x8000123000; + uint64_t postSyncAddress = 0x8000456000; + uint64_t workPartitionAllocationAddress = 0x8000444000; + auto walker = createWalker(postSyncAddress); + + uint32_t totalBytesProgrammed{}; + const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileSync, false); + WalkerPartition::constructStaticallyPartitionedCommandBuffer(cmdBuffer, + cmdBufferGpuAddress, + &walker, + totalBytesProgrammed, + partitionCount, + tileCount, + synchronizeBeforeExecution, + false, + nativeCrossTileSync, + workPartitionAllocationAddress, false); + EXPECT_EQ(controlSectionOffset + sizeof(StaticPartitioningControlSection), totalBytesProgrammed); + + auto parsedOffset = 0u; + { + auto loadRegisterMem = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterMem); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_MEM); + const auto expectedRegister = 0x221Cu; + EXPECT_TRUE(loadRegisterMem->getMmioRemapEnable()); + EXPECT_EQ(expectedRegister, loadRegisterMem->getRegisterAddress()); + EXPECT_EQ(workPartitionAllocationAddress, loadRegisterMem->getMemoryAddress()); + } + { + auto computeWalker = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, computeWalker); + parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); + } + { + auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, pipeControl); + parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + const auto expectedSemaphoreAddress = walker.getPostSync().getDestinationAddress() + 8llu; + EXPECT_EQ(expectedSemaphoreAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(1u, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + const auto expectedSemaphoreAddress = walker.getPostSync().getDestinationAddress() + 8llu + 16llu; + EXPECT_EQ(expectedSemaphoreAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(1u, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + const auto expectedSemaphoreAddress = walker.getPostSync().getDestinationAddress() + 8llu + 32llu; + EXPECT_EQ(expectedSemaphoreAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(1u, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + const auto expectedSemaphoreAddress = walker.getPostSync().getDestinationAddress() + 8llu + 48llu; + EXPECT_EQ(expectedSemaphoreAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(1u, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + EXPECT_FALSE(batchBufferStart->getPredicationEnable()); + const auto afterControlSectionAddress = cmdBufferGpuAddress + controlSectionOffset + sizeof(StaticPartitioningControlSection); + EXPECT_EQ(afterControlSectionAddress, batchBufferStart->getBatchBufferStartAddress()); + } + { + auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, parsedOffset)); + parsedOffset += sizeof(StaticPartitioningControlSection); + StaticPartitioningControlSection expectedControlSection = {}; + EXPECT_EQ(0, std::memcmp(&expectedControlSection, controlSection, sizeof(StaticPartitioningControlSection))); + } + EXPECT_EQ(parsedOffset, totalBytesProgrammed); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWithNativeCrossTileSyncWhenConstructCommandBufferIsCalledThenBatchBufferIsBeingProgrammed) { + const auto tileCount = 4u; + const auto partitionCount = tileCount; + nativeCrossTileSync = true; + checkForProperCmdBufferAddressOffset = false; + uint64_t cmdBufferGpuAddress = 0x8000123000; + uint64_t postSyncAddress = 0x8000456000; + uint64_t workPartitionAllocationAddress = 0x8000444000; + auto walker = createWalker(postSyncAddress); + + uint32_t totalBytesProgrammed{}; + const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileSync, false); + const auto preWalkerSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter); + const auto postWalkerSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter); + const auto finalSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter); + WalkerPartition::constructStaticallyPartitionedCommandBuffer(cmdBuffer, + cmdBufferGpuAddress, + &walker, + totalBytesProgrammed, + partitionCount, + tileCount, + synchronizeBeforeExecution, + false, + nativeCrossTileSync, + workPartitionAllocationAddress, + false); + const auto expectedBytesProgrammed = WalkerPartition::estimateSpaceRequiredInCommandBuffer(false, + partitionCount, + synchronizeBeforeExecution, + nativeCrossTileSync, + true, + false); + EXPECT_EQ(expectedBytesProgrammed, totalBytesProgrammed); + + auto parsedOffset = 0u; + { + auto loadRegisterMem = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterMem); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_MEM); + const auto expectedRegister = 0x221Cu; + EXPECT_TRUE(loadRegisterMem->getMmioRemapEnable()); + EXPECT_EQ(expectedRegister, loadRegisterMem->getRegisterAddress()); + EXPECT_EQ(workPartitionAllocationAddress, loadRegisterMem->getMemoryAddress()); + } + { + auto computeWalker = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, computeWalker); + parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); + } + { + auto storeDataImm = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, storeDataImm); + parsedOffset += sizeof(WalkerPartition::MI_STORE_DATA_IMM); + EXPECT_EQ(finalSyncAddress, storeDataImm->getAddress()); + EXPECT_FALSE(storeDataImm->getStoreQword()); + EXPECT_EQ(WalkerPartition::MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD, storeDataImm->getDwordLength()); + EXPECT_EQ(0u, storeDataImm->getDataDword0()); + } + { + auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, pipeControl); + parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(postWalkerSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(postWalkerSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + EXPECT_FALSE(batchBufferStart->getPredicationEnable()); + const auto afterControlSectionAddress = cmdBufferGpuAddress + controlSectionOffset + sizeof(StaticPartitioningControlSection); + EXPECT_EQ(afterControlSectionAddress, batchBufferStart->getBatchBufferStartAddress()); + } + { + auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, parsedOffset)); + parsedOffset += sizeof(StaticPartitioningControlSection); + StaticPartitioningControlSection expectedControlSection = {}; + EXPECT_EQ(0, std::memcmp(&expectedControlSection, controlSection, sizeof(StaticPartitioningControlSection))); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(finalSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(finalSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto storeDataImm = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, storeDataImm); + parsedOffset += sizeof(WalkerPartition::MI_STORE_DATA_IMM); + EXPECT_EQ(preWalkerSyncAddress, storeDataImm->getAddress()); + EXPECT_FALSE(storeDataImm->getStoreQword()); + EXPECT_EQ(WalkerPartition::MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD, storeDataImm->getDwordLength()); + EXPECT_EQ(0u, storeDataImm->getDataDword0()); + } + { + auto storeDataImm = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, storeDataImm); + parsedOffset += sizeof(WalkerPartition::MI_STORE_DATA_IMM); + EXPECT_EQ(postWalkerSyncAddress, storeDataImm->getAddress()); + EXPECT_FALSE(storeDataImm->getStoreQword()); + EXPECT_EQ(WalkerPartition::MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD, storeDataImm->getDwordLength()); + EXPECT_EQ(0u, storeDataImm->getDataDword0()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(finalSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(finalSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(2 * tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + EXPECT_EQ(parsedOffset, totalBytesProgrammed); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWithNativeCrossTileSyncAndSyncDisabledWithFlagWhenConstructCommandBufferIsCalledThenStillProgramTheSync) { + DebugManagerStateRestore restore{}; + DebugManager.flags.ExperimentalForceCrossAtomicSynchronization.set(0); + + const auto tileCount = 4u; + const auto partitionCount = tileCount; + nativeCrossTileSync = true; + checkForProperCmdBufferAddressOffset = false; + uint64_t cmdBufferGpuAddress = 0x8000123000; + uint64_t postSyncAddress = 0x8000456000; + uint64_t workPartitionAllocationAddress = 0x8000444000; + auto walker = createWalker(postSyncAddress); + + uint32_t totalBytesProgrammed{}; + const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileSync, false); + const auto preWalkerSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter); + const auto postWalkerSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter); + const auto finalSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter); + WalkerPartition::constructStaticallyPartitionedCommandBuffer(cmdBuffer, + cmdBufferGpuAddress, + &walker, + totalBytesProgrammed, + partitionCount, + tileCount, + synchronizeBeforeExecution, + false, + nativeCrossTileSync, + workPartitionAllocationAddress, + false); + const auto expectedBytesProgrammed = WalkerPartition::estimateSpaceRequiredInCommandBuffer(false, + partitionCount, + synchronizeBeforeExecution, + nativeCrossTileSync, + true, + false); + EXPECT_EQ(expectedBytesProgrammed, totalBytesProgrammed); + + auto parsedOffset = 0u; + { + auto loadRegisterMem = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterMem); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_MEM); + const auto expectedRegister = 0x221Cu; + EXPECT_TRUE(loadRegisterMem->getMmioRemapEnable()); + EXPECT_EQ(expectedRegister, loadRegisterMem->getRegisterAddress()); + EXPECT_EQ(workPartitionAllocationAddress, loadRegisterMem->getMemoryAddress()); + } + { + auto computeWalker = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, computeWalker); + parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); + } + { + auto storeDataImm = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, storeDataImm); + parsedOffset += sizeof(WalkerPartition::MI_STORE_DATA_IMM); + EXPECT_EQ(finalSyncAddress, storeDataImm->getAddress()); + EXPECT_FALSE(storeDataImm->getStoreQword()); + EXPECT_EQ(WalkerPartition::MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD, storeDataImm->getDwordLength()); + EXPECT_EQ(0u, storeDataImm->getDataDword0()); + } + { + auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, pipeControl); + parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(postWalkerSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(postWalkerSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + EXPECT_FALSE(batchBufferStart->getPredicationEnable()); + const auto afterControlSectionAddress = cmdBufferGpuAddress + controlSectionOffset + sizeof(StaticPartitioningControlSection); + EXPECT_EQ(afterControlSectionAddress, batchBufferStart->getBatchBufferStartAddress()); + } + { + auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, parsedOffset)); + parsedOffset += sizeof(StaticPartitioningControlSection); + StaticPartitioningControlSection expectedControlSection = {}; + EXPECT_EQ(0, std::memcmp(&expectedControlSection, controlSection, sizeof(StaticPartitioningControlSection))); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(finalSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(finalSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto storeDataImm = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, storeDataImm); + parsedOffset += sizeof(WalkerPartition::MI_STORE_DATA_IMM); + EXPECT_EQ(preWalkerSyncAddress, storeDataImm->getAddress()); + EXPECT_FALSE(storeDataImm->getStoreQword()); + EXPECT_EQ(WalkerPartition::MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD, storeDataImm->getDwordLength()); + EXPECT_EQ(0u, storeDataImm->getDataDword0()); + } + { + auto storeDataImm = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, storeDataImm); + parsedOffset += sizeof(WalkerPartition::MI_STORE_DATA_IMM); + EXPECT_EQ(postWalkerSyncAddress, storeDataImm->getAddress()); + EXPECT_FALSE(storeDataImm->getStoreQword()); + EXPECT_EQ(WalkerPartition::MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD, storeDataImm->getDwordLength()); + EXPECT_EQ(0u, storeDataImm->getDataDword0()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(finalSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(finalSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(2 * tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + EXPECT_EQ(parsedOffset, totalBytesProgrammed); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWithNativeCrossTileSyncAndAtomicsForNativeWhenConstructCommandBufferIsCalledThenBatchBufferIsBeingProgrammed) { + const auto tileCount = 4u; + const auto partitionCount = tileCount; + bool useAtomicsForNative = true; + nativeCrossTileSync = true; + checkForProperCmdBufferAddressOffset = false; + uint64_t cmdBufferGpuAddress = 0x8000123000; + uint64_t postSyncAddress = 0x8000456000; + uint64_t workPartitionAllocationAddress = 0x8000444000; + auto walker = createWalker(postSyncAddress); + + uint32_t totalBytesProgrammed{}; + const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileSync, useAtomicsForNative); + const auto preWalkerSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter); + const auto postWalkerSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter); + const auto finalSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter); + WalkerPartition::constructStaticallyPartitionedCommandBuffer(cmdBuffer, + cmdBufferGpuAddress, + &walker, + totalBytesProgrammed, + partitionCount, + tileCount, + synchronizeBeforeExecution, + false, + nativeCrossTileSync, + workPartitionAllocationAddress, + useAtomicsForNative); + const auto expectedBytesProgrammed = WalkerPartition::estimateSpaceRequiredInCommandBuffer(false, + partitionCount, + synchronizeBeforeExecution, + nativeCrossTileSync, + true, + useAtomicsForNative); + EXPECT_EQ(expectedBytesProgrammed, totalBytesProgrammed); + + auto parsedOffset = 0u; + { + auto loadRegisterMem = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterMem); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_MEM); + const auto expectedRegister = 0x221Cu; + EXPECT_TRUE(loadRegisterMem->getMmioRemapEnable()); + EXPECT_EQ(expectedRegister, loadRegisterMem->getRegisterAddress()); + EXPECT_EQ(workPartitionAllocationAddress, loadRegisterMem->getMemoryAddress()); + } + { + auto computeWalker = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, computeWalker); + parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(finalSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, pipeControl); + parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(postWalkerSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(postWalkerSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + EXPECT_FALSE(batchBufferStart->getPredicationEnable()); + const auto afterControlSectionAddress = cmdBufferGpuAddress + controlSectionOffset + sizeof(StaticPartitioningControlSection); + EXPECT_EQ(afterControlSectionAddress, batchBufferStart->getBatchBufferStartAddress()); + } + { + auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, parsedOffset)); + parsedOffset += sizeof(StaticPartitioningControlSection); + StaticPartitioningControlSection expectedControlSection = {}; + EXPECT_EQ(0, std::memcmp(&expectedControlSection, controlSection, sizeof(StaticPartitioningControlSection))); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(finalSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(finalSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(preWalkerSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(postWalkerSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(finalSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(finalSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(2 * tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + EXPECT_EQ(parsedOffset, totalBytesProgrammed); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWithNativeCrossTileSyncAndSyncDisabledWithFlagWhenUsingAtomicForNativeAndConstructCommandBufferIsCalledThenStillProgramTheSync) { + DebugManagerStateRestore restore{}; + DebugManager.flags.ExperimentalForceCrossAtomicSynchronization.set(0); + + const auto tileCount = 4u; + const auto partitionCount = tileCount; + nativeCrossTileSync = true; + bool useAtomicsForNative = true; + checkForProperCmdBufferAddressOffset = false; + uint64_t cmdBufferGpuAddress = 0x8000123000; + uint64_t postSyncAddress = 0x8000456000; + uint64_t workPartitionAllocationAddress = 0x8000444000; + auto walker = createWalker(postSyncAddress); + + uint32_t totalBytesProgrammed{}; + const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileSync, useAtomicsForNative); + const auto preWalkerSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter); + const auto postWalkerSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter); + const auto finalSyncAddress = cmdBufferGpuAddress + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter); + WalkerPartition::constructStaticallyPartitionedCommandBuffer(cmdBuffer, + cmdBufferGpuAddress, + &walker, + totalBytesProgrammed, + partitionCount, + tileCount, + synchronizeBeforeExecution, + false, + nativeCrossTileSync, + workPartitionAllocationAddress, + useAtomicsForNative); + const auto expectedBytesProgrammed = WalkerPartition::estimateSpaceRequiredInCommandBuffer(false, + partitionCount, + synchronizeBeforeExecution, + nativeCrossTileSync, + true, + useAtomicsForNative); + EXPECT_EQ(expectedBytesProgrammed, totalBytesProgrammed); + + auto parsedOffset = 0u; + { + auto loadRegisterMem = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterMem); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_MEM); + const auto expectedRegister = 0x221Cu; + EXPECT_TRUE(loadRegisterMem->getMmioRemapEnable()); + EXPECT_EQ(expectedRegister, loadRegisterMem->getRegisterAddress()); + EXPECT_EQ(workPartitionAllocationAddress, loadRegisterMem->getMemoryAddress()); + } + { + auto computeWalker = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, computeWalker); + parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(finalSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, pipeControl); + parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(postWalkerSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(postWalkerSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + EXPECT_FALSE(batchBufferStart->getPredicationEnable()); + const auto afterControlSectionAddress = cmdBufferGpuAddress + controlSectionOffset + sizeof(StaticPartitioningControlSection); + EXPECT_EQ(afterControlSectionAddress, batchBufferStart->getBatchBufferStartAddress()); + } + { + auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, parsedOffset)); + parsedOffset += sizeof(StaticPartitioningControlSection); + StaticPartitioningControlSection expectedControlSection = {}; + EXPECT_EQ(0, std::memcmp(&expectedControlSection, controlSection, sizeof(StaticPartitioningControlSection))); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(finalSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(finalSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(preWalkerSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(postWalkerSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + EXPECT_EQ(finalSyncAddress, UnitTestHelper::getAtomicMemoryAddress(*miAtomic)); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_DWORD, miAtomic->getDataSize()); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getCsStall()); + } + { + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(finalSyncAddress, miSemaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation()); + EXPECT_EQ(2 * tileCount, miSemaphoreWait->getSemaphoreDataDword()); + } + EXPECT_EQ(parsedOffset, totalBytesProgrammed); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenDebugModesFOrWalkerPartitionWhenConstructCommandBufferIsCalledThenBatchBufferIsBeingProgrammed) { + DebugManagerStateRestore restorer; + DebugManager.flags.ExperimentalForceCrossAtomicSynchronization.set(0); + DebugManager.flags.ExperimentalSynchronizeWithSemaphores.set(1); + + auto partitionCount = 16u; + checkForProperCmdBufferAddressOffset = false; + uint64_t gpuVirtualAddress = 0x8000123000; + uint64_t postSyncAddress = 0x8000456000; + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X); + auto &postSync = walker.getPostSync(); + postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP); + postSync.setDestinationAddress(postSyncAddress); + uint32_t totalBytesProgrammed; + + auto expectedCommandUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM) + + sizeof(WalkerPartition::MI_ATOMIC) + + sizeof(WalkerPartition::LOAD_REGISTER_REG) + + sizeof(WalkerPartition::MI_SET_PREDICATE) * 2 + + sizeof(WalkerPartition::BATCH_BUFFER_START) * 3 + + sizeof(WalkerPartition::PIPE_CONTROL) + + sizeof(WalkerPartition::COMPUTE_WALKER) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) * partitionCount; + + auto walkerSectionCommands = sizeof(WalkerPartition::BATCH_BUFFER_START) + + sizeof(WalkerPartition::COMPUTE_WALKER); + + EXPECT_EQ(expectedCommandUsedSize, computeControlSectionOffset(partitionCount, synchronizeBeforeExecution, false, false)); + + auto optionalBatchBufferEndOffset = expectedCommandUsedSize + sizeof(BatchBufferControlData); + + auto totalProgrammedSize = optionalBatchBufferEndOffset + sizeof(WalkerPartition::BATCH_BUFFER_END); + + WalkerPartition::constructDynamicallyPartitionedCommandBuffer(cmdBuffer, + gpuVirtualAddress, &walker, totalBytesProgrammed, partitionCount, + 4u, true, synchronizeBeforeExecution, false, false, false); + + EXPECT_EQ(totalProgrammedSize, totalBytesProgrammed); + auto wparidMaskProgrammingLocation = cmdBufferAddress; + + auto expectedMask = 0xFFF0u; + auto expectedRegister = 0x21FCu; + + auto loadRegisterImmediate = genCmdCast *>(wparidMaskProgrammingLocation); + ASSERT_NE(nullptr, loadRegisterImmediate); + EXPECT_EQ(expectedRegister, loadRegisterImmediate->getRegisterOffset()); + EXPECT_EQ(expectedMask, loadRegisterImmediate->getDataDword()); + auto parsedOffset = sizeof(WalkerPartition::LOAD_REGISTER_IMM); + + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + auto miAtomicAddress = gpuVirtualAddress + expectedCommandUsedSize; + auto miAtomicProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicAddress, miAtomicProgrammedAddress); + EXPECT_TRUE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + auto loadRegisterReg = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterReg); + EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableDestination()); + EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableSource()); + EXPECT_EQ(wparidCCSOffset, loadRegisterReg->getDestinationRegisterAddress()); + EXPECT_EQ(generalPurposeRegister4, loadRegisterReg->getSourceRegisterAddress()); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_REG); + + auto miSetPredicate = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSetPredicate); + EXPECT_EQ(miSetPredicate->getPredicateEnableWparid(), MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE); + parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE); + + auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + EXPECT_TRUE(batchBufferStart->getPredicationEnable()); + //address routes to WALKER section which is before control section + auto address = batchBufferStart->getBatchBufferStartAddress(); + EXPECT_EQ(address, gpuVirtualAddress + expectedCommandUsedSize - walkerSectionCommands); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + miSetPredicate = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSetPredicate); + EXPECT_EQ(miSetPredicate->getPredicateEnableWparid(), MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_NEVER); + EXPECT_EQ(miSetPredicate->getPredicateEnable(), MI_SET_PREDICATE::PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE); + + parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE); + + auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); + + parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); + + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + for (uint32_t partitionId = 0u; partitionId < partitionCount; partitionId++) { + ASSERT_NE(nullptr, miSemaphoreWait); + EXPECT_EQ(miSemaphoreWait->getSemaphoreGraphicsAddress(), postSyncAddress + 8llu + partitionId * 16llu); + EXPECT_EQ(miSemaphoreWait->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); + EXPECT_EQ(miSemaphoreWait->getSemaphoreDataDword(), 1u); + + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + } + + //final batch buffer start that routes at the end of the batch buffer + auto batchBufferStartFinal = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + EXPECT_NE(nullptr, batchBufferStartFinal); + EXPECT_EQ(batchBufferStartFinal->getBatchBufferStartAddress(), gpuVirtualAddress + optionalBatchBufferEndOffset); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + auto computeWalker = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, computeWalker); + parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); + + batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + EXPECT_FALSE(batchBufferStart->getPredicationEnable()); + EXPECT_EQ(gpuVirtualAddress, batchBufferStart->getBatchBufferStartAddress()); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, expectedCommandUsedSize)); + EXPECT_EQ(0u, controlSection->partitionCount); + EXPECT_EQ(0u, controlSection->tileCount); + parsedOffset += sizeof(BatchBufferControlData); + + auto batchBufferEnd = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + EXPECT_NE(nullptr, batchBufferEnd); + EXPECT_EQ(parsedOffset, optionalBatchBufferEndOffset); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramRegisterCommandWhenItIsCalledThenLoadRegisterImmIsSetUnderPointer) { + uint32_t registerOffset = 120u; + uint32_t registerValue = 542u; + auto expectedUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM); + void *loadRegisterImmediateAddress = cmdBufferAddress; + WalkerPartition::programRegisterWithValue(cmdBufferAddress, registerOffset, totalBytesProgrammed, registerValue); + auto loadRegisterImmediate = genCmdCast *>(loadRegisterImmediateAddress); + + ASSERT_NE(nullptr, loadRegisterImmediate); + EXPECT_EQ(expectedUsedSize, totalBytesProgrammed); + EXPECT_EQ(registerOffset, loadRegisterImmediate->getRegisterOffset()); + EXPECT_EQ(registerValue, loadRegisterImmediate->getDataDword()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerPartitionWhenConstructCommandBufferIsCalledWithoutBatchBufferEndThenBatchBufferEndIsNotProgrammed) { + auto partitionCount = 16u; + checkForProperCmdBufferAddressOffset = false; + uint64_t gpuVirtualAddress = 0x8000123000; + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X); + + WalkerPartition::constructDynamicallyPartitionedCommandBuffer(cmdBuffer, + gpuVirtualAddress, + &walker, + totalBytesProgrammed, + partitionCount, + 4u, false, + synchronizeBeforeExecution, + false, + false, + false); + auto totalProgrammedSize = computeControlSectionOffset(partitionCount, synchronizeBeforeExecution, false, false) + + sizeof(BatchBufferControlData); + EXPECT_EQ(totalProgrammedSize, totalBytesProgrammed); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenEstimationWhenItIsCalledThenProperSizeIsReturned) { + auto partitionCount = 16u; + auto expectedUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM) + + sizeof(WalkerPartition::MI_ATOMIC) * 2 + + sizeof(WalkerPartition::LOAD_REGISTER_REG) + + sizeof(WalkerPartition::MI_SET_PREDICATE) * 2 + + sizeof(WalkerPartition::BATCH_BUFFER_START) * 3 + + sizeof(WalkerPartition::COMPUTE_WALKER) + + sizeof(WalkerPartition::PIPE_CONTROL) + + sizeof(WalkerPartition::BatchBufferControlData) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + + auto requiresBatchBufferEnd = false; + EXPECT_EQ(expectedUsedSize, + estimateSpaceRequiredInCommandBuffer(requiresBatchBufferEnd, partitionCount, synchronizeBeforeExecution, false, false, false)); + + requiresBatchBufferEnd = true; + EXPECT_EQ(expectedUsedSize + sizeof(WalkerPartition::BATCH_BUFFER_END), + estimateSpaceRequiredInCommandBuffer(requiresBatchBufferEnd, partitionCount, synchronizeBeforeExecution, false, false, false)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenEstimationWhenPartitionCountIs4ThenSizeIsProperlyEstimated) { + auto partitionCount = 4u; + auto expectedUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM) + + sizeof(WalkerPartition::MI_ATOMIC) * 2 + + sizeof(WalkerPartition::LOAD_REGISTER_REG) + + sizeof(WalkerPartition::MI_SET_PREDICATE) * 2 + + sizeof(WalkerPartition::BATCH_BUFFER_START) * 3 + + sizeof(WalkerPartition::COMPUTE_WALKER) + + sizeof(WalkerPartition::PIPE_CONTROL) + + sizeof(WalkerPartition::BatchBufferControlData) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + + auto requiresBatchBufferEnd = false; + EXPECT_EQ(expectedUsedSize, + estimateSpaceRequiredInCommandBuffer(requiresBatchBufferEnd, partitionCount, synchronizeBeforeExecution, false, false, false)); + + requiresBatchBufferEnd = true; + EXPECT_EQ(expectedUsedSize + sizeof(WalkerPartition::BATCH_BUFFER_END), + estimateSpaceRequiredInCommandBuffer(requiresBatchBufferEnd, partitionCount, synchronizeBeforeExecution, false, false, false)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenEstimationAndSynchronizeBeforeExecutionWhenItIsCalledThenProperSizeIsReturned) { + auto partitionCount = 16u; + auto requiresBatchBufferEnd = false; + auto expectedUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM) + + sizeof(WalkerPartition::MI_ATOMIC) * 2 + + sizeof(WalkerPartition::LOAD_REGISTER_REG) + + sizeof(WalkerPartition::MI_SET_PREDICATE) * 2 + + sizeof(WalkerPartition::BATCH_BUFFER_START) * 3 + + sizeof(WalkerPartition::COMPUTE_WALKER) + + sizeof(WalkerPartition::PIPE_CONTROL) + + sizeof(WalkerPartition::BatchBufferControlData) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + auto expectedDelta = sizeof(WalkerPartition::MI_ATOMIC) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + + synchronizeBeforeExecution = false; + EXPECT_EQ(expectedUsedSize, + estimateSpaceRequiredInCommandBuffer(requiresBatchBufferEnd, partitionCount, synchronizeBeforeExecution, false, false, false)); + + synchronizeBeforeExecution = true; + EXPECT_EQ(expectedUsedSize + expectedDelta, + estimateSpaceRequiredInCommandBuffer(requiresBatchBufferEnd, partitionCount, synchronizeBeforeExecution, false, false, false)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitioningEstimationWhenItIsCalledThenProperSizeIsReturned) { + const auto partitionCount = 16u; + const auto expectedUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_MEM) + + sizeof(WalkerPartition::COMPUTE_WALKER) + + sizeof(WalkerPartition::PIPE_CONTROL) + + sizeof(WalkerPartition::MI_ATOMIC) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) + + sizeof(WalkerPartition::BATCH_BUFFER_START) + + sizeof(WalkerPartition::StaticPartitioningControlSection); + + auto requiresBatchBufferEnd = false; + EXPECT_EQ(expectedUsedSize, + estimateSpaceRequiredInCommandBuffer(requiresBatchBufferEnd, partitionCount, synchronizeBeforeExecution, false, true, false)); + + requiresBatchBufferEnd = true; + EXPECT_EQ(expectedUsedSize, + estimateSpaceRequiredInCommandBuffer(requiresBatchBufferEnd, partitionCount, synchronizeBeforeExecution, false, true, false)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitioningEstimationAndSynchronizeBeforeExecutionWhenItIsCalledThenProperSizeIsReturned) { + const auto partitionCount = 16u; + const auto requiresBatchBufferEnd = false; + const auto expectedUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_MEM) + + sizeof(WalkerPartition::COMPUTE_WALKER) + + sizeof(WalkerPartition::PIPE_CONTROL) + + sizeof(WalkerPartition::MI_ATOMIC) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) + + sizeof(WalkerPartition::BATCH_BUFFER_START) + + sizeof(WalkerPartition::StaticPartitioningControlSection); + + synchronizeBeforeExecution = false; + EXPECT_EQ(expectedUsedSize, + estimateSpaceRequiredInCommandBuffer(requiresBatchBufferEnd, partitionCount, synchronizeBeforeExecution, false, true, false)); + + synchronizeBeforeExecution = true; + const auto preExecutionSynchronizationSize = sizeof(WalkerPartition::MI_ATOMIC) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + EXPECT_EQ(expectedUsedSize + preExecutionSynchronizationSize, + estimateSpaceRequiredInCommandBuffer(requiresBatchBufferEnd, partitionCount, synchronizeBeforeExecution, false, true, false)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenEstimationNativeSectionsWhenItIsCalledThenProperSizeIsReturned) { + auto partitionCount = 16u; + auto requiresBatchBufferEnd = false; + synchronizeBeforeExecution = false; + nativeCrossTileSync = true; + + auto expectedUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM) + + sizeof(WalkerPartition::MI_ATOMIC) * 2 + + sizeof(WalkerPartition::LOAD_REGISTER_REG) + + sizeof(WalkerPartition::MI_SET_PREDICATE) * 2 + + sizeof(WalkerPartition::BATCH_BUFFER_START) * 3 + + sizeof(WalkerPartition::COMPUTE_WALKER) + + sizeof(WalkerPartition::MI_STORE_DATA_IMM) + + sizeof(WalkerPartition::PIPE_CONTROL) + + sizeof(WalkerPartition::BatchBufferControlData) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) + + sizeof(WalkerPartition::MI_ATOMIC) * 2 + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) * 2 + + sizeof(WalkerPartition::MI_STORE_DATA_IMM) * 3; + + EXPECT_EQ(expectedUsedSize, + estimateSpaceRequiredInCommandBuffer(requiresBatchBufferEnd, partitionCount, synchronizeBeforeExecution, nativeCrossTileSync, false, false)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenEstimationNativeSectionsWhenAtomicsUsedForNativeThenProperSizeIsReturned) { + auto partitionCount = 16u; + auto requiresBatchBufferEnd = false; + synchronizeBeforeExecution = false; + nativeCrossTileSync = true; + + auto expectedUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM) + + sizeof(WalkerPartition::MI_ATOMIC) * 2 + + sizeof(WalkerPartition::LOAD_REGISTER_REG) + + sizeof(WalkerPartition::MI_SET_PREDICATE) * 2 + + sizeof(WalkerPartition::BATCH_BUFFER_START) * 3 + + sizeof(WalkerPartition::COMPUTE_WALKER) + + sizeof(WalkerPartition::MI_ATOMIC) + + sizeof(WalkerPartition::PIPE_CONTROL) + + sizeof(WalkerPartition::BatchBufferControlData) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) + + sizeof(WalkerPartition::MI_ATOMIC) * 2 + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) * 2 + + sizeof(WalkerPartition::MI_ATOMIC) * 3; + + EXPECT_EQ(expectedUsedSize, + estimateSpaceRequiredInCommandBuffer(requiresBatchBufferEnd, partitionCount, synchronizeBeforeExecution, nativeCrossTileSync, false, true)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramWparidPredicationMaskWhenItIsCalledWithWrongInputThenFalseIsReturnedAndNothingIsProgrammed) { + EXPECT_FALSE(programWparidMask(cmdBufferAddress, totalBytesProgrammed, 3)); + EXPECT_FALSE(programWparidMask(cmdBufferAddress, totalBytesProgrammed, 5)); + EXPECT_FALSE(programWparidMask(cmdBufferAddress, totalBytesProgrammed, 17)); + EXPECT_FALSE(programWparidMask(cmdBufferAddress, totalBytesProgrammed, 32)); + EXPECT_FALSE(programWparidMask(cmdBufferAddress, totalBytesProgrammed, 15)); + EXPECT_FALSE(programWparidMask(cmdBufferAddress, totalBytesProgrammed, 11)); + EXPECT_FALSE(programWparidMask(cmdBufferAddress, totalBytesProgrammed, 9)); + EXPECT_EQ(0u, totalBytesProgrammed); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramWparidPredicationMaskWhenItIsCalledWithPartitionCountThenProperMaskIsSet) { + auto wparidMaskProgrammingLocation = cmdBufferAddress; + EXPECT_TRUE(programWparidMask(cmdBufferAddress, totalBytesProgrammed, 16)); + auto expectedUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM); + EXPECT_EQ(expectedUsedSize, totalBytesProgrammed); + + auto expectedMask = 0xFFF0u; + auto expectedRegister = 0x21FCu; + + auto loadRegisterImmediate = genCmdCast *>(wparidMaskProgrammingLocation); + ASSERT_NE(nullptr, loadRegisterImmediate); + EXPECT_EQ(expectedRegister, loadRegisterImmediate->getRegisterOffset()); + EXPECT_EQ(expectedMask, loadRegisterImmediate->getDataDword()); + + EXPECT_TRUE(programWparidMask(cmdBufferAddress, totalBytesProgrammed, 8)); + wparidMaskProgrammingLocation = ptrOffset(wparidMaskProgrammingLocation, sizeof(LOAD_REGISTER_IMM)); + loadRegisterImmediate = genCmdCast *>(wparidMaskProgrammingLocation); + expectedMask = 0xFFF8u; + EXPECT_EQ(expectedMask, loadRegisterImmediate->getDataDword()); + + EXPECT_TRUE(programWparidMask(cmdBufferAddress, totalBytesProgrammed, 4)); + wparidMaskProgrammingLocation = ptrOffset(wparidMaskProgrammingLocation, sizeof(LOAD_REGISTER_IMM)); + loadRegisterImmediate = genCmdCast *>(wparidMaskProgrammingLocation); + expectedMask = 0xFFFCu; + EXPECT_EQ(expectedMask, loadRegisterImmediate->getDataDword()); + + EXPECT_TRUE(programWparidMask(cmdBufferAddress, totalBytesProgrammed, 2)); + wparidMaskProgrammingLocation = ptrOffset(wparidMaskProgrammingLocation, sizeof(LOAD_REGISTER_IMM)); + loadRegisterImmediate = genCmdCast *>(wparidMaskProgrammingLocation); + expectedMask = 0xFFFEu; + EXPECT_EQ(expectedMask, loadRegisterImmediate->getDataDword()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramPredicationOnWhenItIsProgrammedThenCommandBufferContainsCorrectCommand) { + auto expectedUsedSize = sizeof(WalkerPartition::MI_SET_PREDICATE); + + void *miSetPredicateAddress = cmdBufferAddress; + programWparidPredication(cmdBufferAddress, totalBytesProgrammed, true); + auto miSetPredicate = genCmdCast *>(miSetPredicateAddress); + + ASSERT_NE(nullptr, miSetPredicate); + EXPECT_EQ(expectedUsedSize, totalBytesProgrammed); + EXPECT_EQ(miSetPredicate->getPredicateEnableWparid(), MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramPredicationOffWhenItIsProgrammedThenCommandBufferContainsCorrectCommand) { + auto expectedUsedSize = sizeof(WalkerPartition::MI_SET_PREDICATE); + + void *miSetPredicateAddress = cmdBufferAddress; + programWparidPredication(cmdBufferAddress, totalBytesProgrammed, false); + EXPECT_EQ(expectedUsedSize, totalBytesProgrammed); + auto miSetPredicate = genCmdCast *>(miSetPredicateAddress); + ASSERT_NE(nullptr, miSetPredicate); + EXPECT_EQ(miSetPredicate->getPredicateEnableWparid(), MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_NEVER); + EXPECT_EQ(miSetPredicate->getPredicateEnable(), MI_SET_PREDICATE::PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramWaitForSemaphoreWhenitisProgrammedThenAllFieldsAreSetCorrectly) { + auto expectedUsedSize = sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + uint64_t gpuAddress = 0x6432100llu; + uint32_t compareData = 1u; + + void *semaphoreWaitAddress = cmdBufferAddress; + programWaitForSemaphore(cmdBufferAddress, + totalBytesProgrammed, + gpuAddress, + compareData, + MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); + auto semaphoreWait = genCmdCast *>(semaphoreWaitAddress); + EXPECT_EQ(expectedUsedSize, totalBytesProgrammed); + + ASSERT_NE(nullptr, semaphoreWait); + EXPECT_EQ(compareData, semaphoreWait->getSemaphoreDataDword()); + EXPECT_EQ(gpuAddress, semaphoreWait->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, semaphoreWait->getCompareOperation()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE, semaphoreWait->getWaitMode()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::MEMORY_TYPE::MEMORY_TYPE_PER_PROCESS_GRAPHICS_ADDRESS, semaphoreWait->getMemoryType()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::REGISTER_POLL_MODE::REGISTER_POLL_MODE_MEMORY_POLL, semaphoreWait->getRegisterPollMode()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenMiAtomicWhenItIsProgrammedThenAllFieldsAreSetCorrectly) { + auto expectedUsedSize = sizeof(WalkerPartition::MI_ATOMIC); + uint64_t gpuAddress = 0xFFFFFFDFEEDBAC10llu; + + void *miAtomicAddress = cmdBufferAddress; + programMiAtomic(cmdBufferAddress, + totalBytesProgrammed, gpuAddress, true, MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT); + + auto miAtomic = genCmdCast *>(miAtomicAddress); + ASSERT_NE(nullptr, miAtomic); + EXPECT_EQ(expectedUsedSize, totalBytesProgrammed); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + EXPECT_EQ(0u, miAtomic->getDataSize()); + EXPECT_TRUE(miAtomic->getCsStall()); + EXPECT_EQ(MI_ATOMIC::MEMORY_TYPE::MEMORY_TYPE_PER_PROCESS_GRAPHICS_ADDRESS, miAtomic->getMemoryType()); + EXPECT_TRUE(miAtomic->getReturnDataControl()); + EXPECT_FALSE(miAtomic->getWorkloadPartitionIdOffsetEnable()); + auto memoryAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + + //bits 48-63 are zeroed + EXPECT_EQ((gpuAddress & 0xFFFFFFFFFFFF), memoryAddress); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenMiLoadRegisterRegWhenItIsProgrammedThenCommandIsProperlySet) { + auto expectedUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_REG); + void *loadRegisterRegAddress = cmdBufferAddress; + WalkerPartition::programMiLoadRegisterReg(cmdBufferAddress, totalBytesProgrammed, generalPurposeRegister1, wparidCCSOffset); + auto loadRegisterReg = genCmdCast *>(loadRegisterRegAddress); + ASSERT_NE(nullptr, loadRegisterReg); + EXPECT_EQ(expectedUsedSize, totalBytesProgrammed); + + EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableDestination()); + EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableSource()); + EXPECT_EQ(generalPurposeRegister1, loadRegisterReg->getSourceRegisterAddress()); + EXPECT_EQ(wparidCCSOffset, loadRegisterReg->getDestinationRegisterAddress()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramPipeControlCommandWhenItIsProgrammedThenItIsProperlySet) { + auto expectedUsedSize = sizeof(WalkerPartition::PIPE_CONTROL); + void *pipeControlCAddress = cmdBufferAddress; + WalkerPartition::programPipeControlCommand(cmdBufferAddress, totalBytesProgrammed, true); + auto pipeControl = genCmdCast *>(pipeControlCAddress); + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(expectedUsedSize, totalBytesProgrammed); + + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramPipeControlCommandWhenItIsProgrammedWithDcFlushFalseThenExpectDcFlushFlagFalse) { + auto expectedUsedSize = sizeof(WalkerPartition::PIPE_CONTROL); + void *pipeControlCAddress = cmdBufferAddress; + WalkerPartition::programPipeControlCommand(cmdBufferAddress, totalBytesProgrammed, false); + auto pipeControl = genCmdCast *>(pipeControlCAddress); + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(expectedUsedSize, totalBytesProgrammed); + + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_FALSE(pipeControl->getDcFlushEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramPipeControlCommandWhenItIsProgrammedWithDebugDoNotFlushThenItIsProperlySetWithoutDcFlush) { + DebugManagerStateRestore restore; + DebugManager.flags.DoNotFlushCaches.set(true); + auto expectedUsedSize = sizeof(WalkerPartition::PIPE_CONTROL); + void *pipeControlCAddress = cmdBufferAddress; + WalkerPartition::programPipeControlCommand(cmdBufferAddress, totalBytesProgrammed, true); + auto pipeControl = genCmdCast *>(pipeControlCAddress); + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(expectedUsedSize, totalBytesProgrammed); + + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_FALSE(pipeControl->getDcFlushEnable()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramBatchBufferStartCommandWhenItIsCalledThenCommandIsProgrammedCorrectly) { + auto expectedUsedSize = sizeof(WalkerPartition::BATCH_BUFFER_START); + uint64_t gpuAddress = 0xFFFFFFDFEEDBAC10llu; + + void *batchBufferStartAddress = cmdBufferAddress; + WalkerPartition::programMiBatchBufferStart(cmdBufferAddress, totalBytesProgrammed, gpuAddress, true, false); + auto batchBufferStart = genCmdCast *>(batchBufferStartAddress); + ASSERT_NE(nullptr, batchBufferStart); + EXPECT_EQ(expectedUsedSize, totalBytesProgrammed); + + //bits 48-63 are zeroed + EXPECT_EQ((gpuAddress & 0xFFFFFFFFFFFF), batchBufferStart->getBatchBufferStartAddress()); + + EXPECT_TRUE(batchBufferStart->getPredicationEnable()); + EXPECT_FALSE(batchBufferStart->getEnableCommandCache()); + EXPECT_EQ(BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, batchBufferStart->getSecondLevelBatchBuffer()); + EXPECT_EQ(BATCH_BUFFER_START::ADDRESS_SPACE_INDICATOR::ADDRESS_SPACE_INDICATOR_PPGTT, batchBufferStart->getAddressSpaceIndicator()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhenItIsCalledThenWalkerIsProperlyProgrammed) { + auto expectedUsedSize = sizeof(WalkerPartition::COMPUTE_WALKER); + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(7u); + walker.setThreadGroupIdYDimension(10u); + walker.setThreadGroupIdZDimension(11u); + + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X); + void *walkerCommandAddress = cmdBufferAddress; + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u); + auto walkerCommand = genCmdCast *>(walkerCommandAddress); + + ASSERT_NE(nullptr, walkerCommand); + EXPECT_EQ(expectedUsedSize, totalBytesProgrammed); + EXPECT_TRUE(walkerCommand->getWorkloadPartitionEnable()); + EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walkerCommand->getPartitionType()); + EXPECT_EQ(4u, walkerCommand->getPartitionSize()); + + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y); + walkerCommandAddress = cmdBufferAddress; + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u); + walkerCommand = genCmdCast *>(walkerCommandAddress); + + ASSERT_NE(nullptr, walkerCommand); + EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, walkerCommand->getPartitionType()); + EXPECT_EQ(5u, walkerCommand->getPartitionSize()); + + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z); + walkerCommandAddress = cmdBufferAddress; + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 2u); + walkerCommand = genCmdCast *>(walkerCommandAddress); + + ASSERT_NE(nullptr, walkerCommand); + EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z, walkerCommand->getPartitionType()); + EXPECT_EQ(6u, walkerCommand->getPartitionSize()); + + //if we program with partition Count == 1 then do not trigger partition stuff + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED); + walkerCommandAddress = cmdBufferAddress; + programPartitionedWalker(cmdBufferAddress, totalBytesProgrammed, &walker, 1u); + walkerCommand = genCmdCast *>(walkerCommandAddress); + + ASSERT_NE(nullptr, walkerCommand); + EXPECT_EQ(0u, walkerCommand->getPartitionSize()); + EXPECT_FALSE(walkerCommand->getWorkloadPartitionEnable()); + EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, walkerCommand->getPartitionType()); +} +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerWhenComputePartitionCountIsCalledThenDefaultSizeAndTypeIsReturned) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(16u); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 2u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(2u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerWithNonUniformStartWhenComputePartitionCountIsCalledThenPartitionsAreDisabled) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdStartingX(1u); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(1u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, walker.getPartitionType()); + + walker.setThreadGroupIdStartingX(0u); + walker.setThreadGroupIdStartingY(1u); + + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(1u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, walker.getPartitionType()); + + walker.setThreadGroupIdStartingY(0u); + walker.setThreadGroupIdStartingZ(1u); + + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(1u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerWithDifferentWorkgroupCountsWhenPartitionCountIsObtainedThenHighestDimensionIsPartitioned) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(64u); + walker.setThreadGroupIdYDimension(64u); + walker.setThreadGroupIdZDimension(64u); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walker.getPartitionType()); + + walker.setThreadGroupIdYDimension(65u); + walker.setPartitionType(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED); + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, walker.getPartitionType()); + + walker.setThreadGroupIdZDimension(66u); + walker.setPartitionType(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED); + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenDisalbedMinimalPartitionSizeWhenCoomputePartitionSizeThenProperValueIsReturned) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(64u); + walker.setThreadGroupIdYDimension(64u); + walker.setThreadGroupIdZDimension(64u); + + DebugManagerStateRestore restorer; + DebugManager.flags.SetMinimalPartitionSize.set(0); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(16u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walker.getPartitionType()); + + walker.setThreadGroupIdYDimension(65u); + walker.setPartitionType(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED); + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(16u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, walker.getPartitionType()); + + walker.setThreadGroupIdZDimension(66u); + walker.setPartitionType(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED); + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(16u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerWithDifferentWorkgroupCountsWhenPartitionCountIsObtainedThenPartitionCountIsClampedToHighestDimension) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(15u); + walker.setThreadGroupIdYDimension(7u); + walker.setThreadGroupIdZDimension(4u); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walker.getPartitionType()); + walker.setThreadGroupIdXDimension(1u); + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED); + + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, walker.getPartitionType()); + + walker.setThreadGroupIdYDimension(1u); + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED); + + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerWithPartitionTypeHintWhenPartitionCountIsObtainedThenSuggestedTypeIsUsedForPartition) { + DebugManagerStateRestore restore{}; + + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(8u); + walker.setThreadGroupIdYDimension(4u); + walker.setThreadGroupIdZDimension(2u); + + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(-1); + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walker.getPartitionType()); + + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(static_cast(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X)); + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walker.getPartitionType()); + + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(static_cast(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y)); + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, walker.getPartitionType()); + + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(static_cast(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z)); + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(2u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenInvalidPartitionTypeIsRequestedWhenPartitionCountIsObtainedThenFail) { + DebugManagerStateRestore restore{}; + + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(8u); + walker.setThreadGroupIdYDimension(4u); + walker.setThreadGroupIdZDimension(2u); + + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(0); + bool staticPartitioning = false; + EXPECT_ANY_THROW(computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerWithSmallXDimensionSizeWhenPartitionCountIsObtainedThenPartitionCountIsAdujsted) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(32u); + walker.setThreadGroupIdYDimension(1024u); + walker.setThreadGroupIdZDimension(1u); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 2u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(2u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerWithBigXDimensionSizeWhenPartitionCountIsObtainedThenPartitionCountIsNotAdjusted) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(16384u); + walker.setThreadGroupIdYDimension(1u); + walker.setThreadGroupIdZDimension(1u); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 2u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(16u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenCustomMinimalPartitionSizeWhenComputePartitionCountThenProperValueIsReturned) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(16384u); + walker.setThreadGroupIdYDimension(1u); + walker.setThreadGroupIdZDimension(1u); + + DebugManagerStateRestore restorer; + DebugManager.flags.SetMinimalPartitionSize.set(4096); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 2u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenWalkerWithPartitionTypeProgrammedWhenPartitionCountIsObtainedAndItEqualsOneThenPartitionMechanismIsDisabled) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(1u); + walker.setThreadGroupIdYDimension(1u); + walker.setThreadGroupIdZDimension(1u); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(1u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenXDimensionIsNotLargetAnd2DImagesAreUsedWhenPartitionTypeIsObtainedThenSelectXDimension) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(8u); + walker.setThreadGroupIdYDimension(64u); + walker.setThreadGroupIdZDimension(16u); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, false, &staticPartitioning); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, walker.getPartitionType()); + + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, false, true, &staticPartitioning); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitioningAndNonPartitionableWalkerWhenPartitionCountIsObtainedThenAllowPartitioning) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(1u); + walker.setThreadGroupIdYDimension(1u); + walker.setThreadGroupIdZDimension(1u); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, true, false, &staticPartitioning); + EXPECT_TRUE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitioningAndPartitionableWalkerWhenPartitionCountIsObtainedThenAllowPartitioning) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(1u); + walker.setThreadGroupIdYDimension(2u); + walker.setThreadGroupIdZDimension(1u); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, true, false, &staticPartitioning); + EXPECT_TRUE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitioningAndBigPartitionCountProgrammedInWalkerWhenPartitionCountIsObtainedThenNumberOfPartitionsIsEqualToNumberOfTiles) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(1u); + walker.setThreadGroupIdYDimension(16384u); + walker.setThreadGroupIdZDimension(1u); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, true, false, &staticPartitioning); + EXPECT_TRUE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitioningAndAndNonUniformStartProgrammedInWalkerWhenPartitionCountIsObtainedThenDoNotAllowStaticPartitioningAndSetPartitionCountToOne) { + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(1u); + walker.setThreadGroupIdYDimension(16384u); + walker.setThreadGroupIdZDimension(1u); + walker.setThreadGroupIdStartingX(0); + walker.setThreadGroupIdStartingY(0); + walker.setThreadGroupIdStartingZ(1); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, true, false, &staticPartitioning); + EXPECT_FALSE(staticPartitioning); + EXPECT_EQ(1u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitioningAndPartitionTypeHintIsUsedWhenPartitionCountIsObtainedThenUseRequestedPartitionType) { + DebugManagerStateRestore restore{}; + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(1u); + walker.setThreadGroupIdYDimension(16384u); + walker.setThreadGroupIdZDimension(1u); + + bool staticPartitioning = false; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, true, false, &staticPartitioning); + EXPECT_TRUE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, walker.getPartitionType()); + + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(static_cast(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z)); + staticPartitioning = false; + partitionCount = computePartitionCountAndSetPartitionType(&walker, 4u, true, false, &staticPartitioning); + EXPECT_TRUE(staticPartitioning); + EXPECT_EQ(4u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitioningWhenZDimensionIsNotDivisibleByTwoButIsAboveThreasholThenItIsSelected) { + DebugManagerStateRestore restore{}; + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(1u); + walker.setThreadGroupIdYDimension(16384u); + walker.setThreadGroupIdZDimension(2u); + + bool staticPartitioning = true; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 2u, true, false, &staticPartitioning); + EXPECT_TRUE(staticPartitioning); + EXPECT_EQ(2u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z, walker.getPartitionType()); + + DebugManager.flags.WalkerPartitionPreferHighestDimension.set(0); + + partitionCount = computePartitionCountAndSetPartitionType(&walker, 2u, true, false, &staticPartitioning); + EXPECT_TRUE(staticPartitioning); + EXPECT_EQ(2u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitioningWhenYDimensionIsDivisibleByTwoThenItIsSelected) { + DebugManagerStateRestore restore{}; + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(16384u); + walker.setThreadGroupIdYDimension(2u); + walker.setThreadGroupIdZDimension(1u); + + bool staticPartitioning = true; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 2u, true, false, &staticPartitioning); + EXPECT_TRUE(staticPartitioning); + EXPECT_EQ(2u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Y, walker.getPartitionType()); + + DebugManager.flags.WalkerPartitionPreferHighestDimension.set(0); + + partitionCount = computePartitionCountAndSetPartitionType(&walker, 2u, true, false, &staticPartitioning); + EXPECT_TRUE(staticPartitioning); + EXPECT_EQ(2u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, walker.getPartitionType()); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitioningWhenZDimensionIsDivisibleByTwoThenItIsSelected) { + DebugManagerStateRestore restore{}; + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setThreadGroupIdXDimension(512u); + walker.setThreadGroupIdYDimension(512u); + walker.setThreadGroupIdZDimension(513u); + + bool staticPartitioning = true; + auto partitionCount = computePartitionCountAndSetPartitionType(&walker, 2u, true, false, &staticPartitioning); + EXPECT_TRUE(staticPartitioning); + EXPECT_EQ(2u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z, walker.getPartitionType()); + + DebugManager.flags.WalkerPartitionPreferHighestDimension.set(0); + + partitionCount = computePartitionCountAndSetPartitionType(&walker, 2u, true, false, &staticPartitioning); + EXPECT_TRUE(staticPartitioning); + EXPECT_EQ(2u, partitionCount); + EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z, walker.getPartitionType()); +} + +struct EnqueueWithWalkerPartitionTests : public ::testing::Test { + void SetUp() override { + if (!OSInterface::osEnableLocalMemory) { + GTEST_SKIP(); + } + DebugManager.flags.EnableWalkerPartition.set(1u); + executionEnvironment = platform()->peekExecutionEnvironment(); + DebugManager.flags.CreateMultipleSubDevices.set(numberOfTiles); + executionEnvironment->prepareRootDeviceEnvironments(1u); + executionEnvironment->calculateMaxOsContextCount(); + + rootDevice = std::make_unique(MockDevice::create(executionEnvironment, 0)); + + context = std::make_unique(rootDevice.get()); + + engineControlForFusedQueue = rootDevice->getDefaultEngine(); + } + + DebugManagerStateRestore restore; + VariableBackup mockDeviceFlagBackup{&MockDevice::createSingleDevice, false}; + const uint32_t numberOfTiles = 3; + EngineControl engineControlForFusedQueue = {}; + ExecutionEnvironment *executionEnvironment = nullptr; + std::unique_ptr rootDevice; + std::unique_ptr context; +}; + +HWCMDTEST_F(IGFX_XE_HP_CORE, EnqueueWithWalkerPartitionTests, givenCsrWithSpecificNumberOfTilesWhenDispatchingThenConstructCmdBufferForAllSupportedTiles) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + MockCommandQueueHw commandQueue(context.get(), rootDevice.get(), nullptr); + commandQueue.gpgpuEngine = &engineControlForFusedQueue; + MockKernelWithInternals kernel(*rootDevice, context.get()); + + size_t offset[3] = {0, 0, 0}; + size_t gws[3] = {32, 32, 32}; + commandQueue.enqueueKernel(kernel, 3, offset, gws, nullptr, 0, nullptr, nullptr); + auto &cmdStream = commandQueue.getCS(0); + + HardwareParse hwParser; + hwParser.parseCommands(cmdStream, 0); + + bool lastSemaphoreFound = false; + for (auto it = hwParser.cmdList.rbegin(); it != hwParser.cmdList.rend(); it++) { + auto semaphoreCmd = genCmdCast(*it); + + if (semaphoreCmd) { + if (UnitTestHelper::isAdditionalMiSemaphoreWait(*semaphoreCmd)) { + continue; + } + EXPECT_EQ(numberOfTiles, semaphoreCmd->getSemaphoreDataDword()); + lastSemaphoreFound = true; + break; + } + } + EXPECT_TRUE(lastSemaphoreFound); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenNativeCrossTileSyncWhenDebugForceDisableCrossTileSyncThenNativeOverridesDebugAndAddsOwnCleanupSection) { + DebugManagerStateRestore restorer; + DebugManager.flags.ExperimentalForceCrossAtomicSynchronization.set(0); + + auto partitionCount = 16u; + checkForProperCmdBufferAddressOffset = false; + nativeCrossTileSync = true; + uint64_t gpuVirtualAddress = 0x8000123000; + uint64_t postSyncAddress = 0x8000456000; + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X); + auto &postSync = walker.getPostSync(); + postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP); + postSync.setDestinationAddress(postSyncAddress); + uint32_t totalBytesProgrammed = 0u; + + auto expectedCommandUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM) + + sizeof(WalkerPartition::MI_ATOMIC) * 2 + + sizeof(WalkerPartition::LOAD_REGISTER_REG) + + sizeof(WalkerPartition::MI_SET_PREDICATE) * 2 + + sizeof(WalkerPartition::BATCH_BUFFER_START) * 3 + + sizeof(WalkerPartition::PIPE_CONTROL) + + sizeof(WalkerPartition::COMPUTE_WALKER) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) + + sizeof(WalkerPartition::MI_STORE_DATA_IMM); + + auto walkerSectionCommands = sizeof(WalkerPartition::BATCH_BUFFER_START) + + sizeof(WalkerPartition::COMPUTE_WALKER); + + EXPECT_EQ(expectedCommandUsedSize, computeControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileSync, false)); + + auto cleanupSectionOffset = expectedCommandUsedSize + sizeof(BatchBufferControlData); + + auto totalProgrammedSize = cleanupSectionOffset + 3 * sizeof(WalkerPartition::MI_STORE_DATA_IMM) + + 2 * sizeof(WalkerPartition::MI_ATOMIC) + + 2 * sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + + uint32_t tileCount = 4u; + WalkerPartition::constructDynamicallyPartitionedCommandBuffer(cmdBuffer, + gpuVirtualAddress, &walker, totalBytesProgrammed, partitionCount, + tileCount, false, synchronizeBeforeExecution, false, nativeCrossTileSync, false); + + EXPECT_EQ(totalProgrammedSize, totalBytesProgrammed); + auto wparidMaskProgrammingLocation = cmdBufferAddress; + + auto expectedMask = 0xFFF0u; + auto expectedRegister = 0x21FCu; + + auto loadRegisterImmediate = genCmdCast *>(wparidMaskProgrammingLocation); + ASSERT_NE(nullptr, loadRegisterImmediate); + EXPECT_EQ(expectedRegister, loadRegisterImmediate->getRegisterOffset()); + EXPECT_EQ(expectedMask, loadRegisterImmediate->getDataDword()); + auto parsedOffset = sizeof(WalkerPartition::LOAD_REGISTER_IMM); + + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + auto miAtomicAddress = gpuVirtualAddress + expectedCommandUsedSize; + auto miAtomicProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicAddress, miAtomicProgrammedAddress); + EXPECT_TRUE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + auto loadRegisterReg = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterReg); + EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableDestination()); + EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableSource()); + EXPECT_EQ(wparidCCSOffset, loadRegisterReg->getDestinationRegisterAddress()); + EXPECT_EQ(generalPurposeRegister4, loadRegisterReg->getSourceRegisterAddress()); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_REG); + + auto miSetPredicate = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSetPredicate); + EXPECT_EQ(miSetPredicate->getPredicateEnableWparid(), MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE); + parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE); + + auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + EXPECT_TRUE(batchBufferStart->getPredicationEnable()); + //address routes to WALKER section which is before control section + auto address = batchBufferStart->getBatchBufferStartAddress(); + EXPECT_EQ(address, gpuVirtualAddress + expectedCommandUsedSize - walkerSectionCommands); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + miSetPredicate = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSetPredicate); + EXPECT_EQ(miSetPredicate->getPredicateEnableWparid(), MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_NEVER); + EXPECT_EQ(miSetPredicate->getPredicateEnable(), MI_SET_PREDICATE::PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE); + parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE); + + uint64_t expectedCleanupGpuVa = gpuVirtualAddress + expectedCommandUsedSize + offsetof(BatchBufferControlData, finalSyncTileCount); + constexpr uint32_t expectedData = 0u; + auto finalSyncTileCountFieldStore = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, finalSyncTileCountFieldStore); + EXPECT_EQ(expectedCleanupGpuVa, finalSyncTileCountFieldStore->getAddress()); + EXPECT_EQ(expectedData, finalSyncTileCountFieldStore->getDataDword0()); + parsedOffset += sizeof(WalkerPartition::MI_STORE_DATA_IMM); + + auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, pipeControl); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); + parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); + + miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + auto miAtomicTileAddress = gpuVirtualAddress + expectedCommandUsedSize + sizeof(uint32_t); + auto miAtomicTileProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicTileAddress, miAtomicTileProgrammedAddress); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + EXPECT_EQ(miSemaphoreWait->getSemaphoreGraphicsAddress(), miAtomicTileAddress); + EXPECT_EQ(miSemaphoreWait->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); + EXPECT_EQ(miSemaphoreWait->getSemaphoreDataDword(), tileCount); + + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + + //final batch buffer start that routes at the end of the batch buffer + auto batchBufferStartFinal = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStartFinal); + EXPECT_EQ(batchBufferStartFinal->getBatchBufferStartAddress(), gpuVirtualAddress + cleanupSectionOffset); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + auto computeWalker = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + EXPECT_NE(nullptr, computeWalker); + parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); + + batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + EXPECT_FALSE(batchBufferStart->getPredicationEnable()); + EXPECT_EQ(gpuVirtualAddress, batchBufferStart->getBatchBufferStartAddress()); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, expectedCommandUsedSize)); + EXPECT_EQ(0u, controlSection->partitionCount); + EXPECT_EQ(0u, controlSection->tileCount); + EXPECT_EQ(0u, controlSection->inTileCount); + EXPECT_EQ(0u, controlSection->finalSyncTileCount); + + parsedOffset += sizeof(BatchBufferControlData); + EXPECT_EQ(parsedOffset, cleanupSectionOffset); + + miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + miAtomicTileAddress = gpuVirtualAddress + cleanupSectionOffset - sizeof(BatchBufferControlData) + + 3 * sizeof(uint32_t); + miAtomicTileProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicTileAddress, miAtomicTileProgrammedAddress); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + EXPECT_EQ(miSemaphoreWait->getSemaphoreGraphicsAddress(), miAtomicTileAddress); + EXPECT_EQ(miSemaphoreWait->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); + EXPECT_EQ(miSemaphoreWait->getSemaphoreDataDword(), tileCount); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + + expectedCleanupGpuVa = gpuVirtualAddress + cleanupSectionOffset - sizeof(BatchBufferControlData); + auto partitionCountFieldStore = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, partitionCountFieldStore); + EXPECT_EQ(expectedCleanupGpuVa, partitionCountFieldStore->getAddress()); + EXPECT_EQ(expectedData, partitionCountFieldStore->getDataDword0()); + parsedOffset += sizeof(WalkerPartition::MI_STORE_DATA_IMM); + + expectedCleanupGpuVa += sizeof(BatchBufferControlData::partitionCount); + auto tileCountFieldStore = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, tileCountFieldStore); + EXPECT_EQ(expectedCleanupGpuVa, tileCountFieldStore->getAddress()); + EXPECT_EQ(expectedData, tileCountFieldStore->getDataDword0()); + parsedOffset += sizeof(WalkerPartition::MI_STORE_DATA_IMM); + + expectedCleanupGpuVa += sizeof(BatchBufferControlData::tileCount); + auto inTileCountFieldStore = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, inTileCountFieldStore); + EXPECT_EQ(expectedCleanupGpuVa, inTileCountFieldStore->getAddress()); + EXPECT_EQ(expectedData, inTileCountFieldStore->getDataDword0()); + parsedOffset += sizeof(WalkerPartition::MI_STORE_DATA_IMM); + + miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + miAtomicTileProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicTileAddress, miAtomicTileProgrammedAddress); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + EXPECT_EQ(miSemaphoreWait->getSemaphoreGraphicsAddress(), miAtomicTileAddress); + EXPECT_EQ(miSemaphoreWait->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); + EXPECT_EQ(miSemaphoreWait->getSemaphoreDataDword(), 2 * tileCount); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenNativeCrossTileSyncAndAtomicsUsedForNativeWhenDebugForceDisableCrossTileSyncThenNativeOverridesDebugAndAddsOwnCleanupSection) { + DebugManagerStateRestore restorer; + DebugManager.flags.ExperimentalForceCrossAtomicSynchronization.set(0); + + auto partitionCount = 16u; + checkForProperCmdBufferAddressOffset = false; + nativeCrossTileSync = true; + bool useAtomicsForNative = true; + uint64_t gpuVirtualAddress = 0x8000123000; + uint64_t postSyncAddress = 0x8000456000; + WalkerPartition::COMPUTE_WALKER walker; + walker = FamilyType::cmdInitGpgpuWalker; + walker.setPartitionType(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X); + auto &postSync = walker.getPostSync(); + postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP); + postSync.setDestinationAddress(postSyncAddress); + uint32_t totalBytesProgrammed = 0u; + + auto expectedCommandUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM) + + sizeof(WalkerPartition::MI_ATOMIC) * 2 + + sizeof(WalkerPartition::LOAD_REGISTER_REG) + + sizeof(WalkerPartition::MI_SET_PREDICATE) * 2 + + sizeof(WalkerPartition::BATCH_BUFFER_START) * 3 + + sizeof(WalkerPartition::PIPE_CONTROL) + + sizeof(WalkerPartition::COMPUTE_WALKER) + + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) + + sizeof(WalkerPartition::MI_ATOMIC); + + auto walkerSectionCommands = sizeof(WalkerPartition::BATCH_BUFFER_START) + + sizeof(WalkerPartition::COMPUTE_WALKER); + + EXPECT_EQ(expectedCommandUsedSize, computeControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileSync, useAtomicsForNative)); + + auto cleanupSectionOffset = expectedCommandUsedSize + sizeof(BatchBufferControlData); + + auto totalProgrammedSize = cleanupSectionOffset + 3 * sizeof(WalkerPartition::MI_ATOMIC) + + 2 * sizeof(WalkerPartition::MI_ATOMIC) + + 2 * sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + + uint32_t tileCount = 4u; + WalkerPartition::constructDynamicallyPartitionedCommandBuffer(cmdBuffer, + gpuVirtualAddress, &walker, totalBytesProgrammed, partitionCount, + tileCount, false, synchronizeBeforeExecution, false, nativeCrossTileSync, useAtomicsForNative); + + EXPECT_EQ(totalProgrammedSize, totalBytesProgrammed); + auto wparidMaskProgrammingLocation = cmdBufferAddress; + + auto expectedMask = 0xFFF0u; + auto expectedRegister = 0x21FCu; + + auto loadRegisterImmediate = genCmdCast *>(wparidMaskProgrammingLocation); + ASSERT_NE(nullptr, loadRegisterImmediate); + EXPECT_EQ(expectedRegister, loadRegisterImmediate->getRegisterOffset()); + EXPECT_EQ(expectedMask, loadRegisterImmediate->getDataDword()); + auto parsedOffset = sizeof(WalkerPartition::LOAD_REGISTER_IMM); + + auto miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + auto miAtomicAddress = gpuVirtualAddress + expectedCommandUsedSize; + auto miAtomicProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicAddress, miAtomicProgrammedAddress); + EXPECT_TRUE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + auto loadRegisterReg = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, loadRegisterReg); + EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableDestination()); + EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableSource()); + EXPECT_EQ(wparidCCSOffset, loadRegisterReg->getDestinationRegisterAddress()); + EXPECT_EQ(generalPurposeRegister4, loadRegisterReg->getSourceRegisterAddress()); + parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_REG); + + auto miSetPredicate = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSetPredicate); + EXPECT_EQ(miSetPredicate->getPredicateEnableWparid(), MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE); + parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE); + + auto batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + EXPECT_TRUE(batchBufferStart->getPredicationEnable()); + //address routes to WALKER section which is before control section + auto address = batchBufferStart->getBatchBufferStartAddress(); + EXPECT_EQ(address, gpuVirtualAddress + expectedCommandUsedSize - walkerSectionCommands); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + miSetPredicate = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSetPredicate); + EXPECT_EQ(miSetPredicate->getPredicateEnableWparid(), MI_SET_PREDICATE::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_NEVER); + EXPECT_EQ(miSetPredicate->getPredicateEnable(), MI_SET_PREDICATE::PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE); + parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE); + + uint64_t expectedCleanupGpuVa = gpuVirtualAddress + expectedCommandUsedSize + offsetof(BatchBufferControlData, finalSyncTileCount); + auto finalSyncTileCountFieldStore = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, finalSyncTileCountFieldStore); + miAtomicProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*finalSyncTileCountFieldStore); + EXPECT_EQ(expectedCleanupGpuVa, miAtomicProgrammedAddress); + EXPECT_FALSE(finalSyncTileCountFieldStore->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE, finalSyncTileCountFieldStore->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + auto pipeControl = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, pipeControl); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), pipeControl->getDcFlushEnable()); + parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL); + + miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + auto miAtomicTileAddress = gpuVirtualAddress + expectedCommandUsedSize + sizeof(uint32_t); + auto miAtomicTileProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicTileAddress, miAtomicTileProgrammedAddress); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + auto miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + EXPECT_EQ(miSemaphoreWait->getSemaphoreGraphicsAddress(), miAtomicTileAddress); + EXPECT_EQ(miSemaphoreWait->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); + EXPECT_EQ(miSemaphoreWait->getSemaphoreDataDword(), tileCount); + + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + + //final batch buffer start that routes at the end of the batch buffer + auto batchBufferStartFinal = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStartFinal); + EXPECT_EQ(batchBufferStartFinal->getBatchBufferStartAddress(), gpuVirtualAddress + cleanupSectionOffset); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + auto computeWalker = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + EXPECT_NE(nullptr, computeWalker); + parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER); + + batchBufferStart = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, batchBufferStart); + EXPECT_FALSE(batchBufferStart->getPredicationEnable()); + EXPECT_EQ(gpuVirtualAddress, batchBufferStart->getBatchBufferStartAddress()); + parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START); + + auto controlSection = reinterpret_cast(ptrOffset(cmdBuffer, expectedCommandUsedSize)); + EXPECT_EQ(0u, controlSection->partitionCount); + EXPECT_EQ(0u, controlSection->tileCount); + EXPECT_EQ(0u, controlSection->inTileCount); + EXPECT_EQ(0u, controlSection->finalSyncTileCount); + + parsedOffset += sizeof(BatchBufferControlData); + EXPECT_EQ(parsedOffset, cleanupSectionOffset); + + miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + miAtomicTileAddress = gpuVirtualAddress + cleanupSectionOffset - sizeof(BatchBufferControlData) + + 3 * sizeof(uint32_t); + miAtomicTileProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicTileAddress, miAtomicTileProgrammedAddress); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + EXPECT_EQ(miSemaphoreWait->getSemaphoreGraphicsAddress(), miAtomicTileAddress); + EXPECT_EQ(miSemaphoreWait->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); + EXPECT_EQ(miSemaphoreWait->getSemaphoreDataDword(), tileCount); + parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT); + + expectedCleanupGpuVa = gpuVirtualAddress + cleanupSectionOffset - sizeof(BatchBufferControlData); + auto partitionCountFieldStore = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, partitionCountFieldStore); + miAtomicProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*partitionCountFieldStore); + EXPECT_EQ(expectedCleanupGpuVa, miAtomicProgrammedAddress); + EXPECT_FALSE(partitionCountFieldStore->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE, partitionCountFieldStore->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + expectedCleanupGpuVa += sizeof(BatchBufferControlData::partitionCount); + auto tileCountFieldStore = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, tileCountFieldStore); + miAtomicProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*tileCountFieldStore); + EXPECT_EQ(expectedCleanupGpuVa, miAtomicProgrammedAddress); + EXPECT_FALSE(tileCountFieldStore->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE, tileCountFieldStore->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + expectedCleanupGpuVa += sizeof(BatchBufferControlData::tileCount); + auto inTileCountFieldStore = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, inTileCountFieldStore); + miAtomicProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*inTileCountFieldStore); + EXPECT_EQ(expectedCleanupGpuVa, miAtomicProgrammedAddress); + EXPECT_FALSE(inTileCountFieldStore->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE, inTileCountFieldStore->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + miAtomic = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + miAtomicTileProgrammedAddress = UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(miAtomicTileAddress, miAtomicTileProgrammedAddress); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(WalkerPartition::MI_ATOMIC); + + miSemaphoreWait = genCmdCast *>(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphoreWait); + EXPECT_EQ(miSemaphoreWait->getSemaphoreGraphicsAddress(), miAtomicTileAddress); + EXPECT_EQ(miSemaphoreWait->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); + EXPECT_EQ(miSemaphoreWait->getSemaphoreDataDword(), 2 * tileCount); +} diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index 7f2a9d472a..c0952be839 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -305,6 +305,7 @@ OverrideNotifyEnableForTagUpdatePostSync = -1 EnableCacheFlushAfterWalkerForAllQueues = -1 Force32BitDriverSupport = -1 OverrideCmdQueueSynchronousMode = -1 +ExperimentalUseAtomicsForNativeSectionCleanup = -1 HBMSizePerTileInGigabytes = 0 OverrideSystolicPipelineSelect = -1 OverrideSystolicInComputeWalker = -1 diff --git a/shared/source/command_container/implicit_scaling.cpp b/shared/source/command_container/implicit_scaling.cpp index eb963a525e..67c2d8bc0d 100644 --- a/shared/source/command_container/implicit_scaling.cpp +++ b/shared/source/command_container/implicit_scaling.cpp @@ -34,4 +34,12 @@ bool ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired() { return synchronizeBeforeExecution; } +bool ImplicitScalingHelper::useAtomicsForNativeCleanup() { + bool useAtomics = false; + int overrideUseAtomics = DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.get(); + if (overrideUseAtomics != -1) { + useAtomics = !!(overrideUseAtomics); + } + return useAtomics; +} } // namespace NEO diff --git a/shared/source/command_container/implicit_scaling.h b/shared/source/command_container/implicit_scaling.h index 2f48914cd9..eb558a2df6 100644 --- a/shared/source/command_container/implicit_scaling.h +++ b/shared/source/command_container/implicit_scaling.h @@ -20,6 +20,7 @@ extern bool apiSupport; struct ImplicitScalingHelper { static bool isImplicitScalingEnabled(const DeviceBitfield &devices, bool preCondition); static bool isSynchronizeBeforeExecutionRequired(); + static bool useAtomicsForNativeCleanup(); }; template diff --git a/shared/source/command_container/implicit_scaling_xehp_plus.inl b/shared/source/command_container/implicit_scaling_xehp_plus.inl index cb9ef87f8a..8186f08866 100644 --- a/shared/source/command_container/implicit_scaling_xehp_plus.inl +++ b/shared/source/command_container/implicit_scaling_xehp_plus.inl @@ -30,8 +30,13 @@ size_t ImplicitScalingDispatch::getSize(bool nativeCrossTileAtomicSyn UNRECOVERABLE_IF(staticPartitioning && (tileCount != partitionCount)); auto synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired(); - return static_cast(WalkerPartition::estimateSpaceRequiredInCommandBuffer( - false, 16u, synchronizeBeforeExecution, nativeCrossTileAtomicSync, staticPartitioning)); + const bool useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup(); + return static_cast(WalkerPartition::estimateSpaceRequiredInCommandBuffer(false, + 16u, + synchronizeBeforeExecution, + nativeCrossTileAtomicSync, + staticPartitioning, + useAtomicsForNativeCleanup)); } template @@ -50,6 +55,7 @@ void ImplicitScalingDispatch::dispatchCommands(LinearStream &commandS bool staticPartitioning = false; partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType(&walkerCmd, tileCount, preferStaticPartitioning, usesImages, &staticPartitioning); const bool synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired(); + const bool useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup(); if (staticPartitioning) { UNRECOVERABLE_IF(tileCount != partitionCount); WalkerPartition::constructStaticallyPartitionedCommandBuffer(commandStream.getSpace(0u), @@ -61,7 +67,8 @@ void ImplicitScalingDispatch::dispatchCommands(LinearStream &commandS synchronizeBeforeExecution, useSecondaryBatchBuffer, nativeCrossTileAtomicSync, - workPartitionAllocationGpuVa); + workPartitionAllocationGpuVa, + useAtomicsForNativeCleanup); } else { if (DebugManager.flags.ExperimentalSetWalkerPartitionCount.get()) { partitionCount = DebugManager.flags.ExperimentalSetWalkerPartitionCount.get(); @@ -75,7 +82,8 @@ void ImplicitScalingDispatch::dispatchCommands(LinearStream &commandS &walkerCmd, totalProgrammedSize, partitionCount, tileCount, false, synchronizeBeforeExecution, useSecondaryBatchBuffer, - nativeCrossTileAtomicSync); + nativeCrossTileAtomicSync, + useAtomicsForNativeCleanup); } commandStream.getSpace(totalProgrammedSize); } diff --git a/shared/source/command_container/walker_partition_xehp_plus.h b/shared/source/command_container/walker_partition_xehp_plus.h index 4c6726e57d..b2603fea56 100644 --- a/shared/source/command_container/walker_partition_xehp_plus.h +++ b/shared/source/command_container/walker_partition_xehp_plus.h @@ -352,11 +352,20 @@ void programStoreMemImmediateDword(void *&inputAddress, uint32_t &totalBytesProg template void programNativeCrossTileSyncControl(void *&inputAddress, uint32_t &totalBytesProgrammed, - uint64_t finalSyncTileCountField) { - programStoreMemImmediateDword(inputAddress, - totalBytesProgrammed, - finalSyncTileCountField, - 0u); + uint64_t finalSyncTileCountField, + bool useAtomicsForNativeCleanup) { + if (useAtomicsForNativeCleanup) { + programMiAtomic(inputAddress, + totalBytesProgrammed, + finalSyncTileCountField, + false, + MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE); + } else { + programStoreMemImmediateDword(inputAddress, + totalBytesProgrammed, + finalSyncTileCountField, + 0u); + } } template @@ -365,17 +374,26 @@ void programNativeCrossTileSyncCleanup(void *&inputAddress, uint64_t finalSyncTileCountAddress, uint64_t baseAddressForCleanup, size_t fieldsForCleanupCount, - uint32_t tileCount) { + uint32_t tileCount, + bool useAtomicsForNativeCleanup) { // Synchronize tiles, so the fields are not cleared while still in use programMiAtomic(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, false, MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT); programWaitForSemaphore(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, tileCount, MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); for (auto fieldIndex = 0u; fieldIndex < fieldsForCleanupCount; fieldIndex++) { const uint64_t addressForCleanup = baseAddressForCleanup + fieldIndex * sizeof(uint32_t); - programStoreMemImmediateDword(inputAddress, - totalBytesProgrammed, - addressForCleanup, - 0u); + if (useAtomicsForNativeCleanup) { + programMiAtomic(inputAddress, + totalBytesProgrammed, + addressForCleanup, + false, + MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_MOVE); + } else { + programStoreMemImmediateDword(inputAddress, + totalBytesProgrammed, + addressForCleanup, + 0u); + } } //this synchronization point ensures that all tiles finished zeroing and will fairly access control section atomic variables @@ -410,19 +428,23 @@ uint64_t computeWalkerSectionSize() { } template -uint64_t computeNativeCrossTileSyncControlSectionSize() { - return sizeof(MI_STORE_DATA_IMM); +uint64_t computeNativeCrossTileSyncControlSectionSize(bool useAtomicsForNativeCleanup) { + if (useAtomicsForNativeCleanup) { + return sizeof(MI_ATOMIC); + } else { + return sizeof(MI_STORE_DATA_IMM); + } } template -uint64_t computeNativeCrossTileSyncCleanupSectionSize(size_t fieldsForCleanupCount) { - return fieldsForCleanupCount * sizeof(MI_STORE_DATA_IMM) + +uint64_t computeNativeCrossTileSyncCleanupSectionSize(size_t fieldsForCleanupCount, bool useAtomicsForNativeCleanup) { + return fieldsForCleanupCount * computeNativeCrossTileSyncControlSectionSize(useAtomicsForNativeCleanup) + 2 * sizeof(MI_ATOMIC) + 2 * sizeof(MI_SEMAPHORE_WAIT); } template -uint64_t computeControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync) { +uint64_t computeControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync, bool useAtomicsForNativeCleanup) { auto synchronizationCount = (synchronizeBeforeExecution) ? 2u : 1u; if (!isCrossTileAtomicRequired() && !nativeCrossTileAtomicSync) { synchronizationCount--; @@ -437,14 +459,15 @@ uint64_t computeControlSectionOffset(uint32_t partitionCount, bool synchronizeBe sizeof(MI_SEMAPHORE_WAIT) * synchronizationCount + (isSemaphoreProgrammingRequired() ? sizeof(MI_SEMAPHORE_WAIT) * partitionCount : 0u) + computeWalkerSectionSize() + - (nativeCrossTileAtomicSync ? computeNativeCrossTileSyncControlSectionSize() : 0u); + (nativeCrossTileAtomicSync ? computeNativeCrossTileSyncControlSectionSize(useAtomicsForNativeCleanup) : 0u); } template uint64_t computeWalkerSectionStart(uint32_t partitionCount, bool synchronizeBeforeExecution, - bool nativeCrossTileAtomicSync) { - return computeControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync) - + bool nativeCrossTileAtomicSync, + bool useAtomicsForNativeCleanup) { + return computeControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup) - computeWalkerSectionSize(); } @@ -519,11 +542,12 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer, bool emitBatchBufferEnd, bool synchronizeBeforeExecution, bool secondaryBatchBuffer, - bool nativeCrossTileAtomicSync) { + bool nativeCrossTileAtomicSync, + bool useAtomicsForNativeCleanup) { totalBytesProgrammed = 0u; void *currentBatchBufferPointer = cpuPointer; - auto controlSectionOffset = computeControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync); + auto controlSectionOffset = computeControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup); if (synchronizeBeforeExecution) { auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, inTileCount); programMiAtomic(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, false, MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT); @@ -551,7 +575,8 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer, gpuAddressOfAllocation + computeWalkerSectionStart(partitionCount, synchronizeBeforeExecution, - nativeCrossTileAtomicSync), + nativeCrossTileAtomicSync, + useAtomicsForNativeCleanup), true, secondaryBatchBuffer); @@ -560,7 +585,7 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer, if (nativeCrossTileAtomicSync) { const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount); - programNativeCrossTileSyncControl(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField); + programNativeCrossTileSyncControl(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, useAtomicsForNativeCleanup); } programPipeControlCommand(currentBatchBufferPointer, totalBytesProgrammed, true); @@ -608,7 +633,8 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer, finalSyncTileCountAddress, gpuAddressOfAllocation + controlSectionOffset, dynamicPartitioningFieldsForCleanupCount, - tileCount); + tileCount, + useAtomicsForNativeCleanup); } if (emitBatchBufferEnd) { @@ -625,11 +651,11 @@ struct StaticPartitioningControlSection { static constexpr inline size_t staticPartitioningFieldsForCleanupCount = sizeof(StaticPartitioningControlSection) / sizeof(uint32_t) - 1; template -uint64_t computeStaticPartitioningControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync) { +uint64_t computeStaticPartitioningControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync, bool useAtomicsForNativeCleanup) { const auto beforeExecutionSyncAtomicSize = synchronizeBeforeExecution ? (sizeof(MI_SEMAPHORE_WAIT) + sizeof(MI_ATOMIC)) : 0u; const auto afterExecutionSyncAtomicSize = (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) ? (sizeof(MI_SEMAPHORE_WAIT) + sizeof(MI_ATOMIC)) : 0u; const auto afterExecutionSyncPostSyncSize = isSemaphoreProgrammingRequired() ? sizeof(MI_SEMAPHORE_WAIT) * partitionCount : 0u; - const auto nativeCrossTileSyncSize = nativeCrossTileAtomicSync ? sizeof(MI_STORE_DATA_IMM) : 0u; + const auto nativeCrossTileSyncSize = nativeCrossTileAtomicSync ? computeNativeCrossTileSyncControlSectionSize(useAtomicsForNativeCleanup) : 0u; return beforeExecutionSyncAtomicSize + sizeof(LOAD_REGISTER_MEM) + sizeof(PIPE_CONTROL) + @@ -650,12 +676,13 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer, bool synchronizeBeforeExecution, bool secondaryBatchBuffer, bool nativeCrossTileAtomicSync, - uint64_t workPartitionAllocationGpuVa) { + uint64_t workPartitionAllocationGpuVa, + bool useAtomicsForNativeCleanup) { totalBytesProgrammed = 0u; void *currentBatchBufferPointer = cpuPointer; // Get address of the control section - const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync); + const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup); const auto afterControlSectionOffset = controlSectionOffset + sizeof(StaticPartitioningControlSection); // Synchronize tiles before walker @@ -671,7 +698,7 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer, // Prepare for cleanup section if (nativeCrossTileAtomicSync) { const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter); - programNativeCrossTileSyncControl(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField); + programNativeCrossTileSyncControl(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, useAtomicsForNativeCleanup); } programPipeControlCommand(currentBatchBufferPointer, totalBytesProgrammed, true); // flush L3 cache @@ -704,7 +731,8 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer, finalSyncTileCountAddress, gpuAddressOfAllocation + controlSectionOffset, staticPartitioningFieldsForCleanupCount, - tileCount); + tileCount, + useAtomicsForNativeCleanup); } } @@ -713,17 +741,18 @@ uint64_t estimateSpaceRequiredInCommandBuffer(bool requiresBatchBufferEnd, uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync, - bool staticPartitioning) { + bool staticPartitioning, + bool useAtomicsForNativeCleanup) { uint64_t size = {}; if (staticPartitioning) { - size += computeStaticPartitioningControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync); + size += computeStaticPartitioningControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup); size += sizeof(StaticPartitioningControlSection); - size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize(staticPartitioningFieldsForCleanupCount) : 0u; + size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize(staticPartitioningFieldsForCleanupCount, useAtomicsForNativeCleanup) : 0u; } else { - size += computeControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync); + size += computeControlSectionOffset(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup); size += sizeof(BatchBufferControlData); size += requiresBatchBufferEnd ? sizeof(BATCH_BUFFER_END) : 0u; - size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize(dynamicPartitioningFieldsForCleanupCount) : 0u; + size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize(dynamicPartitioningFieldsForCleanupCount, useAtomicsForNativeCleanup) : 0u; } return size; } diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 17c8286492..fad10b9fcb 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -294,6 +294,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicInComputeWalker, -1, "set SYSTOL /*EXPERIMENTAL TOGGLES*/ DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableCustomLocalMemoryAlignment, 0, "Align local memory allocations to a given value. Works only with allocations at least as big as the value. 0: no effect, 2097152: 2 megabytes, 1073741824: 1 gigabyte") +DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalUseAtomicsForNativeSectionCleanup, -1, "-1: default (disabled), 0: use store data op, 1: use atomic op") /*DRIVER TOGGLES*/ DECLARE_DEBUG_VARIABLE(int32_t, ForceOCLVersion, 0, "Force specific OpenCL API version") diff --git a/shared/test/unit_test/encoders/test_implicit_scaling.cpp b/shared/test/unit_test/encoders/test_implicit_scaling.cpp index 1c3ea8db77..69d5dcf016 100644 --- a/shared/test/unit_test/encoders/test_implicit_scaling.cpp +++ b/shared/test/unit_test/encoders/test_implicit_scaling.cpp @@ -45,3 +45,17 @@ TEST_F(ImplicitScalingTests, givenMultiTileApiEnabledWhenOsSupportOffAndForcedOn OSInterface::osEnableLocalMemory = false; EXPECT_FALSE(ImplicitScalingHelper::isImplicitScalingEnabled(twoTile, true)); } + +TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingAtomicsForNativeCleanupThenExpectFalse) { + EXPECT_FALSE(ImplicitScalingHelper::useAtomicsForNativeCleanup()); +} + +TEST_F(ImplicitScalingTests, givenForceNotUseAtomicsWhenCheckingAtomicsForNativeCleanupThenExpectFalse) { + DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.set(0); + EXPECT_FALSE(ImplicitScalingHelper::useAtomicsForNativeCleanup()); +} + +TEST_F(ImplicitScalingTests, givenForceUseAtomicsWhenCheckingAtomicsForNativeCleanupThenExpectTrue) { + DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.set(1); + EXPECT_TRUE(ImplicitScalingHelper::useAtomicsForNativeCleanup()); +}