Refactor and modularize walker partition code

Related-To: NEO-6244


Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2021-09-13 17:39:55 +00:00 committed by Compute-Runtime-Automation
parent e82c2e4653
commit b65d8909e4
19 changed files with 3184 additions and 2700 deletions

View File

@ -10,5 +10,7 @@
namespace NEO {
namespace ImplicitScaling {
bool apiSupport = false;
}
bool semaphoreProgrammingRequired = false;
bool crossTileAtomicSynchronization = true;
} // namespace ImplicitScaling
} // namespace NEO

View File

@ -10,5 +10,7 @@
namespace NEO {
namespace ImplicitScaling {
bool apiSupport = true;
}
bool semaphoreProgrammingRequired = false;
bool crossTileAtomicSynchronization = true;
} // namespace ImplicitScaling
} // namespace NEO

View File

@ -88,7 +88,6 @@ set(IGDRCL_SRCS_tests_command_queue
${CMAKE_CURRENT_SOURCE_DIR}/ooq_task_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/read_write_buffer_cpu_copy.cpp
${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_tests_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/work_group_size_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/zero_size_enqueue_tests.cpp
)
@ -98,7 +97,10 @@ if(TESTS_XEHP_AND_LATER)
${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker_tests_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_media_kernel_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_resource_barier_tests_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_tests_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_fixture_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_fixture_xehp_and_later.h
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_tests_xehp_and_later_1.cpp
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_tests_xehp_and_later_2.cpp
)
endif()

View File

@ -1072,8 +1072,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenWalkerPart
MockClDevice *device = deviceFactory.rootDevices[0];
MockContext context{device};
auto synchronizeBeforeExecution = false;
auto staticPartitioning = false;
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, device, nullptr);
auto &csr = cmdQ->getUltCommandStreamReceiver();
@ -1087,35 +1085,39 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenWalkerPart
DispatchInfo dispatchInfo{};
dispatchInfo.setNumberOfWorkgroups({32, 1, 1});
synchronizeBeforeExecution = false;
WalkerPartition::WalkerPartitionArgs testArgs = {};
testArgs.initializeWparidRegister = true;
testArgs.crossTileAtomicSynchronization = true;
testArgs.usePipeControlStall = true;
testArgs.partitionCount = 2u;
testArgs.tileCount = static_cast<uint32_t>(device->getDeviceBitfield().count());
DebugManager.flags.SynchronizeWalkerInWparidMode.set(0);
staticPartitioning = false;
testArgs.staticPartitioning = false;
testArgs.synchronizeBeforeExecution = false;
csr.staticWorkPartitioningEnabled = false;
auto partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(false, 16u, synchronizeBeforeExecution, false, staticPartitioning, false);
auto partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(testArgs);
auto returnedSize = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo);
EXPECT_EQ(returnedSize, partitionSize + baseSize);
synchronizeBeforeExecution = false;
DebugManager.flags.SynchronizeWalkerInWparidMode.set(0);
staticPartitioning = true;
testArgs.staticPartitioning = true;
csr.staticWorkPartitioningEnabled = true;
partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(false, 16u, synchronizeBeforeExecution, false, staticPartitioning, false);
partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(testArgs);
returnedSize = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo);
EXPECT_EQ(returnedSize, partitionSize + baseSize);
synchronizeBeforeExecution = true;
DebugManager.flags.SynchronizeWalkerInWparidMode.set(1);
staticPartitioning = false;
testArgs.synchronizeBeforeExecution = true;
testArgs.staticPartitioning = false;
csr.staticWorkPartitioningEnabled = false;
partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(false, 16u, synchronizeBeforeExecution, false, staticPartitioning, false);
partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(testArgs);
returnedSize = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo);
EXPECT_EQ(returnedSize, partitionSize + baseSize);
synchronizeBeforeExecution = true;
DebugManager.flags.SynchronizeWalkerInWparidMode.set(1);
staticPartitioning = true;
testArgs.synchronizeBeforeExecution = true;
testArgs.staticPartitioning = true;
csr.staticWorkPartitioningEnabled = true;
partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(false, 16u, synchronizeBeforeExecution, false, staticPartitioning, false);
partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(testArgs);
returnedSize = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo);
EXPECT_EQ(returnedSize, partitionSize + baseSize);
}
@ -1167,7 +1169,14 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenQueueIsMul
HardwareCommandsHelper<FamilyType>::getSizeRequiredCS() +
EncodeMemoryPrefetch<FamilyType>::getSizeForMemoryPrefetch(kernel->kernelInfo.heapInfo.KernelHeapSize);
auto partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(false, 16u, false, false, false, false);
WalkerPartition::WalkerPartitionArgs testArgs = {};
testArgs.initializeWparidRegister = true;
testArgs.usePipeControlStall = true;
testArgs.crossTileAtomicSynchronization = true;
testArgs.partitionCount = 16u;
testArgs.tileCount = static_cast<uint32_t>(device->getDeviceBitfield().count());
auto partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(testArgs);
DispatchInfo dispatchInfo{};
dispatchInfo.setNumberOfWorkgroups({32, 1, 1});

View File

@ -0,0 +1,25 @@
/*
* Copyright (C) 2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "opencl/test/unit_test/command_queue/walker_partition_fixture_xehp_and_later.h"
void WalkerPartitionTests::SetUp() {
cmdBufferAddress = cmdBuffer;
testArgs.synchronizeBeforeExecution = false;
testArgs.nativeCrossTileAtomicSync = false;
testArgs.initializeWparidRegister = true;
testArgs.usePipeControlStall = true;
testArgs.crossTileAtomicSynchronization = true;
}
void WalkerPartitionTests::TearDown() {
auto initialCommandBufferPointer = cmdBuffer;
if (checkForProperCmdBufferAddressOffset) {
EXPECT_EQ(ptrDiff(cmdBufferAddress, initialCommandBufferPointer), totalBytesProgrammed);
}
}

View File

@ -0,0 +1,38 @@
/*
* Copyright (C) 2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/command_container/walker_partition_xehp_and_later.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "test.h"
using namespace WalkerPartition;
struct WalkerPartitionTests : public ::testing::Test {
void SetUp() override;
void TearDown() override;
template <typename GfxFamily>
auto createWalker(uint64_t postSyncAddress) {
WalkerPartition::COMPUTE_WALKER<GfxFamily> walker;
walker = GfxFamily::cmdInitGpgpuWalker;
walker.setPartitionType(COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X);
auto &postSync = walker.getPostSync();
postSync.setOperation(POSTSYNC_DATA<GfxFamily>::OPERATION::OPERATION_WRITE_TIMESTAMP);
postSync.setDestinationAddress(postSyncAddress);
return walker;
}
char cmdBuffer[4096u];
WalkerPartition::WalkerPartitionArgs testArgs = {};
void *cmdBufferAddress = nullptr;
uint32_t totalBytesProgrammed = 0u;
bool checkForProperCmdBufferAddressOffset = true;
};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -14,3 +14,11 @@ using namespace NEO;
TEST(ImplicitScalingApiTests, givenOpenClApiUsedThenSupportEnabled) {
EXPECT_TRUE(ImplicitScaling::apiSupport);
}
TEST(ImplicitScalingApiTests, givenOpenClApiUsedThenSemaphoreProgrammingRequiredIsFalse) {
EXPECT_FALSE(ImplicitScaling::semaphoreProgrammingRequired);
}
TEST(ImplicitScalingApiTests, givenOpenClApiUsedThenCrossTileAtomicSynchronization) {
EXPECT_TRUE(ImplicitScaling::crossTileAtomicSynchronization);
}

View File

@ -266,8 +266,8 @@ ForceWorkgroupSize1x1x1 = -1
ForceThreadGroupDispatchSize = -1
ForceStatelessL1CachingPolicy = -1
ForceMemoryBankIndexOverride = -1
ExperimentalSynchronizeWithSemaphores = -1
ExperimentalForceCrossAtomicSynchronization = -1
SynchronizeWithSemaphores = -1
UseCrossAtomicSynchronization = -1
EnableStatelessCompression = -1
EnableMultiTileCompression = -1
EnablePrivateScratchSlot1 = -1
@ -313,7 +313,7 @@ OverrideUseKmdWaitFunction = -1
EnableCacheFlushAfterWalkerForAllQueues = -1
Force32BitDriverSupport = -1
OverrideCmdQueueSynchronousMode = -1
ExperimentalUseAtomicsForNativeSectionCleanup = -1
UseAtomicsForNativeSectionCleanup = -1
HBMSizePerTileInGigabytes = 0
OverrideSystolicPipelineSelect = -1
OverrideSystolicInComputeWalker = -1
@ -324,6 +324,9 @@ DoNotFreeResources = 0
OverrideGmmResourceUsageField = -1
LogAllocationType = 0
ProgramAdditionalPipeControlBeforeStateComputeModeCommand = 0
ProgramNativeCleanup = -1
WparidRegisterProgramming = -1
UsePipeControlAfterPartitionedWalker = -1
OverrideBufferSuitableForRenderCompression = -1
AllowMixingRegularAndCooperativeKernels = 0
AllowPatchingVfeStateInCommandLists = 0

View File

@ -34,12 +34,55 @@ bool ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired() {
return synchronizeBeforeExecution;
}
bool ImplicitScalingHelper::isSemaphoreProgrammingRequired() {
auto semaphoreProgrammingRequired = ImplicitScaling::semaphoreProgrammingRequired;
if (NEO::DebugManager.flags.SynchronizeWithSemaphores.get() == 1) {
semaphoreProgrammingRequired = true;
}
return semaphoreProgrammingRequired;
}
bool ImplicitScalingHelper::isCrossTileAtomicRequired() {
auto crossTileAtomicSynchronization = ImplicitScaling::crossTileAtomicSynchronization;
if (NEO::DebugManager.flags.UseCrossAtomicSynchronization.get() == 0) {
crossTileAtomicSynchronization = false;
}
return crossTileAtomicSynchronization;
}
bool ImplicitScalingHelper::useAtomicsForNativeCleanup() {
bool useAtomics = false;
int overrideUseAtomics = DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.get();
int overrideUseAtomics = DebugManager.flags.UseAtomicsForNativeSectionCleanup.get();
if (overrideUseAtomics != -1) {
useAtomics = !!(overrideUseAtomics);
}
return useAtomics;
}
bool ImplicitScalingHelper::programNativeCleanup(bool defaultNativeCleanup) {
int overrideProgramNativeCleanup = DebugManager.flags.ProgramNativeCleanup.get();
if (overrideProgramNativeCleanup != -1) {
defaultNativeCleanup = !!(overrideProgramNativeCleanup);
}
return defaultNativeCleanup;
}
bool ImplicitScalingHelper::initWparidRegister() {
bool initWparidRegister = true;
int overrideInitWparidRegister = DebugManager.flags.WparidRegisterProgramming.get();
if (overrideInitWparidRegister != -1) {
initWparidRegister = !!(overrideInitWparidRegister);
}
return initWparidRegister;
}
bool ImplicitScalingHelper::usePipeControl() {
bool usePipeControl = true;
int overrideUsePipeControl = DebugManager.flags.UsePipeControlAfterPartitionedWalker.get();
if (overrideUsePipeControl != -1) {
usePipeControl = !!(overrideUsePipeControl);
}
return usePipeControl;
}
} // namespace NEO

View File

@ -15,12 +15,22 @@ class LinearStream;
namespace ImplicitScaling {
extern bool apiSupport;
}
extern bool semaphoreProgrammingRequired;
extern bool crossTileAtomicSynchronization;
constexpr uint32_t partitionAddressOffsetDwords = 2u;
constexpr uint32_t partitionAddressOffset = sizeof(uint32_t) * partitionAddressOffsetDwords;
} // namespace ImplicitScaling
struct ImplicitScalingHelper {
static bool isImplicitScalingEnabled(const DeviceBitfield &devices, bool preCondition);
static bool isSemaphoreProgrammingRequired();
static bool isCrossTileAtomicRequired();
static bool isSynchronizeBeforeExecutionRequired();
static bool useAtomicsForNativeCleanup();
static bool programNativeCleanup(bool defaultNativeCleanup);
static bool initWparidRegister();
static bool usePipeControl();
};
template <typename GfxFamily>

View File

@ -20,6 +20,7 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool nativeCrossTileAtomicSyn
typename GfxFamily::COMPUTE_WALKER::PARTITION_TYPE partitionType{};
bool staticPartitioning = false;
const uint32_t tileCount = static_cast<uint32_t>(devices.count());
const uint32_t partitionCount = WalkerPartition::computePartitionCountAndPartitionType<GfxFamily>(tileCount,
preferStaticPartitioning,
groupStart,
@ -28,15 +29,21 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool nativeCrossTileAtomicSyn
&partitionType,
&staticPartitioning);
UNRECOVERABLE_IF(staticPartitioning && (tileCount != partitionCount));
WalkerPartition::WalkerPartitionArgs args = {};
auto synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
const bool useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(false,
16u,
synchronizeBeforeExecution,
nativeCrossTileAtomicSync,
staticPartitioning,
useAtomicsForNativeCleanup));
args.partitionCount = partitionCount;
args.tileCount = tileCount;
args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
args.useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
args.nativeCrossTileAtomicSync = ImplicitScalingHelper::programNativeCleanup(nativeCrossTileAtomicSync);
args.initializeWparidRegister = ImplicitScalingHelper::initWparidRegister();
args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired();
args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired();
args.usePipeControlStall = ImplicitScalingHelper::usePipeControl();
args.emitBatchBufferEnd = false;
args.staticPartitioning = staticPartitioning;
return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(args));
}
template <typename GfxFamily>
@ -54,36 +61,43 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
bool staticPartitioning = false;
partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType<GfxFamily>(&walkerCmd, tileCount, preferStaticPartitioning, usesImages, &staticPartitioning);
const bool synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
const bool useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
WalkerPartition::WalkerPartitionArgs args = {};
args.workPartitionAllocationGpuVa = workPartitionAllocationGpuVa;
args.partitionCount = partitionCount;
args.tileCount = tileCount;
args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
args.useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
args.nativeCrossTileAtomicSync = ImplicitScalingHelper::programNativeCleanup(nativeCrossTileAtomicSync);
args.initializeWparidRegister = ImplicitScalingHelper::initWparidRegister();
args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired();
args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired();
args.usePipeControlStall = ImplicitScalingHelper::usePipeControl();
args.emitBatchBufferEnd = false;
args.secondaryBatchBuffer = useSecondaryBatchBuffer;
args.staticPartitioning = staticPartitioning;
if (staticPartitioning) {
UNRECOVERABLE_IF(tileCount != partitionCount);
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
&walkerCmd,
totalProgrammedSize,
partitionCount,
tileCount,
synchronizeBeforeExecution,
useSecondaryBatchBuffer,
nativeCrossTileAtomicSync,
workPartitionAllocationGpuVa,
useAtomicsForNativeCleanup);
args);
} else {
if (DebugManager.flags.ExperimentalSetWalkerPartitionCount.get()) {
partitionCount = DebugManager.flags.ExperimentalSetWalkerPartitionCount.get();
if (partitionCount == 1u) {
walkerCmd.setPartitionType(GfxFamily::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
}
args.partitionCount = partitionCount;
}
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
&walkerCmd, totalProgrammedSize,
partitionCount, tileCount,
false, synchronizeBeforeExecution, useSecondaryBatchBuffer,
nativeCrossTileAtomicSync,
useAtomicsForNativeCleanup);
&walkerCmd,
totalProgrammedSize,
args);
}
commandStream.getSpace(totalProgrammedSize);
}

View File

@ -18,6 +18,22 @@
namespace WalkerPartition {
struct WalkerPartitionArgs {
uint64_t workPartitionAllocationGpuVa = 0;
uint32_t partitionCount = 0;
uint32_t tileCount = 0;
bool emitBatchBufferEnd = false;
bool secondaryBatchBuffer = false;
bool synchronizeBeforeExecution = false;
bool crossTileAtomicSynchronization = false;
bool semaphoreProgrammingRequired = false;
bool staticPartitioning = false;
bool nativeCrossTileAtomicSync = false;
bool useAtomicsForNativeCleanup = false;
bool initializeWparidRegister = false;
bool usePipeControlStall = false;
};
template <typename GfxFamily>
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
template <typename GfxFamily>
@ -73,22 +89,6 @@ Command *putCommand(void *&inputAddress, uint32_t &totalBytesProgrammed) {
return commandToReturn;
}
bool inline isSemaphoreProgrammingRequired() {
auto semaphoreProgrammingRequired = false;
if (NEO::DebugManager.flags.ExperimentalSynchronizeWithSemaphores.get() == 1) {
semaphoreProgrammingRequired = true;
}
return semaphoreProgrammingRequired;
}
bool inline isCrossTileAtomicRequired() {
auto crossTileAtomicSynchronization = true;
if (NEO::DebugManager.flags.ExperimentalForceCrossAtomicSynchronization.get() == 0) {
crossTileAtomicSynchronization = false;
}
return crossTileAtomicSynchronization;
}
template <typename GfxFamily>
uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitionCount,
bool preferStaticPartitioning,
@ -349,25 +349,55 @@ void programStoreMemImmediateDword(void *&inputAddress, uint32_t &totalBytesProg
*storeDataImmediate = cmd;
}
template <typename GfxFamily>
uint64_t computeNativeCrossTileSyncControlSectionSize(bool useAtomicsForNativeCleanup) {
if (useAtomicsForNativeCleanup) {
return sizeof(MI_ATOMIC<GfxFamily>);
} else {
return sizeof(MI_STORE_DATA_IMM<GfxFamily>);
}
}
template <typename GfxFamily>
void programNativeCrossTileSyncControl(void *&inputAddress,
uint32_t &totalBytesProgrammed,
uint64_t finalSyncTileCountField,
uint64_t address,
bool useAtomicsForNativeCleanup) {
if (useAtomicsForNativeCleanup) {
programMiAtomic<GfxFamily>(inputAddress,
totalBytesProgrammed,
finalSyncTileCountField,
address,
false,
MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_MOVE);
} else {
programStoreMemImmediateDword<GfxFamily>(inputAddress,
totalBytesProgrammed,
finalSyncTileCountField,
address,
0u);
}
}
template <typename GfxFamily>
uint64_t computeTilesSynchronizationWithAtomicsSectionSize() {
return sizeof(MI_ATOMIC<GfxFamily>) +
sizeof(MI_SEMAPHORE_WAIT<GfxFamily>);
}
template <typename GfxFamily>
void programTilesSynchronizationWithAtomics(void *&currentBatchBufferPointer,
uint32_t &totalBytesProgrammed,
uint64_t atomicAddress,
uint32_t tileCount) {
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
}
template <typename GfxFamily>
uint64_t computeNativeCrossTileSyncCleanupSectionSize(size_t fieldsForCleanupCount, bool useAtomicsForNativeCleanup) {
return fieldsForCleanupCount * computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) +
2 * computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>();
}
template <typename GfxFamily>
void programNativeCrossTileSyncCleanup(void *&inputAddress,
uint32_t &totalBytesProgrammed,
@ -377,28 +407,18 @@ void programNativeCrossTileSyncCleanup(void *&inputAddress,
uint32_t tileCount,
bool useAtomicsForNativeCleanup) {
// Synchronize tiles, so the fields are not cleared while still in use
programMiAtomic<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
programWaitForSemaphore<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, tileCount);
for (auto fieldIndex = 0u; fieldIndex < fieldsForCleanupCount; fieldIndex++) {
const uint64_t addressForCleanup = baseAddressForCleanup + fieldIndex * sizeof(uint32_t);
if (useAtomicsForNativeCleanup) {
programMiAtomic<GfxFamily>(inputAddress,
totalBytesProgrammed,
addressForCleanup,
false,
MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_MOVE);
} else {
programStoreMemImmediateDword<GfxFamily>(inputAddress,
programNativeCrossTileSyncControl<GfxFamily>(inputAddress,
totalBytesProgrammed,
addressForCleanup,
0u);
}
useAtomicsForNativeCleanup);
}
//this synchronization point ensures that all tiles finished zeroing and will fairly access control section atomic variables
programMiAtomic<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
programWaitForSemaphore<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * tileCount);
}
template <typename GfxFamily>
@ -412,15 +432,6 @@ void programTilesSynchronizationWithPostSyncs(void *&currentBatchBufferPointer,
}
}
template <typename GfxFamily>
void programTilesSynchronizationWithAtomics(void *&currentBatchBufferPointer,
uint32_t &totalBytesProgrammed,
uint64_t atomicAddress,
uint32_t tileCount) {
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
}
template <typename GfxFamily>
uint64_t computeWalkerSectionSize() {
return sizeof(BATCH_BUFFER_START<GfxFamily>) +
@ -428,46 +439,30 @@ uint64_t computeWalkerSectionSize() {
}
template <typename GfxFamily>
uint64_t computeNativeCrossTileSyncControlSectionSize(bool useAtomicsForNativeCleanup) {
if (useAtomicsForNativeCleanup) {
return sizeof(MI_ATOMIC<GfxFamily>);
} else {
return sizeof(MI_STORE_DATA_IMM<GfxFamily>);
uint64_t computeControlSectionOffset(WalkerPartitionArgs &args) {
uint64_t size = 0u;
size += args.synchronizeBeforeExecution ? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>() : 0;
size += sizeof(LOAD_REGISTER_IMM<GfxFamily>); //predication mask
size += sizeof(MI_ATOMIC<GfxFamily>); //current id for partition
size += sizeof(LOAD_REGISTER_REG<GfxFamily>); //id into register
size += sizeof(MI_SET_PREDICATE<GfxFamily>) * 2 +
sizeof(BATCH_BUFFER_START<GfxFamily>) * 2;
size += (args.semaphoreProgrammingRequired ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * args.partitionCount : 0u);
size += computeWalkerSectionSize<GfxFamily>();
size += args.usePipeControlStall ? sizeof(PIPE_CONTROL<GfxFamily>) : 0u;
if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
size += computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>();
}
}
template <typename GfxFamily>
uint64_t computeNativeCrossTileSyncCleanupSectionSize(size_t fieldsForCleanupCount, bool useAtomicsForNativeCleanup) {
return fieldsForCleanupCount * computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) +
2 * sizeof(MI_ATOMIC<GfxFamily>) +
2 * sizeof(MI_SEMAPHORE_WAIT<GfxFamily>);
}
template <typename GfxFamily>
uint64_t computeControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync, bool useAtomicsForNativeCleanup) {
auto synchronizationCount = (synchronizeBeforeExecution) ? 2u : 1u;
if (!isCrossTileAtomicRequired() && !nativeCrossTileAtomicSync) {
synchronizationCount--;
if (args.nativeCrossTileAtomicSync) {
size += computeNativeCrossTileSyncControlSectionSize<GfxFamily>(args.useAtomicsForNativeCleanup);
}
return sizeof(LOAD_REGISTER_IMM<GfxFamily>) +
sizeof(MI_ATOMIC<GfxFamily>) * (1u + synchronizationCount) +
sizeof(LOAD_REGISTER_REG<GfxFamily>) +
sizeof(MI_SET_PREDICATE<GfxFamily>) * 2 +
sizeof(BATCH_BUFFER_START<GfxFamily>) * 2 +
sizeof(PIPE_CONTROL<GfxFamily>) +
sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * synchronizationCount +
(isSemaphoreProgrammingRequired() ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * partitionCount : 0u) +
computeWalkerSectionSize<GfxFamily>() +
(nativeCrossTileAtomicSync ? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) : 0u);
return size;
}
template <typename GfxFamily>
uint64_t computeWalkerSectionStart(uint32_t partitionCount,
bool synchronizeBeforeExecution,
bool nativeCrossTileAtomicSync,
bool useAtomicsForNativeCleanup) {
return computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup) -
uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) {
return computeControlSectionOffset<GfxFamily>(args) -
computeWalkerSectionSize<GfxFamily>();
}
@ -537,26 +532,17 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
uint64_t gpuAddressOfAllocation,
COMPUTE_WALKER<GfxFamily> *inputWalker,
uint32_t &totalBytesProgrammed,
uint32_t partitionCount,
uint32_t tileCount,
bool emitBatchBufferEnd,
bool synchronizeBeforeExecution,
bool secondaryBatchBuffer,
bool nativeCrossTileAtomicSync,
bool useAtomicsForNativeCleanup) {
WalkerPartitionArgs &args) {
totalBytesProgrammed = 0u;
void *currentBatchBufferPointer = cpuPointer;
auto controlSectionOffset = computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
if (synchronizeBeforeExecution) {
auto controlSectionOffset = computeControlSectionOffset<GfxFamily>(args);
if (args.synchronizeBeforeExecution) {
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, inTileCount);
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
//if all tiles hit the atomic, it means we may go further
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
}
programWparidMask<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, partitionCount);
programWparidMask<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.partitionCount);
programMiAtomic<GfxFamily>(currentBatchBufferPointer,
totalBytesProgrammed,
@ -573,36 +559,32 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer,
totalBytesProgrammed,
gpuAddressOfAllocation +
computeWalkerSectionStart<GfxFamily>(partitionCount,
synchronizeBeforeExecution,
nativeCrossTileAtomicSync,
useAtomicsForNativeCleanup),
computeWalkerSectionStart<GfxFamily>(args),
true,
secondaryBatchBuffer);
args.secondaryBatchBuffer);
//disable predication to not noop subsequent commands.
programWparidPredication<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, false);
if (nativeCrossTileAtomicSync) {
if (args.nativeCrossTileAtomicSync) {
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, useAtomicsForNativeCleanup);
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForNativeCleanup);
}
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true);
if (args.usePipeControlStall) {
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true);
}
if (isSemaphoreProgrammingRequired()) {
if (args.semaphoreProgrammingRequired) {
auto postSyncAddress = inputWalker->getPostSync().getDestinationAddress() + 8llu;
for (uint32_t partitionId = 0u; partitionId < partitionCount; partitionId++) {
for (uint32_t partitionId = 0u; partitionId < args.partitionCount; partitionId++) {
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, postSyncAddress + partitionId * 16llu, 1u, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
}
}
if (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) {
if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, tileCount);
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
//if all tiles hit the atomic, it means we may go further
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
}
//this bb start goes to the end of partitioned command buffer
@ -611,12 +593,12 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
totalBytesProgrammed,
gpuAddressOfAllocation + controlSectionOffset + sizeof(BatchBufferControlData),
false,
secondaryBatchBuffer);
args.secondaryBatchBuffer);
//Walker section
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, secondaryBatchBuffer);
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, args.secondaryBatchBuffer);
auto controlSection = reinterpret_cast<BatchBufferControlData *>(ptrOffset(cpuPointer, static_cast<size_t>(controlSectionOffset)));
controlSection->partitionCount = 0u;
@ -626,18 +608,18 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
totalBytesProgrammed += sizeof(BatchBufferControlData);
currentBatchBufferPointer = ptrOffset(currentBatchBufferPointer, sizeof(BatchBufferControlData));
if (nativeCrossTileAtomicSync) {
if (args.nativeCrossTileAtomicSync) {
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
programNativeCrossTileSyncCleanup<GfxFamily>(currentBatchBufferPointer,
totalBytesProgrammed,
finalSyncTileCountAddress,
gpuAddressOfAllocation + controlSectionOffset,
dynamicPartitioningFieldsForCleanupCount,
tileCount,
useAtomicsForNativeCleanup);
args.tileCount,
args.useAtomicsForNativeCleanup);
}
if (emitBatchBufferEnd) {
if (args.emitBatchBufferEnd) {
auto batchBufferEnd = putCommand<BATCH_BUFFER_END<GfxFamily>>(currentBatchBufferPointer, totalBytesProgrammed);
*batchBufferEnd = GfxFamily::cmdInitBatchBufferEnd;
}
@ -651,14 +633,28 @@ struct StaticPartitioningControlSection {
static constexpr inline size_t staticPartitioningFieldsForCleanupCount = sizeof(StaticPartitioningControlSection) / sizeof(uint32_t) - 1;
template <typename GfxFamily>
uint64_t computeStaticPartitioningControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync, bool useAtomicsForNativeCleanup) {
const auto beforeExecutionSyncAtomicSize = synchronizeBeforeExecution ? (sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) + sizeof(MI_ATOMIC<GfxFamily>)) : 0u;
const auto afterExecutionSyncAtomicSize = (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) ? (sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) + sizeof(MI_ATOMIC<GfxFamily>)) : 0u;
const auto afterExecutionSyncPostSyncSize = isSemaphoreProgrammingRequired() ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * partitionCount : 0u;
const auto nativeCrossTileSyncSize = nativeCrossTileAtomicSync ? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) : 0u;
uint64_t computeStaticPartitioningControlSectionOffset(WalkerPartitionArgs &args) {
const auto beforeExecutionSyncAtomicSize = args.synchronizeBeforeExecution
? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>()
: 0u;
const auto afterExecutionSyncAtomicSize = (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync)
? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>()
: 0u;
const auto afterExecutionSyncPostSyncSize = args.semaphoreProgrammingRequired
? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * args.partitionCount
: 0u;
const auto nativeCrossTileSyncSize = args.nativeCrossTileAtomicSync
? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(args.useAtomicsForNativeCleanup)
: 0u;
const auto wparidRegisterSize = args.initializeWparidRegister
? sizeof(LOAD_REGISTER_MEM<GfxFamily>)
: 0u;
const auto pipeControlSize = args.usePipeControlStall
? sizeof(PIPE_CONTROL<GfxFamily>)
: 0u;
return beforeExecutionSyncAtomicSize +
sizeof(LOAD_REGISTER_MEM<GfxFamily>) +
sizeof(PIPE_CONTROL<GfxFamily>) +
wparidRegisterSize +
pipeControlSize +
sizeof(COMPUTE_WALKER<GfxFamily>) +
nativeCrossTileSyncSize +
afterExecutionSyncAtomicSize +
@ -671,49 +667,48 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
uint64_t gpuAddressOfAllocation,
COMPUTE_WALKER<GfxFamily> *inputWalker,
uint32_t &totalBytesProgrammed,
uint32_t partitionCount,
uint32_t tileCount,
bool synchronizeBeforeExecution,
bool secondaryBatchBuffer,
bool nativeCrossTileAtomicSync,
uint64_t workPartitionAllocationGpuVa,
bool useAtomicsForNativeCleanup) {
WalkerPartitionArgs &args) {
totalBytesProgrammed = 0u;
void *currentBatchBufferPointer = cpuPointer;
// Get address of the control section
const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset<GfxFamily>(args);
const auto afterControlSectionOffset = controlSectionOffset + sizeof(StaticPartitioningControlSection);
// Synchronize tiles before walker
if (synchronizeBeforeExecution) {
if (args.synchronizeBeforeExecution) {
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
}
// Load partition ID to wparid register and execute walker
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, workPartitionAllocationGpuVa, wparidCCSOffset);
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
if (args.initializeWparidRegister) {
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset);
}
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
// Prepare for cleanup section
if (nativeCrossTileAtomicSync) {
if (args.nativeCrossTileAtomicSync) {
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, useAtomicsForNativeCleanup);
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForNativeCleanup);
}
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true); // flush L3 cache
if (args.usePipeControlStall) {
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true); // flush L3 cache
}
// Synchronize tiles after walker
if (isSemaphoreProgrammingRequired()) {
programTilesSynchronizationWithPostSyncs<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
if (args.semaphoreProgrammingRequired) {
programTilesSynchronizationWithPostSyncs<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
}
if (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) {
if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
}
// Jump over the control section
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, secondaryBatchBuffer);
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer);
// Control section
DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset);
@ -724,35 +719,31 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset);
// Cleanup section
if (nativeCrossTileAtomicSync) {
if (args.nativeCrossTileAtomicSync) {
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
programNativeCrossTileSyncCleanup<GfxFamily>(currentBatchBufferPointer,
totalBytesProgrammed,
finalSyncTileCountAddress,
gpuAddressOfAllocation + controlSectionOffset,
staticPartitioningFieldsForCleanupCount,
tileCount,
useAtomicsForNativeCleanup);
args.tileCount,
args.useAtomicsForNativeCleanup);
}
}
template <typename GfxFamily>
uint64_t estimateSpaceRequiredInCommandBuffer(bool requiresBatchBufferEnd,
uint32_t partitionCount,
bool synchronizeBeforeExecution,
bool nativeCrossTileAtomicSync,
bool staticPartitioning,
bool useAtomicsForNativeCleanup) {
uint64_t estimateSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args) {
uint64_t size = {};
if (staticPartitioning) {
size += computeStaticPartitioningControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
if (args.staticPartitioning) {
size += computeStaticPartitioningControlSectionOffset<GfxFamily>(args);
size += sizeof(StaticPartitioningControlSection);
size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(staticPartitioningFieldsForCleanupCount, useAtomicsForNativeCleanup) : 0u;
size += args.nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(staticPartitioningFieldsForCleanupCount, args.useAtomicsForNativeCleanup) : 0u;
} else {
size += computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
size += computeControlSectionOffset<GfxFamily>(args);
size += sizeof(BatchBufferControlData);
size += requiresBatchBufferEnd ? sizeof(BATCH_BUFFER_END<GfxFamily>) : 0u;
size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(dynamicPartitioningFieldsForCleanupCount, useAtomicsForNativeCleanup) : 0u;
size += args.emitBatchBufferEnd ? sizeof(BATCH_BUFFER_END<GfxFamily>) : 0u;
size += args.nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(dynamicPartitioningFieldsForCleanupCount, args.useAtomicsForNativeCleanup) : 0u;
}
return size;
}

View File

@ -123,15 +123,11 @@ DECLARE_DEBUG_VARIABLE(int32_t, CFENumberOfWalkers, -1, "Set Number of Walkers i
DECLARE_DEBUG_VARIABLE(int32_t, CFEMaximumNumberOfThreads, -1, "Set Maximum Number of Threads in CFE_STATE on XEHP, -1 - do not set")
DECLARE_DEBUG_VARIABLE(int32_t, CFEOverDispatchControl, -1, "Set Over Dispatch Control in CFE_STATE on XEHP, -1 - do not set")
DECLARE_DEBUG_VARIABLE(int32_t, CFELargeGRFThreadAdjustDisable, -1, "Set Large GRF thread adjust Disable field in CFE_STATE, -1 - do not set")
DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWalkerInWparidMode, -1, "-1: default, 0: do not synchronize 1: synchronize all tiles prior to doing work distrubution")
DECLARE_DEBUG_VARIABLE(int32_t, EnableWalkerPartition, -1, "-1: default, 0: disable, 1: enable, Enables Walker Partitioning via WPARID.")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumComputeUnitsForScratch, -1, "Override number of compute units used for scratch size calculation")
DECLARE_DEBUG_VARIABLE(int32_t, ForceWorkgroupSize1x1x1, -1, "-1: default, 0: disable, 1: enable, force workgroup size 1x1x1 in builtins")
DECLARE_DEBUG_VARIABLE(int32_t, ForceThreadGroupDispatchSize, -1, "Set ThreadGroupDispatchSize in INTERFACE_DESCRIPTOR_DATA, -1 - default, 0 - TG size 8, 1 - TG size 4, 2 - TG size 2, 3 - Reserved")
DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessL1CachingPolicy, -1, "-1: default, >=0 : program value for stateless L1 caching")
DECLARE_DEBUG_VARIABLE(int32_t, ForceMemoryBankIndexOverride, -1, "-1: default, 0: disable, 1:enable, Force index=1 of memory bank for XEHP")
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSynchronizeWithSemaphores, -1, "Experimental implementation: 1: Emit Semaphores waiting after Walker completion in WPARID mode 0: do not emit semaphores after Walker")
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalForceCrossAtomicSynchronization, -1, "Experimental implementation: 1: Cross Tile Atomic Synchronization present 0: Cross tile atomic synchronization disabled")
DECLARE_DEBUG_VARIABLE(int32_t, EnablePrivateScratchSlot1, -1, "-1: default, 0: disable, 1: enable Allows using private scratch space")
DECLARE_DEBUG_VARIABLE(int32_t, DisablePipeControlPrecedingPostSyncCommand, -1, "-1 default - disabled adding PIPE_CONTROL, 0 - disabled adding PIPE_CONTROL, 1 - enabled adding PIPE_CONTROL")
DECLARE_DEBUG_VARIABLE(int32_t, UseCachingPolicyForIndirectObjectHeap, -1, "Use selected caching policy for IOH, -1 - default, 0 - Uncached, 1 - L3 Caching, 2 - L1 Caching")
@ -142,13 +138,11 @@ DECLARE_DEBUG_VARIABLE(int32_t, ForceMultiGpuPartialWrites, -1, "-1: default - 0
DECLARE_DEBUG_VARIABLE(int32_t, ForceMultiGpuAtomicsInComputeMode, -1, "-1: default - 0 for multiOsContext capable, 0: program value 0 in MultiGpuAtomics bit in STATE_COMPUTE_MODE, 1: program value 1 in MultiGpuAtomics bit in STATE_COMPUTE_MODE")
DECLARE_DEBUG_VARIABLE(int32_t, ForceMultiGpuAtomics, -1, "-1: default - 0 for multiOsContext capable, 0: program value 0 in MultiGpuAtomics controls 1: program value 1 in MultiGpuAtomics controls")
DECLARE_DEBUG_VARIABLE(int32_t, ForceBufferCompressionFormat, -1, "-1: default, >0: Format value")
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionCount, 0, "Experimental implementation: Set number of COMPUTE_WALKERs for a given Partition Type, 0 - do not set the feature.")
DECLARE_DEBUG_VARIABLE(int32_t, EnableHwGenerationLocalIds, -1, "-1: default, 0: disable, 1: enable : Enables generation of local ids on HW")
DECLARE_DEBUG_VARIABLE(int32_t, WalkerPartitionPreferHighestDimension, -1, "-1: default, 0: prefer biggest dimension, 1: prefer Z over Y over X if they divide partition count evenly")
DECLARE_DEBUG_VARIABLE(int32_t, SetMinimalPartitionSize, -1, "-1 default value set to 512 workgroups, 0 - disabled, >0 - minimal partition size in workgroups (should be power of 2)")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideBlitterTargetMemory, -1, "-1:default 0: overwrites to System 1: overwrites to Local")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideBlitterMocs, -1, "-1: default, >=0 SetGivenMocsInBlitterTransfers")
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionType, -1, "Experimental implementation: Set COMPUTE_WALKER Partition Type. Valid values for types from 1 to 3")
DECLARE_DEBUG_VARIABLE(int32_t, OverridePostSyncMocs, -1, "-1: default, >=0 Override post sync mocs with value")
DECLARE_DEBUG_VARIABLE(int32_t, EnableImmediateVmBindExt, -1, "Use immediate bind extension to a new residency model on Linux (requires kernel support), -1: default (enabled whith direct submission), 0: disabled, 1: enabled")
DECLARE_DEBUG_VARIABLE(int32_t, ForceExecutionTile, -1, "-1: default, 0+: given tile is choosen as submission, must be used with EnableWalkerPartition = 0.")
@ -237,11 +231,21 @@ DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionOverrideComputeSupport, -1, "Ove
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableCacheFlush, -1, "-1: driver default, 0: additional cache flush is present 1: disable dispatching cache flush commands")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionNewResourceTlbFlush, -1, "-1: driver default - flush when new resource is bound, 0: disabled, 1: enabled")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableMonitorFence, -1, "Disable dispatching monitor fence commands")
DECLARE_DEBUG_VARIABLE(bool, USMEvictAfterMigration, true, "Evict USM allocation after implicit migration to GPU")
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmissionController, -1, "Enable direct submission terminating after given timeout, -1: default, 0: disabled, 1: enabled")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionControllerTimeout, -1, "Set direct submission controller timeout, -1: default 5 ms, >=0: timeout in ms")
/* IMPLICIT SCALING */
DECLARE_DEBUG_VARIABLE(int32_t, EnableWalkerPartition, -1, "-1: default, 0: disable, 1: enable, Enables Walker Partitioning via WPARID.")
DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWalkerInWparidMode, -1, "-1: default, 0: do not synchronize 1: synchronize all tiles prior to doing work distrubution")
DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWithSemaphores, -1, "-1: default (disabled), 1: Emit Semaphores waiting after Walker completion in WPARID mode 0: do not emit semaphores after Walker")
DECLARE_DEBUG_VARIABLE(int32_t, UseCrossAtomicSynchronization, -1, "-1: default (enabled), 1: Cross Tile Atomic Synchronization present 0: Cross tile atomic synchronization disabled")
DECLARE_DEBUG_VARIABLE(int32_t, UseAtomicsForNativeSectionCleanup, -1, "-1: default (disabled), 0: use store data op, 1: use atomic op")
DECLARE_DEBUG_VARIABLE(int32_t, ProgramNativeCleanup, -1, "-1: default (API dependent), 0: Do not program native cleanup, 1: program native cleanup")
DECLARE_DEBUG_VARIABLE(int32_t, WparidRegisterProgramming, -1, "-1: default (enabled), 0: do not program wparid register, 1: programing wparid register")
DECLARE_DEBUG_VARIABLE(int32_t, UsePipeControlAfterPartitionedWalker, -1, "-1: default (enabled), 0: do not add PipeControl, 1: add PipeControl")
/*FEATURE FLAGS*/
DECLARE_DEBUG_VARIABLE(bool, USMEvictAfterMigration, true, "Evict USM allocation after implicit migration to GPU")
DECLARE_DEBUG_VARIABLE(bool, EnableNV12, true, "Enables NV12 extension")
DECLARE_DEBUG_VARIABLE(bool, EnablePackedYuv, true, "Enables cl_packed_yuv extension")
DECLARE_DEBUG_VARIABLE(bool, EnableDeferredDeleter, true, "Enables async deleter")
@ -305,8 +309,9 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicPipelineSelect, -1, "set SYSTOLI
DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicInComputeWalker, -1, "set SYSTOLIC MODE ENABLE in COMPUTE_WALKER cmd, -1:default, 0:disable, 1:enable")
/*EXPERIMENTAL TOGGLES*/
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionCount, 0, "Experimental implementation: Set number of COMPUTE_WALKERs for a given Partition Type, 0 - do not set the feature.")
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionType, -1, "Experimental implementation: Set COMPUTE_WALKER Partition Type. Valid values for types from 1 to 3")
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableCustomLocalMemoryAlignment, 0, "Align local memory allocations to a given value. Works only with allocations at least as big as the value. 0: no effect, 2097152: 2 megabytes, 1073741824: 1 gigabyte")
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalUseAtomicsForNativeSectionCleanup, -1, "-1: default (disabled), 0: use store data op, 1: use atomic op")
/*DRIVER TOGGLES*/
DECLARE_DEBUG_VARIABLE(int32_t, ForceOCLVersion, 0, "Force specific OpenCL API version")

View File

@ -1026,7 +1026,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
uint32_t expectedPartitionSize = (dims[0] + partitionCount - 1u) / partitionCount;
EXPECT_EQ(expectedPartitionSize, partitionWalkerCmd->getPartitionSize());
auto cleanupSectionOffset = WalkerPartition::computeControlSectionOffset<FamilyType>(partitionCount, false, true, false);
WalkerPartition::WalkerPartitionArgs args = {};
args.initializeWparidRegister = true;
args.usePipeControlStall = true;
args.partitionCount = partitionCount;
args.nativeCrossTileAtomicSync = true;
auto cleanupSectionOffset = WalkerPartition::computeControlSectionOffset<FamilyType>(args);
uint64_t expectedCleanupGpuVa = cmdContainer->getCommandStream()->getGraphicsAllocation()->getGpuAddress() +
cleanupSectionOffset;
constexpr uint32_t expectedData = 0ull;

View File

@ -51,11 +51,85 @@ TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingAtomicsForNativeCle
}
TEST_F(ImplicitScalingTests, givenForceNotUseAtomicsWhenCheckingAtomicsForNativeCleanupThenExpectFalse) {
DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.set(0);
DebugManager.flags.UseAtomicsForNativeSectionCleanup.set(0);
EXPECT_FALSE(ImplicitScalingHelper::useAtomicsForNativeCleanup());
}
TEST_F(ImplicitScalingTests, givenForceUseAtomicsWhenCheckingAtomicsForNativeCleanupThenExpectTrue) {
DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.set(1);
DebugManager.flags.UseAtomicsForNativeSectionCleanup.set(1);
EXPECT_TRUE(ImplicitScalingHelper::useAtomicsForNativeCleanup());
}
TEST_F(ImplicitScalingTests, givenDefaultSettingsIsFalseWhenCheckingProgramNativeCleanupThenExpectFalse) {
EXPECT_FALSE(ImplicitScalingHelper::programNativeCleanup(false));
}
TEST_F(ImplicitScalingTests, givenDefaultSettingsIsTrueWhenCheckingProgramNativeCleanupThenExpectTrue) {
EXPECT_TRUE(ImplicitScalingHelper::programNativeCleanup(true));
}
TEST_F(ImplicitScalingTests, givenForceNotProgramNativeCleanupWhenDefaultNativeCleanupIsTrueThenExpectFalse) {
DebugManager.flags.ProgramNativeCleanup.set(0);
EXPECT_FALSE(ImplicitScalingHelper::programNativeCleanup(true));
}
TEST_F(ImplicitScalingTests, givenForceProgramNativeCleanupWhenDefaultNativeCleanupIsFalseThenExpectTrue) {
DebugManager.flags.ProgramNativeCleanup.set(1);
EXPECT_TRUE(ImplicitScalingHelper::programNativeCleanup(false));
}
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingToProgramWparidRegisterThenExpectTrue) {
EXPECT_TRUE(ImplicitScalingHelper::initWparidRegister());
}
TEST_F(ImplicitScalingTests, givenForceNotProgramWparidRegisterWhenCheckingRegisterProgramThenExpectFalse) {
DebugManager.flags.WparidRegisterProgramming.set(0);
EXPECT_FALSE(ImplicitScalingHelper::initWparidRegister());
}
TEST_F(ImplicitScalingTests, givenForceProgramWparidRegisterWhenCheckingRegisterProgramThenExpectTrue) {
DebugManager.flags.WparidRegisterProgramming.set(1);
EXPECT_TRUE(ImplicitScalingHelper::initWparidRegister());
}
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingToUsePipeControlThenExpectTrue) {
EXPECT_TRUE(ImplicitScalingHelper::usePipeControl());
}
TEST_F(ImplicitScalingTests, givenForceNotUsePipeControlWhenCheckingPipeControlUseThenExpectFalse) {
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
EXPECT_FALSE(ImplicitScalingHelper::usePipeControl());
}
TEST_F(ImplicitScalingTests, givenForceUsePipeControlWhenCheckingPipeControlUseThenExpectTrue) {
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(1);
EXPECT_TRUE(ImplicitScalingHelper::usePipeControl());
}
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingSemaphoreUseThenExpectFalse) {
EXPECT_FALSE(ImplicitScalingHelper::isSemaphoreProgrammingRequired());
}
TEST_F(ImplicitScalingTests, givenForceSemaphoreNotUseWhenCheckingSemaphoreUseThenExpectFalse) {
DebugManager.flags.SynchronizeWithSemaphores.set(0);
EXPECT_FALSE(ImplicitScalingHelper::isSemaphoreProgrammingRequired());
}
TEST_F(ImplicitScalingTests, givenForceSemaphoreUseWhenCheckingSemaphoreUseThenExpectTrue) {
DebugManager.flags.SynchronizeWithSemaphores.set(1);
EXPECT_TRUE(ImplicitScalingHelper::isSemaphoreProgrammingRequired());
}
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingCrossTileAtomicSyncThenExpectTrue) {
EXPECT_TRUE(ImplicitScalingHelper::isCrossTileAtomicRequired());
}
TEST_F(ImplicitScalingTests, givenForceDisableWhenCheckingCrossTileAtomicSyncThenExpectFalse) {
DebugManager.flags.UseCrossAtomicSynchronization.set(0);
EXPECT_FALSE(ImplicitScalingHelper::isCrossTileAtomicRequired());
}
TEST_F(ImplicitScalingTests, givenForceEnableWhenCheckingCrossTileAtomicSyncThenExpectTrue) {
DebugManager.flags.UseCrossAtomicSynchronization.set(1);
EXPECT_TRUE(ImplicitScalingHelper::isCrossTileAtomicRequired());
}

View File

@ -232,3 +232,111 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
auto itorLrm = find<MI_LOAD_REGISTER_MEM *>(loadRegisterMemList.begin(), loadRegisterMemList.end());
ASSERT_NE(itorLrm, loadRegisterMemList.end());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPreferredWhenForceDisabledWparidRegisterThenExpectNoCommandFound) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
DebugManager.flags.WparidRegisterProgramming.set(0);
uint64_t workPartitionAllocationAddress = 0x987654;
uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker;
walker.setThreadGroupIdXDimension(1);
auto &postSync = walker.getPostSync();
postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
postSync.setDestinationAddress(postSyncAddress);
size_t expectedSize = 0;
size_t totalBytesProgrammed = 0;
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
GenCmdList loadRegisterMemList = hwParser.getCommandsList<MI_LOAD_REGISTER_MEM>();
auto itorLrm = find<MI_LOAD_REGISTER_MEM *>(loadRegisterMemList.begin(), loadRegisterMemList.end());
EXPECT_EQ(itorLrm, loadRegisterMemList.end());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPreferredWhenForceDisabledPipeControlThenExpectNoCommandFound) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
uint64_t workPartitionAllocationAddress = 0x987654;
uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker;
walker.setThreadGroupIdXDimension(1);
auto &postSync = walker.getPostSync();
postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
postSync.setDestinationAddress(postSyncAddress);
size_t expectedSize = 0;
size_t totalBytesProgrammed = 0;
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
GenCmdList pipeControlList = hwParser.getCommandsList<PIPE_CONTROL>();
auto itorPipeControl = find<PIPE_CONTROL *>(pipeControlList.begin(), pipeControlList.end());
EXPECT_EQ(itorPipeControl, pipeControlList.end());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenDynamicPartitioningPreferredWhenForceDisabledPipeControlThenExpectNoCommandFound) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
uint64_t workPartitionAllocationAddress = 0x0;
uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker;
walker.setThreadGroupIdXDimension(32);
auto &postSync = walker.getPostSync();
postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
postSync.setDestinationAddress(postSyncAddress);
size_t expectedSize = 0;
size_t totalBytesProgrammed = 0;
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
GenCmdList pipeControlList = hwParser.getCommandsList<PIPE_CONTROL>();
auto itorPipeControl = find<PIPE_CONTROL *>(pipeControlList.begin(), pipeControlList.end());
EXPECT_EQ(itorPipeControl, pipeControlList.end());
}