Rename functions and variables in Implicit Scaling

Related-To: NEO-6244

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2021-09-16 12:11:22 +00:00
committed by Compute-Runtime-Automation
parent eace896ec8
commit eda3531729
12 changed files with 160 additions and 157 deletions

View File

@@ -1088,7 +1088,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenWalkerPart
WalkerPartition::WalkerPartitionArgs testArgs = {};
testArgs.initializeWparidRegister = true;
testArgs.crossTileAtomicSynchronization = true;
testArgs.usePipeControlStall = true;
testArgs.emitPipeControlStall = true;
testArgs.partitionCount = 2u;
testArgs.tileCount = static_cast<uint32_t>(device->getDeviceBitfield().count());
@@ -1171,7 +1171,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenQueueIsMul
WalkerPartition::WalkerPartitionArgs testArgs = {};
testArgs.initializeWparidRegister = true;
testArgs.usePipeControlStall = true;
testArgs.emitPipeControlStall = true;
testArgs.crossTileAtomicSynchronization = true;
testArgs.partitionCount = 16u;
testArgs.tileCount = static_cast<uint32_t>(device->getDeviceBitfield().count());
@@ -1386,7 +1386,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenDispatchPr
EXPECT_EQ(0u, cmdStream.getUsed());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenOpenClWhenEnqueuePartitionWalkerThenExpectNoNativeCrossTileSyncCleanup) {
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenOpenClWhenEnqueuePartitionWalkerThenExpectNoSelfCleanupSection) {
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
if (!OSInterface::osEnableLocalMemory) {

View File

@@ -11,9 +11,9 @@ void WalkerPartitionTests::SetUp() {
cmdBufferAddress = cmdBuffer;
testArgs.synchronizeBeforeExecution = false;
testArgs.nativeCrossTileAtomicSync = false;
testArgs.emitSelfCleanup = false;
testArgs.initializeWparidRegister = true;
testArgs.usePipeControlStall = true;
testArgs.emitPipeControlStall = true;
testArgs.crossTileAtomicSynchronization = true;
}

View File

@@ -416,10 +416,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionAnd
EXPECT_EQ(parsedOffset, totalBytesProgrammed);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWithNativeCrossTileSyncWhenConstructCommandBufferIsCalledThenBatchBufferIsBeingProgrammed) {
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWithSelfCleanupWhenConstructCommandBufferIsCalledThenBatchBufferIsBeingProgrammed) {
testArgs.tileCount = 4u;
testArgs.partitionCount = testArgs.tileCount;
testArgs.nativeCrossTileAtomicSync = true;
testArgs.emitSelfCleanup = true;
testArgs.staticPartitioning = true;
checkForProperCmdBufferAddressOffset = false;
@@ -564,11 +564,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWit
EXPECT_EQ(parsedOffset, totalBytesProgrammed);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWithNativeCrossTileSyncAndSyncDisabledWithFlagWhenConstructCommandBufferIsCalledThenStillProgramTheSync) {
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWithSelfCleanupAndCrossTileSyncDisabledWithFlagWhenConstructCommandBufferIsCalledThenStillProgramTheSync) {
testArgs.crossTileAtomicSynchronization = false;
testArgs.tileCount = 4u;
testArgs.partitionCount = testArgs.tileCount;
testArgs.nativeCrossTileAtomicSync = true;
testArgs.emitSelfCleanup = true;
testArgs.staticPartitioning = true;
checkForProperCmdBufferAddressOffset = false;
uint64_t cmdBufferGpuAddress = 0x8000123000;
@@ -712,11 +712,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWit
EXPECT_EQ(parsedOffset, totalBytesProgrammed);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWithNativeCrossTileSyncAndAtomicsForNativeWhenConstructCommandBufferIsCalledThenBatchBufferIsBeingProgrammed) {
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWithSelfCleanupAndAtomicsForSelfCleanupWhenConstructCommandBufferIsCalledThenBatchBufferIsBeingProgrammed) {
testArgs.tileCount = 4u;
testArgs.partitionCount = testArgs.tileCount;
testArgs.useAtomicsForNativeCleanup = true;
testArgs.nativeCrossTileAtomicSync = true;
testArgs.useAtomicsForSelfCleanup = true;
testArgs.emitSelfCleanup = true;
testArgs.staticPartitioning = true;
checkForProperCmdBufferAddressOffset = false;
uint64_t cmdBufferGpuAddress = 0x8000123000;
@@ -866,12 +866,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWit
EXPECT_EQ(parsedOffset, totalBytesProgrammed);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWithNativeCrossTileSyncAndSyncDisabledWithFlagWhenUsingAtomicForNativeAndConstructCommandBufferIsCalledThenStillProgramTheSync) {
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWithSlefCleanupAndCrossTileSyncDisabledWithFlagWhenUsingAtomicForSelfCleanupAndConstructCommandBufferIsCalledThenStillProgramTheSync) {
testArgs.crossTileAtomicSynchronization = false;
testArgs.tileCount = 4u;
testArgs.partitionCount = testArgs.tileCount;
testArgs.nativeCrossTileAtomicSync = true;
testArgs.useAtomicsForNativeCleanup = true;
testArgs.emitSelfCleanup = true;
testArgs.useAtomicsForSelfCleanup = true;
testArgs.staticPartitioning = true;
checkForProperCmdBufferAddressOffset = false;
uint64_t cmdBufferGpuAddress = 0x8000123000;
@@ -1159,9 +1159,9 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhe
testArgs.tileCount = 4u;
testArgs.partitionCount = testArgs.tileCount;
testArgs.initializeWparidRegister = false;
testArgs.nativeCrossTileAtomicSync = false;
testArgs.emitSelfCleanup = false;
testArgs.crossTileAtomicSynchronization = false;
testArgs.useAtomicsForNativeCleanup = false;
testArgs.useAtomicsForSelfCleanup = false;
testArgs.staticPartitioning = true;
checkForProperCmdBufferAddressOffset = false;
@@ -1218,10 +1218,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhe
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhenPipeControlProgrammingDisabledThenExpectNoPipeControlCommand) {
testArgs.tileCount = 4u;
testArgs.partitionCount = testArgs.tileCount;
testArgs.nativeCrossTileAtomicSync = false;
testArgs.usePipeControlStall = false;
testArgs.emitSelfCleanup = false;
testArgs.emitPipeControlStall = false;
testArgs.crossTileAtomicSynchronization = false;
testArgs.useAtomicsForNativeCleanup = false;
testArgs.useAtomicsForSelfCleanup = false;
testArgs.staticPartitioning = true;
checkForProperCmdBufferAddressOffset = false;

View File

@@ -148,11 +148,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitioningEstima
estimateSpaceRequiredInCommandBuffer<FamilyType>(testArgs));
}
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenEstimationNativeSectionsWhenItIsCalledThenProperSizeIsReturned) {
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenEstimationSelfCleanupSectionsWhenItIsCalledThenProperSizeIsReturned) {
testArgs.partitionCount = 16u;
testArgs.emitBatchBufferEnd = false;
testArgs.synchronizeBeforeExecution = false;
testArgs.nativeCrossTileAtomicSync = true;
testArgs.emitSelfCleanup = true;
auto expectedUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM<FamilyType>) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) * 2 +
@@ -172,12 +172,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenEstimationNativeSections
estimateSpaceRequiredInCommandBuffer<FamilyType>(testArgs));
}
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenEstimationNativeSectionsWhenAtomicsUsedForNativeThenProperSizeIsReturned) {
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenEstimationSelfCleanupSectionsWhenAtomicsUsedForSelfCleanupThenProperSizeIsReturned) {
testArgs.partitionCount = 16u;
testArgs.emitBatchBufferEnd = false;
testArgs.synchronizeBeforeExecution = false;
testArgs.nativeCrossTileAtomicSync = true;
testArgs.useAtomicsForNativeCleanup = true;
testArgs.emitSelfCleanup = true;
testArgs.useAtomicsForSelfCleanup = true;
auto expectedUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM<FamilyType>) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) * 2 +
@@ -828,11 +828,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitioningWhenZD
EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_Z, walker.getPartitionType());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenNativeCrossTileSyncWhenDebugForceDisableCrossTileSyncThenNativeOverridesDebugAndAddsOwnCleanupSection) {
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenSelfCleanupSectionWhenDebugForceDisableCrossTileSyncThenSelfCleanupOverridesDebugAndAddsOwnCleanupSection) {
testArgs.crossTileAtomicSynchronization = false;
testArgs.partitionCount = 16u;
checkForProperCmdBufferAddressOffset = false;
testArgs.nativeCrossTileAtomicSync = true;
testArgs.emitSelfCleanup = true;
uint64_t gpuVirtualAddress = 0x8000123000;
uint64_t postSyncAddress = 0x8000456000;
WalkerPartition::COMPUTE_WALKER<FamilyType> walker;
@@ -1028,12 +1028,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenNativeCrossTileSyncWhenD
EXPECT_EQ(miSemaphoreWait->getSemaphoreDataDword(), 2 * testArgs.tileCount);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenNativeCrossTileSyncAndAtomicsUsedForNativeWhenDebugForceDisableCrossTileSyncThenNativeOverridesDebugAndAddsOwnCleanupSection) {
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenSelfCleanupAndAtomicsUsedForCleanupWhenDebugForceDisableCrossTileSyncThenSelfCleanupOverridesDebugAndAddsOwnCleanupSection) {
testArgs.crossTileAtomicSynchronization = false;
testArgs.partitionCount = 16u;
checkForProperCmdBufferAddressOffset = false;
testArgs.nativeCrossTileAtomicSync = true;
testArgs.useAtomicsForNativeCleanup = true;
testArgs.emitSelfCleanup = true;
testArgs.useAtomicsForSelfCleanup = true;
uint64_t gpuVirtualAddress = 0x8000123000;
uint64_t postSyncAddress = 0x8000456000;
WalkerPartition::COMPUTE_WALKER<FamilyType> walker;
@@ -1240,9 +1240,9 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenDynamicPartitioningWhenP
testArgs.crossTileAtomicSynchronization = false;
testArgs.partitionCount = 16u;
testArgs.tileCount = 4u;
testArgs.nativeCrossTileAtomicSync = false;
testArgs.useAtomicsForNativeCleanup = false;
testArgs.usePipeControlStall = false;
testArgs.emitSelfCleanup = false;
testArgs.useAtomicsForSelfCleanup = false;
testArgs.emitPipeControlStall = false;
checkForProperCmdBufferAddressOffset = false;
uint64_t gpuVirtualAddress = 0x8000123000;

View File

@@ -313,7 +313,7 @@ OverrideUseKmdWaitFunction = -1
EnableCacheFlushAfterWalkerForAllQueues = -1
Force32BitDriverSupport = -1
OverrideCmdQueueSynchronousMode = -1
UseAtomicsForNativeSectionCleanup = -1
UseAtomicsForSelfCleanupSection = -1
HBMSizePerTileInGigabytes = 0
OverrideSystolicPipelineSelect = -1
OverrideSystolicInComputeWalker = -1
@@ -324,7 +324,7 @@ DoNotFreeResources = 0
OverrideGmmResourceUsageField = -1
LogAllocationType = 0
ProgramAdditionalPipeControlBeforeStateComputeModeCommand = 0
ProgramNativeCleanup = -1
ProgramWalkerPartitionSelfCleanup = -1
WparidRegisterProgramming = -1
UsePipeControlAfterPartitionedWalker = -1
OverrideBufferSuitableForRenderCompression = -1

View File

@@ -28,46 +28,49 @@ bool ImplicitScalingHelper::isImplicitScalingEnabled(const DeviceBitfield &devic
bool ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired() {
auto synchronizeBeforeExecution = false;
if (DebugManager.flags.SynchronizeWalkerInWparidMode.get() != -1) {
synchronizeBeforeExecution = static_cast<bool>(DebugManager.flags.SynchronizeWalkerInWparidMode.get());
int overrideSynchronizeBeforeExecution = DebugManager.flags.SynchronizeWalkerInWparidMode.get();
if (overrideSynchronizeBeforeExecution != -1) {
synchronizeBeforeExecution = !!overrideSynchronizeBeforeExecution;
}
return synchronizeBeforeExecution;
}
bool ImplicitScalingHelper::isSemaphoreProgrammingRequired() {
auto semaphoreProgrammingRequired = ImplicitScaling::semaphoreProgrammingRequired;
if (NEO::DebugManager.flags.SynchronizeWithSemaphores.get() == 1) {
semaphoreProgrammingRequired = true;
int overrideSemaphoreProgrammingRequired = NEO::DebugManager.flags.SynchronizeWithSemaphores.get();
if (overrideSemaphoreProgrammingRequired != -1) {
semaphoreProgrammingRequired = !!overrideSemaphoreProgrammingRequired;
}
return semaphoreProgrammingRequired;
}
bool ImplicitScalingHelper::isCrossTileAtomicRequired() {
auto crossTileAtomicSynchronization = ImplicitScaling::crossTileAtomicSynchronization;
if (NEO::DebugManager.flags.UseCrossAtomicSynchronization.get() == 0) {
crossTileAtomicSynchronization = false;
int overrideCrossTileAtomicSynchronization = NEO::DebugManager.flags.UseCrossAtomicSynchronization.get();
if (overrideCrossTileAtomicSynchronization != -1) {
crossTileAtomicSynchronization = !!overrideCrossTileAtomicSynchronization;
}
return crossTileAtomicSynchronization;
}
bool ImplicitScalingHelper::useAtomicsForNativeCleanup() {
bool ImplicitScalingHelper::isAtomicsUsedForSelfCleanup() {
bool useAtomics = false;
int overrideUseAtomics = DebugManager.flags.UseAtomicsForNativeSectionCleanup.get();
int overrideUseAtomics = DebugManager.flags.UseAtomicsForSelfCleanupSection.get();
if (overrideUseAtomics != -1) {
useAtomics = !!(overrideUseAtomics);
}
return useAtomics;
}
bool ImplicitScalingHelper::programNativeCleanup(bool defaultNativeCleanup) {
int overrideProgramNativeCleanup = DebugManager.flags.ProgramNativeCleanup.get();
if (overrideProgramNativeCleanup != -1) {
defaultNativeCleanup = !!(overrideProgramNativeCleanup);
bool ImplicitScalingHelper::isSelfCleanupRequired(bool defaultSelfCleanup) {
int overrideProgramSelfCleanup = DebugManager.flags.ProgramWalkerPartitionSelfCleanup.get();
if (overrideProgramSelfCleanup != -1) {
defaultSelfCleanup = !!(overrideProgramSelfCleanup);
}
return defaultNativeCleanup;
return defaultSelfCleanup;
}
bool ImplicitScalingHelper::initWparidRegister() {
bool ImplicitScalingHelper::isWparidRegisterInitializationRequired() {
bool initWparidRegister = true;
int overrideInitWparidRegister = DebugManager.flags.WparidRegisterProgramming.get();
if (overrideInitWparidRegister != -1) {
@@ -76,13 +79,13 @@ bool ImplicitScalingHelper::initWparidRegister() {
return initWparidRegister;
}
bool ImplicitScalingHelper::usePipeControl() {
bool usePipeControl = true;
bool ImplicitScalingHelper::isPipeControlStallRequired() {
bool emitPipeControl = true;
int overrideUsePipeControl = DebugManager.flags.UsePipeControlAfterPartitionedWalker.get();
if (overrideUsePipeControl != -1) {
usePipeControl = !!(overrideUsePipeControl);
emitPipeControl = !!(overrideUsePipeControl);
}
return usePipeControl;
return emitPipeControl;
}
} // namespace NEO

View File

@@ -27,17 +27,17 @@ struct ImplicitScalingHelper {
static bool isSemaphoreProgrammingRequired();
static bool isCrossTileAtomicRequired();
static bool isSynchronizeBeforeExecutionRequired();
static bool useAtomicsForNativeCleanup();
static bool programNativeCleanup(bool defaultNativeCleanup);
static bool initWparidRegister();
static bool usePipeControl();
static bool isAtomicsUsedForSelfCleanup();
static bool isSelfCleanupRequired(bool defaultSelfCleanup);
static bool isWparidRegisterInitializationRequired();
static bool isPipeControlStallRequired();
};
template <typename GfxFamily>
struct ImplicitScalingDispatch {
using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;
static size_t getSize(bool nativeCrossTileAtomicSync,
static size_t getSize(bool emitSelfCleanup,
bool preferStaticPartitioning,
const DeviceBitfield &devices,
const Vec3<size_t> &groupStart,
@@ -47,7 +47,7 @@ struct ImplicitScalingDispatch {
const DeviceBitfield &devices,
uint32_t &partitionCount,
bool useSecondaryBatchBuffer,
bool nativeCrossTileAtomicSync,
bool emitSelfCleanup,
bool usesImages,
uint64_t workPartitionAllocationGpuVa);
};

View File

@@ -12,7 +12,7 @@
namespace NEO {
template <typename GfxFamily>
size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool nativeCrossTileAtomicSync,
size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool emitSelfCleanup,
bool preferStaticPartitioning,
const DeviceBitfield &devices,
const Vec3<size_t> &groupStart,
@@ -34,12 +34,12 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool nativeCrossTileAtomicSyn
args.partitionCount = partitionCount;
args.tileCount = tileCount;
args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
args.useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
args.nativeCrossTileAtomicSync = ImplicitScalingHelper::programNativeCleanup(nativeCrossTileAtomicSync);
args.initializeWparidRegister = ImplicitScalingHelper::initWparidRegister();
args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup();
args.emitSelfCleanup = ImplicitScalingHelper::isSelfCleanupRequired(emitSelfCleanup);
args.initializeWparidRegister = ImplicitScalingHelper::isWparidRegisterInitializationRequired();
args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired();
args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired();
args.usePipeControlStall = ImplicitScalingHelper::usePipeControl();
args.emitPipeControlStall = ImplicitScalingHelper::isPipeControlStallRequired();
args.emitBatchBufferEnd = false;
args.staticPartitioning = staticPartitioning;
@@ -52,7 +52,7 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
const DeviceBitfield &devices,
uint32_t &partitionCount,
bool useSecondaryBatchBuffer,
bool nativeCrossTileAtomicSync,
bool emitSelfCleanup,
bool usesImages,
uint64_t workPartitionAllocationGpuVa) {
uint32_t totalProgrammedSize = 0u;
@@ -67,12 +67,12 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
args.partitionCount = partitionCount;
args.tileCount = tileCount;
args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
args.useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
args.nativeCrossTileAtomicSync = ImplicitScalingHelper::programNativeCleanup(nativeCrossTileAtomicSync);
args.initializeWparidRegister = ImplicitScalingHelper::initWparidRegister();
args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup();
args.emitSelfCleanup = ImplicitScalingHelper::isSelfCleanupRequired(emitSelfCleanup);
args.initializeWparidRegister = ImplicitScalingHelper::isWparidRegisterInitializationRequired();
args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired();
args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired();
args.usePipeControlStall = ImplicitScalingHelper::usePipeControl();
args.emitPipeControlStall = ImplicitScalingHelper::isPipeControlStallRequired();
args.emitBatchBufferEnd = false;
args.secondaryBatchBuffer = useSecondaryBatchBuffer;
args.staticPartitioning = staticPartitioning;

View File

@@ -28,10 +28,10 @@ struct WalkerPartitionArgs {
bool crossTileAtomicSynchronization = false;
bool semaphoreProgrammingRequired = false;
bool staticPartitioning = false;
bool nativeCrossTileAtomicSync = false;
bool useAtomicsForNativeCleanup = false;
bool emitSelfCleanup = false;
bool useAtomicsForSelfCleanup = false;
bool initializeWparidRegister = false;
bool usePipeControlStall = false;
bool emitPipeControlStall = false;
};
template <typename GfxFamily>
@@ -350,8 +350,8 @@ void programStoreMemImmediateDword(void *&inputAddress, uint32_t &totalBytesProg
}
template <typename GfxFamily>
uint64_t computeNativeCrossTileSyncControlSectionSize(bool useAtomicsForNativeCleanup) {
if (useAtomicsForNativeCleanup) {
uint64_t computeSelfCleanupSectionSize(bool useAtomicsForSelfCleanup) {
if (useAtomicsForSelfCleanup) {
return sizeof(MI_ATOMIC<GfxFamily>);
} else {
return sizeof(MI_STORE_DATA_IMM<GfxFamily>);
@@ -359,11 +359,11 @@ uint64_t computeNativeCrossTileSyncControlSectionSize(bool useAtomicsForNativeCl
}
template <typename GfxFamily>
void programNativeCrossTileSyncControl(void *&inputAddress,
uint32_t &totalBytesProgrammed,
uint64_t address,
bool useAtomicsForNativeCleanup) {
if (useAtomicsForNativeCleanup) {
void programSelfCleanupSection(void *&inputAddress,
uint32_t &totalBytesProgrammed,
uint64_t address,
bool useAtomicsForSelfCleanup) {
if (useAtomicsForSelfCleanup) {
programMiAtomic<GfxFamily>(inputAddress,
totalBytesProgrammed,
address,
@@ -393,28 +393,28 @@ void programTilesSynchronizationWithAtomics(void *&currentBatchBufferPointer,
}
template <typename GfxFamily>
uint64_t computeNativeCrossTileSyncCleanupSectionSize(size_t fieldsForCleanupCount, bool useAtomicsForNativeCleanup) {
return fieldsForCleanupCount * computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) +
uint64_t computeSelfCleanupEndSectionSize(size_t fieldsForCleanupCount, bool useAtomicsForSelfCleanup) {
return fieldsForCleanupCount * computeSelfCleanupSectionSize<GfxFamily>(useAtomicsForSelfCleanup) +
2 * computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>();
}
template <typename GfxFamily>
void programNativeCrossTileSyncCleanup(void *&inputAddress,
uint32_t &totalBytesProgrammed,
uint64_t finalSyncTileCountAddress,
uint64_t baseAddressForCleanup,
size_t fieldsForCleanupCount,
uint32_t tileCount,
bool useAtomicsForNativeCleanup) {
void programSelfCleanupEndSection(void *&inputAddress,
uint32_t &totalBytesProgrammed,
uint64_t finalSyncTileCountAddress,
uint64_t baseAddressForCleanup,
size_t fieldsForCleanupCount,
uint32_t tileCount,
bool useAtomicsForSelfCleanup) {
// Synchronize tiles, so the fields are not cleared while still in use
programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, tileCount);
for (auto fieldIndex = 0u; fieldIndex < fieldsForCleanupCount; fieldIndex++) {
const uint64_t addressForCleanup = baseAddressForCleanup + fieldIndex * sizeof(uint32_t);
programNativeCrossTileSyncControl<GfxFamily>(inputAddress,
totalBytesProgrammed,
addressForCleanup,
useAtomicsForNativeCleanup);
programSelfCleanupSection<GfxFamily>(inputAddress,
totalBytesProgrammed,
addressForCleanup,
useAtomicsForSelfCleanup);
}
//this synchronization point ensures that all tiles finished zeroing and will fairly access control section atomic variables
@@ -450,12 +450,12 @@ uint64_t computeControlSectionOffset(WalkerPartitionArgs &args) {
sizeof(BATCH_BUFFER_START<GfxFamily>) * 2;
size += (args.semaphoreProgrammingRequired ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * args.partitionCount : 0u);
size += computeWalkerSectionSize<GfxFamily>();
size += args.usePipeControlStall ? sizeof(PIPE_CONTROL<GfxFamily>) : 0u;
if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
size += args.emitPipeControlStall ? sizeof(PIPE_CONTROL<GfxFamily>) : 0u;
if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) {
size += computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>();
}
if (args.nativeCrossTileAtomicSync) {
size += computeNativeCrossTileSyncControlSectionSize<GfxFamily>(args.useAtomicsForNativeCleanup);
if (args.emitSelfCleanup) {
size += computeSelfCleanupSectionSize<GfxFamily>(args.useAtomicsForSelfCleanup);
}
return size;
}
@@ -566,12 +566,12 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
//disable predication to not noop subsequent commands.
programWparidPredication<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, false);
if (args.nativeCrossTileAtomicSync) {
if (args.emitSelfCleanup) {
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForNativeCleanup);
programSelfCleanupSection<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForSelfCleanup);
}
if (args.usePipeControlStall) {
if (args.emitPipeControlStall) {
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true);
}
@@ -582,7 +582,7 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
}
}
if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) {
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, tileCount);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
}
@@ -608,15 +608,15 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
totalBytesProgrammed += sizeof(BatchBufferControlData);
currentBatchBufferPointer = ptrOffset(currentBatchBufferPointer, sizeof(BatchBufferControlData));
if (args.nativeCrossTileAtomicSync) {
if (args.emitSelfCleanup) {
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
programNativeCrossTileSyncCleanup<GfxFamily>(currentBatchBufferPointer,
totalBytesProgrammed,
finalSyncTileCountAddress,
gpuAddressOfAllocation + controlSectionOffset,
dynamicPartitioningFieldsForCleanupCount,
args.tileCount,
args.useAtomicsForNativeCleanup);
programSelfCleanupEndSection<GfxFamily>(currentBatchBufferPointer,
totalBytesProgrammed,
finalSyncTileCountAddress,
gpuAddressOfAllocation + controlSectionOffset,
dynamicPartitioningFieldsForCleanupCount,
args.tileCount,
args.useAtomicsForSelfCleanup);
}
if (args.emitBatchBufferEnd) {
@@ -637,26 +637,26 @@ uint64_t computeStaticPartitioningControlSectionOffset(WalkerPartitionArgs &args
const auto beforeExecutionSyncAtomicSize = args.synchronizeBeforeExecution
? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>()
: 0u;
const auto afterExecutionSyncAtomicSize = (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync)
const auto afterExecutionSyncAtomicSize = (args.crossTileAtomicSynchronization || args.emitSelfCleanup)
? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>()
: 0u;
const auto afterExecutionSyncPostSyncSize = args.semaphoreProgrammingRequired
? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * args.partitionCount
: 0u;
const auto nativeCrossTileSyncSize = args.nativeCrossTileAtomicSync
? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(args.useAtomicsForNativeCleanup)
: 0u;
const auto selfCleanupSectionSize = args.emitSelfCleanup
? computeSelfCleanupSectionSize<GfxFamily>(args.useAtomicsForSelfCleanup)
: 0u;
const auto wparidRegisterSize = args.initializeWparidRegister
? sizeof(LOAD_REGISTER_MEM<GfxFamily>)
: 0u;
const auto pipeControlSize = args.usePipeControlStall
const auto pipeControlSize = args.emitPipeControlStall
? sizeof(PIPE_CONTROL<GfxFamily>)
: 0u;
return beforeExecutionSyncAtomicSize +
wparidRegisterSize +
pipeControlSize +
sizeof(COMPUTE_WALKER<GfxFamily>) +
nativeCrossTileSyncSize +
selfCleanupSectionSize +
afterExecutionSyncAtomicSize +
afterExecutionSyncPostSyncSize +
sizeof(BATCH_BUFFER_START<GfxFamily>);
@@ -688,12 +688,12 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
// Prepare for cleanup section
if (args.nativeCrossTileAtomicSync) {
if (args.emitSelfCleanup) {
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForNativeCleanup);
programSelfCleanupSection<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForSelfCleanup);
}
if (args.usePipeControlStall) {
if (args.emitPipeControlStall) {
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true); // flush L3 cache
}
@@ -702,7 +702,7 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
programTilesSynchronizationWithPostSyncs<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
}
if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) {
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
}
@@ -719,15 +719,15 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset);
// Cleanup section
if (args.nativeCrossTileAtomicSync) {
if (args.emitSelfCleanup) {
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
programNativeCrossTileSyncCleanup<GfxFamily>(currentBatchBufferPointer,
totalBytesProgrammed,
finalSyncTileCountAddress,
gpuAddressOfAllocation + controlSectionOffset,
staticPartitioningFieldsForCleanupCount,
args.tileCount,
args.useAtomicsForNativeCleanup);
programSelfCleanupEndSection<GfxFamily>(currentBatchBufferPointer,
totalBytesProgrammed,
finalSyncTileCountAddress,
gpuAddressOfAllocation + controlSectionOffset,
staticPartitioningFieldsForCleanupCount,
args.tileCount,
args.useAtomicsForSelfCleanup);
}
}
@@ -738,12 +738,12 @@ uint64_t estimateSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args) {
if (args.staticPartitioning) {
size += computeStaticPartitioningControlSectionOffset<GfxFamily>(args);
size += sizeof(StaticPartitioningControlSection);
size += args.nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(staticPartitioningFieldsForCleanupCount, args.useAtomicsForNativeCleanup) : 0u;
size += args.emitSelfCleanup ? computeSelfCleanupEndSectionSize<GfxFamily>(staticPartitioningFieldsForCleanupCount, args.useAtomicsForSelfCleanup) : 0u;
} else {
size += computeControlSectionOffset<GfxFamily>(args);
size += sizeof(BatchBufferControlData);
size += args.emitBatchBufferEnd ? sizeof(BATCH_BUFFER_END<GfxFamily>) : 0u;
size += args.nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(dynamicPartitioningFieldsForCleanupCount, args.useAtomicsForNativeCleanup) : 0u;
size += args.emitSelfCleanup ? computeSelfCleanupEndSectionSize<GfxFamily>(dynamicPartitioningFieldsForCleanupCount, args.useAtomicsForSelfCleanup) : 0u;
}
return size;
}

View File

@@ -239,8 +239,8 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableWalkerPartition, -1, "-1: default, 0: disa
DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWalkerInWparidMode, -1, "-1: default, 0: do not synchronize 1: synchronize all tiles prior to doing work distrubution")
DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWithSemaphores, -1, "-1: default (disabled), 1: Emit Semaphores waiting after Walker completion in WPARID mode 0: do not emit semaphores after Walker")
DECLARE_DEBUG_VARIABLE(int32_t, UseCrossAtomicSynchronization, -1, "-1: default (enabled), 1: Cross Tile Atomic Synchronization present 0: Cross tile atomic synchronization disabled")
DECLARE_DEBUG_VARIABLE(int32_t, UseAtomicsForNativeSectionCleanup, -1, "-1: default (disabled), 0: use store data op, 1: use atomic op")
DECLARE_DEBUG_VARIABLE(int32_t, ProgramNativeCleanup, -1, "-1: default (API dependent), 0: Do not program native cleanup, 1: program native cleanup")
DECLARE_DEBUG_VARIABLE(int32_t, UseAtomicsForSelfCleanupSection, -1, "-1: default (disabled), 0: use store data op, 1: use atomic op")
DECLARE_DEBUG_VARIABLE(int32_t, ProgramWalkerPartitionSelfCleanup, -1, "-1: default (API dependent), 0: Do not program self cleanup, 1: program self cleanup")
DECLARE_DEBUG_VARIABLE(int32_t, WparidRegisterProgramming, -1, "-1: default (enabled), 0: do not program wparid register, 1: programing wparid register")
DECLARE_DEBUG_VARIABLE(int32_t, UsePipeControlAfterPartitionedWalker, -1, "-1: default (enabled), 0: do not add PipeControl, 1: add PipeControl")

View File

@@ -975,7 +975,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
EXPECT_EQ(expectedPartitionSize, partitionWalkerCmd->getPartitionSize());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImplicitScalingWhenEncodingDispatchKernelThenExpectNativeCrossTileCleanupSection) {
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImplicitScalingWhenEncodingDispatchKernelThenExpectSelfCleanupSection) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
@@ -1028,9 +1028,9 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
WalkerPartition::WalkerPartitionArgs args = {};
args.initializeWparidRegister = true;
args.usePipeControlStall = true;
args.emitPipeControlStall = true;
args.partitionCount = partitionCount;
args.nativeCrossTileAtomicSync = true;
args.emitSelfCleanup = true;
auto cleanupSectionOffset = WalkerPartition::computeControlSectionOffset<FamilyType>(args);
uint64_t expectedCleanupGpuVa = cmdContainer->getCommandStream()->getGraphicsAllocation()->getGpuAddress() +

View File

@@ -46,64 +46,64 @@ TEST_F(ImplicitScalingTests, givenMultiTileApiEnabledWhenOsSupportOffAndForcedOn
EXPECT_FALSE(ImplicitScalingHelper::isImplicitScalingEnabled(twoTile, true));
}
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingAtomicsForNativeCleanupThenExpectFalse) {
EXPECT_FALSE(ImplicitScalingHelper::useAtomicsForNativeCleanup());
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingAtomicsForSelfCleanupThenExpectFalse) {
EXPECT_FALSE(ImplicitScalingHelper::isAtomicsUsedForSelfCleanup());
}
TEST_F(ImplicitScalingTests, givenForceNotUseAtomicsWhenCheckingAtomicsForNativeCleanupThenExpectFalse) {
DebugManager.flags.UseAtomicsForNativeSectionCleanup.set(0);
EXPECT_FALSE(ImplicitScalingHelper::useAtomicsForNativeCleanup());
TEST_F(ImplicitScalingTests, givenForceNotUseAtomicsWhenCheckingAtomicsForSelfCleanupThenExpectFalse) {
DebugManager.flags.UseAtomicsForSelfCleanupSection.set(0);
EXPECT_FALSE(ImplicitScalingHelper::isAtomicsUsedForSelfCleanup());
}
TEST_F(ImplicitScalingTests, givenForceUseAtomicsWhenCheckingAtomicsForNativeCleanupThenExpectTrue) {
DebugManager.flags.UseAtomicsForNativeSectionCleanup.set(1);
EXPECT_TRUE(ImplicitScalingHelper::useAtomicsForNativeCleanup());
TEST_F(ImplicitScalingTests, givenForceUseAtomicsWhenCheckingAtomicsForSelfCleanupThenExpectTrue) {
DebugManager.flags.UseAtomicsForSelfCleanupSection.set(1);
EXPECT_TRUE(ImplicitScalingHelper::isAtomicsUsedForSelfCleanup());
}
TEST_F(ImplicitScalingTests, givenDefaultSettingsIsFalseWhenCheckingProgramNativeCleanupThenExpectFalse) {
EXPECT_FALSE(ImplicitScalingHelper::programNativeCleanup(false));
TEST_F(ImplicitScalingTests, givenDefaultSettingsIsFalseWhenCheckingProgramSelfCleanupThenExpectFalse) {
EXPECT_FALSE(ImplicitScalingHelper::isSelfCleanupRequired(false));
}
TEST_F(ImplicitScalingTests, givenDefaultSettingsIsTrueWhenCheckingProgramNativeCleanupThenExpectTrue) {
EXPECT_TRUE(ImplicitScalingHelper::programNativeCleanup(true));
TEST_F(ImplicitScalingTests, givenDefaultSettingsIsTrueWhenCheckingProgramSelfCleanupThenExpectTrue) {
EXPECT_TRUE(ImplicitScalingHelper::isSelfCleanupRequired(true));
}
TEST_F(ImplicitScalingTests, givenForceNotProgramNativeCleanupWhenDefaultNativeCleanupIsTrueThenExpectFalse) {
DebugManager.flags.ProgramNativeCleanup.set(0);
EXPECT_FALSE(ImplicitScalingHelper::programNativeCleanup(true));
TEST_F(ImplicitScalingTests, givenForceNotProgramSelfCleanupWhenDefaultSelfCleanupIsTrueThenExpectFalse) {
DebugManager.flags.ProgramWalkerPartitionSelfCleanup.set(0);
EXPECT_FALSE(ImplicitScalingHelper::isSelfCleanupRequired(true));
}
TEST_F(ImplicitScalingTests, givenForceProgramNativeCleanupWhenDefaultNativeCleanupIsFalseThenExpectTrue) {
DebugManager.flags.ProgramNativeCleanup.set(1);
EXPECT_TRUE(ImplicitScalingHelper::programNativeCleanup(false));
TEST_F(ImplicitScalingTests, givenForceProgramSelfCleanupWhenDefaultSelfCleanupIsFalseThenExpectTrue) {
DebugManager.flags.ProgramWalkerPartitionSelfCleanup.set(1);
EXPECT_TRUE(ImplicitScalingHelper::isSelfCleanupRequired(false));
}
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingToProgramWparidRegisterThenExpectTrue) {
EXPECT_TRUE(ImplicitScalingHelper::initWparidRegister());
EXPECT_TRUE(ImplicitScalingHelper::isWparidRegisterInitializationRequired());
}
TEST_F(ImplicitScalingTests, givenForceNotProgramWparidRegisterWhenCheckingRegisterProgramThenExpectFalse) {
DebugManager.flags.WparidRegisterProgramming.set(0);
EXPECT_FALSE(ImplicitScalingHelper::initWparidRegister());
EXPECT_FALSE(ImplicitScalingHelper::isWparidRegisterInitializationRequired());
}
TEST_F(ImplicitScalingTests, givenForceProgramWparidRegisterWhenCheckingRegisterProgramThenExpectTrue) {
DebugManager.flags.WparidRegisterProgramming.set(1);
EXPECT_TRUE(ImplicitScalingHelper::initWparidRegister());
EXPECT_TRUE(ImplicitScalingHelper::isWparidRegisterInitializationRequired());
}
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingToUsePipeControlThenExpectTrue) {
EXPECT_TRUE(ImplicitScalingHelper::usePipeControl());
EXPECT_TRUE(ImplicitScalingHelper::isPipeControlStallRequired());
}
TEST_F(ImplicitScalingTests, givenForceNotUsePipeControlWhenCheckingPipeControlUseThenExpectFalse) {
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
EXPECT_FALSE(ImplicitScalingHelper::usePipeControl());
EXPECT_FALSE(ImplicitScalingHelper::isPipeControlStallRequired());
}
TEST_F(ImplicitScalingTests, givenForceUsePipeControlWhenCheckingPipeControlUseThenExpectTrue) {
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(1);
EXPECT_TRUE(ImplicitScalingHelper::usePipeControl());
EXPECT_TRUE(ImplicitScalingHelper::isPipeControlStallRequired());
}
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingSemaphoreUseThenExpectFalse) {