mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-20 00:24:58 +08:00
Refactor and modularize walker partition code
Related-To: NEO-6244 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
e82c2e4653
commit
b65d8909e4
@@ -34,12 +34,55 @@ bool ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired() {
|
||||
return synchronizeBeforeExecution;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::isSemaphoreProgrammingRequired() {
|
||||
auto semaphoreProgrammingRequired = ImplicitScaling::semaphoreProgrammingRequired;
|
||||
if (NEO::DebugManager.flags.SynchronizeWithSemaphores.get() == 1) {
|
||||
semaphoreProgrammingRequired = true;
|
||||
}
|
||||
return semaphoreProgrammingRequired;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::isCrossTileAtomicRequired() {
|
||||
auto crossTileAtomicSynchronization = ImplicitScaling::crossTileAtomicSynchronization;
|
||||
if (NEO::DebugManager.flags.UseCrossAtomicSynchronization.get() == 0) {
|
||||
crossTileAtomicSynchronization = false;
|
||||
}
|
||||
return crossTileAtomicSynchronization;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::useAtomicsForNativeCleanup() {
|
||||
bool useAtomics = false;
|
||||
int overrideUseAtomics = DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.get();
|
||||
int overrideUseAtomics = DebugManager.flags.UseAtomicsForNativeSectionCleanup.get();
|
||||
if (overrideUseAtomics != -1) {
|
||||
useAtomics = !!(overrideUseAtomics);
|
||||
}
|
||||
return useAtomics;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::programNativeCleanup(bool defaultNativeCleanup) {
|
||||
int overrideProgramNativeCleanup = DebugManager.flags.ProgramNativeCleanup.get();
|
||||
if (overrideProgramNativeCleanup != -1) {
|
||||
defaultNativeCleanup = !!(overrideProgramNativeCleanup);
|
||||
}
|
||||
return defaultNativeCleanup;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::initWparidRegister() {
|
||||
bool initWparidRegister = true;
|
||||
int overrideInitWparidRegister = DebugManager.flags.WparidRegisterProgramming.get();
|
||||
if (overrideInitWparidRegister != -1) {
|
||||
initWparidRegister = !!(overrideInitWparidRegister);
|
||||
}
|
||||
return initWparidRegister;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::usePipeControl() {
|
||||
bool usePipeControl = true;
|
||||
int overrideUsePipeControl = DebugManager.flags.UsePipeControlAfterPartitionedWalker.get();
|
||||
if (overrideUsePipeControl != -1) {
|
||||
usePipeControl = !!(overrideUsePipeControl);
|
||||
}
|
||||
return usePipeControl;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -15,12 +15,22 @@ class LinearStream;
|
||||
|
||||
namespace ImplicitScaling {
|
||||
extern bool apiSupport;
|
||||
}
|
||||
extern bool semaphoreProgrammingRequired;
|
||||
extern bool crossTileAtomicSynchronization;
|
||||
|
||||
constexpr uint32_t partitionAddressOffsetDwords = 2u;
|
||||
constexpr uint32_t partitionAddressOffset = sizeof(uint32_t) * partitionAddressOffsetDwords;
|
||||
} // namespace ImplicitScaling
|
||||
|
||||
struct ImplicitScalingHelper {
|
||||
static bool isImplicitScalingEnabled(const DeviceBitfield &devices, bool preCondition);
|
||||
static bool isSemaphoreProgrammingRequired();
|
||||
static bool isCrossTileAtomicRequired();
|
||||
static bool isSynchronizeBeforeExecutionRequired();
|
||||
static bool useAtomicsForNativeCleanup();
|
||||
static bool programNativeCleanup(bool defaultNativeCleanup);
|
||||
static bool initWparidRegister();
|
||||
static bool usePipeControl();
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
||||
@@ -20,6 +20,7 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool nativeCrossTileAtomicSyn
|
||||
typename GfxFamily::COMPUTE_WALKER::PARTITION_TYPE partitionType{};
|
||||
bool staticPartitioning = false;
|
||||
const uint32_t tileCount = static_cast<uint32_t>(devices.count());
|
||||
|
||||
const uint32_t partitionCount = WalkerPartition::computePartitionCountAndPartitionType<GfxFamily>(tileCount,
|
||||
preferStaticPartitioning,
|
||||
groupStart,
|
||||
@@ -28,15 +29,21 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool nativeCrossTileAtomicSyn
|
||||
&partitionType,
|
||||
&staticPartitioning);
|
||||
UNRECOVERABLE_IF(staticPartitioning && (tileCount != partitionCount));
|
||||
WalkerPartition::WalkerPartitionArgs args = {};
|
||||
|
||||
auto synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
|
||||
const bool useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
|
||||
return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(false,
|
||||
16u,
|
||||
synchronizeBeforeExecution,
|
||||
nativeCrossTileAtomicSync,
|
||||
staticPartitioning,
|
||||
useAtomicsForNativeCleanup));
|
||||
args.partitionCount = partitionCount;
|
||||
args.tileCount = tileCount;
|
||||
args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
|
||||
args.useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
|
||||
args.nativeCrossTileAtomicSync = ImplicitScalingHelper::programNativeCleanup(nativeCrossTileAtomicSync);
|
||||
args.initializeWparidRegister = ImplicitScalingHelper::initWparidRegister();
|
||||
args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired();
|
||||
args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired();
|
||||
args.usePipeControlStall = ImplicitScalingHelper::usePipeControl();
|
||||
args.emitBatchBufferEnd = false;
|
||||
args.staticPartitioning = staticPartitioning;
|
||||
|
||||
return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(args));
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
@@ -54,36 +61,43 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
|
||||
|
||||
bool staticPartitioning = false;
|
||||
partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType<GfxFamily>(&walkerCmd, tileCount, preferStaticPartitioning, usesImages, &staticPartitioning);
|
||||
const bool synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
|
||||
const bool useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
|
||||
|
||||
WalkerPartition::WalkerPartitionArgs args = {};
|
||||
args.workPartitionAllocationGpuVa = workPartitionAllocationGpuVa;
|
||||
args.partitionCount = partitionCount;
|
||||
args.tileCount = tileCount;
|
||||
args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
|
||||
args.useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
|
||||
args.nativeCrossTileAtomicSync = ImplicitScalingHelper::programNativeCleanup(nativeCrossTileAtomicSync);
|
||||
args.initializeWparidRegister = ImplicitScalingHelper::initWparidRegister();
|
||||
args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired();
|
||||
args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired();
|
||||
args.usePipeControlStall = ImplicitScalingHelper::usePipeControl();
|
||||
args.emitBatchBufferEnd = false;
|
||||
args.secondaryBatchBuffer = useSecondaryBatchBuffer;
|
||||
args.staticPartitioning = staticPartitioning;
|
||||
|
||||
if (staticPartitioning) {
|
||||
UNRECOVERABLE_IF(tileCount != partitionCount);
|
||||
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
|
||||
commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
|
||||
&walkerCmd,
|
||||
totalProgrammedSize,
|
||||
partitionCount,
|
||||
tileCount,
|
||||
synchronizeBeforeExecution,
|
||||
useSecondaryBatchBuffer,
|
||||
nativeCrossTileAtomicSync,
|
||||
workPartitionAllocationGpuVa,
|
||||
useAtomicsForNativeCleanup);
|
||||
args);
|
||||
} else {
|
||||
if (DebugManager.flags.ExperimentalSetWalkerPartitionCount.get()) {
|
||||
partitionCount = DebugManager.flags.ExperimentalSetWalkerPartitionCount.get();
|
||||
if (partitionCount == 1u) {
|
||||
walkerCmd.setPartitionType(GfxFamily::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
|
||||
}
|
||||
args.partitionCount = partitionCount;
|
||||
}
|
||||
|
||||
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
|
||||
commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
|
||||
&walkerCmd, totalProgrammedSize,
|
||||
partitionCount, tileCount,
|
||||
false, synchronizeBeforeExecution, useSecondaryBatchBuffer,
|
||||
nativeCrossTileAtomicSync,
|
||||
useAtomicsForNativeCleanup);
|
||||
&walkerCmd,
|
||||
totalProgrammedSize,
|
||||
args);
|
||||
}
|
||||
commandStream.getSpace(totalProgrammedSize);
|
||||
}
|
||||
|
||||
@@ -18,6 +18,22 @@
|
||||
|
||||
namespace WalkerPartition {
|
||||
|
||||
struct WalkerPartitionArgs {
|
||||
uint64_t workPartitionAllocationGpuVa = 0;
|
||||
uint32_t partitionCount = 0;
|
||||
uint32_t tileCount = 0;
|
||||
bool emitBatchBufferEnd = false;
|
||||
bool secondaryBatchBuffer = false;
|
||||
bool synchronizeBeforeExecution = false;
|
||||
bool crossTileAtomicSynchronization = false;
|
||||
bool semaphoreProgrammingRequired = false;
|
||||
bool staticPartitioning = false;
|
||||
bool nativeCrossTileAtomicSync = false;
|
||||
bool useAtomicsForNativeCleanup = false;
|
||||
bool initializeWparidRegister = false;
|
||||
bool usePipeControlStall = false;
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
|
||||
template <typename GfxFamily>
|
||||
@@ -73,22 +89,6 @@ Command *putCommand(void *&inputAddress, uint32_t &totalBytesProgrammed) {
|
||||
return commandToReturn;
|
||||
}
|
||||
|
||||
bool inline isSemaphoreProgrammingRequired() {
|
||||
auto semaphoreProgrammingRequired = false;
|
||||
if (NEO::DebugManager.flags.ExperimentalSynchronizeWithSemaphores.get() == 1) {
|
||||
semaphoreProgrammingRequired = true;
|
||||
}
|
||||
return semaphoreProgrammingRequired;
|
||||
}
|
||||
|
||||
bool inline isCrossTileAtomicRequired() {
|
||||
auto crossTileAtomicSynchronization = true;
|
||||
if (NEO::DebugManager.flags.ExperimentalForceCrossAtomicSynchronization.get() == 0) {
|
||||
crossTileAtomicSynchronization = false;
|
||||
}
|
||||
return crossTileAtomicSynchronization;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitionCount,
|
||||
bool preferStaticPartitioning,
|
||||
@@ -349,25 +349,55 @@ void programStoreMemImmediateDword(void *&inputAddress, uint32_t &totalBytesProg
|
||||
*storeDataImmediate = cmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeNativeCrossTileSyncControlSectionSize(bool useAtomicsForNativeCleanup) {
|
||||
if (useAtomicsForNativeCleanup) {
|
||||
return sizeof(MI_ATOMIC<GfxFamily>);
|
||||
} else {
|
||||
return sizeof(MI_STORE_DATA_IMM<GfxFamily>);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programNativeCrossTileSyncControl(void *&inputAddress,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint64_t finalSyncTileCountField,
|
||||
uint64_t address,
|
||||
bool useAtomicsForNativeCleanup) {
|
||||
if (useAtomicsForNativeCleanup) {
|
||||
programMiAtomic<GfxFamily>(inputAddress,
|
||||
totalBytesProgrammed,
|
||||
finalSyncTileCountField,
|
||||
address,
|
||||
false,
|
||||
MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_MOVE);
|
||||
} else {
|
||||
programStoreMemImmediateDword<GfxFamily>(inputAddress,
|
||||
totalBytesProgrammed,
|
||||
finalSyncTileCountField,
|
||||
address,
|
||||
0u);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeTilesSynchronizationWithAtomicsSectionSize() {
|
||||
return sizeof(MI_ATOMIC<GfxFamily>) +
|
||||
sizeof(MI_SEMAPHORE_WAIT<GfxFamily>);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programTilesSynchronizationWithAtomics(void *¤tBatchBufferPointer,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint64_t atomicAddress,
|
||||
uint32_t tileCount) {
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeNativeCrossTileSyncCleanupSectionSize(size_t fieldsForCleanupCount, bool useAtomicsForNativeCleanup) {
|
||||
return fieldsForCleanupCount * computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) +
|
||||
2 * computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>();
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programNativeCrossTileSyncCleanup(void *&inputAddress,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
@@ -377,28 +407,18 @@ void programNativeCrossTileSyncCleanup(void *&inputAddress,
|
||||
uint32_t tileCount,
|
||||
bool useAtomicsForNativeCleanup) {
|
||||
// Synchronize tiles, so the fields are not cleared while still in use
|
||||
programMiAtomic<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
programWaitForSemaphore<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, tileCount);
|
||||
|
||||
for (auto fieldIndex = 0u; fieldIndex < fieldsForCleanupCount; fieldIndex++) {
|
||||
const uint64_t addressForCleanup = baseAddressForCleanup + fieldIndex * sizeof(uint32_t);
|
||||
if (useAtomicsForNativeCleanup) {
|
||||
programMiAtomic<GfxFamily>(inputAddress,
|
||||
totalBytesProgrammed,
|
||||
addressForCleanup,
|
||||
false,
|
||||
MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_MOVE);
|
||||
} else {
|
||||
programStoreMemImmediateDword<GfxFamily>(inputAddress,
|
||||
programNativeCrossTileSyncControl<GfxFamily>(inputAddress,
|
||||
totalBytesProgrammed,
|
||||
addressForCleanup,
|
||||
0u);
|
||||
}
|
||||
useAtomicsForNativeCleanup);
|
||||
}
|
||||
|
||||
//this synchronization point ensures that all tiles finished zeroing and will fairly access control section atomic variables
|
||||
programMiAtomic<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
programWaitForSemaphore<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * tileCount);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
@@ -412,15 +432,6 @@ void programTilesSynchronizationWithPostSyncs(void *¤tBatchBufferPointer,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programTilesSynchronizationWithAtomics(void *¤tBatchBufferPointer,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint64_t atomicAddress,
|
||||
uint32_t tileCount) {
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeWalkerSectionSize() {
|
||||
return sizeof(BATCH_BUFFER_START<GfxFamily>) +
|
||||
@@ -428,46 +439,30 @@ uint64_t computeWalkerSectionSize() {
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeNativeCrossTileSyncControlSectionSize(bool useAtomicsForNativeCleanup) {
|
||||
if (useAtomicsForNativeCleanup) {
|
||||
return sizeof(MI_ATOMIC<GfxFamily>);
|
||||
} else {
|
||||
return sizeof(MI_STORE_DATA_IMM<GfxFamily>);
|
||||
uint64_t computeControlSectionOffset(WalkerPartitionArgs &args) {
|
||||
uint64_t size = 0u;
|
||||
|
||||
size += args.synchronizeBeforeExecution ? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>() : 0;
|
||||
size += sizeof(LOAD_REGISTER_IMM<GfxFamily>); //predication mask
|
||||
size += sizeof(MI_ATOMIC<GfxFamily>); //current id for partition
|
||||
size += sizeof(LOAD_REGISTER_REG<GfxFamily>); //id into register
|
||||
size += sizeof(MI_SET_PREDICATE<GfxFamily>) * 2 +
|
||||
sizeof(BATCH_BUFFER_START<GfxFamily>) * 2;
|
||||
size += (args.semaphoreProgrammingRequired ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * args.partitionCount : 0u);
|
||||
size += computeWalkerSectionSize<GfxFamily>();
|
||||
size += args.usePipeControlStall ? sizeof(PIPE_CONTROL<GfxFamily>) : 0u;
|
||||
if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
|
||||
size += computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeNativeCrossTileSyncCleanupSectionSize(size_t fieldsForCleanupCount, bool useAtomicsForNativeCleanup) {
|
||||
return fieldsForCleanupCount * computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) +
|
||||
2 * sizeof(MI_ATOMIC<GfxFamily>) +
|
||||
2 * sizeof(MI_SEMAPHORE_WAIT<GfxFamily>);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync, bool useAtomicsForNativeCleanup) {
|
||||
auto synchronizationCount = (synchronizeBeforeExecution) ? 2u : 1u;
|
||||
if (!isCrossTileAtomicRequired() && !nativeCrossTileAtomicSync) {
|
||||
synchronizationCount--;
|
||||
if (args.nativeCrossTileAtomicSync) {
|
||||
size += computeNativeCrossTileSyncControlSectionSize<GfxFamily>(args.useAtomicsForNativeCleanup);
|
||||
}
|
||||
|
||||
return sizeof(LOAD_REGISTER_IMM<GfxFamily>) +
|
||||
sizeof(MI_ATOMIC<GfxFamily>) * (1u + synchronizationCount) +
|
||||
sizeof(LOAD_REGISTER_REG<GfxFamily>) +
|
||||
sizeof(MI_SET_PREDICATE<GfxFamily>) * 2 +
|
||||
sizeof(BATCH_BUFFER_START<GfxFamily>) * 2 +
|
||||
sizeof(PIPE_CONTROL<GfxFamily>) +
|
||||
sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * synchronizationCount +
|
||||
(isSemaphoreProgrammingRequired() ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * partitionCount : 0u) +
|
||||
computeWalkerSectionSize<GfxFamily>() +
|
||||
(nativeCrossTileAtomicSync ? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) : 0u);
|
||||
return size;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeWalkerSectionStart(uint32_t partitionCount,
|
||||
bool synchronizeBeforeExecution,
|
||||
bool nativeCrossTileAtomicSync,
|
||||
bool useAtomicsForNativeCleanup) {
|
||||
return computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup) -
|
||||
uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) {
|
||||
return computeControlSectionOffset<GfxFamily>(args) -
|
||||
computeWalkerSectionSize<GfxFamily>();
|
||||
}
|
||||
|
||||
@@ -537,26 +532,17 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
uint64_t gpuAddressOfAllocation,
|
||||
COMPUTE_WALKER<GfxFamily> *inputWalker,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint32_t partitionCount,
|
||||
uint32_t tileCount,
|
||||
bool emitBatchBufferEnd,
|
||||
bool synchronizeBeforeExecution,
|
||||
bool secondaryBatchBuffer,
|
||||
bool nativeCrossTileAtomicSync,
|
||||
bool useAtomicsForNativeCleanup) {
|
||||
WalkerPartitionArgs &args) {
|
||||
totalBytesProgrammed = 0u;
|
||||
void *currentBatchBufferPointer = cpuPointer;
|
||||
|
||||
auto controlSectionOffset = computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
|
||||
if (synchronizeBeforeExecution) {
|
||||
auto controlSectionOffset = computeControlSectionOffset<GfxFamily>(args);
|
||||
if (args.synchronizeBeforeExecution) {
|
||||
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, inTileCount);
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
|
||||
//if all tiles hit the atomic, it means we may go further
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
|
||||
}
|
||||
|
||||
programWparidMask<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, partitionCount);
|
||||
programWparidMask<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.partitionCount);
|
||||
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
@@ -573,36 +559,32 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
gpuAddressOfAllocation +
|
||||
computeWalkerSectionStart<GfxFamily>(partitionCount,
|
||||
synchronizeBeforeExecution,
|
||||
nativeCrossTileAtomicSync,
|
||||
useAtomicsForNativeCleanup),
|
||||
computeWalkerSectionStart<GfxFamily>(args),
|
||||
true,
|
||||
secondaryBatchBuffer);
|
||||
args.secondaryBatchBuffer);
|
||||
|
||||
//disable predication to not noop subsequent commands.
|
||||
programWparidPredication<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, false);
|
||||
|
||||
if (nativeCrossTileAtomicSync) {
|
||||
if (args.nativeCrossTileAtomicSync) {
|
||||
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
|
||||
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, useAtomicsForNativeCleanup);
|
||||
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForNativeCleanup);
|
||||
}
|
||||
|
||||
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true);
|
||||
if (args.usePipeControlStall) {
|
||||
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true);
|
||||
}
|
||||
|
||||
if (isSemaphoreProgrammingRequired()) {
|
||||
if (args.semaphoreProgrammingRequired) {
|
||||
auto postSyncAddress = inputWalker->getPostSync().getDestinationAddress() + 8llu;
|
||||
for (uint32_t partitionId = 0u; partitionId < partitionCount; partitionId++) {
|
||||
for (uint32_t partitionId = 0u; partitionId < args.partitionCount; partitionId++) {
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, postSyncAddress + partitionId * 16llu, 1u, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
|
||||
}
|
||||
}
|
||||
|
||||
if (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) {
|
||||
if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
|
||||
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, tileCount);
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
|
||||
//if all tiles hit the atomic, it means we may go further
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
|
||||
}
|
||||
|
||||
//this bb start goes to the end of partitioned command buffer
|
||||
@@ -611,12 +593,12 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
totalBytesProgrammed,
|
||||
gpuAddressOfAllocation + controlSectionOffset + sizeof(BatchBufferControlData),
|
||||
false,
|
||||
secondaryBatchBuffer);
|
||||
args.secondaryBatchBuffer);
|
||||
|
||||
//Walker section
|
||||
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
|
||||
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
|
||||
|
||||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, secondaryBatchBuffer);
|
||||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, args.secondaryBatchBuffer);
|
||||
|
||||
auto controlSection = reinterpret_cast<BatchBufferControlData *>(ptrOffset(cpuPointer, static_cast<size_t>(controlSectionOffset)));
|
||||
controlSection->partitionCount = 0u;
|
||||
@@ -626,18 +608,18 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
totalBytesProgrammed += sizeof(BatchBufferControlData);
|
||||
currentBatchBufferPointer = ptrOffset(currentBatchBufferPointer, sizeof(BatchBufferControlData));
|
||||
|
||||
if (nativeCrossTileAtomicSync) {
|
||||
if (args.nativeCrossTileAtomicSync) {
|
||||
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
|
||||
programNativeCrossTileSyncCleanup<GfxFamily>(currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
finalSyncTileCountAddress,
|
||||
gpuAddressOfAllocation + controlSectionOffset,
|
||||
dynamicPartitioningFieldsForCleanupCount,
|
||||
tileCount,
|
||||
useAtomicsForNativeCleanup);
|
||||
args.tileCount,
|
||||
args.useAtomicsForNativeCleanup);
|
||||
}
|
||||
|
||||
if (emitBatchBufferEnd) {
|
||||
if (args.emitBatchBufferEnd) {
|
||||
auto batchBufferEnd = putCommand<BATCH_BUFFER_END<GfxFamily>>(currentBatchBufferPointer, totalBytesProgrammed);
|
||||
*batchBufferEnd = GfxFamily::cmdInitBatchBufferEnd;
|
||||
}
|
||||
@@ -651,14 +633,28 @@ struct StaticPartitioningControlSection {
|
||||
static constexpr inline size_t staticPartitioningFieldsForCleanupCount = sizeof(StaticPartitioningControlSection) / sizeof(uint32_t) - 1;
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeStaticPartitioningControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync, bool useAtomicsForNativeCleanup) {
|
||||
const auto beforeExecutionSyncAtomicSize = synchronizeBeforeExecution ? (sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) + sizeof(MI_ATOMIC<GfxFamily>)) : 0u;
|
||||
const auto afterExecutionSyncAtomicSize = (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) ? (sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) + sizeof(MI_ATOMIC<GfxFamily>)) : 0u;
|
||||
const auto afterExecutionSyncPostSyncSize = isSemaphoreProgrammingRequired() ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * partitionCount : 0u;
|
||||
const auto nativeCrossTileSyncSize = nativeCrossTileAtomicSync ? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) : 0u;
|
||||
uint64_t computeStaticPartitioningControlSectionOffset(WalkerPartitionArgs &args) {
|
||||
const auto beforeExecutionSyncAtomicSize = args.synchronizeBeforeExecution
|
||||
? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>()
|
||||
: 0u;
|
||||
const auto afterExecutionSyncAtomicSize = (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync)
|
||||
? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>()
|
||||
: 0u;
|
||||
const auto afterExecutionSyncPostSyncSize = args.semaphoreProgrammingRequired
|
||||
? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * args.partitionCount
|
||||
: 0u;
|
||||
const auto nativeCrossTileSyncSize = args.nativeCrossTileAtomicSync
|
||||
? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(args.useAtomicsForNativeCleanup)
|
||||
: 0u;
|
||||
const auto wparidRegisterSize = args.initializeWparidRegister
|
||||
? sizeof(LOAD_REGISTER_MEM<GfxFamily>)
|
||||
: 0u;
|
||||
const auto pipeControlSize = args.usePipeControlStall
|
||||
? sizeof(PIPE_CONTROL<GfxFamily>)
|
||||
: 0u;
|
||||
return beforeExecutionSyncAtomicSize +
|
||||
sizeof(LOAD_REGISTER_MEM<GfxFamily>) +
|
||||
sizeof(PIPE_CONTROL<GfxFamily>) +
|
||||
wparidRegisterSize +
|
||||
pipeControlSize +
|
||||
sizeof(COMPUTE_WALKER<GfxFamily>) +
|
||||
nativeCrossTileSyncSize +
|
||||
afterExecutionSyncAtomicSize +
|
||||
@@ -671,49 +667,48 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
uint64_t gpuAddressOfAllocation,
|
||||
COMPUTE_WALKER<GfxFamily> *inputWalker,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint32_t partitionCount,
|
||||
uint32_t tileCount,
|
||||
bool synchronizeBeforeExecution,
|
||||
bool secondaryBatchBuffer,
|
||||
bool nativeCrossTileAtomicSync,
|
||||
uint64_t workPartitionAllocationGpuVa,
|
||||
bool useAtomicsForNativeCleanup) {
|
||||
WalkerPartitionArgs &args) {
|
||||
totalBytesProgrammed = 0u;
|
||||
void *currentBatchBufferPointer = cpuPointer;
|
||||
|
||||
// Get address of the control section
|
||||
const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
|
||||
const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset<GfxFamily>(args);
|
||||
const auto afterControlSectionOffset = controlSectionOffset + sizeof(StaticPartitioningControlSection);
|
||||
|
||||
// Synchronize tiles before walker
|
||||
if (synchronizeBeforeExecution) {
|
||||
if (args.synchronizeBeforeExecution) {
|
||||
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
|
||||
}
|
||||
|
||||
// Load partition ID to wparid register and execute walker
|
||||
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, workPartitionAllocationGpuVa, wparidCCSOffset);
|
||||
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
|
||||
if (args.initializeWparidRegister) {
|
||||
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset);
|
||||
}
|
||||
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
|
||||
|
||||
// Prepare for cleanup section
|
||||
if (nativeCrossTileAtomicSync) {
|
||||
if (args.nativeCrossTileAtomicSync) {
|
||||
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
|
||||
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, useAtomicsForNativeCleanup);
|
||||
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForNativeCleanup);
|
||||
}
|
||||
|
||||
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true); // flush L3 cache
|
||||
if (args.usePipeControlStall) {
|
||||
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true); // flush L3 cache
|
||||
}
|
||||
|
||||
// Synchronize tiles after walker
|
||||
if (isSemaphoreProgrammingRequired()) {
|
||||
programTilesSynchronizationWithPostSyncs<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
|
||||
if (args.semaphoreProgrammingRequired) {
|
||||
programTilesSynchronizationWithPostSyncs<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
|
||||
}
|
||||
if (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) {
|
||||
|
||||
if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
|
||||
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
|
||||
}
|
||||
|
||||
// Jump over the control section
|
||||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, secondaryBatchBuffer);
|
||||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer);
|
||||
|
||||
// Control section
|
||||
DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset);
|
||||
@@ -724,35 +719,31 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset);
|
||||
|
||||
// Cleanup section
|
||||
if (nativeCrossTileAtomicSync) {
|
||||
if (args.nativeCrossTileAtomicSync) {
|
||||
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
|
||||
programNativeCrossTileSyncCleanup<GfxFamily>(currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
finalSyncTileCountAddress,
|
||||
gpuAddressOfAllocation + controlSectionOffset,
|
||||
staticPartitioningFieldsForCleanupCount,
|
||||
tileCount,
|
||||
useAtomicsForNativeCleanup);
|
||||
args.tileCount,
|
||||
args.useAtomicsForNativeCleanup);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t estimateSpaceRequiredInCommandBuffer(bool requiresBatchBufferEnd,
|
||||
uint32_t partitionCount,
|
||||
bool synchronizeBeforeExecution,
|
||||
bool nativeCrossTileAtomicSync,
|
||||
bool staticPartitioning,
|
||||
bool useAtomicsForNativeCleanup) {
|
||||
uint64_t estimateSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args) {
|
||||
|
||||
uint64_t size = {};
|
||||
if (staticPartitioning) {
|
||||
size += computeStaticPartitioningControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
|
||||
if (args.staticPartitioning) {
|
||||
size += computeStaticPartitioningControlSectionOffset<GfxFamily>(args);
|
||||
size += sizeof(StaticPartitioningControlSection);
|
||||
size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(staticPartitioningFieldsForCleanupCount, useAtomicsForNativeCleanup) : 0u;
|
||||
size += args.nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(staticPartitioningFieldsForCleanupCount, args.useAtomicsForNativeCleanup) : 0u;
|
||||
} else {
|
||||
size += computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
|
||||
size += computeControlSectionOffset<GfxFamily>(args);
|
||||
size += sizeof(BatchBufferControlData);
|
||||
size += requiresBatchBufferEnd ? sizeof(BATCH_BUFFER_END<GfxFamily>) : 0u;
|
||||
size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(dynamicPartitioningFieldsForCleanupCount, useAtomicsForNativeCleanup) : 0u;
|
||||
size += args.emitBatchBufferEnd ? sizeof(BATCH_BUFFER_END<GfxFamily>) : 0u;
|
||||
size += args.nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(dynamicPartitioningFieldsForCleanupCount, args.useAtomicsForNativeCleanup) : 0u;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
@@ -123,15 +123,11 @@ DECLARE_DEBUG_VARIABLE(int32_t, CFENumberOfWalkers, -1, "Set Number of Walkers i
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, CFEMaximumNumberOfThreads, -1, "Set Maximum Number of Threads in CFE_STATE on XEHP, -1 - do not set")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, CFEOverDispatchControl, -1, "Set Over Dispatch Control in CFE_STATE on XEHP, -1 - do not set")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, CFELargeGRFThreadAdjustDisable, -1, "Set Large GRF thread adjust Disable field in CFE_STATE, -1 - do not set")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWalkerInWparidMode, -1, "-1: default, 0: do not synchronize 1: synchronize all tiles prior to doing work distrubution")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableWalkerPartition, -1, "-1: default, 0: disable, 1: enable, Enables Walker Partitioning via WPARID.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumComputeUnitsForScratch, -1, "Override number of compute units used for scratch size calculation")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceWorkgroupSize1x1x1, -1, "-1: default, 0: disable, 1: enable, force workgroup size 1x1x1 in builtins")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceThreadGroupDispatchSize, -1, "Set ThreadGroupDispatchSize in INTERFACE_DESCRIPTOR_DATA, -1 - default, 0 - TG size 8, 1 - TG size 4, 2 - TG size 2, 3 - Reserved")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessL1CachingPolicy, -1, "-1: default, >=0 : program value for stateless L1 caching")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceMemoryBankIndexOverride, -1, "-1: default, 0: disable, 1:enable, Force index=1 of memory bank for XEHP")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSynchronizeWithSemaphores, -1, "Experimental implementation: 1: Emit Semaphores waiting after Walker completion in WPARID mode 0: do not emit semaphores after Walker")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalForceCrossAtomicSynchronization, -1, "Experimental implementation: 1: Cross Tile Atomic Synchronization present 0: Cross tile atomic synchronization disabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnablePrivateScratchSlot1, -1, "-1: default, 0: disable, 1: enable Allows using private scratch space")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DisablePipeControlPrecedingPostSyncCommand, -1, "-1 default - disabled adding PIPE_CONTROL, 0 - disabled adding PIPE_CONTROL, 1 - enabled adding PIPE_CONTROL")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, UseCachingPolicyForIndirectObjectHeap, -1, "Use selected caching policy for IOH, -1 - default, 0 - Uncached, 1 - L3 Caching, 2 - L1 Caching")
|
||||
@@ -142,13 +138,11 @@ DECLARE_DEBUG_VARIABLE(int32_t, ForceMultiGpuPartialWrites, -1, "-1: default - 0
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceMultiGpuAtomicsInComputeMode, -1, "-1: default - 0 for multiOsContext capable, 0: program value 0 in MultiGpuAtomics bit in STATE_COMPUTE_MODE, 1: program value 1 in MultiGpuAtomics bit in STATE_COMPUTE_MODE")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceMultiGpuAtomics, -1, "-1: default - 0 for multiOsContext capable, 0: program value 0 in MultiGpuAtomics controls 1: program value 1 in MultiGpuAtomics controls")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceBufferCompressionFormat, -1, "-1: default, >0: Format value")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionCount, 0, "Experimental implementation: Set number of COMPUTE_WALKERs for a given Partition Type, 0 - do not set the feature.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableHwGenerationLocalIds, -1, "-1: default, 0: disable, 1: enable : Enables generation of local ids on HW")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, WalkerPartitionPreferHighestDimension, -1, "-1: default, 0: prefer biggest dimension, 1: prefer Z over Y over X if they divide partition count evenly")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, SetMinimalPartitionSize, -1, "-1 default value set to 512 workgroups, 0 - disabled, >0 - minimal partition size in workgroups (should be power of 2)")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideBlitterTargetMemory, -1, "-1:default 0: overwrites to System 1: overwrites to Local")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideBlitterMocs, -1, "-1: default, >=0 SetGivenMocsInBlitterTransfers")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionType, -1, "Experimental implementation: Set COMPUTE_WALKER Partition Type. Valid values for types from 1 to 3")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverridePostSyncMocs, -1, "-1: default, >=0 Override post sync mocs with value")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableImmediateVmBindExt, -1, "Use immediate bind extension to a new residency model on Linux (requires kernel support), -1: default (enabled whith direct submission), 0: disabled, 1: enabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceExecutionTile, -1, "-1: default, 0+: given tile is choosen as submission, must be used with EnableWalkerPartition = 0.")
|
||||
@@ -237,11 +231,21 @@ DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionOverrideComputeSupport, -1, "Ove
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableCacheFlush, -1, "-1: driver default, 0: additional cache flush is present 1: disable dispatching cache flush commands")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionNewResourceTlbFlush, -1, "-1: driver default - flush when new resource is bound, 0: disabled, 1: enabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableMonitorFence, -1, "Disable dispatching monitor fence commands")
|
||||
DECLARE_DEBUG_VARIABLE(bool, USMEvictAfterMigration, true, "Evict USM allocation after implicit migration to GPU")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmissionController, -1, "Enable direct submission terminating after given timeout, -1: default, 0: disabled, 1: enabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionControllerTimeout, -1, "Set direct submission controller timeout, -1: default 5 ms, >=0: timeout in ms")
|
||||
|
||||
/* IMPLICIT SCALING */
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableWalkerPartition, -1, "-1: default, 0: disable, 1: enable, Enables Walker Partitioning via WPARID.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWalkerInWparidMode, -1, "-1: default, 0: do not synchronize 1: synchronize all tiles prior to doing work distrubution")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWithSemaphores, -1, "-1: default (disabled), 1: Emit Semaphores waiting after Walker completion in WPARID mode 0: do not emit semaphores after Walker")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, UseCrossAtomicSynchronization, -1, "-1: default (enabled), 1: Cross Tile Atomic Synchronization present 0: Cross tile atomic synchronization disabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, UseAtomicsForNativeSectionCleanup, -1, "-1: default (disabled), 0: use store data op, 1: use atomic op")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ProgramNativeCleanup, -1, "-1: default (API dependent), 0: Do not program native cleanup, 1: program native cleanup")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, WparidRegisterProgramming, -1, "-1: default (enabled), 0: do not program wparid register, 1: programing wparid register")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, UsePipeControlAfterPartitionedWalker, -1, "-1: default (enabled), 0: do not add PipeControl, 1: add PipeControl")
|
||||
|
||||
/*FEATURE FLAGS*/
|
||||
DECLARE_DEBUG_VARIABLE(bool, USMEvictAfterMigration, true, "Evict USM allocation after implicit migration to GPU")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableNV12, true, "Enables NV12 extension")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnablePackedYuv, true, "Enables cl_packed_yuv extension")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableDeferredDeleter, true, "Enables async deleter")
|
||||
@@ -305,8 +309,9 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicPipelineSelect, -1, "set SYSTOLI
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicInComputeWalker, -1, "set SYSTOLIC MODE ENABLE in COMPUTE_WALKER cmd, -1:default, 0:disable, 1:enable")
|
||||
|
||||
/*EXPERIMENTAL TOGGLES*/
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionCount, 0, "Experimental implementation: Set number of COMPUTE_WALKERs for a given Partition Type, 0 - do not set the feature.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionType, -1, "Experimental implementation: Set COMPUTE_WALKER Partition Type. Valid values for types from 1 to 3")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableCustomLocalMemoryAlignment, 0, "Align local memory allocations to a given value. Works only with allocations at least as big as the value. 0: no effect, 2097152: 2 megabytes, 1073741824: 1 gigabyte")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalUseAtomicsForNativeSectionCleanup, -1, "-1: default (disabled), 0: use store data op, 1: use atomic op")
|
||||
|
||||
/*DRIVER TOGGLES*/
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceOCLVersion, 0, "Force specific OpenCL API version")
|
||||
|
||||
@@ -1026,7 +1026,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
|
||||
uint32_t expectedPartitionSize = (dims[0] + partitionCount - 1u) / partitionCount;
|
||||
EXPECT_EQ(expectedPartitionSize, partitionWalkerCmd->getPartitionSize());
|
||||
|
||||
auto cleanupSectionOffset = WalkerPartition::computeControlSectionOffset<FamilyType>(partitionCount, false, true, false);
|
||||
WalkerPartition::WalkerPartitionArgs args = {};
|
||||
args.initializeWparidRegister = true;
|
||||
args.usePipeControlStall = true;
|
||||
args.partitionCount = partitionCount;
|
||||
args.nativeCrossTileAtomicSync = true;
|
||||
|
||||
auto cleanupSectionOffset = WalkerPartition::computeControlSectionOffset<FamilyType>(args);
|
||||
uint64_t expectedCleanupGpuVa = cmdContainer->getCommandStream()->getGraphicsAllocation()->getGpuAddress() +
|
||||
cleanupSectionOffset;
|
||||
constexpr uint32_t expectedData = 0ull;
|
||||
|
||||
@@ -51,11 +51,85 @@ TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingAtomicsForNativeCle
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceNotUseAtomicsWhenCheckingAtomicsForNativeCleanupThenExpectFalse) {
|
||||
DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.set(0);
|
||||
DebugManager.flags.UseAtomicsForNativeSectionCleanup.set(0);
|
||||
EXPECT_FALSE(ImplicitScalingHelper::useAtomicsForNativeCleanup());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceUseAtomicsWhenCheckingAtomicsForNativeCleanupThenExpectTrue) {
|
||||
DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.set(1);
|
||||
DebugManager.flags.UseAtomicsForNativeSectionCleanup.set(1);
|
||||
EXPECT_TRUE(ImplicitScalingHelper::useAtomicsForNativeCleanup());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenDefaultSettingsIsFalseWhenCheckingProgramNativeCleanupThenExpectFalse) {
|
||||
EXPECT_FALSE(ImplicitScalingHelper::programNativeCleanup(false));
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenDefaultSettingsIsTrueWhenCheckingProgramNativeCleanupThenExpectTrue) {
|
||||
EXPECT_TRUE(ImplicitScalingHelper::programNativeCleanup(true));
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceNotProgramNativeCleanupWhenDefaultNativeCleanupIsTrueThenExpectFalse) {
|
||||
DebugManager.flags.ProgramNativeCleanup.set(0);
|
||||
EXPECT_FALSE(ImplicitScalingHelper::programNativeCleanup(true));
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceProgramNativeCleanupWhenDefaultNativeCleanupIsFalseThenExpectTrue) {
|
||||
DebugManager.flags.ProgramNativeCleanup.set(1);
|
||||
EXPECT_TRUE(ImplicitScalingHelper::programNativeCleanup(false));
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingToProgramWparidRegisterThenExpectTrue) {
|
||||
EXPECT_TRUE(ImplicitScalingHelper::initWparidRegister());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceNotProgramWparidRegisterWhenCheckingRegisterProgramThenExpectFalse) {
|
||||
DebugManager.flags.WparidRegisterProgramming.set(0);
|
||||
EXPECT_FALSE(ImplicitScalingHelper::initWparidRegister());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceProgramWparidRegisterWhenCheckingRegisterProgramThenExpectTrue) {
|
||||
DebugManager.flags.WparidRegisterProgramming.set(1);
|
||||
EXPECT_TRUE(ImplicitScalingHelper::initWparidRegister());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingToUsePipeControlThenExpectTrue) {
|
||||
EXPECT_TRUE(ImplicitScalingHelper::usePipeControl());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceNotUsePipeControlWhenCheckingPipeControlUseThenExpectFalse) {
|
||||
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
|
||||
EXPECT_FALSE(ImplicitScalingHelper::usePipeControl());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceUsePipeControlWhenCheckingPipeControlUseThenExpectTrue) {
|
||||
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(1);
|
||||
EXPECT_TRUE(ImplicitScalingHelper::usePipeControl());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingSemaphoreUseThenExpectFalse) {
|
||||
EXPECT_FALSE(ImplicitScalingHelper::isSemaphoreProgrammingRequired());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceSemaphoreNotUseWhenCheckingSemaphoreUseThenExpectFalse) {
|
||||
DebugManager.flags.SynchronizeWithSemaphores.set(0);
|
||||
EXPECT_FALSE(ImplicitScalingHelper::isSemaphoreProgrammingRequired());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceSemaphoreUseWhenCheckingSemaphoreUseThenExpectTrue) {
|
||||
DebugManager.flags.SynchronizeWithSemaphores.set(1);
|
||||
EXPECT_TRUE(ImplicitScalingHelper::isSemaphoreProgrammingRequired());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingCrossTileAtomicSyncThenExpectTrue) {
|
||||
EXPECT_TRUE(ImplicitScalingHelper::isCrossTileAtomicRequired());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceDisableWhenCheckingCrossTileAtomicSyncThenExpectFalse) {
|
||||
DebugManager.flags.UseCrossAtomicSynchronization.set(0);
|
||||
EXPECT_FALSE(ImplicitScalingHelper::isCrossTileAtomicRequired());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceEnableWhenCheckingCrossTileAtomicSyncThenExpectTrue) {
|
||||
DebugManager.flags.UseCrossAtomicSynchronization.set(1);
|
||||
EXPECT_TRUE(ImplicitScalingHelper::isCrossTileAtomicRequired());
|
||||
}
|
||||
|
||||
@@ -232,3 +232,111 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
|
||||
auto itorLrm = find<MI_LOAD_REGISTER_MEM *>(loadRegisterMemList.begin(), loadRegisterMemList.end());
|
||||
ASSERT_NE(itorLrm, loadRegisterMemList.end());
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPreferredWhenForceDisabledWparidRegisterThenExpectNoCommandFound) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
|
||||
|
||||
DebugManager.flags.WparidRegisterProgramming.set(0);
|
||||
|
||||
uint64_t workPartitionAllocationAddress = 0x987654;
|
||||
uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
|
||||
|
||||
WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker;
|
||||
walker.setThreadGroupIdXDimension(1);
|
||||
auto &postSync = walker.getPostSync();
|
||||
postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
|
||||
postSync.setDestinationAddress(postSyncAddress);
|
||||
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress);
|
||||
totalBytesProgrammed = commandStream.getUsed();
|
||||
EXPECT_EQ(expectedSize, totalBytesProgrammed);
|
||||
EXPECT_EQ(twoTile.count(), partitionCount);
|
||||
|
||||
HardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(commandStream, 0);
|
||||
|
||||
GenCmdList loadRegisterMemList = hwParser.getCommandsList<MI_LOAD_REGISTER_MEM>();
|
||||
auto itorLrm = find<MI_LOAD_REGISTER_MEM *>(loadRegisterMemList.begin(), loadRegisterMemList.end());
|
||||
EXPECT_EQ(itorLrm, loadRegisterMemList.end());
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPreferredWhenForceDisabledPipeControlThenExpectNoCommandFound) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
|
||||
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
|
||||
|
||||
uint64_t workPartitionAllocationAddress = 0x987654;
|
||||
uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
|
||||
|
||||
WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker;
|
||||
walker.setThreadGroupIdXDimension(1);
|
||||
auto &postSync = walker.getPostSync();
|
||||
postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
|
||||
postSync.setDestinationAddress(postSyncAddress);
|
||||
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress);
|
||||
totalBytesProgrammed = commandStream.getUsed();
|
||||
EXPECT_EQ(expectedSize, totalBytesProgrammed);
|
||||
EXPECT_EQ(twoTile.count(), partitionCount);
|
||||
|
||||
HardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(commandStream, 0);
|
||||
|
||||
GenCmdList pipeControlList = hwParser.getCommandsList<PIPE_CONTROL>();
|
||||
auto itorPipeControl = find<PIPE_CONTROL *>(pipeControlList.begin(), pipeControlList.end());
|
||||
EXPECT_EQ(itorPipeControl, pipeControlList.end());
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenDynamicPartitioningPreferredWhenForceDisabledPipeControlThenExpectNoCommandFound) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
|
||||
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
|
||||
|
||||
uint64_t workPartitionAllocationAddress = 0x0;
|
||||
uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
|
||||
|
||||
WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker;
|
||||
walker.setThreadGroupIdXDimension(32);
|
||||
auto &postSync = walker.getPostSync();
|
||||
postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
|
||||
postSync.setDestinationAddress(postSyncAddress);
|
||||
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress);
|
||||
totalBytesProgrammed = commandStream.getUsed();
|
||||
EXPECT_EQ(expectedSize, totalBytesProgrammed);
|
||||
EXPECT_EQ(twoTile.count(), partitionCount);
|
||||
|
||||
HardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(commandStream, 0);
|
||||
|
||||
GenCmdList pipeControlList = hwParser.getCommandsList<PIPE_CONTROL>();
|
||||
auto itorPipeControl = find<PIPE_CONTROL *>(pipeControlList.begin(), pipeControlList.end());
|
||||
EXPECT_EQ(itorPipeControl, pipeControlList.end());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user