feature: adding flag to block dispatch implicit scaling commands

- this feature is part of making compute walker command view
- compute walker is programed for implicit scaling but not dispatched
- together with new flag, comes the refactor to reduce number of arguments

Related-To: NEO-11972

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2024-07-29 13:37:30 +00:00
committed by Compute-Runtime-Automation
parent 2b71ffa7ce
commit b33fe6ccf1
16 changed files with 371 additions and 221 deletions

View File

@@ -157,24 +157,31 @@ inline void HardwareInterface<GfxFamily>::programWalker(
if (partitionWalker) {
const uint64_t workPartitionAllocationGpuVa = defaultCsr->getWorkPartitionAllocationGpuAddress();
uint32_t partitionCount = 0u;
RequiredPartitionDim requiredPartitionDim = kernel.usesImages() ? RequiredPartitionDim::x : RequiredPartitionDim::none;
ImplicitScalingDispatchCommandArgs implicitScalingArgs{
workPartitionAllocationGpuVa, // workPartitionAllocationGpuVa
&hwInfo, // hwInfo
nullptr, // outWalkerPtr
requiredPartitionDim, // requiredPartitionDim
partitionCount, // partitionCount
false, // useSecondaryBatchBuffer
false, // apiSelfCleanup
queueCsr.getDcFlushSupport(), // dcFlush
kernel.isSingleSubdevicePreferred(), // forceExecutionOnSingleTile
false}; // blockDispatchToCommandBuffer
ImplicitScalingDispatch<GfxFamily>::template dispatchCommands<WalkerType>(commandStream,
walkerCmd,
nullptr,
devices,
kernel.usesImages() ? RequiredPartitionDim::x : RequiredPartitionDim::none,
partitionCount,
false,
false,
queueCsr.getDcFlushSupport(),
kernel.isSingleSubdevicePreferred(),
workPartitionAllocationGpuVa,
hwInfo);
implicitScalingArgs);
if (queueCsr.isStaticWorkPartitioningEnabled()) {
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount));
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), implicitScalingArgs.partitionCount));
}
if (timestampPacketNode) {
timestampPacketNode->setPacketsUsed(partitionCount);
timestampPacketNode->setPacketsUsed(implicitScalingArgs.partitionCount);
}
} else {
auto computeWalkerOnStream = commandStream.getSpaceForCmd<WalkerType>();

View File

@@ -412,18 +412,23 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
if (args.partitionCount > 1 && !args.isInternal) {
const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
ImplicitScalingDispatchCommandArgs implicitScalingArgs{
workPartitionAllocationGpuVa, // workPartitionAllocationGpuVa
&hwInfo, // hwInfo
&args.outWalkerPtr, // outWalkerPtr
args.requiredPartitionDim, // requiredPartitionDim
args.partitionCount, // partitionCount
!(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer
!args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup
args.dcFlushEnable, // dcFlush
gfxCoreHelper.singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile
args.makeCommandView}; // blockDispatchToCommandBuffer
ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream,
walkerCmd,
&args.outWalkerPtr,
args.device->getDeviceBitfield(),
args.requiredPartitionDim,
args.partitionCount,
!(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()),
!args.isKernelDispatchedFromImmediateCmdList,
args.dcFlushEnable,
gfxCoreHelper.singleTileExecImplicitScalingRequired(args.isCooperative),
workPartitionAllocationGpuVa,
hwInfo);
implicitScalingArgs);
args.partitionCount = implicitScalingArgs.partitionCount;
} else {
args.partitionCount = 1;
if (!args.makeCommandView) {

View File

@@ -37,6 +37,21 @@ struct ImplicitScalingHelper {
static bool pipeControlBeforeCleanupAtomicSyncRequired();
};
struct ImplicitScalingDispatchCommandArgs {
uint64_t workPartitionAllocationGpuVa = 0;
const HardwareInfo *hwInfo = nullptr;
void **outWalkerPtr = nullptr;
RequiredPartitionDim requiredPartitionDim = RequiredPartitionDim::none;
uint32_t partitionCount = 0;
bool useSecondaryBatchBuffer = false;
bool apiSelfCleanup = false;
bool dcFlush = false;
bool forceExecutionOnSingleTile = false;
bool blockDispatchToCommandBuffer = false;
};
template <typename GfxFamily>
struct ImplicitScalingDispatch {
using DefaultWalkerType = typename GfxFamily::DefaultWalkerType;
@@ -51,16 +66,8 @@ struct ImplicitScalingDispatch {
template <typename WalkerType>
static void dispatchCommands(LinearStream &commandStream,
WalkerType &walkerCmd,
void **outWalkerPtr,
const DeviceBitfield &devices,
RequiredPartitionDim requiredPartitionDim,
uint32_t &partitionCount,
bool useSecondaryBatchBuffer,
bool apiSelfCleanup,
bool dcFlush,
bool forceExecutionOnSingleTile,
uint64_t workPartitionAllocationGpuVa,
const HardwareInfo &hwInfo);
ImplicitScalingDispatchCommandArgs &dispatchCommandArgs);
static bool &getPipeControlStallRequired();

View File

@@ -18,9 +18,7 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool apiSelfCleanup, bool pre
template <typename GfxFamily>
template <typename WalkerType>
void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandStream, WalkerType &walkerCmd, void **outWalkerPtr, const DeviceBitfield &devices, NEO::RequiredPartitionDim requiredPartitionDim,
uint32_t &partitionCount, bool useSecondaryBatchBuffer, bool apiSelfCleanup, bool dcFlush, bool forceExecutionOnSingleTile, uint64_t workPartitionAllocationGpuVa,
const HardwareInfo &hwInfo) {
void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandStream, WalkerType &walkerCmd, const DeviceBitfield &devices, ImplicitScalingDispatchCommandArgs &dispatchCommandArgs) {
}
template <typename GfxFamily>
@@ -74,10 +72,7 @@ template <>
bool ImplicitScalingDispatch<Family>::pipeControlStallRequired = true;
template struct ImplicitScalingDispatch<Family>;
template void ImplicitScalingDispatch<Family>::dispatchCommands<Family::DefaultWalkerType>(LinearStream &commandStream, Family::DefaultWalkerType &walkerCmd, void **outWalkerPtr,
const DeviceBitfield &devices, RequiredPartitionDim requiredPartitionDim, uint32_t &partitionCount,
bool useSecondaryBatchBuffer, bool apiSelfCleanup, bool dcFlush, bool forceExecutionOnSingleTile,
uint64_t workPartitionAllocationGpuVa, const HardwareInfo &hwInfo);
template void ImplicitScalingDispatch<Family>::dispatchCommands<Family::DefaultWalkerType>(LinearStream &commandStream, Family::DefaultWalkerType &walkerCmd, const DeviceBitfield &devices, ImplicitScalingDispatchCommandArgs &dispatchCommandArgs);
template size_t ImplicitScalingDispatch<Family>::getSize<Family::DefaultWalkerType>(bool apiSelfCleanup, bool preferStaticPartitioning, const DeviceBitfield &devices, const Vec3<size_t> &groupStart, const Vec3<size_t> &groupCount);
} // namespace NEO

View File

@@ -17,23 +17,18 @@
namespace NEO {
template <typename GfxFamily>
WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(uint64_t workPartitionAllocationGpuVa,
WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(ImplicitScalingDispatchCommandArgs &dispatchCommandArgs,
uint32_t tileCount,
uint32_t partitionCount,
bool emitSelfCleanup,
bool preferStaticPartitioning,
bool staticPartitioning,
bool useSecondaryBatchBuffer,
bool dcFlush,
bool forceExecutionOnSingleTile) {
bool staticPartitioning) {
WalkerPartition::WalkerPartitionArgs args = {};
args.workPartitionAllocationGpuVa = workPartitionAllocationGpuVa;
args.partitionCount = partitionCount;
args.workPartitionAllocationGpuVa = dispatchCommandArgs.workPartitionAllocationGpuVa;
args.partitionCount = dispatchCommandArgs.partitionCount;
args.tileCount = tileCount;
args.staticPartitioning = staticPartitioning;
args.preferredStaticPartitioning = preferStaticPartitioning;
args.forceExecutionOnSingleTile = forceExecutionOnSingleTile;
args.forceExecutionOnSingleTile = dispatchCommandArgs.forceExecutionOnSingleTile;
args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup();
args.initializeWparidRegister = ImplicitScalingHelper::isWparidRegisterInitializationRequired();
@@ -44,14 +39,16 @@ WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(uint64_t workPar
args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired(args.emitPipeControlStall);
args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired();
args.emitSelfCleanup = ImplicitScalingHelper::isSelfCleanupRequired(args, emitSelfCleanup);
args.emitSelfCleanup = ImplicitScalingHelper::isSelfCleanupRequired(args, dispatchCommandArgs.apiSelfCleanup);
args.emitBatchBufferEnd = false;
args.secondaryBatchBuffer = useSecondaryBatchBuffer;
args.secondaryBatchBuffer = dispatchCommandArgs.useSecondaryBatchBuffer;
args.dcFlushEnable = dcFlush;
args.dcFlushEnable = dispatchCommandArgs.dcFlush;
args.pipeControlBeforeCleanupCrossTileSync = ImplicitScalingHelper::pipeControlBeforeCleanupAtomicSyncRequired();
args.blockDispatchToCommandBuffer = dispatchCommandArgs.blockDispatchToCommandBuffer;
return args;
}
@@ -74,15 +71,14 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool apiSelfCleanup,
&partitionType,
&staticPartitioning);
UNRECOVERABLE_IF(staticPartitioning && (tileCount != partitionCount));
WalkerPartition::WalkerPartitionArgs args = prepareWalkerPartitionArgs<GfxFamily>(0u,
ImplicitScalingDispatchCommandArgs dispatchCommandArgs = {};
dispatchCommandArgs.partitionCount = partitionCount;
dispatchCommandArgs.apiSelfCleanup = apiSelfCleanup;
WalkerPartition::WalkerPartitionArgs args = prepareWalkerPartitionArgs<GfxFamily>(dispatchCommandArgs,
tileCount,
partitionCount,
apiSelfCleanup,
preferStaticPartitioning,
staticPartitioning,
false,
false,
false);
staticPartitioning);
return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily, WalkerType>(args));
}
@@ -91,62 +87,58 @@ template <typename GfxFamily>
template <typename WalkerType>
void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandStream,
WalkerType &walkerCmd,
void **outWalkerPtr,
const DeviceBitfield &devices,
NEO::RequiredPartitionDim requiredPartitionDim,
uint32_t &partitionCount,
bool useSecondaryBatchBuffer,
bool apiSelfCleanup,
bool dcFlush,
bool forceExecutionOnSingleTile,
uint64_t workPartitionAllocationGpuVa,
const HardwareInfo &hwInfo) {
ImplicitScalingDispatchCommandArgs &dispatchCommandArgs) {
uint32_t totalProgrammedSize = 0u;
const uint32_t tileCount = static_cast<uint32_t>(devices.count());
const bool preferStaticPartitioning = workPartitionAllocationGpuVa != 0u;
const bool preferStaticPartitioning = dispatchCommandArgs.workPartitionAllocationGpuVa != 0u;
bool staticPartitioning = false;
partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType<GfxFamily, WalkerType>(&walkerCmd, requiredPartitionDim, tileCount, preferStaticPartitioning, &staticPartitioning);
dispatchCommandArgs.partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType<GfxFamily, WalkerType>(&walkerCmd, dispatchCommandArgs.requiredPartitionDim, tileCount, preferStaticPartitioning, &staticPartitioning);
WalkerPartition::WalkerPartitionArgs args = prepareWalkerPartitionArgs<GfxFamily>(workPartitionAllocationGpuVa,
tileCount,
partitionCount,
apiSelfCleanup,
preferStaticPartitioning,
staticPartitioning,
useSecondaryBatchBuffer,
dcFlush,
forceExecutionOnSingleTile);
WalkerPartition::WalkerPartitionArgs walkerPartitionArgs = prepareWalkerPartitionArgs<GfxFamily>(dispatchCommandArgs,
tileCount,
preferStaticPartitioning,
staticPartitioning);
size_t dispatchCommandsSize = 0;
void *commandBuffer = nullptr;
uint64_t cmdBufferGpuAddress = 0;
auto dispatchCommandsSize = getSize<WalkerType>(apiSelfCleanup, preferStaticPartitioning, devices, {walkerCmd.getThreadGroupIdStartingX(), walkerCmd.getThreadGroupIdStartingY(), walkerCmd.getThreadGroupIdStartingZ()}, {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()});
void *commandBuffer = commandStream.getSpace(dispatchCommandsSize);
uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed() - dispatchCommandsSize;
if (!dispatchCommandArgs.blockDispatchToCommandBuffer) {
dispatchCommandsSize = getSize<WalkerType>(dispatchCommandArgs.apiSelfCleanup,
preferStaticPartitioning,
devices,
{walkerCmd.getThreadGroupIdStartingX(), walkerCmd.getThreadGroupIdStartingY(), walkerCmd.getThreadGroupIdStartingZ()},
{walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()});
commandBuffer = commandStream.getSpace(dispatchCommandsSize);
cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed() - dispatchCommandsSize;
}
if (staticPartitioning) {
UNRECOVERABLE_IF(tileCount != partitionCount);
UNRECOVERABLE_IF(tileCount != dispatchCommandArgs.partitionCount);
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily, WalkerType>(commandBuffer,
outWalkerPtr,
dispatchCommandArgs.outWalkerPtr,
cmdBufferGpuAddress,
&walkerCmd,
totalProgrammedSize,
args,
hwInfo);
walkerPartitionArgs,
*dispatchCommandArgs.hwInfo);
} else {
if (debugManager.flags.ExperimentalSetWalkerPartitionCount.get()) {
partitionCount = debugManager.flags.ExperimentalSetWalkerPartitionCount.get();
if (partitionCount == 1u) {
dispatchCommandArgs.partitionCount = debugManager.flags.ExperimentalSetWalkerPartitionCount.get();
if (dispatchCommandArgs.partitionCount == 1u) {
walkerCmd.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
}
args.partitionCount = partitionCount;
walkerPartitionArgs.partitionCount = dispatchCommandArgs.partitionCount;
}
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily, WalkerType>(commandBuffer,
outWalkerPtr,
dispatchCommandArgs.outWalkerPtr,
cmdBufferGpuAddress,
&walkerCmd,
totalProgrammedSize,
args,
hwInfo);
walkerPartitionArgs,
*dispatchCommandArgs.hwInfo);
}
UNRECOVERABLE_IF(totalProgrammedSize != dispatchCommandsSize);
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
* Copyright (C) 2021-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -32,6 +32,7 @@ struct WalkerPartitionArgs {
bool pipeControlBeforeCleanupCrossTileSync = false;
bool dcFlushEnable = false;
bool forceExecutionOnSingleTile = false;
bool blockDispatchToCommandBuffer = false;
};
inline constexpr uint32_t wparidCCSOffset = 0x221C;

View File

@@ -496,8 +496,12 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm
WalkerType *inputWalker,
uint32_t partitionCount,
uint32_t tileCount,
bool forceExecutionOnSingleTile) {
auto computeWalker = putCommand<WalkerType>(inputAddress, totalBytesProgrammed);
bool forceExecutionOnSingleTile,
bool blockDispatchToCommandBuffer) {
WalkerType *computeWalker = nullptr;
if (!blockDispatchToCommandBuffer) {
computeWalker = putCommand<WalkerType>(inputAddress, totalBytesProgrammed);
}
if (partitionCount > 1) {
auto partitionType = inputWalker->getPartitionType();
@@ -527,7 +531,9 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm
appendWalkerFields<GfxFamily, WalkerType>(*inputWalker, tileCount, workgroupCount);
}
*computeWalker = *inputWalker;
if (!blockDispatchToCommandBuffer) {
*computeWalker = *inputWalker;
}
return computeWalker;
}
@@ -639,7 +645,7 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
args.secondaryBatchBuffer);
// Walker section
auto walkerPtr = programPartitionedWalker<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile);
auto walkerPtr = programPartitionedWalker<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile, args.blockDispatchToCommandBuffer);
if (outWalkerPtr) {
*outWalkerPtr = walkerPtr;
}
@@ -720,68 +726,74 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
void *currentBatchBufferPointer = cpuPointer;
// Get address of the control section
const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset<GfxFamily, WalkerType>(args);
const auto afterControlSectionOffset = controlSectionOffset + sizeof(StaticPartitioningControlSection);
const auto controlSectionOffset = args.blockDispatchToCommandBuffer ? 0u : computeStaticPartitioningControlSectionOffset<GfxFamily, WalkerType>(args);
const auto afterControlSectionOffset = args.blockDispatchToCommandBuffer ? 0u : controlSectionOffset + sizeof(StaticPartitioningControlSection);
// Synchronize tiles before walker
if (args.synchronizeBeforeExecution) {
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
if (!args.blockDispatchToCommandBuffer) {
// Synchronize tiles before walker
if (args.synchronizeBeforeExecution) {
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
}
// Load partition ID to wparid register and execute walker
if (args.initializeWparidRegister) {
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset);
}
}
// Load partition ID to wparid register and execute walker
if (args.initializeWparidRegister) {
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset);
}
auto walkerPtr = programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile);
if (outWalkerPtr) {
*outWalkerPtr = walkerPtr;
}
auto walkerPtr = programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile, args.blockDispatchToCommandBuffer);
// Prepare for cleanup section
if (args.emitSelfCleanup) {
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
programSelfCleanupSection<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForSelfCleanup);
}
if (!args.blockDispatchToCommandBuffer) {
if (outWalkerPtr) {
*outWalkerPtr = walkerPtr;
}
if (args.emitPipeControlStall) {
NEO::PipeControlArgs pipeControlArgs;
pipeControlArgs.dcFlushEnable = args.dcFlushEnable;
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, pipeControlArgs);
}
// Prepare for cleanup section
if (args.emitSelfCleanup) {
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
programSelfCleanupSection<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForSelfCleanup);
}
// Synchronize tiles after walker
if (args.semaphoreProgrammingRequired) {
programTilesSynchronizationWithPostSyncs<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
}
if (args.emitPipeControlStall) {
NEO::PipeControlArgs pipeControlArgs;
pipeControlArgs.dcFlushEnable = args.dcFlushEnable;
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, pipeControlArgs);
}
if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) {
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
}
// Synchronize tiles after walker
if (args.semaphoreProgrammingRequired) {
programTilesSynchronizationWithPostSyncs<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
}
// Jump over the control section only when needed
if (isStartAndControlSectionRequired<GfxFamily>(args)) {
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer);
if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) {
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
}
// Control section
DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset);
StaticPartitioningControlSection *controlSection = putCommand<StaticPartitioningControlSection>(currentBatchBufferPointer, totalBytesProgrammed);
controlSection->synchronizeBeforeWalkerCounter = 0u;
controlSection->synchronizeAfterWalkerCounter = 0u;
controlSection->finalSyncTileCounter = 0u;
DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset);
}
// Jump over the control section only when needed
if (isStartAndControlSectionRequired<GfxFamily>(args)) {
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer);
// Cleanup section
if (args.emitSelfCleanup) {
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
programSelfCleanupEndSection<GfxFamily>(currentBatchBufferPointer,
totalBytesProgrammed,
finalSyncTileCountAddress,
gpuAddressOfAllocation + controlSectionOffset,
staticPartitioningFieldsForCleanupCount,
args);
// Control section
DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset);
StaticPartitioningControlSection *controlSection = putCommand<StaticPartitioningControlSection>(currentBatchBufferPointer, totalBytesProgrammed);
controlSection->synchronizeBeforeWalkerCounter = 0u;
controlSection->synchronizeAfterWalkerCounter = 0u;
controlSection->finalSyncTileCounter = 0u;
DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset);
}
// Cleanup section
if (args.emitSelfCleanup) {
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
programSelfCleanupEndSection<GfxFamily>(currentBatchBufferPointer,
totalBytesProgrammed,
finalSyncTileCountAddress,
gpuAddressOfAllocation + controlSectionOffset,
staticPartitioningFieldsForCleanupCount,
args);
}
}
}

View File

@@ -19,10 +19,8 @@ template <>
bool ImplicitScalingDispatch<Family>::pipeControlStallRequired = false;
template struct ImplicitScalingDispatch<Family>;
template void ImplicitScalingDispatch<Family>::dispatchCommands<DefaultWalkerType>(LinearStream &commandStream, DefaultWalkerType &walkerCmd, void **outWalkerPtr,
const DeviceBitfield &devices, RequiredPartitionDim requiredPartitionDim, uint32_t &partitionCount,
bool useSecondaryBatchBuffer, bool apiSelfCleanup, bool dcFlush, bool forceExecutionOnSingleTile,
uint64_t workPartitionAllocationGpuVa, const HardwareInfo &hwInfo);
template void ImplicitScalingDispatch<Family>::dispatchCommands<DefaultWalkerType>(LinearStream &commandStream, DefaultWalkerType &walkerCmd, const DeviceBitfield &devices,
ImplicitScalingDispatchCommandArgs &dispatchCommandArgs);
template size_t ImplicitScalingDispatch<Family>::getSize<DefaultWalkerType>(bool apiSelfCleanup, bool preferStaticPartitioning, const DeviceBitfield &devices, const Vec3<size_t> &groupStart, const Vec3<size_t> &groupCount);
} // namespace NEO

View File

@@ -32,10 +32,8 @@ bool ImplicitScalingDispatch<Family>::platformSupportsImplicitScaling(const Root
}
template struct ImplicitScalingDispatch<Family>;
template void ImplicitScalingDispatch<Family>::dispatchCommands<DefaultWalkerType>(LinearStream &commandStream, DefaultWalkerType &walkerCmd, void **outWalkerPtr,
const DeviceBitfield &devices, RequiredPartitionDim requiredPartitionDim, uint32_t &partitionCount,
bool useSecondaryBatchBuffer, bool apiSelfCleanup, bool dcFlush, bool forceExecutionOnSingleTile,
uint64_t workPartitionAllocationGpuVa, const HardwareInfo &hwInfo);
template void ImplicitScalingDispatch<Family>::dispatchCommands<DefaultWalkerType>(LinearStream &commandStream, DefaultWalkerType &walkerCmd, const DeviceBitfield &devices,
ImplicitScalingDispatchCommandArgs &dispatchCommandArgs);
template size_t ImplicitScalingDispatch<Family>::getSize<DefaultWalkerType>(bool apiSelfCleanup, bool preferStaticPartitioning, const DeviceBitfield &devices, const Vec3<size_t> &groupStart, const Vec3<size_t> &groupCount);
} // namespace NEO

View File

@@ -19,10 +19,8 @@ template <>
bool ImplicitScalingDispatch<Family>::pipeControlStallRequired = true;
template struct ImplicitScalingDispatch<Family>;
template void ImplicitScalingDispatch<Family>::dispatchCommands<DefaultWalkerType>(LinearStream &commandStream, DefaultWalkerType &walkerCmd, void **outWalkerPtr,
const DeviceBitfield &devices, RequiredPartitionDim requiredPartitionDim, uint32_t &partitionCount,
bool useSecondaryBatchBuffer, bool apiSelfCleanup, bool dcFlush, bool forceExecutionOnSingleTile,
uint64_t workPartitionAllocationGpuVa, const HardwareInfo &hwInfo);
template void ImplicitScalingDispatch<Family>::dispatchCommands<DefaultWalkerType>(LinearStream &commandStream, DefaultWalkerType &walkerCmd, const DeviceBitfield &devices,
ImplicitScalingDispatchCommandArgs &dispatchCommandArgs);
template size_t ImplicitScalingDispatch<Family>::getSize<DefaultWalkerType>(bool apiSelfCleanup, bool preferStaticPartitioning, const DeviceBitfield &devices, const Vec3<size_t> &groupStart, const Vec3<size_t> &groupCount);
} // namespace NEO

View File

@@ -720,7 +720,20 @@ HWTEST2_F(CommandEncoderTests, whenAskingForImplicitScalingValuesThenAlwaysRetur
EXPECT_EQ(0u, ImplicitScalingDispatch<FamilyType>::template getSize<WalkerType>(false, false, deviceBitField, vec3, vec3));
void *ptr = nullptr;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(linearStream, walkerCmd, &ptr, deviceBitField, RequiredPartitionDim::x, partitionCount, false, false, false, false, 0, *defaultHwInfo);
ImplicitScalingDispatchCommandArgs args{
0, // workPartitionAllocationGpuVa
defaultHwInfo.get(), // hwInfo
&ptr, // outWalkerPtr
RequiredPartitionDim::x, // requiredPartitionDim
partitionCount, // partitionCount
false, // useSecondaryBatchBuffer
false, // apiSelfCleanup
false, // dcFlush
false, // forceExecutionOnSingleTile
false}; // blockDispatchToCommandBuffer
ImplicitScalingDispatch<FamilyType>::dispatchCommands(linearStream, walkerCmd, deviceBitField, args);
EXPECT_EQ(0u, linearStream.getUsed());
EXPECT_TRUE(ImplicitScalingDispatch<FamilyType>::getPipeControlStallRequired());

View File

@@ -1702,3 +1702,47 @@ HWTEST2_F(CommandEncodeStatesTest, givenEncodeDispatchKernelWhenRequestingComman
EXPECT_ANY_THROW(EncodeDispatchKernel<FamilyType>::template encode<DefaultWalkerType>(*cmdContainer.get(), dispatchArgs));
}
struct MultiTileCommandEncodeStatesFixture : public CommandEncodeStatesFixture {
void setUp() {
debugManager.flags.CreateMultipleSubDevices.set(2);
CommandEncodeStatesFixture::setUp();
}
DebugManagerStateRestore restorer;
};
using MultiTileCommandEncodeStatesTest = Test<MultiTileCommandEncodeStatesFixture>;
HWTEST2_F(MultiTileCommandEncodeStatesTest, givenEncodeDispatchKernelInImplicitScalingWhenRequestingCommandViewThenDoNotConsumeCmdBufferAndHeapSpace, IsAtLeastXeHpCore) {
using DefaultWalkerType = typename FamilyType::DefaultWalkerType;
uint32_t dims[] = {1, 1, 1};
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(pDevice->getDefaultEngine().commandStreamReceiver);
ultCsr->staticWorkPartitioningEnabled = true;
ultCsr->createWorkPartitionAllocation(*pDevice);
auto payloadHeap = cmdContainer->getIndirectHeap(HeapType::indirectObject);
auto payloadHeapUsed = payloadHeap->getUsed();
auto cmdBuffer = cmdContainer->getCommandStream();
auto cmdBufferUsed = cmdBuffer->getUsed();
uint8_t payloadView[256] = {};
dispatchInterface->getCrossThreadDataSizeResult = 64;
auto walkerPtr = std::make_unique<DefaultWalkerType>();
DefaultWalkerType *cpuWalkerPointer = walkerPtr.get();
bool requiresUncachedMocs = false;
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
dispatchArgs.makeCommandView = true;
dispatchArgs.partitionCount = 2;
dispatchArgs.cpuPayloadBuffer = payloadView;
dispatchArgs.cpuWalkerBuffer = cpuWalkerPointer;
EncodeDispatchKernel<FamilyType>::template encode<DefaultWalkerType>(*cmdContainer.get(), dispatchArgs);
EXPECT_EQ(payloadHeapUsed, payloadHeap->getUsed());
EXPECT_EQ(cmdBufferUsed, cmdBuffer->getUsed());
}

View File

@@ -36,11 +36,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenGetSizeWhenDispatchingCm
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<DefaultWalkerType>(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, false, dcFlushFlag,
forceExecutionOnSingleTileFlag, 0u, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(0, partitionCount);
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(2u, partitionCount);
EXPECT_EQ(2u, dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
@@ -79,11 +80,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenWorkgroupOneAndNoPartiti
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<DefaultWalkerType>(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, false, false, dcFlushFlag,
forceExecutionOnSingleTileFlag, 0u, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(0, partitionCount);
dispatchArgs.useSecondaryBatchBuffer = false;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(1u, partitionCount);
EXPECT_EQ(1u, dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
@@ -123,11 +126,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenWorkgroupOneAndPartition
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<DefaultWalkerType>(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, false, dcFlushFlag,
forceExecutionOnSingleTileFlag, 0u, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(0, partitionCount);
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(1u, partitionCount);
EXPECT_EQ(1u, dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
@@ -170,11 +174,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenDi
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<DefaultWalkerType>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, false, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(2u, partitionCount);
EXPECT_EQ(2u, dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
@@ -222,11 +227,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenPa
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<DefaultWalkerType>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, false, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(2u, partitionCount);
EXPECT_EQ(2u, dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
@@ -276,11 +282,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<DefaultWalkerType>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, false, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
@@ -327,11 +334,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<DefaultWalkerType>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, false, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
@@ -364,11 +372,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<DefaultWalkerType>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, false, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
@@ -401,11 +410,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenDynamicPartitioningPrefe
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<DefaultWalkerType>(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, false, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
@@ -449,11 +459,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, true, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.apiSelfCleanup = true;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parsePipeControl = true;
@@ -517,11 +529,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, true, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.apiSelfCleanup = true;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parsePipeControl = true;
@@ -577,11 +591,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, true, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.apiSelfCleanup = true;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parsePipeControl = true;
@@ -637,11 +653,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, true, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.apiSelfCleanup = true;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parsePipeControl = true;
@@ -704,11 +722,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, true, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.apiSelfCleanup = true;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parsePipeControl = true;
@@ -767,11 +787,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, false, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parsePipeControl = true;
@@ -832,11 +853,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, false, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parsePipeControl = true;
@@ -900,11 +922,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, true, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.apiSelfCleanup = true;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parsePipeControl = true;
@@ -967,11 +991,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, NEO::RequiredPartitionDim::none, partitionCount, true, false, dcFlushFlag,
forceExecutionOnSingleTileFlag, workPartitionAllocationAddress, *defaultHwInfo);
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedSize, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), partitionCount);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
HardwareParse hwParser;
hwParser.parsePipeControl = true;
@@ -1564,3 +1589,40 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
auto bbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStartList.begin());
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
givenStaticPartitioningWhenBlockDispatchFlagIsTrueThenDoNotDispatchAnyCommands) {
using DefaultWalkerType = typename FamilyType::DefaultWalkerType;
using PostSyncType = typename DefaultWalkerType::PostSyncType;
uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
DefaultWalkerType walker = FamilyType::template getInitGpuWalker<DefaultWalkerType>();
walker.setThreadGroupIdXDimension(32);
auto &postSync = walker.getPostSync();
postSync.setOperation(PostSyncType::OPERATION::OPERATION_WRITE_TIMESTAMP);
postSync.setDestinationAddress(postSyncAddress);
DefaultWalkerType walkerDispatched = walker;
uint64_t workPartitionAllocationAddress = 0x1000;
size_t expectedTotalBytesProgrammed = 0;
void *outWalkerPtr = nullptr;
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.blockDispatchToCommandBuffer = true;
dispatchArgs.outWalkerPtr = &outWalkerPtr;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
size_t totalBytesProgrammed = commandStream.getUsed();
EXPECT_EQ(expectedTotalBytesProgrammed, totalBytesProgrammed);
EXPECT_EQ(twoTile.count(), dispatchArgs.partitionCount);
EXPECT_EQ(nullptr, outWalkerPtr);
dispatchArgs.blockDispatchToCommandBuffer = false;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walkerDispatched, twoTile, dispatchArgs);
ASSERT_NE(nullptr, outWalkerPtr);
EXPECT_EQ(0, memcmp(&walkerDispatched, outWalkerPtr, sizeof(DefaultWalkerType)));
}

View File

@@ -426,7 +426,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_X);
void *walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false);
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false);
auto walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand);
@@ -437,7 +437,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_Y);
walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false);
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand);
@@ -446,7 +446,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_Z);
walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false);
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand);
@@ -456,7 +456,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
// if we program with partition Count == 1 then do not trigger partition stuff
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 1u, 2, false);
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 1u, 2, false, false);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand);
@@ -1763,7 +1763,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTi
bool forceExecutionOnSingleTile = false;
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_X);
void *walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, forceExecutionOnSingleTile);
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, forceExecutionOnSingleTile, false);
auto walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand);
@@ -1773,7 +1773,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTi
forceExecutionOnSingleTile = true;
walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, forceExecutionOnSingleTile);
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, forceExecutionOnSingleTile, false);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2023 Intel Corporation
* Copyright (C) 2021-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -39,3 +39,19 @@ void ImplicitScalingFixture::tearDown() {
alignedFree(alignedMemory);
CommandEncodeStatesFixture::tearDown();
}
ImplicitScalingDispatchCommandArgs ImplicitScalingFixture::createDispatchCommandArgs(uint64_t workPartitionAllocationAddress, uint32_t partitionCount) {
ImplicitScalingDispatchCommandArgs args{
workPartitionAllocationAddress, // workPartitionAllocationGpuVa
defaultHwInfo.get(), // hwInfo
nullptr, // outWalkerPtr
NEO::RequiredPartitionDim::none, // requiredPartitionDim
partitionCount, // partitionCount
true, // useSecondaryBatchBuffer
false, // apiSelfCleanup
dcFlushFlag, // dcFlush
forceExecutionOnSingleTileFlag, // forceExecutionOnSingleTile
false}; // blockDispatchToCommandBuffer
return args;
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
* Copyright (C) 2021-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,8 @@ struct ImplicitScalingFixture : public CommandEncodeStatesFixture {
void setUp();
void tearDown();
ImplicitScalingDispatchCommandArgs createDispatchCommandArgs(uint64_t workPartitionAllocationAddress, uint32_t partitionCount);
static constexpr uint64_t gpuVa = (1ull << 48);
static constexpr size_t bufferSize = 1024u;
DebugManagerStateRestore restorer;