performance: avoid reading from gfx memory when modifying walker command

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2024-09-11 15:25:33 +00:00
committed by Compute-Runtime-Automation
parent 487b02a2ac
commit 8bb92ff445
8 changed files with 68 additions and 31 deletions

View File

@@ -162,25 +162,25 @@ inline void HardwareInterface<GfxFamily>::programWalker(
uint32_t partitionCount = 0u;
RequiredPartitionDim requiredPartitionDim = kernel.usesImages() ? RequiredPartitionDim::x : RequiredPartitionDim::none;
void *outWalker = nullptr;
ImplicitScalingDispatchCommandArgs implicitScalingArgs{
workPartitionAllocationGpuVa, // workPartitionAllocationGpuVa
&hwInfo, // hwInfo
&outWalker, // outWalkerPtr
nullptr, // outWalkerPtr
requiredPartitionDim, // requiredPartitionDim
partitionCount, // partitionCount
workgroupSize, // workgroupSize
maxWgCountPerTile, // maxWgCountPerTile
false, // useSecondaryBatchBuffer
false, // apiSelfCleanup
queueCsr.getDcFlushSupport(), // dcFlush
kernel.isSingleSubdevicePreferred(), // forceExecutionOnSingleTile
false}; // blockDispatchToCommandBuffer
false, // blockDispatchToCommandBuffer
requiredWalkOrder != 0}; // isRequiredWorkGroupOrder
ImplicitScalingDispatch<GfxFamily>::template dispatchCommands<WalkerType>(commandStream,
walkerCmd,
devices,
implicitScalingArgs);
EncodeDispatchKernel<GfxFamily>::setWalkerRegionSettings(*static_cast<WalkerType *>(outWalker), hwInfo, implicitScalingArgs.partitionCount, workgroupSize, maxWgCountPerTile, requiredWalkOrder != 0);
if (queueCsr.isStaticWorkPartitioningEnabled()) {
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), implicitScalingArgs.partitionCount));

View File

@@ -413,7 +413,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
PreemptionHelper::applyPreemptionWaCmdsBegin<Family>(listCmdBufferStream, *args.device);
uint32_t workgroupSize = args.dispatchInterface->getGroupSize()[0] * args.dispatchInterface->getGroupSize()[1] * args.dispatchInterface->getGroupSize()[2];
bool isRequiredWorkGroupOrder = args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none;
if (args.partitionCount > 1 && !args.isInternal) {
const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
@@ -423,24 +423,23 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
&args.outWalkerPtr, // outWalkerPtr
args.requiredPartitionDim, // requiredPartitionDim
args.partitionCount, // partitionCount
workgroupSize, // workgroupSize
args.maxWgCountPerTile, // maxWgCountPerTile
!(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer
!args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup
args.dcFlushEnable, // dcFlush
EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile
args.makeCommandView}; // blockDispatchToCommandBuffer
args.makeCommandView, // blockDispatchToCommandBuffer
isRequiredWorkGroupOrder}; // isRequiredWorkGroupOrder
ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream,
walkerCmd,
args.device->getDeviceBitfield(),
implicitScalingArgs);
args.partitionCount = implicitScalingArgs.partitionCount;
void *walkerToModify = args.outWalkerPtr ? args.outWalkerPtr : &walkerCmd;
EncodeDispatchKernel<Family>::setWalkerRegionSettings(*static_cast<WalkerType *>(walkerToModify), hwInfo, args.partitionCount, workgroupSize, args.maxWgCountPerTile, args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none);
} else {
args.partitionCount = 1;
EncodeDispatchKernel<Family>::setWalkerRegionSettings(walkerCmd, hwInfo, args.partitionCount, workgroupSize, args.maxWgCountPerTile, args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none);
EncodeDispatchKernel<Family>::setWalkerRegionSettings(walkerCmd, hwInfo, args.partitionCount, workgroupSize, args.maxWgCountPerTile, isRequiredWorkGroupOrder);
if (!args.makeCommandView) {
auto buffer = listCmdBufferStream->getSpaceForCmd<WalkerType>();

View File

@@ -44,12 +44,15 @@ struct ImplicitScalingDispatchCommandArgs {
RequiredPartitionDim requiredPartitionDim = RequiredPartitionDim::none;
uint32_t partitionCount = 0;
uint32_t workgroupSize = 0;
uint32_t maxWgCountPerTile = 0;
bool useSecondaryBatchBuffer = false;
bool apiSelfCleanup = false;
bool dcFlush = false;
bool forceExecutionOnSingleTile = false;
bool blockDispatchToCommandBuffer = false;
bool isRequiredWorkGroupOrder = false;
};
template <typename GfxFamily>

View File

@@ -49,6 +49,10 @@ WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(ImplicitScalingD
args.blockDispatchToCommandBuffer = dispatchCommandArgs.blockDispatchToCommandBuffer;
args.workgroupSize = dispatchCommandArgs.workgroupSize;
args.maxWgCountPerTile = dispatchCommandArgs.maxWgCountPerTile;
args.isRequiredWorkGroupOrder = dispatchCommandArgs.isRequiredWorkGroupOrder;
return args;
}

View File

@@ -17,6 +17,8 @@ struct WalkerPartitionArgs {
uint64_t postSyncImmediateValue = 0;
uint32_t partitionCount = 0;
uint32_t tileCount = 0;
uint32_t workgroupSize = 0;
uint32_t maxWgCountPerTile = 0;
bool emitBatchBufferEnd = false;
bool secondaryBatchBuffer = false;
bool synchronizeBeforeExecution = false;
@@ -33,6 +35,7 @@ struct WalkerPartitionArgs {
bool dcFlushEnable = false;
bool forceExecutionOnSingleTile = false;
bool blockDispatchToCommandBuffer = false;
bool isRequiredWorkGroupOrder = false;
};
inline constexpr uint32_t wparidCCSOffset = 0x221C;

View File

@@ -494,16 +494,14 @@ uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) {
template <typename GfxFamily, typename WalkerType>
void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgrammed,
WalkerType *inputWalker,
uint32_t partitionCount,
uint32_t tileCount,
bool forceExecutionOnSingleTile,
bool blockDispatchToCommandBuffer) {
WalkerPartitionArgs &args,
const NEO::HardwareInfo &hwInfo) {
WalkerType *computeWalker = nullptr;
if (!blockDispatchToCommandBuffer) {
if (!args.blockDispatchToCommandBuffer) {
computeWalker = putCommand<WalkerType>(inputAddress, totalBytesProgrammed);
}
if (partitionCount > 1) {
if (args.partitionCount > 1) {
auto partitionType = inputWalker->getPartitionType();
assert(inputWalker->getThreadGroupIdStartingX() == 0u);
@@ -522,16 +520,23 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm
workgroupCount = inputWalker->getThreadGroupIdZDimension();
}
if (forceExecutionOnSingleTile) {
if (args.forceExecutionOnSingleTile) {
inputWalker->setPartitionSize(workgroupCount);
} else {
inputWalker->setPartitionSize(Math::divideAndRoundUp(workgroupCount, partitionCount));
inputWalker->setPartitionSize(Math::divideAndRoundUp(workgroupCount, args.partitionCount));
}
appendWalkerFields<GfxFamily, WalkerType>(*inputWalker, tileCount, workgroupCount);
NEO::EncodeDispatchKernel<GfxFamily>::setWalkerRegionSettings(*inputWalker,
hwInfo,
args.partitionCount,
args.workgroupSize,
args.maxWgCountPerTile,
args.isRequiredWorkGroupOrder);
appendWalkerFields<GfxFamily, WalkerType>(*inputWalker, args.tileCount, workgroupCount);
}
if (!blockDispatchToCommandBuffer) {
if (computeWalker != nullptr) {
*computeWalker = *inputWalker;
}
@@ -645,7 +650,7 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
args.secondaryBatchBuffer);
// Walker section
auto walkerPtr = programPartitionedWalker<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile, args.blockDispatchToCommandBuffer);
auto walkerPtr = programPartitionedWalker<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args, hwInfo);
if (outWalkerPtr) {
*outWalkerPtr = walkerPtr;
}
@@ -742,7 +747,7 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
}
}
auto walkerPtr = programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile, args.blockDispatchToCommandBuffer);
auto walkerPtr = programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args, hwInfo);
if (!args.blockDispatchToCommandBuffer) {
if (outWalkerPtr) {

View File

@@ -426,7 +426,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_X);
void *walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false);
WalkerPartition::WalkerPartitionArgs args = {};
args.partitionCount = 2;
args.tileCount = 2;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
auto walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand);
@@ -437,7 +440,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_Y);
walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false);
args = {};
args.partitionCount = 2;
args.tileCount = 2;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand);
@@ -446,7 +452,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_Z);
walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false);
args = {};
args.partitionCount = 2;
args.tileCount = 2;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand);
@@ -456,7 +465,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
// if we program with partition Count == 1 then do not trigger partition stuff
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 1u, 2, false, false);
args = {};
args.partitionCount = 1;
args.tileCount = 2;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand);
@@ -1763,7 +1775,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTi
bool forceExecutionOnSingleTile = false;
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_X);
void *walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, forceExecutionOnSingleTile, false);
WalkerPartition::WalkerPartitionArgs args = {};
args.partitionCount = 2;
args.tileCount = 2;
args.forceExecutionOnSingleTile = forceExecutionOnSingleTile;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
auto walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand);
@@ -1773,7 +1789,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTi
forceExecutionOnSingleTile = true;
walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, forceExecutionOnSingleTile, false);
args = {};
args.partitionCount = 2;
args.tileCount = 2;
args.forceExecutionOnSingleTile = forceExecutionOnSingleTile;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand);

View File

@@ -47,11 +47,14 @@ ImplicitScalingDispatchCommandArgs ImplicitScalingFixture::createDispatchCommand
nullptr, // outWalkerPtr
NEO::RequiredPartitionDim::none, // requiredPartitionDim
partitionCount, // partitionCount
1, // workgroupSize
1, // maxWgCountPerTile
true, // useSecondaryBatchBuffer
false, // apiSelfCleanup
dcFlushFlag, // dcFlush
forceExecutionOnSingleTileFlag, // forceExecutionOnSingleTile
false}; // blockDispatchToCommandBuffer
false, // blockDispatchToCommandBuffer
false}; // isRequiredWorkGroupOrder
return args;
}