performance: avoid reading from gfx memory when modifying walker command

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2024-09-11 15:25:33 +00:00
committed by Compute-Runtime-Automation
parent 487b02a2ac
commit 8bb92ff445
8 changed files with 68 additions and 31 deletions

View File

@@ -162,25 +162,25 @@ inline void HardwareInterface<GfxFamily>::programWalker(
uint32_t partitionCount = 0u; uint32_t partitionCount = 0u;
RequiredPartitionDim requiredPartitionDim = kernel.usesImages() ? RequiredPartitionDim::x : RequiredPartitionDim::none; RequiredPartitionDim requiredPartitionDim = kernel.usesImages() ? RequiredPartitionDim::x : RequiredPartitionDim::none;
void *outWalker = nullptr;
ImplicitScalingDispatchCommandArgs implicitScalingArgs{ ImplicitScalingDispatchCommandArgs implicitScalingArgs{
workPartitionAllocationGpuVa, // workPartitionAllocationGpuVa workPartitionAllocationGpuVa, // workPartitionAllocationGpuVa
&hwInfo, // hwInfo &hwInfo, // hwInfo
&outWalker, // outWalkerPtr nullptr, // outWalkerPtr
requiredPartitionDim, // requiredPartitionDim requiredPartitionDim, // requiredPartitionDim
partitionCount, // partitionCount partitionCount, // partitionCount
workgroupSize, // workgroupSize
maxWgCountPerTile, // maxWgCountPerTile
false, // useSecondaryBatchBuffer false, // useSecondaryBatchBuffer
false, // apiSelfCleanup false, // apiSelfCleanup
queueCsr.getDcFlushSupport(), // dcFlush queueCsr.getDcFlushSupport(), // dcFlush
kernel.isSingleSubdevicePreferred(), // forceExecutionOnSingleTile kernel.isSingleSubdevicePreferred(), // forceExecutionOnSingleTile
false}; // blockDispatchToCommandBuffer false, // blockDispatchToCommandBuffer
requiredWalkOrder != 0}; // isRequiredWorkGroupOrder
ImplicitScalingDispatch<GfxFamily>::template dispatchCommands<WalkerType>(commandStream, ImplicitScalingDispatch<GfxFamily>::template dispatchCommands<WalkerType>(commandStream,
walkerCmd, walkerCmd,
devices, devices,
implicitScalingArgs); implicitScalingArgs);
EncodeDispatchKernel<GfxFamily>::setWalkerRegionSettings(*static_cast<WalkerType *>(outWalker), hwInfo, implicitScalingArgs.partitionCount, workgroupSize, maxWgCountPerTile, requiredWalkOrder != 0);
if (queueCsr.isStaticWorkPartitioningEnabled()) { if (queueCsr.isStaticWorkPartitioningEnabled()) {
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), implicitScalingArgs.partitionCount)); queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), implicitScalingArgs.partitionCount));

View File

@@ -413,7 +413,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
PreemptionHelper::applyPreemptionWaCmdsBegin<Family>(listCmdBufferStream, *args.device); PreemptionHelper::applyPreemptionWaCmdsBegin<Family>(listCmdBufferStream, *args.device);
uint32_t workgroupSize = args.dispatchInterface->getGroupSize()[0] * args.dispatchInterface->getGroupSize()[1] * args.dispatchInterface->getGroupSize()[2]; uint32_t workgroupSize = args.dispatchInterface->getGroupSize()[0] * args.dispatchInterface->getGroupSize()[1] * args.dispatchInterface->getGroupSize()[2];
bool isRequiredWorkGroupOrder = args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none;
if (args.partitionCount > 1 && !args.isInternal) { if (args.partitionCount > 1 && !args.isInternal) {
const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress(); const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
@@ -423,24 +423,23 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
&args.outWalkerPtr, // outWalkerPtr &args.outWalkerPtr, // outWalkerPtr
args.requiredPartitionDim, // requiredPartitionDim args.requiredPartitionDim, // requiredPartitionDim
args.partitionCount, // partitionCount args.partitionCount, // partitionCount
workgroupSize, // workgroupSize
args.maxWgCountPerTile, // maxWgCountPerTile
!(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer !(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer
!args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup !args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup
args.dcFlushEnable, // dcFlush args.dcFlushEnable, // dcFlush
EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile
args.makeCommandView}; // blockDispatchToCommandBuffer args.makeCommandView, // blockDispatchToCommandBuffer
isRequiredWorkGroupOrder}; // isRequiredWorkGroupOrder
ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream, ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream,
walkerCmd, walkerCmd,
args.device->getDeviceBitfield(), args.device->getDeviceBitfield(),
implicitScalingArgs); implicitScalingArgs);
args.partitionCount = implicitScalingArgs.partitionCount; args.partitionCount = implicitScalingArgs.partitionCount;
void *walkerToModify = args.outWalkerPtr ? args.outWalkerPtr : &walkerCmd;
EncodeDispatchKernel<Family>::setWalkerRegionSettings(*static_cast<WalkerType *>(walkerToModify), hwInfo, args.partitionCount, workgroupSize, args.maxWgCountPerTile, args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none);
} else { } else {
args.partitionCount = 1; args.partitionCount = 1;
EncodeDispatchKernel<Family>::setWalkerRegionSettings(walkerCmd, hwInfo, args.partitionCount, workgroupSize, args.maxWgCountPerTile, args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none); EncodeDispatchKernel<Family>::setWalkerRegionSettings(walkerCmd, hwInfo, args.partitionCount, workgroupSize, args.maxWgCountPerTile, isRequiredWorkGroupOrder);
if (!args.makeCommandView) { if (!args.makeCommandView) {
auto buffer = listCmdBufferStream->getSpaceForCmd<WalkerType>(); auto buffer = listCmdBufferStream->getSpaceForCmd<WalkerType>();

View File

@@ -44,12 +44,15 @@ struct ImplicitScalingDispatchCommandArgs {
RequiredPartitionDim requiredPartitionDim = RequiredPartitionDim::none; RequiredPartitionDim requiredPartitionDim = RequiredPartitionDim::none;
uint32_t partitionCount = 0; uint32_t partitionCount = 0;
uint32_t workgroupSize = 0;
uint32_t maxWgCountPerTile = 0;
bool useSecondaryBatchBuffer = false; bool useSecondaryBatchBuffer = false;
bool apiSelfCleanup = false; bool apiSelfCleanup = false;
bool dcFlush = false; bool dcFlush = false;
bool forceExecutionOnSingleTile = false; bool forceExecutionOnSingleTile = false;
bool blockDispatchToCommandBuffer = false; bool blockDispatchToCommandBuffer = false;
bool isRequiredWorkGroupOrder = false;
}; };
template <typename GfxFamily> template <typename GfxFamily>

View File

@@ -49,6 +49,10 @@ WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(ImplicitScalingD
args.blockDispatchToCommandBuffer = dispatchCommandArgs.blockDispatchToCommandBuffer; args.blockDispatchToCommandBuffer = dispatchCommandArgs.blockDispatchToCommandBuffer;
args.workgroupSize = dispatchCommandArgs.workgroupSize;
args.maxWgCountPerTile = dispatchCommandArgs.maxWgCountPerTile;
args.isRequiredWorkGroupOrder = dispatchCommandArgs.isRequiredWorkGroupOrder;
return args; return args;
} }

View File

@@ -17,6 +17,8 @@ struct WalkerPartitionArgs {
uint64_t postSyncImmediateValue = 0; uint64_t postSyncImmediateValue = 0;
uint32_t partitionCount = 0; uint32_t partitionCount = 0;
uint32_t tileCount = 0; uint32_t tileCount = 0;
uint32_t workgroupSize = 0;
uint32_t maxWgCountPerTile = 0;
bool emitBatchBufferEnd = false; bool emitBatchBufferEnd = false;
bool secondaryBatchBuffer = false; bool secondaryBatchBuffer = false;
bool synchronizeBeforeExecution = false; bool synchronizeBeforeExecution = false;
@@ -33,6 +35,7 @@ struct WalkerPartitionArgs {
bool dcFlushEnable = false; bool dcFlushEnable = false;
bool forceExecutionOnSingleTile = false; bool forceExecutionOnSingleTile = false;
bool blockDispatchToCommandBuffer = false; bool blockDispatchToCommandBuffer = false;
bool isRequiredWorkGroupOrder = false;
}; };
inline constexpr uint32_t wparidCCSOffset = 0x221C; inline constexpr uint32_t wparidCCSOffset = 0x221C;

View File

@@ -494,16 +494,14 @@ uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) {
template <typename GfxFamily, typename WalkerType> template <typename GfxFamily, typename WalkerType>
void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgrammed, void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgrammed,
WalkerType *inputWalker, WalkerType *inputWalker,
uint32_t partitionCount, WalkerPartitionArgs &args,
uint32_t tileCount, const NEO::HardwareInfo &hwInfo) {
bool forceExecutionOnSingleTile,
bool blockDispatchToCommandBuffer) {
WalkerType *computeWalker = nullptr; WalkerType *computeWalker = nullptr;
if (!blockDispatchToCommandBuffer) { if (!args.blockDispatchToCommandBuffer) {
computeWalker = putCommand<WalkerType>(inputAddress, totalBytesProgrammed); computeWalker = putCommand<WalkerType>(inputAddress, totalBytesProgrammed);
} }
if (partitionCount > 1) { if (args.partitionCount > 1) {
auto partitionType = inputWalker->getPartitionType(); auto partitionType = inputWalker->getPartitionType();
assert(inputWalker->getThreadGroupIdStartingX() == 0u); assert(inputWalker->getThreadGroupIdStartingX() == 0u);
@@ -522,16 +520,23 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm
workgroupCount = inputWalker->getThreadGroupIdZDimension(); workgroupCount = inputWalker->getThreadGroupIdZDimension();
} }
if (forceExecutionOnSingleTile) { if (args.forceExecutionOnSingleTile) {
inputWalker->setPartitionSize(workgroupCount); inputWalker->setPartitionSize(workgroupCount);
} else { } else {
inputWalker->setPartitionSize(Math::divideAndRoundUp(workgroupCount, partitionCount)); inputWalker->setPartitionSize(Math::divideAndRoundUp(workgroupCount, args.partitionCount));
} }
appendWalkerFields<GfxFamily, WalkerType>(*inputWalker, tileCount, workgroupCount); NEO::EncodeDispatchKernel<GfxFamily>::setWalkerRegionSettings(*inputWalker,
hwInfo,
args.partitionCount,
args.workgroupSize,
args.maxWgCountPerTile,
args.isRequiredWorkGroupOrder);
appendWalkerFields<GfxFamily, WalkerType>(*inputWalker, args.tileCount, workgroupCount);
} }
if (!blockDispatchToCommandBuffer) { if (computeWalker != nullptr) {
*computeWalker = *inputWalker; *computeWalker = *inputWalker;
} }
@@ -645,7 +650,7 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
args.secondaryBatchBuffer); args.secondaryBatchBuffer);
// Walker section // Walker section
auto walkerPtr = programPartitionedWalker<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile, args.blockDispatchToCommandBuffer); auto walkerPtr = programPartitionedWalker<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args, hwInfo);
if (outWalkerPtr) { if (outWalkerPtr) {
*outWalkerPtr = walkerPtr; *outWalkerPtr = walkerPtr;
} }
@@ -742,7 +747,7 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
} }
} }
auto walkerPtr = programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile, args.blockDispatchToCommandBuffer); auto walkerPtr = programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args, hwInfo);
if (!args.blockDispatchToCommandBuffer) { if (!args.blockDispatchToCommandBuffer) {
if (outWalkerPtr) { if (outWalkerPtr) {

View File

@@ -426,7 +426,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_X); walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_X);
void *walkerCommandAddress = cmdBufferAddress; void *walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false); WalkerPartition::WalkerPartitionArgs args = {};
args.partitionCount = 2;
args.tileCount = 2;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
auto walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress); auto walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand); ASSERT_NE(nullptr, walkerCommand);
@@ -437,7 +440,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_Y); walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_Y);
walkerCommandAddress = cmdBufferAddress; walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false); args = {};
args.partitionCount = 2;
args.tileCount = 2;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress); walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand); ASSERT_NE(nullptr, walkerCommand);
@@ -446,7 +452,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_Z); walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_Z);
walkerCommandAddress = cmdBufferAddress; walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false); args = {};
args.partitionCount = 2;
args.tileCount = 2;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress); walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand); ASSERT_NE(nullptr, walkerCommand);
@@ -456,7 +465,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
// if we program with partition Count == 1 then do not trigger partition stuff // if we program with partition Count == 1 then do not trigger partition stuff
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED); walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
walkerCommandAddress = cmdBufferAddress; walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 1u, 2, false, false); args = {};
args.partitionCount = 1;
args.tileCount = 2;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress); walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand); ASSERT_NE(nullptr, walkerCommand);
@@ -1763,7 +1775,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTi
bool forceExecutionOnSingleTile = false; bool forceExecutionOnSingleTile = false;
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_X); walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_X);
void *walkerCommandAddress = cmdBufferAddress; void *walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, forceExecutionOnSingleTile, false); WalkerPartition::WalkerPartitionArgs args = {};
args.partitionCount = 2;
args.tileCount = 2;
args.forceExecutionOnSingleTile = forceExecutionOnSingleTile;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
auto walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress); auto walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand); ASSERT_NE(nullptr, walkerCommand);
@@ -1773,7 +1789,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTi
forceExecutionOnSingleTile = true; forceExecutionOnSingleTile = true;
walkerCommandAddress = cmdBufferAddress; walkerCommandAddress = cmdBufferAddress;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, forceExecutionOnSingleTile, false); args = {};
args.partitionCount = 2;
args.tileCount = 2;
args.forceExecutionOnSingleTile = forceExecutionOnSingleTile;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress); walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
ASSERT_NE(nullptr, walkerCommand); ASSERT_NE(nullptr, walkerCommand);

View File

@@ -47,11 +47,14 @@ ImplicitScalingDispatchCommandArgs ImplicitScalingFixture::createDispatchCommand
nullptr, // outWalkerPtr nullptr, // outWalkerPtr
NEO::RequiredPartitionDim::none, // requiredPartitionDim NEO::RequiredPartitionDim::none, // requiredPartitionDim
partitionCount, // partitionCount partitionCount, // partitionCount
1, // workgroupSize
1, // maxWgCountPerTile
true, // useSecondaryBatchBuffer true, // useSecondaryBatchBuffer
false, // apiSelfCleanup false, // apiSelfCleanup
dcFlushFlag, // dcFlush dcFlushFlag, // dcFlush
forceExecutionOnSingleTileFlag, // forceExecutionOnSingleTile forceExecutionOnSingleTileFlag, // forceExecutionOnSingleTile
false}; // blockDispatchToCommandBuffer false, // blockDispatchToCommandBuffer
false}; // isRequiredWorkGroupOrder
return args; return args;
} }