mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-21 09:14:47 +08:00
performance: avoid reading from gfx memory when modifying walker command
Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
487b02a2ac
commit
8bb92ff445
@@ -162,25 +162,25 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
uint32_t partitionCount = 0u;
|
||||
RequiredPartitionDim requiredPartitionDim = kernel.usesImages() ? RequiredPartitionDim::x : RequiredPartitionDim::none;
|
||||
|
||||
void *outWalker = nullptr;
|
||||
|
||||
ImplicitScalingDispatchCommandArgs implicitScalingArgs{
|
||||
workPartitionAllocationGpuVa, // workPartitionAllocationGpuVa
|
||||
&hwInfo, // hwInfo
|
||||
&outWalker, // outWalkerPtr
|
||||
nullptr, // outWalkerPtr
|
||||
requiredPartitionDim, // requiredPartitionDim
|
||||
partitionCount, // partitionCount
|
||||
workgroupSize, // workgroupSize
|
||||
maxWgCountPerTile, // maxWgCountPerTile
|
||||
false, // useSecondaryBatchBuffer
|
||||
false, // apiSelfCleanup
|
||||
queueCsr.getDcFlushSupport(), // dcFlush
|
||||
kernel.isSingleSubdevicePreferred(), // forceExecutionOnSingleTile
|
||||
false}; // blockDispatchToCommandBuffer
|
||||
false, // blockDispatchToCommandBuffer
|
||||
requiredWalkOrder != 0}; // isRequiredWorkGroupOrder
|
||||
|
||||
ImplicitScalingDispatch<GfxFamily>::template dispatchCommands<WalkerType>(commandStream,
|
||||
walkerCmd,
|
||||
devices,
|
||||
implicitScalingArgs);
|
||||
EncodeDispatchKernel<GfxFamily>::setWalkerRegionSettings(*static_cast<WalkerType *>(outWalker), hwInfo, implicitScalingArgs.partitionCount, workgroupSize, maxWgCountPerTile, requiredWalkOrder != 0);
|
||||
|
||||
if (queueCsr.isStaticWorkPartitioningEnabled()) {
|
||||
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), implicitScalingArgs.partitionCount));
|
||||
|
||||
@@ -413,7 +413,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
PreemptionHelper::applyPreemptionWaCmdsBegin<Family>(listCmdBufferStream, *args.device);
|
||||
|
||||
uint32_t workgroupSize = args.dispatchInterface->getGroupSize()[0] * args.dispatchInterface->getGroupSize()[1] * args.dispatchInterface->getGroupSize()[2];
|
||||
|
||||
bool isRequiredWorkGroupOrder = args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none;
|
||||
if (args.partitionCount > 1 && !args.isInternal) {
|
||||
const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
|
||||
|
||||
@@ -423,24 +423,23 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
&args.outWalkerPtr, // outWalkerPtr
|
||||
args.requiredPartitionDim, // requiredPartitionDim
|
||||
args.partitionCount, // partitionCount
|
||||
workgroupSize, // workgroupSize
|
||||
args.maxWgCountPerTile, // maxWgCountPerTile
|
||||
!(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer
|
||||
!args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup
|
||||
args.dcFlushEnable, // dcFlush
|
||||
EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile
|
||||
args.makeCommandView}; // blockDispatchToCommandBuffer
|
||||
args.makeCommandView, // blockDispatchToCommandBuffer
|
||||
isRequiredWorkGroupOrder}; // isRequiredWorkGroupOrder
|
||||
|
||||
ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream,
|
||||
walkerCmd,
|
||||
args.device->getDeviceBitfield(),
|
||||
implicitScalingArgs);
|
||||
args.partitionCount = implicitScalingArgs.partitionCount;
|
||||
|
||||
void *walkerToModify = args.outWalkerPtr ? args.outWalkerPtr : &walkerCmd;
|
||||
|
||||
EncodeDispatchKernel<Family>::setWalkerRegionSettings(*static_cast<WalkerType *>(walkerToModify), hwInfo, args.partitionCount, workgroupSize, args.maxWgCountPerTile, args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none);
|
||||
} else {
|
||||
args.partitionCount = 1;
|
||||
EncodeDispatchKernel<Family>::setWalkerRegionSettings(walkerCmd, hwInfo, args.partitionCount, workgroupSize, args.maxWgCountPerTile, args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none);
|
||||
EncodeDispatchKernel<Family>::setWalkerRegionSettings(walkerCmd, hwInfo, args.partitionCount, workgroupSize, args.maxWgCountPerTile, isRequiredWorkGroupOrder);
|
||||
|
||||
if (!args.makeCommandView) {
|
||||
auto buffer = listCmdBufferStream->getSpaceForCmd<WalkerType>();
|
||||
|
||||
@@ -44,12 +44,15 @@ struct ImplicitScalingDispatchCommandArgs {
|
||||
|
||||
RequiredPartitionDim requiredPartitionDim = RequiredPartitionDim::none;
|
||||
uint32_t partitionCount = 0;
|
||||
uint32_t workgroupSize = 0;
|
||||
uint32_t maxWgCountPerTile = 0;
|
||||
|
||||
bool useSecondaryBatchBuffer = false;
|
||||
bool apiSelfCleanup = false;
|
||||
bool dcFlush = false;
|
||||
bool forceExecutionOnSingleTile = false;
|
||||
bool blockDispatchToCommandBuffer = false;
|
||||
bool isRequiredWorkGroupOrder = false;
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
||||
@@ -49,6 +49,10 @@ WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(ImplicitScalingD
|
||||
|
||||
args.blockDispatchToCommandBuffer = dispatchCommandArgs.blockDispatchToCommandBuffer;
|
||||
|
||||
args.workgroupSize = dispatchCommandArgs.workgroupSize;
|
||||
args.maxWgCountPerTile = dispatchCommandArgs.maxWgCountPerTile;
|
||||
args.isRequiredWorkGroupOrder = dispatchCommandArgs.isRequiredWorkGroupOrder;
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
|
||||
@@ -17,6 +17,8 @@ struct WalkerPartitionArgs {
|
||||
uint64_t postSyncImmediateValue = 0;
|
||||
uint32_t partitionCount = 0;
|
||||
uint32_t tileCount = 0;
|
||||
uint32_t workgroupSize = 0;
|
||||
uint32_t maxWgCountPerTile = 0;
|
||||
bool emitBatchBufferEnd = false;
|
||||
bool secondaryBatchBuffer = false;
|
||||
bool synchronizeBeforeExecution = false;
|
||||
@@ -33,6 +35,7 @@ struct WalkerPartitionArgs {
|
||||
bool dcFlushEnable = false;
|
||||
bool forceExecutionOnSingleTile = false;
|
||||
bool blockDispatchToCommandBuffer = false;
|
||||
bool isRequiredWorkGroupOrder = false;
|
||||
};
|
||||
|
||||
inline constexpr uint32_t wparidCCSOffset = 0x221C;
|
||||
|
||||
@@ -494,16 +494,14 @@ uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) {
|
||||
template <typename GfxFamily, typename WalkerType>
|
||||
void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgrammed,
|
||||
WalkerType *inputWalker,
|
||||
uint32_t partitionCount,
|
||||
uint32_t tileCount,
|
||||
bool forceExecutionOnSingleTile,
|
||||
bool blockDispatchToCommandBuffer) {
|
||||
WalkerPartitionArgs &args,
|
||||
const NEO::HardwareInfo &hwInfo) {
|
||||
WalkerType *computeWalker = nullptr;
|
||||
if (!blockDispatchToCommandBuffer) {
|
||||
if (!args.blockDispatchToCommandBuffer) {
|
||||
computeWalker = putCommand<WalkerType>(inputAddress, totalBytesProgrammed);
|
||||
}
|
||||
|
||||
if (partitionCount > 1) {
|
||||
if (args.partitionCount > 1) {
|
||||
auto partitionType = inputWalker->getPartitionType();
|
||||
|
||||
assert(inputWalker->getThreadGroupIdStartingX() == 0u);
|
||||
@@ -522,16 +520,23 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm
|
||||
workgroupCount = inputWalker->getThreadGroupIdZDimension();
|
||||
}
|
||||
|
||||
if (forceExecutionOnSingleTile) {
|
||||
if (args.forceExecutionOnSingleTile) {
|
||||
inputWalker->setPartitionSize(workgroupCount);
|
||||
} else {
|
||||
inputWalker->setPartitionSize(Math::divideAndRoundUp(workgroupCount, partitionCount));
|
||||
inputWalker->setPartitionSize(Math::divideAndRoundUp(workgroupCount, args.partitionCount));
|
||||
}
|
||||
|
||||
appendWalkerFields<GfxFamily, WalkerType>(*inputWalker, tileCount, workgroupCount);
|
||||
NEO::EncodeDispatchKernel<GfxFamily>::setWalkerRegionSettings(*inputWalker,
|
||||
hwInfo,
|
||||
args.partitionCount,
|
||||
args.workgroupSize,
|
||||
args.maxWgCountPerTile,
|
||||
args.isRequiredWorkGroupOrder);
|
||||
|
||||
appendWalkerFields<GfxFamily, WalkerType>(*inputWalker, args.tileCount, workgroupCount);
|
||||
}
|
||||
|
||||
if (!blockDispatchToCommandBuffer) {
|
||||
if (computeWalker != nullptr) {
|
||||
*computeWalker = *inputWalker;
|
||||
}
|
||||
|
||||
@@ -645,7 +650,7 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
args.secondaryBatchBuffer);
|
||||
|
||||
// Walker section
|
||||
auto walkerPtr = programPartitionedWalker<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile, args.blockDispatchToCommandBuffer);
|
||||
auto walkerPtr = programPartitionedWalker<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args, hwInfo);
|
||||
if (outWalkerPtr) {
|
||||
*outWalkerPtr = walkerPtr;
|
||||
}
|
||||
@@ -742,7 +747,7 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
}
|
||||
}
|
||||
|
||||
auto walkerPtr = programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile, args.blockDispatchToCommandBuffer);
|
||||
auto walkerPtr = programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args, hwInfo);
|
||||
|
||||
if (!args.blockDispatchToCommandBuffer) {
|
||||
if (outWalkerPtr) {
|
||||
|
||||
@@ -426,7 +426,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
|
||||
|
||||
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_X);
|
||||
void *walkerCommandAddress = cmdBufferAddress;
|
||||
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false);
|
||||
WalkerPartition::WalkerPartitionArgs args = {};
|
||||
args.partitionCount = 2;
|
||||
args.tileCount = 2;
|
||||
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
|
||||
auto walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
|
||||
|
||||
ASSERT_NE(nullptr, walkerCommand);
|
||||
@@ -437,7 +440,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
|
||||
|
||||
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_Y);
|
||||
walkerCommandAddress = cmdBufferAddress;
|
||||
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false);
|
||||
args = {};
|
||||
args.partitionCount = 2;
|
||||
args.tileCount = 2;
|
||||
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
|
||||
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
|
||||
|
||||
ASSERT_NE(nullptr, walkerCommand);
|
||||
@@ -446,7 +452,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
|
||||
|
||||
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_Z);
|
||||
walkerCommandAddress = cmdBufferAddress;
|
||||
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, false, false);
|
||||
args = {};
|
||||
args.partitionCount = 2;
|
||||
args.tileCount = 2;
|
||||
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
|
||||
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
|
||||
|
||||
ASSERT_NE(nullptr, walkerCommand);
|
||||
@@ -456,7 +465,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
|
||||
// if we program with partition Count == 1 then do not trigger partition stuff
|
||||
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
|
||||
walkerCommandAddress = cmdBufferAddress;
|
||||
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 1u, 2, false, false);
|
||||
args = {};
|
||||
args.partitionCount = 1;
|
||||
args.tileCount = 2;
|
||||
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
|
||||
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
|
||||
|
||||
ASSERT_NE(nullptr, walkerCommand);
|
||||
@@ -1763,7 +1775,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTi
|
||||
bool forceExecutionOnSingleTile = false;
|
||||
walker.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_X);
|
||||
void *walkerCommandAddress = cmdBufferAddress;
|
||||
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, forceExecutionOnSingleTile, false);
|
||||
WalkerPartition::WalkerPartitionArgs args = {};
|
||||
args.partitionCount = 2;
|
||||
args.tileCount = 2;
|
||||
args.forceExecutionOnSingleTile = forceExecutionOnSingleTile;
|
||||
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
|
||||
auto walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
|
||||
|
||||
ASSERT_NE(nullptr, walkerCommand);
|
||||
@@ -1773,7 +1789,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTi
|
||||
|
||||
forceExecutionOnSingleTile = true;
|
||||
walkerCommandAddress = cmdBufferAddress;
|
||||
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, 2u, 2, forceExecutionOnSingleTile, false);
|
||||
args = {};
|
||||
args.partitionCount = 2;
|
||||
args.tileCount = 2;
|
||||
args.forceExecutionOnSingleTile = forceExecutionOnSingleTile;
|
||||
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, testHardwareInfo);
|
||||
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
|
||||
|
||||
ASSERT_NE(nullptr, walkerCommand);
|
||||
|
||||
@@ -47,11 +47,14 @@ ImplicitScalingDispatchCommandArgs ImplicitScalingFixture::createDispatchCommand
|
||||
nullptr, // outWalkerPtr
|
||||
NEO::RequiredPartitionDim::none, // requiredPartitionDim
|
||||
partitionCount, // partitionCount
|
||||
1, // workgroupSize
|
||||
1, // maxWgCountPerTile
|
||||
true, // useSecondaryBatchBuffer
|
||||
false, // apiSelfCleanup
|
||||
dcFlushFlag, // dcFlush
|
||||
forceExecutionOnSingleTileFlag, // forceExecutionOnSingleTile
|
||||
false}; // blockDispatchToCommandBuffer
|
||||
false, // blockDispatchToCommandBuffer
|
||||
false}; // isRequiredWorkGroupOrder
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user