refactor: add thread group count parameter to implicit scaling functions

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2025-01-17 15:03:50 +00:00
committed by Compute-Runtime-Automation
parent b7d7424aab
commit 897c890d03
10 changed files with 39 additions and 13 deletions

View File

@ -427,6 +427,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
args.requiredPartitionDim, // requiredPartitionDim
args.partitionCount, // partitionCount
workgroupSize, // workgroupSize
threadGroupCount, // threadGroupCount
args.maxWgCountPerTile, // maxWgCountPerTile
!(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer
!args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2024 Intel Corporation
* Copyright (C) 2021-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -46,6 +46,7 @@ struct ImplicitScalingDispatchCommandArgs {
RequiredPartitionDim requiredPartitionDim = RequiredPartitionDim::none;
uint32_t partitionCount = 0;
uint32_t workgroupSize = 0;
uint32_t threadGroupCount = 0;
uint32_t maxWgCountPerTile = 0;
bool useSecondaryBatchBuffer = false;

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2024 Intel Corporation
* Copyright (C) 2021-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -50,6 +50,7 @@ WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(ImplicitScalingD
args.blockDispatchToCommandBuffer = dispatchCommandArgs.blockDispatchToCommandBuffer;
args.workgroupSize = dispatchCommandArgs.workgroupSize;
args.threadGroupCount = dispatchCommandArgs.threadGroupCount;
args.maxWgCountPerTile = dispatchCommandArgs.maxWgCountPerTile;
args.isRequiredDispatchWorkGroupOrder = dispatchCommandArgs.isRequiredDispatchWorkGroupOrder;

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2024 Intel Corporation
* Copyright (C) 2021-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -18,6 +18,7 @@ struct WalkerPartitionArgs {
uint32_t partitionCount = 0;
uint32_t tileCount = 0;
uint32_t workgroupSize = 0;
uint32_t threadGroupCount = 0;
uint32_t maxWgCountPerTile = 0;
bool emitBatchBufferEnd = false;
bool secondaryBatchBuffer = false;

View File

@ -526,15 +526,11 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm
inputWalker->setPartitionSize(Math::divideAndRoundUp(workgroupCount, args.partitionCount));
}
uint32_t threadGroupCount = inputWalker->getThreadGroupIdXDimension() *
inputWalker->getThreadGroupIdYDimension() *
inputWalker->getThreadGroupIdZDimension();
NEO::EncodeDispatchKernel<GfxFamily>::setWalkerRegionSettings(*inputWalker,
device,
args.partitionCount,
args.workgroupSize,
threadGroupCount,
args.threadGroupCount,
args.maxWgCountPerTile,
args.isRequiredDispatchWorkGroupOrder);

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2024 Intel Corporation
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -846,11 +846,15 @@ HWTEST2_F(CommandEncoderTests, whenAskingForImplicitScalingValuesThenAlwaysRetur
&ptr, // outWalkerPtr
RequiredPartitionDim::x, // requiredPartitionDim
partitionCount, // partitionCount
1, // workgroupSize
1, // threadGroupCount
1, // maxWgCountPerTile
false, // useSecondaryBatchBuffer
false, // apiSelfCleanup
false, // dcFlush
false, // forceExecutionOnSingleTile
false}; // blockDispatchToCommandBuffer
false, // blockDispatchToCommandBuffer
false}; // isRequiredDispatchWorkGroupOrder
ImplicitScalingDispatch<FamilyType>::dispatchCommands(linearStream, walkerCmd, deviceBitField, args);
EXPECT_EQ(0u, linearStream.getUsed());

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2024 Intel Corporation
* Copyright (C) 2021-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -37,6 +37,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenGetSizeWhenDispatchingCm
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(0, partitionCount);
dispatchArgs.threadGroupCount = 32;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
@ -175,6 +176,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenDi
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.threadGroupCount = 32;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
@ -228,6 +230,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenPa
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.threadGroupCount = 32;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
@ -408,6 +411,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenDynamicPartitioningPrefe
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.threadGroupCount = 32;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
@ -458,6 +462,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.apiSelfCleanup = true;
dispatchArgs.threadGroupCount = 32;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
@ -528,6 +533,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.apiSelfCleanup = true;
dispatchArgs.threadGroupCount = 32;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
@ -589,6 +595,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.apiSelfCleanup = true;
dispatchArgs.threadGroupCount = 32;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
@ -649,6 +656,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.threadGroupCount = 32;
dispatchArgs.apiSelfCleanup = true;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
@ -717,6 +725,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.threadGroupCount = 32;
dispatchArgs.apiSelfCleanup = true;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
@ -781,6 +790,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.threadGroupCount = 32;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
@ -846,6 +856,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.threadGroupCount = 32;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
@ -914,6 +925,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.threadGroupCount = 32;
dispatchArgs.apiSelfCleanup = true;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
@ -982,6 +994,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.threadGroupCount = 32;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, dispatchArgs);
totalBytesProgrammed = commandStream.getUsed();
@ -1601,6 +1614,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
uint32_t partitionCount = 0;
auto dispatchArgs = createDispatchCommandArgs(workPartitionAllocationAddress, partitionCount);
dispatchArgs.threadGroupCount = 32;
dispatchArgs.blockDispatchToCommandBuffer = true;
dispatchArgs.outWalkerPtr = &outWalkerPtr;

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2024 Intel Corporation
* Copyright (C) 2021-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -434,6 +434,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
WalkerPartition::WalkerPartitionArgs args = {};
args.partitionCount = 2;
args.tileCount = 2;
args.threadGroupCount = 7 * 10 * 11;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, *device);
auto walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
@ -448,6 +449,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
args = {};
args.partitionCount = 2;
args.tileCount = 2;
args.threadGroupCount = 7 * 10 * 11;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, *device);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
@ -460,6 +462,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
args = {};
args.partitionCount = 2;
args.tileCount = 2;
args.threadGroupCount = 7 * 10 * 11;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, *device);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
@ -473,6 +476,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenProgramComputeWalkerWhen
args = {};
args.partitionCount = 1;
args.tileCount = 2;
args.threadGroupCount = 7 * 10 * 11;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, *device);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
@ -1793,6 +1797,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTi
WalkerPartition::WalkerPartitionArgs args = {};
args.partitionCount = 2;
args.tileCount = 2;
args.threadGroupCount = 32;
args.forceExecutionOnSingleTile = forceExecutionOnSingleTile;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, *device);
auto walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);
@ -1807,6 +1812,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenForceExecutionOnSingleTi
args = {};
args.partitionCount = 2;
args.tileCount = 2;
args.threadGroupCount = 32;
args.forceExecutionOnSingleTile = forceExecutionOnSingleTile;
programPartitionedWalker<FamilyType>(cmdBufferAddress, totalBytesProgrammed, &walker, args, *device);
walkerCommand = genCmdCast<WalkerType *>(walkerCommandAddress);

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2024 Intel Corporation
* Copyright (C) 2021-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -48,6 +48,7 @@ ImplicitScalingDispatchCommandArgs ImplicitScalingFixture::createDispatchCommand
NEO::RequiredPartitionDim::none, // requiredPartitionDim
partitionCount, // partitionCount
1, // workgroupSize
1, // threadGroupCount
1, // maxWgCountPerTile
true, // useSecondaryBatchBuffer
false, // apiSelfCleanup