fix: add function to calculate number of threads per tg

Signed-off-by: Cencelewska, Katarzyna <katarzyna.cencelewska@intel.com>
This commit is contained in:
Cencelewska, Katarzyna
2023-06-12 11:41:13 +00:00
committed by Compute-Runtime-Automation
parent 987394b27c
commit 7cb3278eb3
16 changed files with 71 additions and 21 deletions

View File

@@ -317,10 +317,9 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
}
}
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
this->numThreadsPerThreadGroup = static_cast<uint32_t>((itemsInGroup + simdSize - 1u) / simdSize);
patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ);
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
auto remainderSimdLanes = itemsInGroup & (simdSize - 1u);
threadExecutionMask = static_cast<uint32_t>(maxNBitValue(remainderSimdLanes));
if (!threadExecutionMask) {
@@ -328,6 +327,12 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
}
evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, kernelRequiresGenerationOfLocalIdsByRuntime);
if (kernelRequiresGenerationOfLocalIdsByRuntime) {
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =

View File

@@ -29,10 +29,10 @@ inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
bool localIdsGenerationByRuntime,
bool inlineDataProgrammingRequired,
uint32_t requiredWorkgroupOrder) {
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
auto localWorkSize = static_cast<uint32_t>(localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2]);
auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
walkerCmd->setThreadWidthCounterMaximum(threadsPerWorkGroup);
walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));

View File

@@ -240,9 +240,10 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(),
device.getRootDeviceEnvironment());
}
auto &gfxCoreHelper = device.getGfxCoreHelper();
auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, localIdsGenerationByRuntime);
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();

View File

@@ -729,7 +729,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
// only X is present
auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, sizeGrf);
sizePerThreadData = std::max(sizePerThreadData, sizeGrf);
size_t perThreadTotalDataSize = getThreadsPerWG(simd, lws[0]) * sizePerThreadData;
size_t perThreadTotalDataSize = getThreadsPerWG(simd, static_cast<uint32_t>(lws[0])) * sizePerThreadData;
uint32_t expectedIndirectDataLength = alignUp(static_cast<uint32_t>(perThreadTotalDataSize), COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength());
@@ -832,13 +832,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
// only X is present
uint32_t localIdSizePerThread = getPerThreadSizeLocalIDs(simd, sizeGrf);
localIdSizePerThread = std::max(localIdSizePerThread, sizeGrf);
auto sizePerThreadData = getThreadsPerWG(simd, lws[0]) * localIdSizePerThread;
auto sizePerThreadData = getThreadsPerWG(simd, static_cast<uint32_t>(lws[0])) * localIdSizePerThread;
auto crossThreadDataSize = kernel->mockKernel->getCrossThreadDataSize();
crossThreadDataSize -= std::min(static_cast<uint32_t>(sizeof(INLINE_DATA)), crossThreadDataSize);
// second GRF in indirect
uint32_t expectedIndirectDataLength = static_cast<uint32_t>(sizePerThreadData + crossThreadDataSize);
uint32_t expectedIndirectDataLength = sizePerThreadData + crossThreadDataSize;
expectedIndirectDataLength = alignUp(expectedIndirectDataLength, COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength());

View File

@@ -354,9 +354,9 @@ void EncodeDispatchKernel<Family>::encodeThreadData(WALKER_TYPE &walkerCmd,
walkerCmd.setSimdSize(getSimdConfig<WALKER_TYPE>(simd));
auto localWorkSize = workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2];
auto localWorkSize = static_cast<uint32_t>(workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]);
if (threadsPerThreadGroup == 0) {
threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkSize));
threadsPerThreadGroup = getThreadsPerWG(simd, localWorkSize);
}
walkerCmd.setThreadWidthCounterMaximum(threadsPerThreadGroup);

View File

@@ -144,6 +144,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, CFEOverDispatchControl, -1, "Set Over Dispatch C
DECLARE_DEBUG_VARIABLE(int32_t, CFELargeGRFThreadAdjustDisable, -1, "Set Large GRF thread adjust Disable field in CFE_STATE, -1 - do not set")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumComputeUnitsForScratch, -1, "Override number of compute units used for scratch size calculation")
DECLARE_DEBUG_VARIABLE(int32_t, ForceThreadGroupDispatchSize, -1, "Set ThreadGroupDispatchSize in INTERFACE_DESCRIPTOR_DATA, -1 - default, 0 - TG size 8, 1 - TG size 4, 2 - TG size 2, 3 - Reserved")
DECLARE_DEBUG_VARIABLE(int32_t, ForceNumberOfThreadsInGpgpuThreadGroup, -1, "-1 - default, set NumberOfThreadsInGpgpuThreadGroup in INTERFACE_DESCRIPTOR_DATA")
DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessL1CachingPolicy, -1, "-1: default, >=0 : program value for stateless L1 caching")
DECLARE_DEBUG_VARIABLE(int32_t, ForceMemoryBankIndexOverride, -1, "-1: default, 0: disable, 1:enable, Force index=1 of memory bank for XEHP")
DECLARE_DEBUG_VARIABLE(int32_t, EnablePrivateScratchSlot1, -1, "-1: default, 0: disable, 1: enable Allows using private scratch space")

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2022 Intel Corporation
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -42,7 +42,7 @@ LocalIDHelper::LocalIDHelper() {
LocalIDHelper LocalIDHelper::initializer;
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
if (useLayoutForImages) {
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);

View File

@@ -168,6 +168,7 @@ class GfxCoreHelper {
virtual bool isChipsetUniqueUUIDSupported() const = 0;
virtual bool isTimestampShiftRequired() const = 0;
virtual bool isRelaxedOrderingSupported() const = 0;
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const = 0;
static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
virtual ~GfxCoreHelper() = default;
@@ -380,6 +381,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
bool isChipsetUniqueUUIDSupported() const override;
bool isTimestampShiftRequired() const override;
bool isRelaxedOrderingSupported() const override;
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const override;
~GfxCoreHelperHw() override = default;

View File

@@ -16,6 +16,7 @@
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/helpers/local_id_gen.h"
#include "shared/source/helpers/pipe_control_args.h"
#include "shared/source/helpers/timestamp_packet.h"
#include "shared/source/memory_manager/allocation_properties.h"
@@ -685,4 +686,12 @@ template <typename GfxFamily>
uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
return 128u;
}
template <typename GfxFamily>
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const {
if (DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get() != -1) {
return DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get();
}
return getThreadsPerWG(simd, totalWorkItems);
}
} // namespace NEO

View File

@@ -16,7 +16,7 @@ inline uint32_t getGRFsPerThread(uint32_t simd, uint32_t grfSize) {
return (simd == 32 && grfSize == 32) ? 2 : 1;
}
inline size_t getThreadsPerWG(uint32_t simd, size_t lws) {
inline uint32_t getThreadsPerWG(uint32_t simd, uint32_t lws) {
auto result = lws + simd - 1;
// Original logic:

View File

@@ -20,7 +20,7 @@ struct PerThreadDataHelper {
uint32_t grfSize,
uint32_t numChannels,
size_t localWorkSize) {
return getThreadsPerWG(simd, localWorkSize) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
return getThreadsPerWG(simd, static_cast<uint32_t>(localWorkSize)) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
}
}; // namespace PerThreadDataHelper
} // namespace NEO

View File

@@ -45,7 +45,7 @@ LocalIDHelper LocalIDHelper::initializer;
// traditional function to generate local IDs
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
if (useLayoutForImages) {
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022 Intel Corporation
* Copyright (C) 2022-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -33,9 +33,9 @@ std::unique_lock<std::mutex> LocalIdsCache::lock() {
}
size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3<uint16_t> &group) const {
const auto numElementsInGroup = Math::computeTotalElementsCount({group[0], group[1], group[2]});
const auto numElementsInGroup = static_cast<uint32_t>(Math::computeTotalElementsCount({group[0], group[1], group[2]}));
const auto numberOfThreads = getThreadsPerWG(simdSize, numElementsInGroup);
return numberOfThreads * static_cast<size_t>(localIdsSizePerThread);
return static_cast<size_t>(numberOfThreads * localIdsSizePerThread);
}
size_t LocalIdsCache::getLocalIdsSizePerThread() const {

View File

@@ -534,4 +534,5 @@ PrintGlobalTimestampInNs = 0
EnableDeviceStateVerification = -1
VfBarResourceAllocationWa = 1
EnableDynamicPostSyncAllocLayout = -1
ForceNumberOfThreadsInGpgpuThreadGroup = -1
# Please don't edit below this line

View File

@@ -1568,3 +1568,34 @@ HWTEST_F(GfxCoreHelperTest, givenNumGrfAndSimdSizeWhenAdjustingMaxWorkGroupSizeT
numGrfRequired = GrfConfig::DefaultGrfNumber;
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize));
}
HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue, IsAtMostXeHpcCore) {
auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
std::array<std::array<uint32_t, 3>, 8> values = {{
{32u, 32u, 1u}, // SIMT Size, totalWorkItems, Max Num of threads
{32u, 64u, 2u},
{32u, 128u, 4u},
{32u, 1024u, 32u},
{16u, 32u, 2u},
{16u, 64u, 4u},
{16u, 128u, 8u},
{16u, 1024u, 64u},
}};
for (auto &[simtSize, totalWgSize, expectedNumThreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, true));
}
}
HWTEST_F(GfxCoreHelperTest, givenFlagForceNumberOfThreadsInGpgpuThreadGroupWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue) {
DebugManagerStateRestore dbgRestore;
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
uint32_t expectedNumThreadsPerThreadGroup = 10u;
DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
expectedNumThreadsPerThreadGroup = 20u;
DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
}

View File

@@ -40,7 +40,7 @@ HWTEST_F(LocalIdTests, GivenSimd32AndNon32GrfSizeWhenGettingGrfsPerThreadThenTwo
}
TEST(LocalID, GivenSimd32AndLws33WhenGettingThreadsPerWorkgroupThenTwoIsReturned) {
size_t lws = 33;
uint32_t lws = 33;
uint32_t simd = 32;
EXPECT_EQ(2u, getThreadsPerWG(simd, lws));
}
@@ -313,7 +313,7 @@ HWTEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
}
TEST_P(LocalIDFixture, WhenThreadsPerWgAreGeneratedThenSizeCalculationAreCorrect) {
auto workItems = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
auto workItems = static_cast<uint32_t>(localWorkSizeX * localWorkSizeY * localWorkSizeZ);
auto sizeTotalPerThreadData = getThreadsPerWG(simd, workItems) * getPerThreadSizeLocalIDs(simd, grfSize);
// Should be multiple of GRFs