mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-25 21:42:53 +08:00
fix: add function to calculate number of threads per tg
Signed-off-by: Cencelewska, Katarzyna <katarzyna.cencelewska@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
987394b27c
commit
7cb3278eb3
@@ -317,10 +317,9 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
}
|
||||
}
|
||||
|
||||
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
|
||||
this->numThreadsPerThreadGroup = static_cast<uint32_t>((itemsInGroup + simdSize - 1u) / simdSize);
|
||||
patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ);
|
||||
|
||||
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
|
||||
auto remainderSimdLanes = itemsInGroup & (simdSize - 1u);
|
||||
threadExecutionMask = static_cast<uint32_t>(maxNBitValue(remainderSimdLanes));
|
||||
if (!threadExecutionMask) {
|
||||
@@ -328,6 +327,12 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
}
|
||||
evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);
|
||||
|
||||
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
|
||||
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, kernelRequiresGenerationOfLocalIdsByRuntime);
|
||||
|
||||
if (kernelRequiresGenerationOfLocalIdsByRuntime) {
|
||||
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
|
||||
|
||||
@@ -29,10 +29,10 @@ inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
bool localIdsGenerationByRuntime,
|
||||
bool inlineDataProgrammingRequired,
|
||||
uint32_t requiredWorkgroupOrder) {
|
||||
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
|
||||
auto localWorkSize = static_cast<uint32_t>(localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2]);
|
||||
|
||||
auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
|
||||
walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
|
||||
walkerCmd->setThreadWidthCounterMaximum(threadsPerWorkGroup);
|
||||
|
||||
walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
|
||||
walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
|
||||
|
||||
@@ -240,9 +240,10 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(),
|
||||
device.getRootDeviceEnvironment());
|
||||
}
|
||||
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
|
||||
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
|
||||
auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
|
||||
auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, localIdsGenerationByRuntime);
|
||||
|
||||
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
||||
|
||||
|
||||
@@ -729,7 +729,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
|
||||
// only X is present
|
||||
auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, sizeGrf);
|
||||
sizePerThreadData = std::max(sizePerThreadData, sizeGrf);
|
||||
size_t perThreadTotalDataSize = getThreadsPerWG(simd, lws[0]) * sizePerThreadData;
|
||||
size_t perThreadTotalDataSize = getThreadsPerWG(simd, static_cast<uint32_t>(lws[0])) * sizePerThreadData;
|
||||
|
||||
uint32_t expectedIndirectDataLength = alignUp(static_cast<uint32_t>(perThreadTotalDataSize), COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||
EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength());
|
||||
@@ -832,13 +832,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
|
||||
// only X is present
|
||||
uint32_t localIdSizePerThread = getPerThreadSizeLocalIDs(simd, sizeGrf);
|
||||
localIdSizePerThread = std::max(localIdSizePerThread, sizeGrf);
|
||||
auto sizePerThreadData = getThreadsPerWG(simd, lws[0]) * localIdSizePerThread;
|
||||
auto sizePerThreadData = getThreadsPerWG(simd, static_cast<uint32_t>(lws[0])) * localIdSizePerThread;
|
||||
|
||||
auto crossThreadDataSize = kernel->mockKernel->getCrossThreadDataSize();
|
||||
crossThreadDataSize -= std::min(static_cast<uint32_t>(sizeof(INLINE_DATA)), crossThreadDataSize);
|
||||
|
||||
// second GRF in indirect
|
||||
uint32_t expectedIndirectDataLength = static_cast<uint32_t>(sizePerThreadData + crossThreadDataSize);
|
||||
uint32_t expectedIndirectDataLength = sizePerThreadData + crossThreadDataSize;
|
||||
expectedIndirectDataLength = alignUp(expectedIndirectDataLength, COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||
EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength());
|
||||
|
||||
|
||||
@@ -354,9 +354,9 @@ void EncodeDispatchKernel<Family>::encodeThreadData(WALKER_TYPE &walkerCmd,
|
||||
|
||||
walkerCmd.setSimdSize(getSimdConfig<WALKER_TYPE>(simd));
|
||||
|
||||
auto localWorkSize = workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2];
|
||||
auto localWorkSize = static_cast<uint32_t>(workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]);
|
||||
if (threadsPerThreadGroup == 0) {
|
||||
threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkSize));
|
||||
threadsPerThreadGroup = getThreadsPerWG(simd, localWorkSize);
|
||||
}
|
||||
walkerCmd.setThreadWidthCounterMaximum(threadsPerThreadGroup);
|
||||
|
||||
|
||||
@@ -144,6 +144,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, CFEOverDispatchControl, -1, "Set Over Dispatch C
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, CFELargeGRFThreadAdjustDisable, -1, "Set Large GRF thread adjust Disable field in CFE_STATE, -1 - do not set")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumComputeUnitsForScratch, -1, "Override number of compute units used for scratch size calculation")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceThreadGroupDispatchSize, -1, "Set ThreadGroupDispatchSize in INTERFACE_DESCRIPTOR_DATA, -1 - default, 0 - TG size 8, 1 - TG size 4, 2 - TG size 2, 3 - Reserved")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceNumberOfThreadsInGpgpuThreadGroup, -1, "-1 - default, set NumberOfThreadsInGpgpuThreadGroup in INTERFACE_DESCRIPTOR_DATA")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessL1CachingPolicy, -1, "-1: default, >=0 : program value for stateless L1 caching")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceMemoryBankIndexOverride, -1, "-1: default, 0: disable, 1:enable, Force index=1 of memory bank for XEHP")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnablePrivateScratchSlot1, -1, "-1: default, 0: disable, 1: enable Allows using private scratch space")
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
* Copyright (C) 2018-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -42,7 +42,7 @@ LocalIDHelper::LocalIDHelper() {
|
||||
LocalIDHelper LocalIDHelper::initializer;
|
||||
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])));
|
||||
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
||||
if (useLayoutForImages) {
|
||||
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
|
||||
|
||||
@@ -168,6 +168,7 @@ class GfxCoreHelper {
|
||||
virtual bool isChipsetUniqueUUIDSupported() const = 0;
|
||||
virtual bool isTimestampShiftRequired() const = 0;
|
||||
virtual bool isRelaxedOrderingSupported() const = 0;
|
||||
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const = 0;
|
||||
static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
|
||||
virtual ~GfxCoreHelper() = default;
|
||||
|
||||
@@ -380,6 +381,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
|
||||
bool isChipsetUniqueUUIDSupported() const override;
|
||||
bool isTimestampShiftRequired() const override;
|
||||
bool isRelaxedOrderingSupported() const override;
|
||||
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const override;
|
||||
|
||||
~GfxCoreHelperHw() override = default;
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/helpers/local_id_gen.h"
|
||||
#include "shared/source/helpers/pipe_control_args.h"
|
||||
#include "shared/source/helpers/timestamp_packet.h"
|
||||
#include "shared/source/memory_manager/allocation_properties.h"
|
||||
@@ -685,4 +686,12 @@ template <typename GfxFamily>
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
|
||||
return 128u;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const {
|
||||
if (DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get() != -1) {
|
||||
return DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get();
|
||||
}
|
||||
return getThreadsPerWG(simd, totalWorkItems);
|
||||
}
|
||||
} // namespace NEO
|
||||
|
||||
@@ -16,7 +16,7 @@ inline uint32_t getGRFsPerThread(uint32_t simd, uint32_t grfSize) {
|
||||
return (simd == 32 && grfSize == 32) ? 2 : 1;
|
||||
}
|
||||
|
||||
inline size_t getThreadsPerWG(uint32_t simd, size_t lws) {
|
||||
inline uint32_t getThreadsPerWG(uint32_t simd, uint32_t lws) {
|
||||
auto result = lws + simd - 1;
|
||||
|
||||
// Original logic:
|
||||
|
||||
@@ -20,7 +20,7 @@ struct PerThreadDataHelper {
|
||||
uint32_t grfSize,
|
||||
uint32_t numChannels,
|
||||
size_t localWorkSize) {
|
||||
return getThreadsPerWG(simd, localWorkSize) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
|
||||
return getThreadsPerWG(simd, static_cast<uint32_t>(localWorkSize)) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
|
||||
}
|
||||
}; // namespace PerThreadDataHelper
|
||||
} // namespace NEO
|
||||
|
||||
@@ -45,7 +45,7 @@ LocalIDHelper LocalIDHelper::initializer;
|
||||
|
||||
// traditional function to generate local IDs
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2])));
|
||||
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
||||
if (useLayoutForImages) {
|
||||
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Intel Corporation
|
||||
* Copyright (C) 2022-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -33,9 +33,9 @@ std::unique_lock<std::mutex> LocalIdsCache::lock() {
|
||||
}
|
||||
|
||||
size_t LocalIdsCache::getLocalIdsSizeForGroup(const Vec3<uint16_t> &group) const {
|
||||
const auto numElementsInGroup = Math::computeTotalElementsCount({group[0], group[1], group[2]});
|
||||
const auto numElementsInGroup = static_cast<uint32_t>(Math::computeTotalElementsCount({group[0], group[1], group[2]}));
|
||||
const auto numberOfThreads = getThreadsPerWG(simdSize, numElementsInGroup);
|
||||
return numberOfThreads * static_cast<size_t>(localIdsSizePerThread);
|
||||
return static_cast<size_t>(numberOfThreads * localIdsSizePerThread);
|
||||
}
|
||||
|
||||
size_t LocalIdsCache::getLocalIdsSizePerThread() const {
|
||||
|
||||
@@ -534,4 +534,5 @@ PrintGlobalTimestampInNs = 0
|
||||
EnableDeviceStateVerification = -1
|
||||
VfBarResourceAllocationWa = 1
|
||||
EnableDynamicPostSyncAllocLayout = -1
|
||||
ForceNumberOfThreadsInGpgpuThreadGroup = -1
|
||||
# Please don't edit below this line
|
||||
|
||||
@@ -1568,3 +1568,34 @@ HWTEST_F(GfxCoreHelperTest, givenNumGrfAndSimdSizeWhenAdjustingMaxWorkGroupSizeT
|
||||
numGrfRequired = GrfConfig::DefaultGrfNumber;
|
||||
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize));
|
||||
}
|
||||
|
||||
HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue, IsAtMostXeHpcCore) {
|
||||
auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
|
||||
std::array<std::array<uint32_t, 3>, 8> values = {{
|
||||
{32u, 32u, 1u}, // SIMT Size, totalWorkItems, Max Num of threads
|
||||
{32u, 64u, 2u},
|
||||
{32u, 128u, 4u},
|
||||
{32u, 1024u, 32u},
|
||||
{16u, 32u, 2u},
|
||||
{16u, 64u, 4u},
|
||||
{16u, 128u, 8u},
|
||||
{16u, 1024u, 64u},
|
||||
}};
|
||||
|
||||
for (auto &[simtSize, totalWgSize, expectedNumThreadsPerThreadGroup] : values) {
|
||||
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, true));
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(GfxCoreHelperTest, givenFlagForceNumberOfThreadsInGpgpuThreadGroupWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue) {
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
|
||||
|
||||
uint32_t expectedNumThreadsPerThreadGroup = 10u;
|
||||
DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
|
||||
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
|
||||
|
||||
expectedNumThreadsPerThreadGroup = 20u;
|
||||
DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
|
||||
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
|
||||
}
|
||||
|
||||
@@ -40,7 +40,7 @@ HWTEST_F(LocalIdTests, GivenSimd32AndNon32GrfSizeWhenGettingGrfsPerThreadThenTwo
|
||||
}
|
||||
|
||||
TEST(LocalID, GivenSimd32AndLws33WhenGettingThreadsPerWorkgroupThenTwoIsReturned) {
|
||||
size_t lws = 33;
|
||||
uint32_t lws = 33;
|
||||
uint32_t simd = 32;
|
||||
EXPECT_EQ(2u, getThreadsPerWG(simd, lws));
|
||||
}
|
||||
@@ -313,7 +313,7 @@ HWTEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
|
||||
}
|
||||
|
||||
TEST_P(LocalIDFixture, WhenThreadsPerWgAreGeneratedThenSizeCalculationAreCorrect) {
|
||||
auto workItems = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
|
||||
auto workItems = static_cast<uint32_t>(localWorkSizeX * localWorkSizeY * localWorkSizeZ);
|
||||
auto sizeTotalPerThreadData = getThreadsPerWG(simd, workItems) * getPerThreadSizeLocalIDs(simd, grfSize);
|
||||
|
||||
// Should be multiple of GRFs
|
||||
|
||||
Reference in New Issue
Block a user