435 lines
18 KiB
C++
435 lines
18 KiB
C++
/*
|
|
* Copyright (C) 2018-2025 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
*/
|
|
|
|
#include "shared/source/helpers/local_work_size.h"
|
|
|
|
#include "shared/source/debug_settings/debug_settings_manager.h"
|
|
#include "shared/source/helpers/array_count.h"
|
|
#include "shared/source/helpers/basic_math.h"
|
|
#include "shared/source/helpers/debug_helpers.h"
|
|
#include "shared/source/program/kernel_info.h"
|
|
#include "shared/source/program/work_size_info.h"
|
|
|
|
#include <cmath>
|
|
#include <cstdint>
|
|
|
|
namespace NEO {
|
|
|
|
// threshold used to determine what kind of device is underneath
|
|
// big cores like SKL have 8EU * 7 HW threads per subslice and are considered as highThreadCount devices
|
|
constexpr uint32_t highThreadCountThreshold = 56u;
|
|
|
|
constexpr uint32_t optimalHardwareThreadCountGeneric[] = {32, 16, 8, 4, 2, 1};
|
|
|
|
static const uint32_t primeNumbers[] = {
|
|
251,
|
|
241,
|
|
239, 233,
|
|
229, 227, 223,
|
|
211,
|
|
199, 197, 193, 191,
|
|
181,
|
|
179, 173,
|
|
167, 163,
|
|
157, 151,
|
|
149,
|
|
139, 137, 131,
|
|
127,
|
|
113,
|
|
109, 107, 103, 101,
|
|
97,
|
|
89, 83,
|
|
79, 73, 71,
|
|
67, 61,
|
|
59, 53,
|
|
47, 43, 41,
|
|
37, 31,
|
|
29, 23,
|
|
19, 17, 13, 11,
|
|
7, 5, 3, 2};
|
|
|
|
static const size_t maxPrimes = sizeof(primeNumbers) / sizeof(primeNumbers[0]);
|
|
|
|
// Recursive template function to test prime factors
|
|
template <uint32_t primeIndex>
|
|
static inline uint32_t factor(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) {
|
|
auto primeNumber = primeNumbers[primeIndex];
|
|
|
|
auto newWorkSize = workSize * primeNumber;
|
|
if (newWorkSize <= workItems) {
|
|
while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) {
|
|
workSize = newWorkSize;
|
|
newWorkSize = workSize * primeNumber;
|
|
}
|
|
|
|
workSize = factor<primeIndex - 1>(workItems, workSize, maxWorkGroupSize);
|
|
}
|
|
|
|
return workSize;
|
|
}
|
|
|
|
// Terminator of recursive factoring logic
|
|
template <>
|
|
inline uint32_t factor<0>(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) {
|
|
uint32_t primeIndex = 0;
|
|
auto primeNumber = primeNumbers[primeIndex];
|
|
|
|
auto newWorkSize = workSize * primeNumber;
|
|
if (newWorkSize <= workItems) {
|
|
while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) {
|
|
workSize = newWorkSize;
|
|
newWorkSize = workSize * primeNumber;
|
|
}
|
|
}
|
|
|
|
return workSize;
|
|
}
|
|
|
|
void computePowerOfTwoLWS(const size_t workItems[3], WorkSizeInfo &workGroupInfo, size_t workGroupSize[3], const uint32_t workDim, bool canUseNx4) {
|
|
uint32_t targetIndex = (canUseNx4 || workGroupInfo.numThreadsPerSubSlice < highThreadCountThreshold) ? 2 : 0;
|
|
auto simdSize = workGroupInfo.simdSize;
|
|
|
|
while (optimalHardwareThreadCountGeneric[targetIndex] > 1 &&
|
|
workGroupInfo.maxWorkGroupSize < optimalHardwareThreadCountGeneric[targetIndex] * simdSize) {
|
|
targetIndex++;
|
|
}
|
|
|
|
uint32_t optimalLocalThreads = optimalHardwareThreadCountGeneric[targetIndex];
|
|
|
|
if (workDim == 2) {
|
|
uint32_t xDim, yDim;
|
|
xDim = uint32_t(optimalLocalThreads * simdSize) / (canUseNx4 ? 4 : 1);
|
|
while (xDim > workItems[0])
|
|
xDim = xDim >> 1;
|
|
yDim = canUseNx4 ? 4 : (uint32_t(optimalLocalThreads * simdSize) / xDim);
|
|
workGroupSize[0] = xDim;
|
|
workGroupSize[1] = yDim;
|
|
} else {
|
|
uint32_t xDim, yDim, zDim;
|
|
xDim = uint32_t(optimalLocalThreads * simdSize);
|
|
while (xDim > workItems[0])
|
|
xDim = xDim >> 1;
|
|
yDim = uint32_t(optimalLocalThreads * simdSize) / xDim;
|
|
while (yDim > workItems[1])
|
|
yDim = yDim >> 1;
|
|
UNRECOVERABLE_IF((xDim * yDim) == 0);
|
|
zDim = uint32_t(optimalLocalThreads * simdSize) / (xDim * yDim);
|
|
workGroupSize[0] = xDim;
|
|
workGroupSize[1] = yDim;
|
|
workGroupSize[2] = zDim;
|
|
}
|
|
}
|
|
|
|
void choosePreferredWorkGroupSizeWithRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, bool enforceDescendingOrder) {
|
|
float localRatio = std::numeric_limits<float>::max();
|
|
uint64_t localNumWorkgroups = std::numeric_limits<uint64_t>::max();
|
|
for (uint32_t xFactorsIdx = 0; xFactorsIdx < xyzFactorsLen[0]; ++xFactorsIdx) {
|
|
for (uint32_t yFactorsIdx = 0; yFactorsIdx < xyzFactorsLen[1]; ++yFactorsIdx) {
|
|
|
|
uint32_t xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - xFactorsIdx];
|
|
uint32_t ydim = xyzFactors[1][yFactorsIdx];
|
|
|
|
if (enforceDescendingOrder && ydim > xdim) {
|
|
break;
|
|
}
|
|
|
|
if ((xdim * ydim) > wsInfo.maxWorkGroupSize) {
|
|
break;
|
|
}
|
|
if ((xdim * ydim) < wsInfo.minWorkGroupSize) {
|
|
continue;
|
|
}
|
|
|
|
uint64_t numWorkGroups = Math::divideAndRoundUp(workItems[0], xdim);
|
|
numWorkGroups *= Math::divideAndRoundUp(workItems[1], ydim);
|
|
|
|
float ratioDiff = log(static_cast<float>(xdim)) - log(static_cast<float>(ydim));
|
|
ratioDiff = fabs(wsInfo.targetRatio - ratioDiff);
|
|
|
|
bool setWorkGroupSize = wsInfo.useStrictRatio
|
|
? (ratioDiff < localRatio)
|
|
: (numWorkGroups < localNumWorkgroups) || ((numWorkGroups == localNumWorkgroups) && (ratioDiff < localRatio));
|
|
if (setWorkGroupSize) {
|
|
workGroupSize[0] = xdim;
|
|
workGroupSize[1] = ydim;
|
|
localRatio = ratioDiff;
|
|
localNumWorkgroups = numWorkGroups;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void choosePreferredWorkGroupSizeWithOutRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, bool enforceDescendingOrder) {
|
|
uint64_t localEuThrdsDispatched = std::numeric_limits<uint64_t>::max();
|
|
|
|
for (uint32_t xFactorsIdx = 0; xFactorsIdx < xyzFactorsLen[0]; ++xFactorsIdx) {
|
|
for (uint32_t yFactorsIdx = 0; yFactorsIdx < xyzFactorsLen[1]; ++yFactorsIdx) {
|
|
for (uint32_t zFactorsIdx = 0; zFactorsIdx < xyzFactorsLen[2]; ++zFactorsIdx) {
|
|
|
|
uint32_t xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - xFactorsIdx];
|
|
uint32_t ydim = xyzFactors[1][xyzFactorsLen[1] - 1 - yFactorsIdx];
|
|
uint32_t zdim = xyzFactors[2][xyzFactorsLen[2] - 1 - zFactorsIdx];
|
|
|
|
if (enforceDescendingOrder) {
|
|
if (ydim > xdim) {
|
|
break;
|
|
} else if (zdim > ydim) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
uint32_t numItemsInWorkGroup = xdim * ydim * zdim;
|
|
if (numItemsInWorkGroup > wsInfo.maxWorkGroupSize) {
|
|
continue;
|
|
}
|
|
if (numItemsInWorkGroup < wsInfo.minWorkGroupSize) {
|
|
break;
|
|
}
|
|
|
|
uint64_t numWorkGroups = Math::divideAndRoundUp(workItems[0], xdim);
|
|
numWorkGroups *= Math::divideAndRoundUp(workItems[1], ydim);
|
|
numWorkGroups *= Math::divideAndRoundUp(workItems[2], zdim);
|
|
uint64_t numThreadsPerWorkGroup = Math::divideAndRoundUp(numItemsInWorkGroup, wsInfo.simdSize);
|
|
uint64_t euThrdsDispatched = numThreadsPerWorkGroup * numWorkGroups;
|
|
if (euThrdsDispatched < localEuThrdsDispatched) {
|
|
localEuThrdsDispatched = euThrdsDispatched;
|
|
workGroupSize[0] = xdim;
|
|
workGroupSize[1] = ydim;
|
|
workGroupSize[2] = zdim;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void computeWorkgroupSize1D(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize) {
|
|
auto items = workItems[0];
|
|
|
|
// Determine the LSB set to quickly handle factors of 2
|
|
auto numBits = Math::getMinLsbSet(static_cast<uint32_t>(items));
|
|
|
|
// Clamp power of 2 result to maxWorkGroupSize
|
|
uint32_t workSize = 1u << numBits;
|
|
|
|
// Assumes maxWorkGroupSize is a power of two.
|
|
DEBUG_BREAK_IF((maxWorkGroupSize & (maxWorkGroupSize - 1)) != 0);
|
|
workSize = std::min(workSize, maxWorkGroupSize);
|
|
|
|
// Try all primes as potential factors
|
|
workSize = factor<maxPrimes - 1>(items, workSize, maxWorkGroupSize);
|
|
|
|
workGroupSize[0] = workSize;
|
|
workGroupSize[1] = 1;
|
|
workGroupSize[2] = 1;
|
|
}
|
|
|
|
void choosePreferredWorkgroupSize(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, bool enforceDescendingOrder) {
|
|
// check if algorithm should use ratio
|
|
wsInfo.checkRatio(workItems);
|
|
|
|
if (wsInfo.useRatio) {
|
|
choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, enforceDescendingOrder);
|
|
if (wsInfo.useStrictRatio && workGroupSize[0] * workGroupSize[1] * 2 <= wsInfo.simdSize) {
|
|
wsInfo.useStrictRatio = false;
|
|
choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, enforceDescendingOrder);
|
|
}
|
|
} else {
|
|
choosePreferredWorkGroupSizeWithOutRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, enforceDescendingOrder);
|
|
}
|
|
}
|
|
|
|
void choosePrefferedWorkgroupSize(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) {
|
|
// find all divisors for all dimensions
|
|
uint32_t xyzFactors[3][1024];
|
|
uint32_t xyzFactorsLen[3] = {};
|
|
for (int i = 0; i < 3; i++)
|
|
xyzFactors[i][xyzFactorsLen[i]++] = 1;
|
|
for (auto i = 0u; i < workDim; i++) {
|
|
for (auto j = 2u; j < wsInfo.maxWorkGroupSize; ++j) {
|
|
if ((workItems[i] % j) == 0) {
|
|
xyzFactors[i][xyzFactorsLen[i]++] = j;
|
|
}
|
|
}
|
|
}
|
|
|
|
choosePreferredWorkgroupSize(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, true);
|
|
size_t wgs = workGroupSize[0] * workGroupSize[1] * workGroupSize[2];
|
|
if (wgs * 2 <= wsInfo.simdSize) {
|
|
choosePreferredWorkgroupSize(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, false);
|
|
}
|
|
}
|
|
|
|
void computeWorkgroupSize2D(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize) {
|
|
uint32_t xFactors[1024];
|
|
uint32_t yFactors[1024];
|
|
uint32_t xFactorsLen = 0;
|
|
uint32_t yFactorsLen = 0;
|
|
uint64_t waste;
|
|
uint64_t localWSWaste = 0xffffffffffffffff;
|
|
uint64_t euThrdsDispatched;
|
|
uint64_t localEuThrdsDispatched = 0xffffffffffffffff;
|
|
uint64_t workGroups;
|
|
uint32_t xDim;
|
|
uint32_t yDim;
|
|
|
|
for (int i = 0; i < 3; i++)
|
|
workGroupSize[i] = 1;
|
|
|
|
for (uint32_t i = 2; i <= maxWorkGroupSize; i++) {
|
|
if ((workItems[0] % i) == 0) {
|
|
xFactors[xFactorsLen++] = i;
|
|
}
|
|
if (((workItems[1] % i) == 0)) {
|
|
yFactors[yFactorsLen++] = i;
|
|
}
|
|
}
|
|
|
|
for (uint32_t xFactorsIdx = 0; xFactorsIdx < xFactorsLen; ++xFactorsIdx) {
|
|
for (uint32_t yFactorsIdx = 0; yFactorsIdx < yFactorsLen; ++yFactorsIdx) {
|
|
// Pick a LocalWorkSize that is a multiple as well as appropriate:
|
|
// 1 <= workGroupSize[ 0 ] <= workItems[ 0 ]
|
|
// 1 <= workGroupSize[ 1 ] <= workItems[ 1 ]
|
|
xDim = xFactors[xFactorsLen - 1 - xFactorsIdx];
|
|
yDim = yFactors[yFactorsIdx];
|
|
|
|
if ((xDim * yDim) > maxWorkGroupSize) {
|
|
// The yDim value is too big, so break out of this loop.
|
|
// No other entries will work.
|
|
break;
|
|
}
|
|
|
|
// Find the wasted channels.
|
|
workGroups = Math::divideAndRoundUp(workItems[0], xDim);
|
|
workGroups *= Math::divideAndRoundUp(workItems[1], yDim);
|
|
|
|
// Compaction Mode!
|
|
euThrdsDispatched = Math::divideAndRoundUp(xDim * yDim, simdSize);
|
|
euThrdsDispatched *= workGroups;
|
|
|
|
waste = simdSize - ((xDim * yDim - 1) & (simdSize - 1));
|
|
waste *= workGroups;
|
|
|
|
if (((euThrdsDispatched < localEuThrdsDispatched) ||
|
|
((euThrdsDispatched == localEuThrdsDispatched) && (waste < localWSWaste)))) {
|
|
localWSWaste = waste;
|
|
localEuThrdsDispatched = euThrdsDispatched;
|
|
workGroupSize[0] = xDim;
|
|
workGroupSize[1] = yDim;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) {
|
|
for (int i = 0; i < 3; i++)
|
|
workGroupSize[i] = 1;
|
|
size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
|
|
for (auto i = 0u; i < workDim; i++) {
|
|
uint32_t requiredWorkItemsCount = maxWorkGroupSize;
|
|
while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
|
|
requiredWorkItemsCount >>= 1;
|
|
itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
|
|
}
|
|
if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) {
|
|
while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) {
|
|
if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1])
|
|
itemsPowerOfTwoDivisors[0] >>= 1;
|
|
else
|
|
itemsPowerOfTwoDivisors[1] >>= 1;
|
|
}
|
|
for (auto i = 0u; i < 3; i++)
|
|
workGroupSize[i] = itemsPowerOfTwoDivisors[i];
|
|
return;
|
|
|
|
} else if (workItems[0] * workItems[1] > maxWorkGroupSize) {
|
|
computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize);
|
|
return;
|
|
} else {
|
|
for (auto i = 0u; i < workDim; i++)
|
|
workGroupSize[i] = workItems[i];
|
|
return;
|
|
}
|
|
}
|
|
|
|
void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) {
|
|
for (int i = 0; i < 3; i++)
|
|
workGroupSize[i] = 1;
|
|
|
|
UNRECOVERABLE_IF(wsInfo.simdSize == 0);
|
|
uint64_t totalNumberOfItems = workItems[0] * workItems[1] * workItems[2];
|
|
auto optimalWgThreadCount = optimalHardwareThreadCountGeneric[0];
|
|
bool totalRequiredThreadGroupsMoreThanSingleThreadGroup = totalNumberOfItems > wsInfo.simdSize * optimalWgThreadCount;
|
|
|
|
// Find biggest power of two which devide each dimension size
|
|
if (wsInfo.slmTotalSize == 0 && !wsInfo.hasBarriers) {
|
|
if (debugManager.flags.EnableComputeWorkSizeSquared.get() && workDim == 2 && !wsInfo.imgUsed) {
|
|
return computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim);
|
|
}
|
|
|
|
if (wsInfo.preferredWgCountPerSubSlice != 0 && wsInfo.simdSize == 32 && totalRequiredThreadGroupsMoreThanSingleThreadGroup) {
|
|
optimalWgThreadCount = std::min(optimalWgThreadCount, wsInfo.numThreadsPerSubSlice / wsInfo.preferredWgCountPerSubSlice);
|
|
wsInfo.maxWorkGroupSize = wsInfo.simdSize * optimalWgThreadCount;
|
|
}
|
|
|
|
size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
|
|
for (auto i = 0u; i < workDim; i++) {
|
|
uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalWgThreadCount);
|
|
while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
|
|
requiredWorkItemsCount >>= 1;
|
|
itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
|
|
}
|
|
|
|
bool canUseNx4 = (wsInfo.imgUsed &&
|
|
(itemsPowerOfTwoDivisors[0] >= 4 || (itemsPowerOfTwoDivisors[0] >= 2 && wsInfo.simdSize == 8)) &&
|
|
itemsPowerOfTwoDivisors[1] >= 4);
|
|
|
|
// If computed dimension sizes which are powers of two are creating group which is
|
|
// bigger than maxWorkGroupSize or this group would create more than optimal hardware threads then downsize it
|
|
uint64_t allItems = itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] * itemsPowerOfTwoDivisors[2];
|
|
if (allItems > wsInfo.simdSize && (allItems > wsInfo.maxWorkGroupSize || allItems > wsInfo.simdSize * optimalWgThreadCount)) {
|
|
return computePowerOfTwoLWS(itemsPowerOfTwoDivisors, wsInfo, workGroupSize, workDim, canUseNx4);
|
|
}
|
|
// If coputed workgroup is at this point in correct size
|
|
else if (allItems >= wsInfo.simdSize) {
|
|
itemsPowerOfTwoDivisors[1] = canUseNx4 ? 4 : itemsPowerOfTwoDivisors[1];
|
|
for (auto i = 0u; i < workDim; i++)
|
|
workGroupSize[i] = itemsPowerOfTwoDivisors[i];
|
|
return;
|
|
}
|
|
}
|
|
|
|
// If dimensions are not powers of two but total number of items is less than max work group size
|
|
if (totalNumberOfItems <= wsInfo.maxWorkGroupSize) {
|
|
for (auto i = 0u; i < workDim; i++)
|
|
workGroupSize[i] = workItems[i];
|
|
return;
|
|
}
|
|
|
|
if (workDim == 1) {
|
|
return computeWorkgroupSize1D(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize);
|
|
}
|
|
|
|
choosePrefferedWorkgroupSize(wsInfo, workGroupSize, workItems, workDim);
|
|
}
|
|
|
|
Vec3<size_t> computeWorkgroupsNumber(const Vec3<size_t> &gws, const Vec3<size_t> &lws) {
|
|
return (Vec3<size_t>(gws.x / lws.x + ((gws.x % lws.x) ? 1 : 0),
|
|
gws.y / lws.y + ((gws.y % lws.y) ? 1 : 0),
|
|
gws.z / lws.z + ((gws.z % lws.z) ? 1 : 0)));
|
|
}
|
|
|
|
Vec3<size_t> generateWorkgroupsNumber(const Vec3<size_t> &gws, const Vec3<size_t> &lws) {
|
|
return (lws.x > 0) ? computeWorkgroupsNumber(gws, lws) : Vec3<size_t>(0, 0, 0);
|
|
}
|
|
|
|
Vec3<size_t> canonizeWorkgroup(const Vec3<size_t> &workgroup) {
|
|
return ((workgroup.x > 0) ? Vec3<size_t>({workgroup.x, std::max(workgroup.y, static_cast<size_t>(1)), std::max(workgroup.z, static_cast<size_t>(1))})
|
|
: Vec3<size_t>(0, 0, 0));
|
|
}
|
|
|
|
} // namespace NEO
|