/* * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/helpers/local_work_size.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/helpers/array_count.h" #include "shared/source/helpers/basic_math.h" #include "shared/source/helpers/debug_helpers.h" #include "shared/source/program/kernel_info.h" #include "shared/source/program/work_size_info.h" #include #include namespace NEO { // threshold used to determine what kind of device is underneath // big cores like SKL have 8EU * 7 HW threads per subslice and are considered as highThreadCount devices constexpr uint32_t highThreadCountThreshold = 56u; static const uint32_t optimalHardwareThreadCountGeneric[] = {32, 16, 8, 4, 2, 1}; static const uint32_t primeNumbers[] = { 251, 241, 239, 233, 229, 227, 223, 211, 199, 197, 193, 191, 181, 179, 173, 167, 163, 157, 151, 149, 139, 137, 131, 127, 113, 109, 107, 103, 101, 97, 89, 83, 79, 73, 71, 67, 61, 59, 53, 47, 43, 41, 37, 31, 29, 23, 19, 17, 13, 11, 7, 5, 3, 2}; static const size_t MAX_PRIMES = sizeof(primeNumbers) / sizeof(primeNumbers[0]); // Recursive template function to test prime factors template static inline uint32_t factor(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) { auto primeNumber = primeNumbers[primeIndex]; auto newWorkSize = workSize * primeNumber; if (newWorkSize <= workItems) { while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) { workSize = newWorkSize; newWorkSize = workSize * primeNumber; } workSize = factor(workItems, workSize, maxWorkGroupSize); } return workSize; } // Terminator of recursive factoring logic template <> inline uint32_t factor<0>(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) { uint32_t primeIndex = 0; auto primeNumber = primeNumbers[primeIndex]; auto newWorkSize = workSize * primeNumber; if (newWorkSize <= workItems) { while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) { workSize = newWorkSize; newWorkSize = workSize * primeNumber; } } return workSize; } void computePowerOfTwoLWS(const size_t workItems[3], WorkSizeInfo &workGroupInfo, size_t workGroupSize[3], const uint32_t workDim, bool canUseNx4) { uint32_t targetIndex = (canUseNx4 || workGroupInfo.numThreadsPerSubSlice < highThreadCountThreshold) ? 2 : 0; auto arraySize = arrayCount(optimalHardwareThreadCountGeneric); auto simdSize = workGroupInfo.simdSize; while (targetIndex < arraySize && optimalHardwareThreadCountGeneric[targetIndex] > 1 && workGroupInfo.maxWorkGroupSize < optimalHardwareThreadCountGeneric[targetIndex] * simdSize) { targetIndex++; } uint32_t optimalLocalThreads = optimalHardwareThreadCountGeneric[targetIndex]; if (workDim == 2) { uint32_t xDim, yDim; xDim = uint32_t(optimalLocalThreads * simdSize) / (canUseNx4 ? 4 : 1); while (xDim > workItems[0]) xDim = xDim >> 1; yDim = canUseNx4 ? 4 : (uint32_t(optimalLocalThreads * simdSize) / xDim); workGroupSize[0] = xDim; workGroupSize[1] = yDim; } else { uint32_t xDim, yDim, zDim; xDim = uint32_t(optimalLocalThreads * simdSize); while (xDim > workItems[0]) xDim = xDim >> 1; yDim = uint32_t(optimalLocalThreads * simdSize) / xDim; while (yDim > workItems[1]) yDim = yDim >> 1; UNRECOVERABLE_IF((xDim * yDim) == 0); zDim = uint32_t(optimalLocalThreads * simdSize) / (xDim * yDim); workGroupSize[0] = xDim; workGroupSize[1] = yDim; workGroupSize[2] = zDim; } } void choosePreferredWorkGroupSizeWithRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, bool enforceDescendingOrder) { float localRatio = std::numeric_limits::max(); uint64_t localNumWorkgroups = std::numeric_limits::max(); for (uint32_t xFactorsIdx = 0; xFactorsIdx < xyzFactorsLen[0]; ++xFactorsIdx) { for (uint32_t yFactorsIdx = 0; yFactorsIdx < xyzFactorsLen[1]; ++yFactorsIdx) { uint32_t xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - xFactorsIdx]; uint32_t ydim = xyzFactors[1][yFactorsIdx]; if (enforceDescendingOrder && ydim > xdim) { break; } if ((xdim * ydim) > wsInfo.maxWorkGroupSize) { break; } if ((xdim * ydim) < wsInfo.minWorkGroupSize) { continue; } uint64_t numWorkGroups = Math::divideAndRoundUp(workItems[0], xdim); numWorkGroups *= Math::divideAndRoundUp(workItems[1], ydim); float ratioDiff = log(static_cast(xdim)) - log(static_cast(ydim)); ratioDiff = fabs(wsInfo.targetRatio - ratioDiff); bool setWorkGroupSize = wsInfo.useStrictRatio ? (ratioDiff < localRatio) : (numWorkGroups < localNumWorkgroups) || ((numWorkGroups == localNumWorkgroups) && (ratioDiff < localRatio)); if (setWorkGroupSize) { workGroupSize[0] = xdim; workGroupSize[1] = ydim; localRatio = ratioDiff; localNumWorkgroups = numWorkGroups; } } } } void choosePreferredWorkGroupSizeWithOutRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, bool enforceDescendingOrder) { uint64_t localEuThrdsDispatched = std::numeric_limits::max(); for (uint32_t xFactorsIdx = 0; xFactorsIdx < xyzFactorsLen[0]; ++xFactorsIdx) { for (uint32_t yFactorsIdx = 0; yFactorsIdx < xyzFactorsLen[1]; ++yFactorsIdx) { for (uint32_t zFactorsIdx = 0; zFactorsIdx < xyzFactorsLen[2]; ++zFactorsIdx) { uint32_t xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - xFactorsIdx]; uint32_t ydim = xyzFactors[1][xyzFactorsLen[1] - 1 - yFactorsIdx]; uint32_t zdim = xyzFactors[2][xyzFactorsLen[2] - 1 - zFactorsIdx]; if (enforceDescendingOrder) { if (ydim > xdim) { break; } else if (zdim > ydim) { continue; } } uint32_t numItemsInWorkGroup = xdim * ydim * zdim; if (numItemsInWorkGroup > wsInfo.maxWorkGroupSize) { continue; } if (numItemsInWorkGroup < wsInfo.minWorkGroupSize) { break; } uint64_t numWorkGroups = Math::divideAndRoundUp(workItems[0], xdim); numWorkGroups *= Math::divideAndRoundUp(workItems[1], ydim); numWorkGroups *= Math::divideAndRoundUp(workItems[2], zdim); uint64_t numThreadsPerWorkGroup = Math::divideAndRoundUp(numItemsInWorkGroup, wsInfo.simdSize); uint64_t euThrdsDispatched = numThreadsPerWorkGroup * numWorkGroups; if (euThrdsDispatched < localEuThrdsDispatched) { localEuThrdsDispatched = euThrdsDispatched; workGroupSize[0] = xdim; workGroupSize[1] = ydim; workGroupSize[2] = zdim; } } } } } void computeWorkgroupSize1D(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize) { auto items = workItems[0]; // Determine the LSB set to quickly handle factors of 2 auto numBits = Math::getMinLsbSet(static_cast(items)); // Clamp power of 2 result to maxWorkGroupSize uint32_t workSize = 1u << numBits; // Assumes maxWorkGroupSize is a power of two. DEBUG_BREAK_IF((maxWorkGroupSize & (maxWorkGroupSize - 1)) != 0); workSize = std::min(workSize, maxWorkGroupSize); // Try all primes as potential factors workSize = factor(items, workSize, maxWorkGroupSize); workGroupSize[0] = workSize; workGroupSize[1] = 1; workGroupSize[2] = 1; } void choosePreferredWorkgroupSize(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, bool enforceDescendingOrder) { // check if algorithm should use ratio wsInfo.checkRatio(workItems); if (wsInfo.useRatio) { choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, enforceDescendingOrder); if (wsInfo.useStrictRatio && workGroupSize[0] * workGroupSize[1] * 2 <= wsInfo.simdSize) { wsInfo.useStrictRatio = false; choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, enforceDescendingOrder); } } else { choosePreferredWorkGroupSizeWithOutRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, enforceDescendingOrder); } } void choosePrefferedWorkgroupSize(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) { // find all divisors for all dimensions uint32_t xyzFactors[3][1024]; uint32_t xyzFactorsLen[3] = {}; for (int i = 0; i < 3; i++) xyzFactors[i][xyzFactorsLen[i]++] = 1; for (auto i = 0u; i < workDim; i++) { for (auto j = 2u; j < wsInfo.maxWorkGroupSize; ++j) { if ((workItems[i] % j) == 0) { xyzFactors[i][xyzFactorsLen[i]++] = j; } } } choosePreferredWorkgroupSize(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, true); size_t wgs = workGroupSize[0] * workGroupSize[1] * workGroupSize[2]; if (wgs * 2 <= wsInfo.simdSize) { choosePreferredWorkgroupSize(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, false); } } void computeWorkgroupSize2D(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize) { uint32_t xFactors[1024]; uint32_t yFactors[1024]; uint32_t xFactorsLen = 0; uint32_t yFactorsLen = 0; uint64_t waste; uint64_t localWSWaste = 0xffffffffffffffff; uint64_t euThrdsDispatched; uint64_t localEuThrdsDispatched = 0xffffffffffffffff; uint64_t workGroups; uint32_t xDim; uint32_t yDim; for (int i = 0; i < 3; i++) workGroupSize[i] = 1; for (uint32_t i = 2; i <= maxWorkGroupSize; i++) { if ((workItems[0] % i) == 0) { xFactors[xFactorsLen++] = i; } if (((workItems[1] % i) == 0)) { yFactors[yFactorsLen++] = i; } } for (uint32_t xFactorsIdx = 0; xFactorsIdx < xFactorsLen; ++xFactorsIdx) { for (uint32_t yFactorsIdx = 0; yFactorsIdx < yFactorsLen; ++yFactorsIdx) { // Pick a LocalWorkSize that is a multiple as well as appropriate: // 1 <= workGroupSize[ 0 ] <= workItems[ 0 ] // 1 <= workGroupSize[ 1 ] <= workItems[ 1 ] xDim = xFactors[xFactorsLen - 1 - xFactorsIdx]; yDim = yFactors[yFactorsIdx]; if ((xDim * yDim) > maxWorkGroupSize) { // The yDim value is too big, so break out of this loop. // No other entries will work. break; } // Find the wasted channels. workGroups = Math::divideAndRoundUp(workItems[0], xDim); workGroups *= Math::divideAndRoundUp(workItems[1], yDim); // Compaction Mode! euThrdsDispatched = Math::divideAndRoundUp(xDim * yDim, simdSize); euThrdsDispatched *= workGroups; waste = simdSize - ((xDim * yDim - 1) & (simdSize - 1)); waste *= workGroups; if (((euThrdsDispatched < localEuThrdsDispatched) || ((euThrdsDispatched == localEuThrdsDispatched) && (waste < localWSWaste)))) { localWSWaste = waste; localEuThrdsDispatched = euThrdsDispatched; workGroupSize[0] = xDim; workGroupSize[1] = yDim; } } } } void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) { for (int i = 0; i < 3; i++) workGroupSize[i] = 1; size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1}; for (auto i = 0u; i < workDim; i++) { uint32_t requiredWorkItemsCount = maxWorkGroupSize; while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount))) requiredWorkItemsCount >>= 1; itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount; } if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) { while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) { if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1]) itemsPowerOfTwoDivisors[0] >>= 1; else itemsPowerOfTwoDivisors[1] >>= 1; } for (auto i = 0u; i < 3; i++) workGroupSize[i] = itemsPowerOfTwoDivisors[i]; return; } else if (workItems[0] * workItems[1] > maxWorkGroupSize) { computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize); return; } else { for (auto i = 0u; i < workDim; i++) workGroupSize[i] = workItems[i]; return; } } void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) { for (int i = 0; i < 3; i++) workGroupSize[i] = 1; UNRECOVERABLE_IF(wsInfo.simdSize == 0); // Find biggest power of two which devide each dimension size if (wsInfo.slmTotalSize == 0 && !wsInfo.hasBarriers) { if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && workDim == 2 && !wsInfo.imgUsed) { return computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim); } size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1}; for (auto i = 0u; i < workDim; i++) { uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalHardwareThreadCountGeneric[0]); while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount))) requiredWorkItemsCount >>= 1; itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount; } bool canUseNx4 = (wsInfo.imgUsed && (itemsPowerOfTwoDivisors[0] >= 4 || (itemsPowerOfTwoDivisors[0] >= 2 && wsInfo.simdSize == 8)) && itemsPowerOfTwoDivisors[1] >= 4); // If computed dimension sizes which are powers of two are creating group which is // bigger than maxWorkGroupSize or this group would create more than optimal hardware threads then downsize it uint64_t allItems = itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] * itemsPowerOfTwoDivisors[2]; if (allItems > wsInfo.simdSize && (allItems > wsInfo.maxWorkGroupSize || allItems > wsInfo.simdSize * optimalHardwareThreadCountGeneric[0])) { return computePowerOfTwoLWS(itemsPowerOfTwoDivisors, wsInfo, workGroupSize, workDim, canUseNx4); } // If coputed workgroup is at this point in correct size else if (allItems >= wsInfo.simdSize) { itemsPowerOfTwoDivisors[1] = canUseNx4 ? 4 : itemsPowerOfTwoDivisors[1]; for (auto i = 0u; i < workDim; i++) workGroupSize[i] = itemsPowerOfTwoDivisors[i]; return; } } uint64_t totalNuberOfItems = workItems[0] * workItems[1] * workItems[2]; // If dimensions are not powers of two but total number of items is less than max work group size if (totalNuberOfItems <= wsInfo.maxWorkGroupSize) { for (auto i = 0u; i < workDim; i++) workGroupSize[i] = workItems[i]; return; } if (workDim == 1) { return computeWorkgroupSize1D(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize); } choosePrefferedWorkgroupSize(wsInfo, workGroupSize, workItems, workDim); } Vec3 computeWorkgroupsNumber(const Vec3 &gws, const Vec3 &lws) { return (Vec3(gws.x / lws.x + ((gws.x % lws.x) ? 1 : 0), gws.y / lws.y + ((gws.y % lws.y) ? 1 : 0), gws.z / lws.z + ((gws.z % lws.z) ? 1 : 0))); } Vec3 generateWorkgroupsNumber(const Vec3 &gws, const Vec3 &lws) { return (lws.x > 0) ? computeWorkgroupsNumber(gws, lws) : Vec3(0, 0, 0); } Vec3 canonizeWorkgroup(const Vec3 &workgroup) { return ((workgroup.x > 0) ? Vec3({workgroup.x, std::max(workgroup.y, static_cast(1)), std::max(workgroup.z, static_cast(1))}) : Vec3(0, 0, 0)); } } // namespace NEO