diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 9fa5731ad1..4607058e0f 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -11,9 +11,12 @@ #include "shared/source/helpers/blit_commands_helper.h" #include "shared/source/helpers/hw_info.h" #include "shared/source/helpers/kernel_helpers.h" +#include "shared/source/helpers/local_work_size.h" +#include "shared/source/helpers/per_thread_data.h" #include "shared/source/helpers/register_offsets.h" #include "shared/source/helpers/string.h" #include "shared/source/helpers/surface_format_info.h" +#include "shared/source/kernel/implicit_args.h" #include "shared/source/kernel/kernel_arg_descriptor.h" #include "shared/source/kernel/kernel_descriptor.h" #include "shared/source/memory_manager/memory_manager.h" @@ -22,7 +25,6 @@ #include "shared/source/program/kernel_info.h" #include "shared/source/utilities/arrayref.h" -#include "opencl/source/command_queue/gpgpu_walker.h" #include "opencl/source/mem_obj/buffer.h" #include "level_zero/core/source/debugger/debugger_l0.h" diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 649595547a..b3e2a8c28e 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -10,6 +10,7 @@ #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/helpers/array_count.h" #include "shared/source/helpers/engine_node_helper.h" +#include "shared/source/helpers/local_work_size.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/surface.h" diff --git a/opencl/source/command_queue/gpgpu_walker.h b/opencl/source/command_queue/gpgpu_walker.h index 4431e1dd6d..4410db55d7 100644 --- a/opencl/source/command_queue/gpgpu_walker.h +++ b/opencl/source/command_queue/gpgpu_walker.h @@ -14,7 +14,6 @@ #include "shared/source/helpers/timestamp_packet.h" #include "shared/source/helpers/vec.h" #include "shared/source/indirect_heap/indirect_heap.h" -#include "shared/source/program/kernel_info.h" #include "shared/source/utilities/hw_timestamps.h" #include "shared/source/utilities/perf_counter.h" #include "shared/source/utilities/tag_allocator.h" @@ -36,64 +35,17 @@ using WALKER_TYPE = typename GfxFamily::WALKER_TYPE; template using MI_STORE_REG_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM_CMD; -void computeWorkgroupSize1D( - uint32_t maxWorkGroupSize, - size_t workGroupSize[3], - const size_t workItems[3], - size_t simdSize); - -void computeWorkgroupSizeND( - WorkSizeInfo &wsInfo, - size_t workGroupSize[3], - const size_t workItems[3], - const uint32_t workDim); - -void computeWorkgroupSize2D( - uint32_t maxWorkGroupSize, - size_t workGroupSize[3], - const size_t workItems[3], - size_t simdSize); - -void computeWorkgroupSizeSquared( - uint32_t maxWorkGroupSize, - size_t workGroupSize[3], - const size_t workItems[3], - size_t simdSize, - const uint32_t workDim); - Vec3 computeWorkgroupSize( const DispatchInfo &dispatchInfo); Vec3 generateWorkgroupSize( const DispatchInfo &dispatchInfo); -Vec3 computeWorkgroupsNumber( - const Vec3 &gws, - const Vec3 &lws); - -Vec3 generateWorkgroupsNumber( - const Vec3 &gws, - const Vec3 &lws); - Vec3 generateWorkgroupsNumber( const DispatchInfo &dispatchInfo); -inline uint32_t calculateDispatchDim(const Vec3 &dispatchSize, const Vec3 &dispatchOffset) { - return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim())); -} - -Vec3 canonizeWorkgroup( - const Vec3 &workgroup); - void provideLocalWorkGroupSizeHints(Context *context, DispatchInfo dispatchInfo); -void setSpecialWorkgroupSize(size_t workgroupSize[3]); - -inline cl_uint computeDimensions(const size_t workItems[3]) { - return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 - : 1; -} - WorkSizeInfo createWorkSizeInfoFromDispatchInfo(const DispatchInfo &dispatchInfo); template diff --git a/opencl/source/command_queue/local_work_size.cpp b/opencl/source/command_queue/local_work_size.cpp index 05776ca5d0..82e9bd5361 100644 --- a/opencl/source/command_queue/local_work_size.cpp +++ b/opencl/source/command_queue/local_work_size.cpp @@ -5,11 +5,14 @@ * */ +#include "shared/source/helpers/local_work_size.h" + #include "shared/source/device/device.h" #include "shared/source/helpers/array_count.h" #include "shared/source/helpers/basic_math.h" #include "shared/source/helpers/debug_helpers.h" #include "shared/source/helpers/hw_helper.h" +#include "shared/source/program/kernel_info.h" #include "opencl/source/cl_device/cl_device.h" #include "opencl/source/command_queue/gpgpu_walker.h" @@ -17,400 +20,12 @@ #include "opencl/source/helpers/dispatch_info.h" #include "opencl/source/kernel/kernel.h" -#include #include #include #include namespace NEO { -//threshold used to determine what kind of device is underneath -//big cores like SKL have 8EU * 7 HW threads per subslice and are considered as highThreadCount devices -constexpr uint32_t highThreadCountThreshold = 56u; - -static const uint32_t optimalHardwareThreadCountGeneric[] = {32, 16, 8, 4, 2, 1}; - -static const uint32_t primeNumbers[] = { - 251, - 241, - 239, 233, - 229, 227, 223, - 211, - 199, 197, 193, 191, - 181, - 179, 173, - 167, 163, - 157, 151, - 149, - 139, 137, 131, - 127, - 113, - 109, 107, 103, 101, - 97, - 89, 83, - 79, 73, 71, - 67, 61, - 59, 53, - 47, 43, 41, - 37, 31, - 29, 23, - 19, 17, 13, 11, - 7, 5, 3, 2}; - -static const size_t MAX_PRIMES = sizeof(primeNumbers) / sizeof(primeNumbers[0]); - -// Recursive template function to test prime factors -template -static inline uint32_t factor(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) { - auto primeNumber = primeNumbers[primeIndex]; - - auto newWorkSize = workSize * primeNumber; - if (newWorkSize <= workItems) { - while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) { - workSize = newWorkSize; - newWorkSize = workSize * primeNumber; - } - - workSize = factor(workItems, workSize, maxWorkGroupSize); - } - - return workSize; -} - -// Terminator of recursive factoring logic -template <> -inline uint32_t factor<0>(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) { - uint32_t primeIndex = 0; - auto primeNumber = primeNumbers[primeIndex]; - - auto newWorkSize = workSize * primeNumber; - if (newWorkSize <= workItems) { - while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) { - workSize = newWorkSize; - newWorkSize = workSize * primeNumber; - } - } - - return workSize; -} - -void computePowerOfTwoLWS(const size_t workItems[3], WorkSizeInfo &workGroupInfo, size_t workGroupSize[3], const uint32_t workDim, bool canUseNx4) { - uint32_t targetIndex = (canUseNx4 || workGroupInfo.numThreadsPerSubSlice < highThreadCountThreshold) ? 2 : 0; - auto arraySize = arrayCount(optimalHardwareThreadCountGeneric); - auto simdSize = workGroupInfo.simdSize; - - while (targetIndex < arraySize && - optimalHardwareThreadCountGeneric[targetIndex] > 1 && - workGroupInfo.maxWorkGroupSize < optimalHardwareThreadCountGeneric[targetIndex] * simdSize) { - targetIndex++; - } - uint32_t optimalLocalThreads = optimalHardwareThreadCountGeneric[targetIndex]; - - if (workDim == 2) { - uint32_t xDim, yDim; - xDim = uint32_t(optimalLocalThreads * simdSize) / (canUseNx4 ? 4 : 1); - while (xDim > workItems[0]) - xDim = xDim >> 1; - yDim = canUseNx4 ? 4 : (uint32_t(optimalLocalThreads * simdSize) / xDim); - workGroupSize[0] = xDim; - workGroupSize[1] = yDim; - } else { - uint32_t xDim, yDim, zDim; - xDim = uint32_t(optimalLocalThreads * simdSize); - while (xDim > workItems[0]) - xDim = xDim >> 1; - yDim = uint32_t(optimalLocalThreads * simdSize) / xDim; - while (yDim > workItems[1]) - yDim = yDim >> 1; - UNRECOVERABLE_IF((xDim * yDim) == 0); - zDim = uint32_t(optimalLocalThreads * simdSize) / (xDim * yDim); - workGroupSize[0] = xDim; - workGroupSize[1] = yDim; - workGroupSize[2] = zDim; - } -} - -void choosePreferredWorkGroupSizeWithRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo) { - float ratioDiff = 0; - float localRatio = float(0xffffffff); - ulong localWkgs = 0xffffffff; - ulong workGroups; - for (cl_uint XFactorsIdx = 0; XFactorsIdx < xyzFactorsLen[0]; ++XFactorsIdx) { - for (cl_uint YFactorsIdx = 0; YFactorsIdx < xyzFactorsLen[1]; ++YFactorsIdx) { - - uint32_t Xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - XFactorsIdx]; - uint32_t Ydim = xyzFactors[1][YFactorsIdx]; - - if ((Xdim * Ydim) > wsInfo.maxWorkGroupSize) { - break; - } - if ((Xdim * Ydim) < wsInfo.minWorkGroupSize) { - continue; - } - - workGroups = Math::divideAndRoundUp(workItems[0], Xdim); - workGroups *= Math::divideAndRoundUp(workItems[1], Ydim); - - ratioDiff = log((float)Xdim) - log((float)Ydim); - ratioDiff = fabs(wsInfo.targetRatio - ratioDiff); - - if (wsInfo.useStrictRatio == CL_TRUE) { - if (ratioDiff < localRatio) { - workGroupSize[0] = Xdim; - workGroupSize[1] = Ydim; - localRatio = ratioDiff; - localWkgs = workGroups; - } - } else { - if ((workGroups < localWkgs) || - ((workGroups == localWkgs) && (ratioDiff < localRatio))) { - workGroupSize[0] = Xdim; - workGroupSize[1] = Ydim; - localRatio = ratioDiff; - localWkgs = workGroups; - } - } - } - } -} -void choosePreferredWorkGroupSizeWithOutRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, uint32_t workdim) { - uint64_t localEuThrdsDispatched = 0xffffffffffffffff; - uint64_t workGroups; - for (uint32_t ZFactorsIdx = 0; ZFactorsIdx < xyzFactorsLen[2]; ++ZFactorsIdx) { - for (uint32_t XFactorsIdx = 0; XFactorsIdx < xyzFactorsLen[0]; ++XFactorsIdx) { - for (uint32_t YFactorsIdx = 0; YFactorsIdx < xyzFactorsLen[1]; ++YFactorsIdx) { - - uint32_t Xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - XFactorsIdx]; - uint32_t Ydim = xyzFactors[1][YFactorsIdx]; - uint32_t Zdim = xyzFactors[2][ZFactorsIdx]; - - if ((Xdim * Ydim * Zdim) > wsInfo.maxWorkGroupSize) { - break; - } - if ((Xdim * Ydim * Zdim) < wsInfo.minWorkGroupSize) { - continue; - } - - workGroups = Math::divideAndRoundUp(workItems[0], Xdim); - workGroups *= Math::divideAndRoundUp(workItems[1], Ydim); - workGroups *= Math::divideAndRoundUp(workItems[2], Zdim); - uint64_t euThrdsDispatched; - - euThrdsDispatched = Math::divideAndRoundUp(Xdim * Ydim * Zdim, wsInfo.simdSize); - euThrdsDispatched *= workGroups; - - if (euThrdsDispatched < localEuThrdsDispatched) { - localEuThrdsDispatched = euThrdsDispatched; - workGroupSize[0] = Xdim; - workGroupSize[1] = Ydim; - workGroupSize[2] = Zdim; - } - } - } - } -} - -void setSpecialWorkgroupSize(size_t workgroupSize[3]) { - workgroupSize[0] = 1; - workgroupSize[1] = 1; - workgroupSize[2] = 1; -} - -void computeWorkgroupSize1D(uint32_t maxWorkGroupSize, - size_t workGroupSize[3], - const size_t workItems[3], - size_t simdSize) { - auto items = workItems[0]; - - // Determine the LSB set to quickly handle factors of 2 - auto numBits = Math::getMinLsbSet(static_cast(items)); - - // Clamp power of 2 result to maxWorkGroupSize - uint32_t workSize = 1u << numBits; - - //Assumes maxWorkGroupSize is a power of two. - DEBUG_BREAK_IF((maxWorkGroupSize & (maxWorkGroupSize - 1)) != 0); - workSize = std::min(workSize, maxWorkGroupSize); - - // Try all primes as potential factors - workSize = factor(items, workSize, maxWorkGroupSize); - - workGroupSize[0] = workSize; - workGroupSize[1] = 1; - workGroupSize[2] = 1; -} - -void computeWorkgroupSize2D(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize) { - uint32_t xFactors[1024]; - uint32_t yFactors[1024]; - uint32_t xFactorsLen = 0; - uint32_t yFactorsLen = 0; - uint64_t waste; - uint64_t localWSWaste = 0xffffffffffffffff; - uint64_t euThrdsDispatched; - uint64_t localEuThrdsDispatched = 0xffffffffffffffff; - uint64_t workGroups; - uint32_t xDim; - uint32_t yDim; - - for (int i = 0; i < 3; i++) - workGroupSize[i] = 1; - - for (uint32_t i = 2; i <= maxWorkGroupSize; i++) { - if ((workItems[0] % i) == 0) { - xFactors[xFactorsLen++] = i; - } - if (((workItems[1] % i) == 0)) { - yFactors[yFactorsLen++] = i; - } - } - - for (uint32_t xFactorsIdx = 0; xFactorsIdx < xFactorsLen; ++xFactorsIdx) { - for (uint32_t yFactorsIdx = 0; yFactorsIdx < yFactorsLen; ++yFactorsIdx) { - // Pick a LocalWorkSize that is a multiple as well as appropriate: - // 1 <= workGroupSize[ 0 ] <= workItems[ 0 ] - // 1 <= workGroupSize[ 1 ] <= workItems[ 1 ] - xDim = xFactors[xFactorsLen - 1 - xFactorsIdx]; - yDim = yFactors[yFactorsIdx]; - - if ((xDim * yDim) > maxWorkGroupSize) { - // The yDim value is too big, so break out of this loop. - // No other entries will work. - break; - } - - // Find the wasted channels. - workGroups = Math::divideAndRoundUp(workItems[0], xDim); - workGroups *= Math::divideAndRoundUp(workItems[1], yDim); - - // Compaction Mode! - euThrdsDispatched = Math::divideAndRoundUp(xDim * yDim, simdSize); - euThrdsDispatched *= workGroups; - - waste = simdSize - ((xDim * yDim - 1) & (simdSize - 1)); - waste *= workGroups; - - if (((euThrdsDispatched < localEuThrdsDispatched) || - ((euThrdsDispatched == localEuThrdsDispatched) && (waste < localWSWaste)))) { - localWSWaste = waste; - localEuThrdsDispatched = euThrdsDispatched; - workGroupSize[0] = xDim; - workGroupSize[1] = yDim; - } - } - } -} - -void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) { - for (int i = 0; i < 3; i++) - workGroupSize[i] = 1; - size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1}; - for (auto i = 0u; i < workDim; i++) { - uint32_t requiredWorkItemsCount = maxWorkGroupSize; - while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount))) - requiredWorkItemsCount >>= 1; - itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount; - } - if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) { - while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) { - if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1]) - itemsPowerOfTwoDivisors[0] >>= 1; - else - itemsPowerOfTwoDivisors[1] >>= 1; - } - for (auto i = 0u; i < 3; i++) - workGroupSize[i] = itemsPowerOfTwoDivisors[i]; - return; - - } else if (workItems[0] * workItems[1] > maxWorkGroupSize) { - computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize); - return; - } else { - for (auto i = 0u; i < workDim; i++) - workGroupSize[i] = workItems[i]; - return; - } -} - -void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) { - for (int i = 0; i < 3; i++) - workGroupSize[i] = 1; - - uint64_t totalNuberOfItems = workItems[0] * workItems[1] * workItems[2]; - - UNRECOVERABLE_IF(wsInfo.simdSize == 0); - - //Find biggest power of two which devide each dimension size - if (wsInfo.slmTotalSize == 0 && !wsInfo.hasBarriers) { - if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && workDim == 2 && !wsInfo.imgUsed) { - computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim); - return; - } - - size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1}; - for (auto i = 0u; i < workDim; i++) { - uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalHardwareThreadCountGeneric[0]); - while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount))) - requiredWorkItemsCount >>= 1; - itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount; - } - - bool canUseNx4 = (wsInfo.imgUsed && - (itemsPowerOfTwoDivisors[0] >= 4 || (itemsPowerOfTwoDivisors[0] >= 2 && wsInfo.simdSize == 8)) && - itemsPowerOfTwoDivisors[1] >= 4); - - //If computed dimension sizes which are powers of two are creating group which is - //bigger than maxWorkGroupSize or this group would create more than optimal hardware threads then downsize it - uint64_t allItems = itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] * itemsPowerOfTwoDivisors[2]; - if (allItems > wsInfo.simdSize && (allItems > wsInfo.maxWorkGroupSize || allItems > wsInfo.simdSize * optimalHardwareThreadCountGeneric[0])) { - computePowerOfTwoLWS(itemsPowerOfTwoDivisors, wsInfo, workGroupSize, workDim, canUseNx4); - return; - } - //If coputed workgroup is at this point in correct size - else if (allItems >= wsInfo.simdSize) { - itemsPowerOfTwoDivisors[1] = canUseNx4 ? 4 : itemsPowerOfTwoDivisors[1]; - for (auto i = 0u; i < workDim; i++) - workGroupSize[i] = itemsPowerOfTwoDivisors[i]; - return; - } - } - //If dimensions are not powers of two but total number of items is less than max work group size - if (totalNuberOfItems <= wsInfo.maxWorkGroupSize) { - for (auto i = 0u; i < workDim; i++) - workGroupSize[i] = workItems[i]; - return; - } else { - if (workDim == 1) - computeWorkgroupSize1D(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize); - else { - uint32_t xyzFactors[3][1024]; - uint32_t xyzFactorsLen[3] = {}; - - //check if algorithm should use ratio - wsInfo.checkRatio(workItems); - - //find all divisors for all dimensions - for (int i = 0; i < 3; i++) - xyzFactors[i][xyzFactorsLen[i]++] = 1; - for (auto i = 0u; i < workDim; i++) { - for (auto j = 2u; j < wsInfo.maxWorkGroupSize; ++j) { - if ((workItems[i] % j) == 0) { - xyzFactors[i][xyzFactorsLen[i]++] = j; - } - } - } - if (wsInfo.useRatio) { - choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo); - if (wsInfo.useStrictRatio && workGroupSize[0] * workGroupSize[1] * 2 <= wsInfo.simdSize) { - wsInfo.useStrictRatio = false; - choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo); - } - } else - choosePreferredWorkGroupSizeWithOutRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, workDim); - } - } -} - Vec3 computeWorkgroupSize(const DispatchInfo &dispatchInfo) { size_t workGroupSize[3] = {}; auto kernel = dispatchInfo.getKernel(); @@ -453,25 +68,10 @@ Vec3 generateWorkgroupSize(const DispatchInfo &dispatchInfo) { return (dispatchInfo.getEnqueuedWorkgroupSize().x == 0) ? computeWorkgroupSize(dispatchInfo) : dispatchInfo.getEnqueuedWorkgroupSize(); } -Vec3 computeWorkgroupsNumber(const Vec3 &gws, const Vec3 &lws) { - return (Vec3(gws.x / lws.x + ((gws.x % lws.x) ? 1 : 0), - gws.y / lws.y + ((gws.y % lws.y) ? 1 : 0), - gws.z / lws.z + ((gws.z % lws.z) ? 1 : 0))); -} - -Vec3 generateWorkgroupsNumber(const Vec3 &gws, const Vec3 &lws) { - return (lws.x > 0) ? computeWorkgroupsNumber(gws, lws) : Vec3(0, 0, 0); -} - Vec3 generateWorkgroupsNumber(const DispatchInfo &dispatchInfo) { return generateWorkgroupsNumber(dispatchInfo.getGWS(), dispatchInfo.getLocalWorkgroupSize()); } -Vec3 canonizeWorkgroup(const Vec3 &workgroup) { - return ((workgroup.x > 0) ? Vec3({workgroup.x, std::max(workgroup.y, static_cast(1)), std::max(workgroup.z, static_cast(1))}) - : Vec3(0, 0, 0)); -} - void provideLocalWorkGroupSizeHints(Context *context, DispatchInfo dispatchInfo) { if (context != nullptr && context->isProvidingPerformanceHints() && dispatchInfo.getDim() <= 3) { size_t preferredWorkGroupSize[3]; diff --git a/opencl/source/helpers/dispatch_info_builder.h b/opencl/source/helpers/dispatch_info_builder.h index e1fd7db2d6..67a458738e 100644 --- a/opencl/source/helpers/dispatch_info_builder.h +++ b/opencl/source/helpers/dispatch_info_builder.h @@ -6,6 +6,7 @@ */ #pragma once +#include "shared/source/helpers/local_work_size.h" #include "opencl/source/command_queue/gpgpu_walker.h" #include "opencl/source/helpers/dispatch_info.h" diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp index 6f8fe78814..0365f0cc85 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp @@ -7,6 +7,7 @@ #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/hw_helper.h" +#include "shared/source/helpers/local_work_size.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/utilities/perf_counter.h" #include "shared/source/utilities/tag_allocator.h" diff --git a/opencl/test/unit_test/command_queue/local_work_size_tests.cpp b/opencl/test/unit_test/command_queue/local_work_size_tests.cpp index d2e3d56f43..6b2b87419d 100644 --- a/opencl/test/unit_test/command_queue/local_work_size_tests.cpp +++ b/opencl/test/unit_test/command_queue/local_work_size_tests.cpp @@ -5,6 +5,7 @@ * */ +#include "shared/source/helpers/local_work_size.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/mocks/mock_device.h" diff --git a/opencl/test/unit_test/command_queue/work_group_size_tests.cpp b/opencl/test/unit_test/command_queue/work_group_size_tests.cpp index c4656e4c21..15b21afb62 100644 --- a/opencl/test/unit_test/command_queue/work_group_size_tests.cpp +++ b/opencl/test/unit_test/command_queue/work_group_size_tests.cpp @@ -5,6 +5,7 @@ * */ +#include "shared/source/helpers/local_work_size.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "opencl/source/command_queue/gpgpu_walker.h" @@ -58,10 +59,11 @@ struct WorkGroupSizeBase { WorkSizeInfo wsInfo(maxWorkGroupSize, 0u, simdSize, 0u, ::defaultHwInfo.get(), 32u, 0u, false, false); computeWorkgroupSizeND(wsInfo, workGroupSize, workItems, dims); } else { - if (dims == 1) + if (dims == 1) { computeWorkgroupSize1D(maxWorkGroupSize, workGroupSize, workItems, simdSize); - else + } else { computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize); + } } auto totalWorkItems = workItems[0] * workItems[1] * workItems[2]; auto localWorkItems = workGroupSize[0] * workGroupSize[1] * workGroupSize[2]; diff --git a/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp b/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp index 8f462f6749..b40c91f5b7 100644 --- a/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp +++ b/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp @@ -5,6 +5,7 @@ * */ +#include "shared/source/helpers/local_work_size.h" #include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" diff --git a/shared/source/helpers/CMakeLists.txt b/shared/source/helpers/CMakeLists.txt index bf1d250499..358afc75df 100644 --- a/shared/source/helpers/CMakeLists.txt +++ b/shared/source/helpers/CMakeLists.txt @@ -77,6 +77,8 @@ set(NEO_CORE_HELPERS ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_special.inl ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_sse4.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/local_work_size.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/local_work_size.h ${CMAKE_CURRENT_SOURCE_DIR}/neo_driver_version.h ${CMAKE_CURRENT_SOURCE_DIR}/non_copyable_or_moveable.h ${CMAKE_CURRENT_SOURCE_DIR}/options.h diff --git a/shared/source/helpers/local_work_size.cpp b/shared/source/helpers/local_work_size.cpp new file mode 100644 index 0000000000..f0c882e364 --- /dev/null +++ b/shared/source/helpers/local_work_size.cpp @@ -0,0 +1,425 @@ +/* + * Copyright (C) 2018-2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/helpers/local_work_size.h" + +#include "shared/source/debug_settings/debug_settings_manager.h" +#include "shared/source/helpers/array_count.h" +#include "shared/source/helpers/basic_math.h" +#include "shared/source/helpers/debug_helpers.h" +#include "shared/source/helpers/hw_helper.h" +#include "shared/source/program/kernel_info.h" + +#include +#include +#include + +namespace NEO { + +//threshold used to determine what kind of device is underneath +//big cores like SKL have 8EU * 7 HW threads per subslice and are considered as highThreadCount devices +constexpr uint32_t highThreadCountThreshold = 56u; + +static const uint32_t optimalHardwareThreadCountGeneric[] = {32, 16, 8, 4, 2, 1}; + +static const uint32_t primeNumbers[] = { + 251, + 241, + 239, 233, + 229, 227, 223, + 211, + 199, 197, 193, 191, + 181, + 179, 173, + 167, 163, + 157, 151, + 149, + 139, 137, 131, + 127, + 113, + 109, 107, 103, 101, + 97, + 89, 83, + 79, 73, 71, + 67, 61, + 59, 53, + 47, 43, 41, + 37, 31, + 29, 23, + 19, 17, 13, 11, + 7, 5, 3, 2}; + +static const size_t MAX_PRIMES = sizeof(primeNumbers) / sizeof(primeNumbers[0]); + +// Recursive template function to test prime factors +template +static inline uint32_t factor(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) { + auto primeNumber = primeNumbers[primeIndex]; + + auto newWorkSize = workSize * primeNumber; + if (newWorkSize <= workItems) { + while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) { + workSize = newWorkSize; + newWorkSize = workSize * primeNumber; + } + + workSize = factor(workItems, workSize, maxWorkGroupSize); + } + + return workSize; +} + +// Terminator of recursive factoring logic +template <> +inline uint32_t factor<0>(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) { + uint32_t primeIndex = 0; + auto primeNumber = primeNumbers[primeIndex]; + + auto newWorkSize = workSize * primeNumber; + if (newWorkSize <= workItems) { + while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) { + workSize = newWorkSize; + newWorkSize = workSize * primeNumber; + } + } + + return workSize; +} + +void computePowerOfTwoLWS(const size_t workItems[3], WorkSizeInfo &workGroupInfo, size_t workGroupSize[3], const uint32_t workDim, bool canUseNx4) { + uint32_t targetIndex = (canUseNx4 || workGroupInfo.numThreadsPerSubSlice < highThreadCountThreshold) ? 2 : 0; + auto arraySize = arrayCount(optimalHardwareThreadCountGeneric); + auto simdSize = workGroupInfo.simdSize; + + while (targetIndex < arraySize && + optimalHardwareThreadCountGeneric[targetIndex] > 1 && + workGroupInfo.maxWorkGroupSize < optimalHardwareThreadCountGeneric[targetIndex] * simdSize) { + targetIndex++; + } + uint32_t optimalLocalThreads = optimalHardwareThreadCountGeneric[targetIndex]; + + if (workDim == 2) { + uint32_t xDim, yDim; + xDim = uint32_t(optimalLocalThreads * simdSize) / (canUseNx4 ? 4 : 1); + while (xDim > workItems[0]) + xDim = xDim >> 1; + yDim = canUseNx4 ? 4 : (uint32_t(optimalLocalThreads * simdSize) / xDim); + workGroupSize[0] = xDim; + workGroupSize[1] = yDim; + } else { + uint32_t xDim, yDim, zDim; + xDim = uint32_t(optimalLocalThreads * simdSize); + while (xDim > workItems[0]) + xDim = xDim >> 1; + yDim = uint32_t(optimalLocalThreads * simdSize) / xDim; + while (yDim > workItems[1]) + yDim = yDim >> 1; + UNRECOVERABLE_IF((xDim * yDim) == 0); + zDim = uint32_t(optimalLocalThreads * simdSize) / (xDim * yDim); + workGroupSize[0] = xDim; + workGroupSize[1] = yDim; + workGroupSize[2] = zDim; + } +} + +void choosePreferredWorkGroupSizeWithRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo) { + float ratioDiff = 0; + float localRatio = float(0xffffffff); + uint64_t localWkgs = 0xffffffff; + uint64_t workGroups; + for (uint32_t XFactorsIdx = 0; XFactorsIdx < xyzFactorsLen[0]; ++XFactorsIdx) { + for (uint32_t YFactorsIdx = 0; YFactorsIdx < xyzFactorsLen[1]; ++YFactorsIdx) { + + uint32_t Xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - XFactorsIdx]; + uint32_t Ydim = xyzFactors[1][YFactorsIdx]; + + if ((Xdim * Ydim) > wsInfo.maxWorkGroupSize) { + break; + } + if ((Xdim * Ydim) < wsInfo.minWorkGroupSize) { + continue; + } + + workGroups = Math::divideAndRoundUp(workItems[0], Xdim); + workGroups *= Math::divideAndRoundUp(workItems[1], Ydim); + + ratioDiff = log((float)Xdim) - log((float)Ydim); + ratioDiff = fabs(wsInfo.targetRatio - ratioDiff); + + if (wsInfo.useStrictRatio == true) { + if (ratioDiff < localRatio) { + workGroupSize[0] = Xdim; + workGroupSize[1] = Ydim; + localRatio = ratioDiff; + localWkgs = workGroups; + } + } else { + if ((workGroups < localWkgs) || + ((workGroups == localWkgs) && (ratioDiff < localRatio))) { + workGroupSize[0] = Xdim; + workGroupSize[1] = Ydim; + localRatio = ratioDiff; + localWkgs = workGroups; + } + } + } + } +} +void choosePreferredWorkGroupSizeWithOutRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, uint32_t workdim) { + uint64_t localEuThrdsDispatched = 0xffffffffffffffff; + uint64_t workGroups; + for (uint32_t ZFactorsIdx = 0; ZFactorsIdx < xyzFactorsLen[2]; ++ZFactorsIdx) { + for (uint32_t XFactorsIdx = 0; XFactorsIdx < xyzFactorsLen[0]; ++XFactorsIdx) { + for (uint32_t YFactorsIdx = 0; YFactorsIdx < xyzFactorsLen[1]; ++YFactorsIdx) { + + uint32_t Xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - XFactorsIdx]; + uint32_t Ydim = xyzFactors[1][YFactorsIdx]; + uint32_t Zdim = xyzFactors[2][ZFactorsIdx]; + + if ((Xdim * Ydim * Zdim) > wsInfo.maxWorkGroupSize) { + break; + } + if ((Xdim * Ydim * Zdim) < wsInfo.minWorkGroupSize) { + continue; + } + + workGroups = Math::divideAndRoundUp(workItems[0], Xdim); + workGroups *= Math::divideAndRoundUp(workItems[1], Ydim); + workGroups *= Math::divideAndRoundUp(workItems[2], Zdim); + uint64_t euThrdsDispatched; + + euThrdsDispatched = Math::divideAndRoundUp(Xdim * Ydim * Zdim, wsInfo.simdSize); + euThrdsDispatched *= workGroups; + + if (euThrdsDispatched < localEuThrdsDispatched) { + localEuThrdsDispatched = euThrdsDispatched; + workGroupSize[0] = Xdim; + workGroupSize[1] = Ydim; + workGroupSize[2] = Zdim; + } + } + } + } +} + +void setSpecialWorkgroupSize(size_t workgroupSize[3]) { + workgroupSize[0] = 1; + workgroupSize[1] = 1; + workgroupSize[2] = 1; +} + +void computeWorkgroupSize1D(uint32_t maxWorkGroupSize, + size_t workGroupSize[3], + const size_t workItems[3], + size_t simdSize) { + auto items = workItems[0]; + + // Determine the LSB set to quickly handle factors of 2 + auto numBits = Math::getMinLsbSet(static_cast(items)); + + // Clamp power of 2 result to maxWorkGroupSize + uint32_t workSize = 1u << numBits; + + //Assumes maxWorkGroupSize is a power of two. + DEBUG_BREAK_IF((maxWorkGroupSize & (maxWorkGroupSize - 1)) != 0); + workSize = std::min(workSize, maxWorkGroupSize); + + // Try all primes as potential factors + workSize = factor(items, workSize, maxWorkGroupSize); + + workGroupSize[0] = workSize; + workGroupSize[1] = 1; + workGroupSize[2] = 1; +} + +void computeWorkgroupSize2D(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize) { + uint32_t xFactors[1024]; + uint32_t yFactors[1024]; + uint32_t xFactorsLen = 0; + uint32_t yFactorsLen = 0; + uint64_t waste; + uint64_t localWSWaste = 0xffffffffffffffff; + uint64_t euThrdsDispatched; + uint64_t localEuThrdsDispatched = 0xffffffffffffffff; + uint64_t workGroups; + uint32_t xDim; + uint32_t yDim; + + for (int i = 0; i < 3; i++) + workGroupSize[i] = 1; + + for (uint32_t i = 2; i <= maxWorkGroupSize; i++) { + if ((workItems[0] % i) == 0) { + xFactors[xFactorsLen++] = i; + } + if (((workItems[1] % i) == 0)) { + yFactors[yFactorsLen++] = i; + } + } + + for (uint32_t xFactorsIdx = 0; xFactorsIdx < xFactorsLen; ++xFactorsIdx) { + for (uint32_t yFactorsIdx = 0; yFactorsIdx < yFactorsLen; ++yFactorsIdx) { + // Pick a LocalWorkSize that is a multiple as well as appropriate: + // 1 <= workGroupSize[ 0 ] <= workItems[ 0 ] + // 1 <= workGroupSize[ 1 ] <= workItems[ 1 ] + xDim = xFactors[xFactorsLen - 1 - xFactorsIdx]; + yDim = yFactors[yFactorsIdx]; + + if ((xDim * yDim) > maxWorkGroupSize) { + // The yDim value is too big, so break out of this loop. + // No other entries will work. + break; + } + + // Find the wasted channels. + workGroups = Math::divideAndRoundUp(workItems[0], xDim); + workGroups *= Math::divideAndRoundUp(workItems[1], yDim); + + // Compaction Mode! + euThrdsDispatched = Math::divideAndRoundUp(xDim * yDim, simdSize); + euThrdsDispatched *= workGroups; + + waste = simdSize - ((xDim * yDim - 1) & (simdSize - 1)); + waste *= workGroups; + + if (((euThrdsDispatched < localEuThrdsDispatched) || + ((euThrdsDispatched == localEuThrdsDispatched) && (waste < localWSWaste)))) { + localWSWaste = waste; + localEuThrdsDispatched = euThrdsDispatched; + workGroupSize[0] = xDim; + workGroupSize[1] = yDim; + } + } + } +} + +void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) { + for (int i = 0; i < 3; i++) + workGroupSize[i] = 1; + size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1}; + for (auto i = 0u; i < workDim; i++) { + uint32_t requiredWorkItemsCount = maxWorkGroupSize; + while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount))) + requiredWorkItemsCount >>= 1; + itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount; + } + if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) { + while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) { + if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1]) + itemsPowerOfTwoDivisors[0] >>= 1; + else + itemsPowerOfTwoDivisors[1] >>= 1; + } + for (auto i = 0u; i < 3; i++) + workGroupSize[i] = itemsPowerOfTwoDivisors[i]; + return; + + } else if (workItems[0] * workItems[1] > maxWorkGroupSize) { + computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize); + return; + } else { + for (auto i = 0u; i < workDim; i++) + workGroupSize[i] = workItems[i]; + return; + } +} + +void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) { + for (int i = 0; i < 3; i++) + workGroupSize[i] = 1; + + uint64_t totalNuberOfItems = workItems[0] * workItems[1] * workItems[2]; + + UNRECOVERABLE_IF(wsInfo.simdSize == 0); + + //Find biggest power of two which devide each dimension size + if (wsInfo.slmTotalSize == 0 && !wsInfo.hasBarriers) { + if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && workDim == 2 && !wsInfo.imgUsed) { + computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim); + return; + } + + size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1}; + for (auto i = 0u; i < workDim; i++) { + uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalHardwareThreadCountGeneric[0]); + while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount))) + requiredWorkItemsCount >>= 1; + itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount; + } + + bool canUseNx4 = (wsInfo.imgUsed && + (itemsPowerOfTwoDivisors[0] >= 4 || (itemsPowerOfTwoDivisors[0] >= 2 && wsInfo.simdSize == 8)) && + itemsPowerOfTwoDivisors[1] >= 4); + + //If computed dimension sizes which are powers of two are creating group which is + //bigger than maxWorkGroupSize or this group would create more than optimal hardware threads then downsize it + uint64_t allItems = itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] * itemsPowerOfTwoDivisors[2]; + if (allItems > wsInfo.simdSize && (allItems > wsInfo.maxWorkGroupSize || allItems > wsInfo.simdSize * optimalHardwareThreadCountGeneric[0])) { + computePowerOfTwoLWS(itemsPowerOfTwoDivisors, wsInfo, workGroupSize, workDim, canUseNx4); + return; + } + //If coputed workgroup is at this point in correct size + else if (allItems >= wsInfo.simdSize) { + itemsPowerOfTwoDivisors[1] = canUseNx4 ? 4 : itemsPowerOfTwoDivisors[1]; + for (auto i = 0u; i < workDim; i++) + workGroupSize[i] = itemsPowerOfTwoDivisors[i]; + return; + } + } + //If dimensions are not powers of two but total number of items is less than max work group size + if (totalNuberOfItems <= wsInfo.maxWorkGroupSize) { + for (auto i = 0u; i < workDim; i++) + workGroupSize[i] = workItems[i]; + return; + } else { + if (workDim == 1) + computeWorkgroupSize1D(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize); + else { + uint32_t xyzFactors[3][1024]; + uint32_t xyzFactorsLen[3] = {}; + + //check if algorithm should use ratio + wsInfo.checkRatio(workItems); + + //find all divisors for all dimensions + for (int i = 0; i < 3; i++) + xyzFactors[i][xyzFactorsLen[i]++] = 1; + for (auto i = 0u; i < workDim; i++) { + for (auto j = 2u; j < wsInfo.maxWorkGroupSize; ++j) { + if ((workItems[i] % j) == 0) { + xyzFactors[i][xyzFactorsLen[i]++] = j; + } + } + } + if (wsInfo.useRatio) { + choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo); + if (wsInfo.useStrictRatio && workGroupSize[0] * workGroupSize[1] * 2 <= wsInfo.simdSize) { + wsInfo.useStrictRatio = false; + choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo); + } + } else + choosePreferredWorkGroupSizeWithOutRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, workDim); + } + } +} + +Vec3 computeWorkgroupsNumber(const Vec3 &gws, const Vec3 &lws) { + return (Vec3(gws.x / lws.x + ((gws.x % lws.x) ? 1 : 0), + gws.y / lws.y + ((gws.y % lws.y) ? 1 : 0), + gws.z / lws.z + ((gws.z % lws.z) ? 1 : 0))); +} + +Vec3 generateWorkgroupsNumber(const Vec3 &gws, const Vec3 &lws) { + return (lws.x > 0) ? computeWorkgroupsNumber(gws, lws) : Vec3(0, 0, 0); +} + +Vec3 canonizeWorkgroup(const Vec3 &workgroup) { + return ((workgroup.x > 0) ? Vec3({workgroup.x, std::max(workgroup.y, static_cast(1)), std::max(workgroup.z, static_cast(1))}) + : Vec3(0, 0, 0)); +} + +} // namespace NEO diff --git a/shared/source/helpers/local_work_size.h b/shared/source/helpers/local_work_size.h new file mode 100644 index 0000000000..da4d8f230e --- /dev/null +++ b/shared/source/helpers/local_work_size.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2018-2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +#include "shared/source/helpers/vec.h" + +#include + +namespace NEO { +struct WorkSizeInfo; + +void computeWorkgroupSize1D( + uint32_t maxWorkGroupSize, + size_t workGroupSize[3], + const size_t workItems[3], + size_t simdSize); + +void computeWorkgroupSizeND( + WorkSizeInfo &wsInfo, + size_t workGroupSize[3], + const size_t workItems[3], + const uint32_t workDim); + +void computeWorkgroupSize2D( + uint32_t maxWorkGroupSize, + size_t workGroupSize[3], + const size_t workItems[3], + size_t simdSize); + +void computeWorkgroupSizeSquared( + uint32_t maxWorkGroupSize, + size_t workGroupSize[3], + const size_t workItems[3], + size_t simdSize, + const uint32_t workDim); + +Vec3 computeWorkgroupsNumber( + const Vec3 &gws, + const Vec3 &lws); + +Vec3 generateWorkgroupsNumber( + const Vec3 &gws, + const Vec3 &lws); + +inline uint32_t calculateDispatchDim(const Vec3 &dispatchSize, const Vec3 &dispatchOffset) { + return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim())); +} + +Vec3 canonizeWorkgroup( + const Vec3 &workgroup); + +void setSpecialWorkgroupSize(size_t workgroupSize[3]); + +inline uint32_t computeDimensions(const size_t workItems[3]) { + return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 + : 1; +} + +} // namespace NEO