diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp
index 9fa5731ad1..4607058e0f 100644
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -11,9 +11,12 @@
 #include "shared/source/helpers/blit_commands_helper.h"
 #include "shared/source/helpers/hw_info.h"
 #include "shared/source/helpers/kernel_helpers.h"
+#include "shared/source/helpers/local_work_size.h"
+#include "shared/source/helpers/per_thread_data.h"
 #include "shared/source/helpers/register_offsets.h"
 #include "shared/source/helpers/string.h"
 #include "shared/source/helpers/surface_format_info.h"
+#include "shared/source/kernel/implicit_args.h"
 #include "shared/source/kernel/kernel_arg_descriptor.h"
 #include "shared/source/kernel/kernel_descriptor.h"
 #include "shared/source/memory_manager/memory_manager.h"
@@ -22,7 +25,6 @@
 #include "shared/source/program/kernel_info.h"
 #include "shared/source/utilities/arrayref.h"
 
-#include "opencl/source/command_queue/gpgpu_walker.h"
 #include "opencl/source/mem_obj/buffer.h"
 
 #include "level_zero/core/source/debugger/debugger_l0.h"
diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h
index 649595547a..b3e2a8c28e 100644
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -10,6 +10,7 @@
 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/source/helpers/array_count.h"
 #include "shared/source/helpers/engine_node_helper.h"
+#include "shared/source/helpers/local_work_size.h"
 #include "shared/source/memory_manager/internal_allocation_storage.h"
 #include "shared/source/memory_manager/memory_manager.h"
 #include "shared/source/memory_manager/surface.h"
diff --git a/opencl/source/command_queue/gpgpu_walker.h b/opencl/source/command_queue/gpgpu_walker.h
index 4431e1dd6d..4410db55d7 100644
--- a/opencl/source/command_queue/gpgpu_walker.h
+++ b/opencl/source/command_queue/gpgpu_walker.h
@@ -14,7 +14,6 @@
 #include "shared/source/helpers/timestamp_packet.h"
 #include "shared/source/helpers/vec.h"
 #include "shared/source/indirect_heap/indirect_heap.h"
-#include "shared/source/program/kernel_info.h"
 #include "shared/source/utilities/hw_timestamps.h"
 #include "shared/source/utilities/perf_counter.h"
 #include "shared/source/utilities/tag_allocator.h"
@@ -36,64 +35,17 @@ using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;
 template <typename GfxFamily>
 using MI_STORE_REG_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM_CMD;
 
-void computeWorkgroupSize1D(
-    uint32_t maxWorkGroupSize,
-    size_t workGroupSize[3],
-    const size_t workItems[3],
-    size_t simdSize);
-
-void computeWorkgroupSizeND(
-    WorkSizeInfo &wsInfo,
-    size_t workGroupSize[3],
-    const size_t workItems[3],
-    const uint32_t workDim);
-
-void computeWorkgroupSize2D(
-    uint32_t maxWorkGroupSize,
-    size_t workGroupSize[3],
-    const size_t workItems[3],
-    size_t simdSize);
-
-void computeWorkgroupSizeSquared(
-    uint32_t maxWorkGroupSize,
-    size_t workGroupSize[3],
-    const size_t workItems[3],
-    size_t simdSize,
-    const uint32_t workDim);
-
 Vec3<size_t> computeWorkgroupSize(
     const DispatchInfo &dispatchInfo);
 
 Vec3<size_t> generateWorkgroupSize(
     const DispatchInfo &dispatchInfo);
 
-Vec3<size_t> computeWorkgroupsNumber(
-    const Vec3<size_t> &gws,
-    const Vec3<size_t> &lws);
-
-Vec3<size_t> generateWorkgroupsNumber(
-    const Vec3<size_t> &gws,
-    const Vec3<size_t> &lws);
-
 Vec3<size_t> generateWorkgroupsNumber(
     const DispatchInfo &dispatchInfo);
 
-inline uint32_t calculateDispatchDim(const Vec3<size_t> &dispatchSize, const Vec3<size_t> &dispatchOffset) {
-    return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
-}
-
-Vec3<size_t> canonizeWorkgroup(
-    const Vec3<size_t> &workgroup);
-
 void provideLocalWorkGroupSizeHints(Context *context, DispatchInfo dispatchInfo);
 
-void setSpecialWorkgroupSize(size_t workgroupSize[3]);
-
-inline cl_uint computeDimensions(const size_t workItems[3]) {
-    return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2
-                                                       : 1;
-}
-
 WorkSizeInfo createWorkSizeInfoFromDispatchInfo(const DispatchInfo &dispatchInfo);
 
 template <typename GfxFamily>
diff --git a/opencl/source/command_queue/local_work_size.cpp b/opencl/source/command_queue/local_work_size.cpp
index 05776ca5d0..82e9bd5361 100644
--- a/opencl/source/command_queue/local_work_size.cpp
+++ b/opencl/source/command_queue/local_work_size.cpp
@@ -5,11 +5,14 @@
  *
  */
 
+#include "shared/source/helpers/local_work_size.h"
+
 #include "shared/source/device/device.h"
 #include "shared/source/helpers/array_count.h"
 #include "shared/source/helpers/basic_math.h"
 #include "shared/source/helpers/debug_helpers.h"
 #include "shared/source/helpers/hw_helper.h"
+#include "shared/source/program/kernel_info.h"
 
 #include "opencl/source/cl_device/cl_device.h"
 #include "opencl/source/command_queue/gpgpu_walker.h"
@@ -17,400 +20,12 @@
 #include "opencl/source/helpers/dispatch_info.h"
 #include "opencl/source/kernel/kernel.h"
 
-#include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <ctime>
 
 namespace NEO {
 
-//threshold used to determine what kind of device is underneath
-//big cores like SKL have 8EU * 7 HW threads per subslice and are considered as highThreadCount devices
-constexpr uint32_t highThreadCountThreshold = 56u;
-
-static const uint32_t optimalHardwareThreadCountGeneric[] = {32, 16, 8, 4, 2, 1};
-
-static const uint32_t primeNumbers[] = {
-    251,
-    241,
-    239, 233,
-    229, 227, 223,
-    211,
-    199, 197, 193, 191,
-    181,
-    179, 173,
-    167, 163,
-    157, 151,
-    149,
-    139, 137, 131,
-    127,
-    113,
-    109, 107, 103, 101,
-    97,
-    89, 83,
-    79, 73, 71,
-    67, 61,
-    59, 53,
-    47, 43, 41,
-    37, 31,
-    29, 23,
-    19, 17, 13, 11,
-    7, 5, 3, 2};
-
-static const size_t MAX_PRIMES = sizeof(primeNumbers) / sizeof(primeNumbers[0]);
-
-// Recursive template function to test prime factors
-template <uint32_t primeIndex>
-static inline uint32_t factor(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) {
-    auto primeNumber = primeNumbers[primeIndex];
-
-    auto newWorkSize = workSize * primeNumber;
-    if (newWorkSize <= workItems) {
-        while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) {
-            workSize = newWorkSize;
-            newWorkSize = workSize * primeNumber;
-        }
-
-        workSize = factor<primeIndex - 1>(workItems, workSize, maxWorkGroupSize);
-    }
-
-    return workSize;
-}
-
-// Terminator of recursive factoring logic
-template <>
-inline uint32_t factor<0>(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) {
-    uint32_t primeIndex = 0;
-    auto primeNumber = primeNumbers[primeIndex];
-
-    auto newWorkSize = workSize * primeNumber;
-    if (newWorkSize <= workItems) {
-        while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) {
-            workSize = newWorkSize;
-            newWorkSize = workSize * primeNumber;
-        }
-    }
-
-    return workSize;
-}
-
-void computePowerOfTwoLWS(const size_t workItems[3], WorkSizeInfo &workGroupInfo, size_t workGroupSize[3], const uint32_t workDim, bool canUseNx4) {
-    uint32_t targetIndex = (canUseNx4 || workGroupInfo.numThreadsPerSubSlice < highThreadCountThreshold) ? 2 : 0;
-    auto arraySize = arrayCount(optimalHardwareThreadCountGeneric);
-    auto simdSize = workGroupInfo.simdSize;
-
-    while (targetIndex < arraySize &&
-           optimalHardwareThreadCountGeneric[targetIndex] > 1 &&
-           workGroupInfo.maxWorkGroupSize < optimalHardwareThreadCountGeneric[targetIndex] * simdSize) {
-        targetIndex++;
-    }
-    uint32_t optimalLocalThreads = optimalHardwareThreadCountGeneric[targetIndex];
-
-    if (workDim == 2) {
-        uint32_t xDim, yDim;
-        xDim = uint32_t(optimalLocalThreads * simdSize) / (canUseNx4 ? 4 : 1);
-        while (xDim > workItems[0])
-            xDim = xDim >> 1;
-        yDim = canUseNx4 ? 4 : (uint32_t(optimalLocalThreads * simdSize) / xDim);
-        workGroupSize[0] = xDim;
-        workGroupSize[1] = yDim;
-    } else {
-        uint32_t xDim, yDim, zDim;
-        xDim = uint32_t(optimalLocalThreads * simdSize);
-        while (xDim > workItems[0])
-            xDim = xDim >> 1;
-        yDim = uint32_t(optimalLocalThreads * simdSize) / xDim;
-        while (yDim > workItems[1])
-            yDim = yDim >> 1;
-        UNRECOVERABLE_IF((xDim * yDim) == 0);
-        zDim = uint32_t(optimalLocalThreads * simdSize) / (xDim * yDim);
-        workGroupSize[0] = xDim;
-        workGroupSize[1] = yDim;
-        workGroupSize[2] = zDim;
-    }
-}
-
-void choosePreferredWorkGroupSizeWithRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo) {
-    float ratioDiff = 0;
-    float localRatio = float(0xffffffff);
-    ulong localWkgs = 0xffffffff;
-    ulong workGroups;
-    for (cl_uint XFactorsIdx = 0; XFactorsIdx < xyzFactorsLen[0]; ++XFactorsIdx) {
-        for (cl_uint YFactorsIdx = 0; YFactorsIdx < xyzFactorsLen[1]; ++YFactorsIdx) {
-
-            uint32_t Xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - XFactorsIdx];
-            uint32_t Ydim = xyzFactors[1][YFactorsIdx];
-
-            if ((Xdim * Ydim) > wsInfo.maxWorkGroupSize) {
-                break;
-            }
-            if ((Xdim * Ydim) < wsInfo.minWorkGroupSize) {
-                continue;
-            }
-
-            workGroups = Math::divideAndRoundUp(workItems[0], Xdim);
-            workGroups *= Math::divideAndRoundUp(workItems[1], Ydim);
-
-            ratioDiff = log((float)Xdim) - log((float)Ydim);
-            ratioDiff = fabs(wsInfo.targetRatio - ratioDiff);
-
-            if (wsInfo.useStrictRatio == CL_TRUE) {
-                if (ratioDiff < localRatio) {
-                    workGroupSize[0] = Xdim;
-                    workGroupSize[1] = Ydim;
-                    localRatio = ratioDiff;
-                    localWkgs = workGroups;
-                }
-            } else {
-                if ((workGroups < localWkgs) ||
-                    ((workGroups == localWkgs) && (ratioDiff < localRatio))) {
-                    workGroupSize[0] = Xdim;
-                    workGroupSize[1] = Ydim;
-                    localRatio = ratioDiff;
-                    localWkgs = workGroups;
-                }
-            }
-        }
-    }
-}
-void choosePreferredWorkGroupSizeWithOutRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, uint32_t workdim) {
-    uint64_t localEuThrdsDispatched = 0xffffffffffffffff;
-    uint64_t workGroups;
-    for (uint32_t ZFactorsIdx = 0; ZFactorsIdx < xyzFactorsLen[2]; ++ZFactorsIdx) {
-        for (uint32_t XFactorsIdx = 0; XFactorsIdx < xyzFactorsLen[0]; ++XFactorsIdx) {
-            for (uint32_t YFactorsIdx = 0; YFactorsIdx < xyzFactorsLen[1]; ++YFactorsIdx) {
-
-                uint32_t Xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - XFactorsIdx];
-                uint32_t Ydim = xyzFactors[1][YFactorsIdx];
-                uint32_t Zdim = xyzFactors[2][ZFactorsIdx];
-
-                if ((Xdim * Ydim * Zdim) > wsInfo.maxWorkGroupSize) {
-                    break;
-                }
-                if ((Xdim * Ydim * Zdim) < wsInfo.minWorkGroupSize) {
-                    continue;
-                }
-
-                workGroups = Math::divideAndRoundUp(workItems[0], Xdim);
-                workGroups *= Math::divideAndRoundUp(workItems[1], Ydim);
-                workGroups *= Math::divideAndRoundUp(workItems[2], Zdim);
-                uint64_t euThrdsDispatched;
-
-                euThrdsDispatched = Math::divideAndRoundUp(Xdim * Ydim * Zdim, wsInfo.simdSize);
-                euThrdsDispatched *= workGroups;
-
-                if (euThrdsDispatched < localEuThrdsDispatched) {
-                    localEuThrdsDispatched = euThrdsDispatched;
-                    workGroupSize[0] = Xdim;
-                    workGroupSize[1] = Ydim;
-                    workGroupSize[2] = Zdim;
-                }
-            }
-        }
-    }
-}
-
-void setSpecialWorkgroupSize(size_t workgroupSize[3]) {
-    workgroupSize[0] = 1;
-    workgroupSize[1] = 1;
-    workgroupSize[2] = 1;
-}
-
-void computeWorkgroupSize1D(uint32_t maxWorkGroupSize,
-                            size_t workGroupSize[3],
-                            const size_t workItems[3],
-                            size_t simdSize) {
-    auto items = workItems[0];
-
-    // Determine the LSB set to quickly handle factors of 2
-    auto numBits = Math::getMinLsbSet(static_cast<uint32_t>(items));
-
-    // Clamp power of 2 result to maxWorkGroupSize
-    uint32_t workSize = 1u << numBits;
-
-    //Assumes maxWorkGroupSize is a power of two.
-    DEBUG_BREAK_IF((maxWorkGroupSize & (maxWorkGroupSize - 1)) != 0);
-    workSize = std::min(workSize, maxWorkGroupSize);
-
-    // Try all primes as potential factors
-    workSize = factor<MAX_PRIMES - 1>(items, workSize, maxWorkGroupSize);
-
-    workGroupSize[0] = workSize;
-    workGroupSize[1] = 1;
-    workGroupSize[2] = 1;
-}
-
-void computeWorkgroupSize2D(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize) {
-    uint32_t xFactors[1024];
-    uint32_t yFactors[1024];
-    uint32_t xFactorsLen = 0;
-    uint32_t yFactorsLen = 0;
-    uint64_t waste;
-    uint64_t localWSWaste = 0xffffffffffffffff;
-    uint64_t euThrdsDispatched;
-    uint64_t localEuThrdsDispatched = 0xffffffffffffffff;
-    uint64_t workGroups;
-    uint32_t xDim;
-    uint32_t yDim;
-
-    for (int i = 0; i < 3; i++)
-        workGroupSize[i] = 1;
-
-    for (uint32_t i = 2; i <= maxWorkGroupSize; i++) {
-        if ((workItems[0] % i) == 0) {
-            xFactors[xFactorsLen++] = i;
-        }
-        if (((workItems[1] % i) == 0)) {
-            yFactors[yFactorsLen++] = i;
-        }
-    }
-
-    for (uint32_t xFactorsIdx = 0; xFactorsIdx < xFactorsLen; ++xFactorsIdx) {
-        for (uint32_t yFactorsIdx = 0; yFactorsIdx < yFactorsLen; ++yFactorsIdx) {
-            // Pick a LocalWorkSize that is a multiple as well as appropriate:
-            // 1 <= workGroupSize[ 0 ] <= workItems[ 0 ]
-            // 1 <= workGroupSize[ 1 ] <= workItems[ 1 ]
-            xDim = xFactors[xFactorsLen - 1 - xFactorsIdx];
-            yDim = yFactors[yFactorsIdx];
-
-            if ((xDim * yDim) > maxWorkGroupSize) {
-                // The yDim value is too big, so break out of this loop.
-                // No other entries will work.
-                break;
-            }
-
-            // Find the wasted channels.
-            workGroups = Math::divideAndRoundUp(workItems[0], xDim);
-            workGroups *= Math::divideAndRoundUp(workItems[1], yDim);
-
-            // Compaction Mode!
-            euThrdsDispatched = Math::divideAndRoundUp(xDim * yDim, simdSize);
-            euThrdsDispatched *= workGroups;
-
-            waste = simdSize - ((xDim * yDim - 1) & (simdSize - 1));
-            waste *= workGroups;
-
-            if (((euThrdsDispatched < localEuThrdsDispatched) ||
-                 ((euThrdsDispatched == localEuThrdsDispatched) && (waste < localWSWaste)))) {
-                localWSWaste = waste;
-                localEuThrdsDispatched = euThrdsDispatched;
-                workGroupSize[0] = xDim;
-                workGroupSize[1] = yDim;
-            }
-        }
-    }
-}
-
-void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) {
-    for (int i = 0; i < 3; i++)
-        workGroupSize[i] = 1;
-    size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
-    for (auto i = 0u; i < workDim; i++) {
-        uint32_t requiredWorkItemsCount = maxWorkGroupSize;
-        while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
-            requiredWorkItemsCount >>= 1;
-        itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
-    }
-    if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) {
-        while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) {
-            if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1])
-                itemsPowerOfTwoDivisors[0] >>= 1;
-            else
-                itemsPowerOfTwoDivisors[1] >>= 1;
-        }
-        for (auto i = 0u; i < 3; i++)
-            workGroupSize[i] = itemsPowerOfTwoDivisors[i];
-        return;
-
-    } else if (workItems[0] * workItems[1] > maxWorkGroupSize) {
-        computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize);
-        return;
-    } else {
-        for (auto i = 0u; i < workDim; i++)
-            workGroupSize[i] = workItems[i];
-        return;
-    }
-}
-
-void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) {
-    for (int i = 0; i < 3; i++)
-        workGroupSize[i] = 1;
-
-    uint64_t totalNuberOfItems = workItems[0] * workItems[1] * workItems[2];
-
-    UNRECOVERABLE_IF(wsInfo.simdSize == 0);
-
-    //Find biggest power of two which devide each dimension size
-    if (wsInfo.slmTotalSize == 0 && !wsInfo.hasBarriers) {
-        if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && workDim == 2 && !wsInfo.imgUsed) {
-            computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim);
-            return;
-        }
-
-        size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
-        for (auto i = 0u; i < workDim; i++) {
-            uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalHardwareThreadCountGeneric[0]);
-            while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
-                requiredWorkItemsCount >>= 1;
-            itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
-        }
-
-        bool canUseNx4 = (wsInfo.imgUsed &&
-                          (itemsPowerOfTwoDivisors[0] >= 4 || (itemsPowerOfTwoDivisors[0] >= 2 && wsInfo.simdSize == 8)) &&
-                          itemsPowerOfTwoDivisors[1] >= 4);
-
-        //If computed dimension sizes which are powers of two are creating group which is
-        //bigger than maxWorkGroupSize or this group would create more than optimal hardware threads then downsize it
-        uint64_t allItems = itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] * itemsPowerOfTwoDivisors[2];
-        if (allItems > wsInfo.simdSize && (allItems > wsInfo.maxWorkGroupSize || allItems > wsInfo.simdSize * optimalHardwareThreadCountGeneric[0])) {
-            computePowerOfTwoLWS(itemsPowerOfTwoDivisors, wsInfo, workGroupSize, workDim, canUseNx4);
-            return;
-        }
-        //If coputed workgroup is at this point in correct size
-        else if (allItems >= wsInfo.simdSize) {
-            itemsPowerOfTwoDivisors[1] = canUseNx4 ? 4 : itemsPowerOfTwoDivisors[1];
-            for (auto i = 0u; i < workDim; i++)
-                workGroupSize[i] = itemsPowerOfTwoDivisors[i];
-            return;
-        }
-    }
-    //If dimensions are not powers of two but total number of items is less than max work group size
-    if (totalNuberOfItems <= wsInfo.maxWorkGroupSize) {
-        for (auto i = 0u; i < workDim; i++)
-            workGroupSize[i] = workItems[i];
-        return;
-    } else {
-        if (workDim == 1)
-            computeWorkgroupSize1D(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize);
-        else {
-            uint32_t xyzFactors[3][1024];
-            uint32_t xyzFactorsLen[3] = {};
-
-            //check if algorithm should use ratio
-            wsInfo.checkRatio(workItems);
-
-            //find all divisors for all dimensions
-            for (int i = 0; i < 3; i++)
-                xyzFactors[i][xyzFactorsLen[i]++] = 1;
-            for (auto i = 0u; i < workDim; i++) {
-                for (auto j = 2u; j < wsInfo.maxWorkGroupSize; ++j) {
-                    if ((workItems[i] % j) == 0) {
-                        xyzFactors[i][xyzFactorsLen[i]++] = j;
-                    }
-                }
-            }
-            if (wsInfo.useRatio) {
-                choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo);
-                if (wsInfo.useStrictRatio && workGroupSize[0] * workGroupSize[1] * 2 <= wsInfo.simdSize) {
-                    wsInfo.useStrictRatio = false;
-                    choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo);
-                }
-            } else
-                choosePreferredWorkGroupSizeWithOutRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, workDim);
-        }
-    }
-}
-
 Vec3<size_t> computeWorkgroupSize(const DispatchInfo &dispatchInfo) {
     size_t workGroupSize[3] = {};
     auto kernel = dispatchInfo.getKernel();
@@ -453,25 +68,10 @@ Vec3<size_t> generateWorkgroupSize(const DispatchInfo &dispatchInfo) {
     return (dispatchInfo.getEnqueuedWorkgroupSize().x == 0) ? computeWorkgroupSize(dispatchInfo) : dispatchInfo.getEnqueuedWorkgroupSize();
 }
 
-Vec3<size_t> computeWorkgroupsNumber(const Vec3<size_t> &gws, const Vec3<size_t> &lws) {
-    return (Vec3<size_t>(gws.x / lws.x + ((gws.x % lws.x) ? 1 : 0),
-                         gws.y / lws.y + ((gws.y % lws.y) ? 1 : 0),
-                         gws.z / lws.z + ((gws.z % lws.z) ? 1 : 0)));
-}
-
-Vec3<size_t> generateWorkgroupsNumber(const Vec3<size_t> &gws, const Vec3<size_t> &lws) {
-    return (lws.x > 0) ? computeWorkgroupsNumber(gws, lws) : Vec3<size_t>(0, 0, 0);
-}
-
 Vec3<size_t> generateWorkgroupsNumber(const DispatchInfo &dispatchInfo) {
     return generateWorkgroupsNumber(dispatchInfo.getGWS(), dispatchInfo.getLocalWorkgroupSize());
 }
 
-Vec3<size_t> canonizeWorkgroup(const Vec3<size_t> &workgroup) {
-    return ((workgroup.x > 0) ? Vec3<size_t>({workgroup.x, std::max(workgroup.y, static_cast<size_t>(1)), std::max(workgroup.z, static_cast<size_t>(1))})
-                              : Vec3<size_t>(0, 0, 0));
-}
-
 void provideLocalWorkGroupSizeHints(Context *context, DispatchInfo dispatchInfo) {
     if (context != nullptr && context->isProvidingPerformanceHints() && dispatchInfo.getDim() <= 3) {
         size_t preferredWorkGroupSize[3];
diff --git a/opencl/source/helpers/dispatch_info_builder.h b/opencl/source/helpers/dispatch_info_builder.h
index e1fd7db2d6..67a458738e 100644
--- a/opencl/source/helpers/dispatch_info_builder.h
+++ b/opencl/source/helpers/dispatch_info_builder.h
@@ -6,6 +6,7 @@
  */
 
 #pragma once
+#include "shared/source/helpers/local_work_size.h"
 
 #include "opencl/source/command_queue/gpgpu_walker.h"
 #include "opencl/source/helpers/dispatch_info.h"
diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
index 6f8fe78814..0365f0cc85 100644
--- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
+++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
@@ -7,6 +7,7 @@
 
 #include "shared/source/helpers/aligned_memory.h"
 #include "shared/source/helpers/hw_helper.h"
+#include "shared/source/helpers/local_work_size.h"
 #include "shared/source/memory_manager/internal_allocation_storage.h"
 #include "shared/source/utilities/perf_counter.h"
 #include "shared/source/utilities/tag_allocator.h"
diff --git a/opencl/test/unit_test/command_queue/local_work_size_tests.cpp b/opencl/test/unit_test/command_queue/local_work_size_tests.cpp
index d2e3d56f43..6b2b87419d 100644
--- a/opencl/test/unit_test/command_queue/local_work_size_tests.cpp
+++ b/opencl/test/unit_test/command_queue/local_work_size_tests.cpp
@@ -5,6 +5,7 @@
  *
  */
 
+#include "shared/source/helpers/local_work_size.h"
 #include "shared/test/common/helpers/debug_manager_state_restore.h"
 #include "shared/test/common/mocks/mock_device.h"
 
diff --git a/opencl/test/unit_test/command_queue/work_group_size_tests.cpp b/opencl/test/unit_test/command_queue/work_group_size_tests.cpp
index c4656e4c21..15b21afb62 100644
--- a/opencl/test/unit_test/command_queue/work_group_size_tests.cpp
+++ b/opencl/test/unit_test/command_queue/work_group_size_tests.cpp
@@ -5,6 +5,7 @@
  *
  */
 
+#include "shared/source/helpers/local_work_size.h"
 #include "shared/test/common/helpers/debug_manager_state_restore.h"
 
 #include "opencl/source/command_queue/gpgpu_walker.h"
@@ -58,10 +59,11 @@ struct WorkGroupSizeBase {
             WorkSizeInfo wsInfo(maxWorkGroupSize, 0u, simdSize, 0u, ::defaultHwInfo.get(), 32u, 0u, false, false);
             computeWorkgroupSizeND(wsInfo, workGroupSize, workItems, dims);
         } else {
-            if (dims == 1)
+            if (dims == 1) {
                 computeWorkgroupSize1D(maxWorkGroupSize, workGroupSize, workItems, simdSize);
-            else
+            } else {
                 computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize);
+            }
         }
         auto totalWorkItems = workItems[0] * workItems[1] * workItems[2];
         auto localWorkItems = workGroupSize[0] * workGroupSize[1] * workGroupSize[2];
diff --git a/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp b/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp
index 8f462f6749..b40c91f5b7 100644
--- a/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp
+++ b/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp
@@ -5,6 +5,7 @@
  *
  */
 
+#include "shared/source/helpers/local_work_size.h"
 #include "shared/source/memory_manager/unified_memory_manager.h"
 #include "shared/test/common/helpers/debug_manager_state_restore.h"
 
diff --git a/shared/source/helpers/CMakeLists.txt b/shared/source/helpers/CMakeLists.txt
index bf1d250499..358afc75df 100644
--- a/shared/source/helpers/CMakeLists.txt
+++ b/shared/source/helpers/CMakeLists.txt
@@ -77,6 +77,8 @@ set(NEO_CORE_HELPERS
     ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl
     ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_special.inl
     ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_sse4.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/local_work_size.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/local_work_size.h
     ${CMAKE_CURRENT_SOURCE_DIR}/neo_driver_version.h
     ${CMAKE_CURRENT_SOURCE_DIR}/non_copyable_or_moveable.h
     ${CMAKE_CURRENT_SOURCE_DIR}/options.h
diff --git a/shared/source/helpers/local_work_size.cpp b/shared/source/helpers/local_work_size.cpp
new file mode 100644
index 0000000000..f0c882e364
--- /dev/null
+++ b/shared/source/helpers/local_work_size.cpp
@@ -0,0 +1,425 @@
+/*
+ * Copyright (C) 2018-2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/helpers/local_work_size.h"
+
+#include "shared/source/debug_settings/debug_settings_manager.h"
+#include "shared/source/helpers/array_count.h"
+#include "shared/source/helpers/basic_math.h"
+#include "shared/source/helpers/debug_helpers.h"
+#include "shared/source/helpers/hw_helper.h"
+#include "shared/source/program/kernel_info.h"
+
+#include <cmath>
+#include <cstdint>
+#include <ctime>
+
+namespace NEO {
+
+//threshold used to determine what kind of device is underneath
+//big cores like SKL have 8EU * 7 HW threads per subslice and are considered as highThreadCount devices
+constexpr uint32_t highThreadCountThreshold = 56u;
+
+static const uint32_t optimalHardwareThreadCountGeneric[] = {32, 16, 8, 4, 2, 1};
+
+static const uint32_t primeNumbers[] = {
+    251,
+    241,
+    239, 233,
+    229, 227, 223,
+    211,
+    199, 197, 193, 191,
+    181,
+    179, 173,
+    167, 163,
+    157, 151,
+    149,
+    139, 137, 131,
+    127,
+    113,
+    109, 107, 103, 101,
+    97,
+    89, 83,
+    79, 73, 71,
+    67, 61,
+    59, 53,
+    47, 43, 41,
+    37, 31,
+    29, 23,
+    19, 17, 13, 11,
+    7, 5, 3, 2};
+
+static const size_t MAX_PRIMES = sizeof(primeNumbers) / sizeof(primeNumbers[0]);
+
+// Recursive template function to test prime factors
+template <uint32_t primeIndex>
+static inline uint32_t factor(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) {
+    auto primeNumber = primeNumbers[primeIndex];
+
+    auto newWorkSize = workSize * primeNumber;
+    if (newWorkSize <= workItems) {
+        while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) {
+            workSize = newWorkSize;
+            newWorkSize = workSize * primeNumber;
+        }
+
+        workSize = factor<primeIndex - 1>(workItems, workSize, maxWorkGroupSize);
+    }
+
+    return workSize;
+}
+
+// Terminator of recursive factoring logic
+template <>
+inline uint32_t factor<0>(size_t workItems, uint32_t workSize, uint32_t maxWorkGroupSize) {
+    uint32_t primeIndex = 0;
+    auto primeNumber = primeNumbers[primeIndex];
+
+    auto newWorkSize = workSize * primeNumber;
+    if (newWorkSize <= workItems) {
+        while (newWorkSize <= maxWorkGroupSize && (workItems % newWorkSize) == 0) {
+            workSize = newWorkSize;
+            newWorkSize = workSize * primeNumber;
+        }
+    }
+
+    return workSize;
+}
+
+void computePowerOfTwoLWS(const size_t workItems[3], WorkSizeInfo &workGroupInfo, size_t workGroupSize[3], const uint32_t workDim, bool canUseNx4) {
+    uint32_t targetIndex = (canUseNx4 || workGroupInfo.numThreadsPerSubSlice < highThreadCountThreshold) ? 2 : 0;
+    auto arraySize = arrayCount(optimalHardwareThreadCountGeneric);
+    auto simdSize = workGroupInfo.simdSize;
+
+    while (targetIndex < arraySize &&
+           optimalHardwareThreadCountGeneric[targetIndex] > 1 &&
+           workGroupInfo.maxWorkGroupSize < optimalHardwareThreadCountGeneric[targetIndex] * simdSize) {
+        targetIndex++;
+    }
+    uint32_t optimalLocalThreads = optimalHardwareThreadCountGeneric[targetIndex];
+
+    if (workDim == 2) {
+        uint32_t xDim, yDim;
+        xDim = uint32_t(optimalLocalThreads * simdSize) / (canUseNx4 ? 4 : 1);
+        while (xDim > workItems[0])
+            xDim = xDim >> 1;
+        yDim = canUseNx4 ? 4 : (uint32_t(optimalLocalThreads * simdSize) / xDim);
+        workGroupSize[0] = xDim;
+        workGroupSize[1] = yDim;
+    } else {
+        uint32_t xDim, yDim, zDim;
+        xDim = uint32_t(optimalLocalThreads * simdSize);
+        while (xDim > workItems[0])
+            xDim = xDim >> 1;
+        yDim = uint32_t(optimalLocalThreads * simdSize) / xDim;
+        while (yDim > workItems[1])
+            yDim = yDim >> 1;
+        UNRECOVERABLE_IF((xDim * yDim) == 0);
+        zDim = uint32_t(optimalLocalThreads * simdSize) / (xDim * yDim);
+        workGroupSize[0] = xDim;
+        workGroupSize[1] = yDim;
+        workGroupSize[2] = zDim;
+    }
+}
+
+void choosePreferredWorkGroupSizeWithRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo) {
+    float ratioDiff = 0;
+    float localRatio = float(0xffffffff);
+    uint64_t localWkgs = 0xffffffff;
+    uint64_t workGroups;
+    for (uint32_t XFactorsIdx = 0; XFactorsIdx < xyzFactorsLen[0]; ++XFactorsIdx) {
+        for (uint32_t YFactorsIdx = 0; YFactorsIdx < xyzFactorsLen[1]; ++YFactorsIdx) {
+
+            uint32_t Xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - XFactorsIdx];
+            uint32_t Ydim = xyzFactors[1][YFactorsIdx];
+
+            if ((Xdim * Ydim) > wsInfo.maxWorkGroupSize) {
+                break;
+            }
+            if ((Xdim * Ydim) < wsInfo.minWorkGroupSize) {
+                continue;
+            }
+
+            workGroups = Math::divideAndRoundUp(workItems[0], Xdim);
+            workGroups *= Math::divideAndRoundUp(workItems[1], Ydim);
+
+            ratioDiff = log((float)Xdim) - log((float)Ydim);
+            ratioDiff = fabs(wsInfo.targetRatio - ratioDiff);
+
+            if (wsInfo.useStrictRatio == true) {
+                if (ratioDiff < localRatio) {
+                    workGroupSize[0] = Xdim;
+                    workGroupSize[1] = Ydim;
+                    localRatio = ratioDiff;
+                    localWkgs = workGroups;
+                }
+            } else {
+                if ((workGroups < localWkgs) ||
+                    ((workGroups == localWkgs) && (ratioDiff < localRatio))) {
+                    workGroupSize[0] = Xdim;
+                    workGroupSize[1] = Ydim;
+                    localRatio = ratioDiff;
+                    localWkgs = workGroups;
+                }
+            }
+        }
+    }
+}
+void choosePreferredWorkGroupSizeWithOutRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, uint32_t workdim) {
+    uint64_t localEuThrdsDispatched = 0xffffffffffffffff;
+    uint64_t workGroups;
+    for (uint32_t ZFactorsIdx = 0; ZFactorsIdx < xyzFactorsLen[2]; ++ZFactorsIdx) {
+        for (uint32_t XFactorsIdx = 0; XFactorsIdx < xyzFactorsLen[0]; ++XFactorsIdx) {
+            for (uint32_t YFactorsIdx = 0; YFactorsIdx < xyzFactorsLen[1]; ++YFactorsIdx) {
+
+                uint32_t Xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - XFactorsIdx];
+                uint32_t Ydim = xyzFactors[1][YFactorsIdx];
+                uint32_t Zdim = xyzFactors[2][ZFactorsIdx];
+
+                if ((Xdim * Ydim * Zdim) > wsInfo.maxWorkGroupSize) {
+                    break;
+                }
+                if ((Xdim * Ydim * Zdim) < wsInfo.minWorkGroupSize) {
+                    continue;
+                }
+
+                workGroups = Math::divideAndRoundUp(workItems[0], Xdim);
+                workGroups *= Math::divideAndRoundUp(workItems[1], Ydim);
+                workGroups *= Math::divideAndRoundUp(workItems[2], Zdim);
+                uint64_t euThrdsDispatched;
+
+                euThrdsDispatched = Math::divideAndRoundUp(Xdim * Ydim * Zdim, wsInfo.simdSize);
+                euThrdsDispatched *= workGroups;
+
+                if (euThrdsDispatched < localEuThrdsDispatched) {
+                    localEuThrdsDispatched = euThrdsDispatched;
+                    workGroupSize[0] = Xdim;
+                    workGroupSize[1] = Ydim;
+                    workGroupSize[2] = Zdim;
+                }
+            }
+        }
+    }
+}
+
+void setSpecialWorkgroupSize(size_t workgroupSize[3]) {
+    workgroupSize[0] = 1;
+    workgroupSize[1] = 1;
+    workgroupSize[2] = 1;
+}
+
+void computeWorkgroupSize1D(uint32_t maxWorkGroupSize,
+                            size_t workGroupSize[3],
+                            const size_t workItems[3],
+                            size_t simdSize) {
+    auto items = workItems[0];
+
+    // Determine the LSB set to quickly handle factors of 2
+    auto numBits = Math::getMinLsbSet(static_cast<uint32_t>(items));
+
+    // Clamp power of 2 result to maxWorkGroupSize
+    uint32_t workSize = 1u << numBits;
+
+    //Assumes maxWorkGroupSize is a power of two.
+    DEBUG_BREAK_IF((maxWorkGroupSize & (maxWorkGroupSize - 1)) != 0);
+    workSize = std::min(workSize, maxWorkGroupSize);
+
+    // Try all primes as potential factors
+    workSize = factor<MAX_PRIMES - 1>(items, workSize, maxWorkGroupSize);
+
+    workGroupSize[0] = workSize;
+    workGroupSize[1] = 1;
+    workGroupSize[2] = 1;
+}
+
+void computeWorkgroupSize2D(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize) {
+    uint32_t xFactors[1024];
+    uint32_t yFactors[1024];
+    uint32_t xFactorsLen = 0;
+    uint32_t yFactorsLen = 0;
+    uint64_t waste;
+    uint64_t localWSWaste = 0xffffffffffffffff;
+    uint64_t euThrdsDispatched;
+    uint64_t localEuThrdsDispatched = 0xffffffffffffffff;
+    uint64_t workGroups;
+    uint32_t xDim;
+    uint32_t yDim;
+
+    for (int i = 0; i < 3; i++)
+        workGroupSize[i] = 1;
+
+    for (uint32_t i = 2; i <= maxWorkGroupSize; i++) {
+        if ((workItems[0] % i) == 0) {
+            xFactors[xFactorsLen++] = i;
+        }
+        if (((workItems[1] % i) == 0)) {
+            yFactors[yFactorsLen++] = i;
+        }
+    }
+
+    for (uint32_t xFactorsIdx = 0; xFactorsIdx < xFactorsLen; ++xFactorsIdx) {
+        for (uint32_t yFactorsIdx = 0; yFactorsIdx < yFactorsLen; ++yFactorsIdx) {
+            // Pick a LocalWorkSize that is a multiple as well as appropriate:
+            // 1 <= workGroupSize[ 0 ] <= workItems[ 0 ]
+            // 1 <= workGroupSize[ 1 ] <= workItems[ 1 ]
+            xDim = xFactors[xFactorsLen - 1 - xFactorsIdx];
+            yDim = yFactors[yFactorsIdx];
+
+            if ((xDim * yDim) > maxWorkGroupSize) {
+                // The yDim value is too big, so break out of this loop.
+                // No other entries will work.
+                break;
+            }
+
+            // Find the wasted channels.
+            workGroups = Math::divideAndRoundUp(workItems[0], xDim);
+            workGroups *= Math::divideAndRoundUp(workItems[1], yDim);
+
+            // Compaction Mode!
+            euThrdsDispatched = Math::divideAndRoundUp(xDim * yDim, simdSize);
+            euThrdsDispatched *= workGroups;
+
+            waste = simdSize - ((xDim * yDim - 1) & (simdSize - 1));
+            waste *= workGroups;
+
+            if (((euThrdsDispatched < localEuThrdsDispatched) ||
+                 ((euThrdsDispatched == localEuThrdsDispatched) && (waste < localWSWaste)))) {
+                localWSWaste = waste;
+                localEuThrdsDispatched = euThrdsDispatched;
+                workGroupSize[0] = xDim;
+                workGroupSize[1] = yDim;
+            }
+        }
+    }
+}
+
+void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) {
+    for (int i = 0; i < 3; i++)
+        workGroupSize[i] = 1;
+    size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
+    for (auto i = 0u; i < workDim; i++) {
+        uint32_t requiredWorkItemsCount = maxWorkGroupSize;
+        while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
+            requiredWorkItemsCount >>= 1;
+        itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
+    }
+    if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) {
+        while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) {
+            if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1])
+                itemsPowerOfTwoDivisors[0] >>= 1;
+            else
+                itemsPowerOfTwoDivisors[1] >>= 1;
+        }
+        for (auto i = 0u; i < 3; i++)
+            workGroupSize[i] = itemsPowerOfTwoDivisors[i];
+        return;
+
+    } else if (workItems[0] * workItems[1] > maxWorkGroupSize) {
+        computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize);
+        return;
+    } else {
+        for (auto i = 0u; i < workDim; i++)
+            workGroupSize[i] = workItems[i];
+        return;
+    }
+}
+
+void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) {
+    for (int i = 0; i < 3; i++)
+        workGroupSize[i] = 1;
+
+    uint64_t totalNuberOfItems = workItems[0] * workItems[1] * workItems[2];
+
+    UNRECOVERABLE_IF(wsInfo.simdSize == 0);
+
+    //Find biggest power of two which devide each dimension size
+    if (wsInfo.slmTotalSize == 0 && !wsInfo.hasBarriers) {
+        if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && workDim == 2 && !wsInfo.imgUsed) {
+            computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim);
+            return;
+        }
+
+        size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
+        for (auto i = 0u; i < workDim; i++) {
+            uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalHardwareThreadCountGeneric[0]);
+            while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
+                requiredWorkItemsCount >>= 1;
+            itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
+        }
+
+        bool canUseNx4 = (wsInfo.imgUsed &&
+                          (itemsPowerOfTwoDivisors[0] >= 4 || (itemsPowerOfTwoDivisors[0] >= 2 && wsInfo.simdSize == 8)) &&
+                          itemsPowerOfTwoDivisors[1] >= 4);
+
+        //If computed dimension sizes which are powers of two are creating group which is
+        //bigger than maxWorkGroupSize or this group would create more than optimal hardware threads then downsize it
+        uint64_t allItems = itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] * itemsPowerOfTwoDivisors[2];
+        if (allItems > wsInfo.simdSize && (allItems > wsInfo.maxWorkGroupSize || allItems > wsInfo.simdSize * optimalHardwareThreadCountGeneric[0])) {
+            computePowerOfTwoLWS(itemsPowerOfTwoDivisors, wsInfo, workGroupSize, workDim, canUseNx4);
+            return;
+        }
+        //If coputed workgroup is at this point in correct size
+        else if (allItems >= wsInfo.simdSize) {
+            itemsPowerOfTwoDivisors[1] = canUseNx4 ? 4 : itemsPowerOfTwoDivisors[1];
+            for (auto i = 0u; i < workDim; i++)
+                workGroupSize[i] = itemsPowerOfTwoDivisors[i];
+            return;
+        }
+    }
+    //If dimensions are not powers of two but total number of items is less than max work group size
+    if (totalNuberOfItems <= wsInfo.maxWorkGroupSize) {
+        for (auto i = 0u; i < workDim; i++)
+            workGroupSize[i] = workItems[i];
+        return;
+    } else {
+        if (workDim == 1)
+            computeWorkgroupSize1D(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize);
+        else {
+            uint32_t xyzFactors[3][1024];
+            uint32_t xyzFactorsLen[3] = {};
+
+            //check if algorithm should use ratio
+            wsInfo.checkRatio(workItems);
+
+            //find all divisors for all dimensions
+            for (int i = 0; i < 3; i++)
+                xyzFactors[i][xyzFactorsLen[i]++] = 1;
+            for (auto i = 0u; i < workDim; i++) {
+                for (auto j = 2u; j < wsInfo.maxWorkGroupSize; ++j) {
+                    if ((workItems[i] % j) == 0) {
+                        xyzFactors[i][xyzFactorsLen[i]++] = j;
+                    }
+                }
+            }
+            if (wsInfo.useRatio) {
+                choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo);
+                if (wsInfo.useStrictRatio && workGroupSize[0] * workGroupSize[1] * 2 <= wsInfo.simdSize) {
+                    wsInfo.useStrictRatio = false;
+                    choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo);
+                }
+            } else
+                choosePreferredWorkGroupSizeWithOutRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, workDim);
+        }
+    }
+}
+
+Vec3<size_t> computeWorkgroupsNumber(const Vec3<size_t> &gws, const Vec3<size_t> &lws) {
+    return (Vec3<size_t>(gws.x / lws.x + ((gws.x % lws.x) ? 1 : 0),
+                         gws.y / lws.y + ((gws.y % lws.y) ? 1 : 0),
+                         gws.z / lws.z + ((gws.z % lws.z) ? 1 : 0)));
+}
+
+Vec3<size_t> generateWorkgroupsNumber(const Vec3<size_t> &gws, const Vec3<size_t> &lws) {
+    return (lws.x > 0) ? computeWorkgroupsNumber(gws, lws) : Vec3<size_t>(0, 0, 0);
+}
+
+Vec3<size_t> canonizeWorkgroup(const Vec3<size_t> &workgroup) {
+    return ((workgroup.x > 0) ? Vec3<size_t>({workgroup.x, std::max(workgroup.y, static_cast<size_t>(1)), std::max(workgroup.z, static_cast<size_t>(1))})
+                              : Vec3<size_t>(0, 0, 0));
+}
+
+} // namespace NEO
diff --git a/shared/source/helpers/local_work_size.h b/shared/source/helpers/local_work_size.h
new file mode 100644
index 0000000000..da4d8f230e
--- /dev/null
+++ b/shared/source/helpers/local_work_size.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2018-2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#pragma once
+
+#include "shared/source/helpers/vec.h"
+
+#include <algorithm>
+
+namespace NEO {
+struct WorkSizeInfo;
+
+void computeWorkgroupSize1D(
+    uint32_t maxWorkGroupSize,
+    size_t workGroupSize[3],
+    const size_t workItems[3],
+    size_t simdSize);
+
+void computeWorkgroupSizeND(
+    WorkSizeInfo &wsInfo,
+    size_t workGroupSize[3],
+    const size_t workItems[3],
+    const uint32_t workDim);
+
+void computeWorkgroupSize2D(
+    uint32_t maxWorkGroupSize,
+    size_t workGroupSize[3],
+    const size_t workItems[3],
+    size_t simdSize);
+
+void computeWorkgroupSizeSquared(
+    uint32_t maxWorkGroupSize,
+    size_t workGroupSize[3],
+    const size_t workItems[3],
+    size_t simdSize,
+    const uint32_t workDim);
+
+Vec3<size_t> computeWorkgroupsNumber(
+    const Vec3<size_t> &gws,
+    const Vec3<size_t> &lws);
+
+Vec3<size_t> generateWorkgroupsNumber(
+    const Vec3<size_t> &gws,
+    const Vec3<size_t> &lws);
+
+inline uint32_t calculateDispatchDim(const Vec3<size_t> &dispatchSize, const Vec3<size_t> &dispatchOffset) {
+    return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
+}
+
+Vec3<size_t> canonizeWorkgroup(
+    const Vec3<size_t> &workgroup);
+
+void setSpecialWorkgroupSize(size_t workgroupSize[3]);
+
+inline uint32_t computeDimensions(const size_t workItems[3]) {
+    return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2
+                                                       : 1;
+}
+
+} // namespace NEO