diff --git a/opencl/test/unit_test/command_queue/local_work_size_tests.cpp b/opencl/test/unit_test/command_queue/local_work_size_tests.cpp index 594493fbcb..db0f9636da 100644 --- a/opencl/test/unit_test/command_queue/local_work_size_tests.cpp +++ b/opencl/test/unit_test/command_queue/local_work_size_tests.cpp @@ -947,3 +947,31 @@ TEST_F(LocalWorkSizeTest, givenMaxWorkgroupSizeEqualToSimdSizeWhenLwsIsCalculate EXPECT_EQ(workGroupSize[1], 1u); EXPECT_EQ(workGroupSize[2], 1u); } + +TEST_F(LocalWorkSizeTest, givenGwsWithSmallXAndBigYWhenLwsIsCalculatedThenDescendingOrderIsNotEnforced) { + WorkSizeInfo wsInfo(256u, true, 32u, 0u, rootDeviceEnvironment, 0u, 0u, false, false, false); + + uint32_t workDim = 3; + size_t workGroup[3] = {2, 1024, 1}; + size_t workGroupSize[3]; + + NEO::choosePrefferedWorkgroupSize(wsInfo, workGroupSize, workGroup, workDim); + EXPECT_EQ(workGroupSize[0], 2u); + EXPECT_EQ(workGroupSize[1], 128u); + EXPECT_EQ(workGroupSize[2], 1u); + + // Enforce strict ratio requirement + wsInfo.yTiledSurfaces = true; + NEO::choosePrefferedWorkgroupSize(wsInfo, workGroupSize, workGroup, workDim); + EXPECT_EQ(workGroupSize[0], 2u); + EXPECT_EQ(workGroupSize[1], 128u); + EXPECT_EQ(workGroupSize[2], 1u); + + // Enforce ratio requirement + wsInfo.yTiledSurfaces = false; + wsInfo.slmTotalSize = 128U; + NEO::choosePrefferedWorkgroupSize(wsInfo, workGroupSize, workGroup, workDim); + EXPECT_EQ(workGroupSize[0], 2u); + EXPECT_EQ(workGroupSize[1], 128u); + EXPECT_EQ(workGroupSize[2], 1u); +} diff --git a/shared/source/helpers/local_work_size.cpp b/shared/source/helpers/local_work_size.cpp index fd2efe7caa..434b6c342e 100644 --- a/shared/source/helpers/local_work_size.cpp +++ b/shared/source/helpers/local_work_size.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -124,18 +124,16 @@ void computePowerOfTwoLWS(const size_t workItems[3], WorkSizeInfo &workGroupInfo } } -void choosePreferredWorkGroupSizeWithRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo) { - float ratioDiff = 0; - float localRatio = float(0xffffffff); - uint64_t localWkgs = 0xffffffff; - uint64_t workGroups; +void choosePreferredWorkGroupSizeWithRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, bool enforceDescendingOrder) { + float localRatio = std::numeric_limits::max(); + uint64_t localNumWorkgroups = std::numeric_limits::max(); for (uint32_t xFactorsIdx = 0; xFactorsIdx < xyzFactorsLen[0]; ++xFactorsIdx) { for (uint32_t yFactorsIdx = 0; yFactorsIdx < xyzFactorsLen[1]; ++yFactorsIdx) { uint32_t xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - xFactorsIdx]; uint32_t ydim = xyzFactors[1][yFactorsIdx]; - if (ydim > xdim) { + if (enforceDescendingOrder && ydim > xdim) { break; } @@ -146,65 +144,57 @@ void choosePreferredWorkGroupSizeWithRatio(uint32_t xyzFactors[3][1024], uint32_ continue; } - workGroups = Math::divideAndRoundUp(workItems[0], xdim); - workGroups *= Math::divideAndRoundUp(workItems[1], ydim); + uint64_t numWorkGroups = Math::divideAndRoundUp(workItems[0], xdim); + numWorkGroups *= Math::divideAndRoundUp(workItems[1], ydim); - ratioDiff = log((float)xdim) - log((float)ydim); + float ratioDiff = log(static_cast(xdim)) - log(static_cast(ydim)); ratioDiff = fabs(wsInfo.targetRatio - ratioDiff); - if (wsInfo.useStrictRatio == true) { - if (ratioDiff < localRatio) { - workGroupSize[0] = xdim; - workGroupSize[1] = ydim; - localRatio = ratioDiff; - localWkgs = workGroups; - } - } else { - if ((workGroups < localWkgs) || - ((workGroups == localWkgs) && (ratioDiff < localRatio))) { - workGroupSize[0] = xdim; - workGroupSize[1] = ydim; - localRatio = ratioDiff; - localWkgs = workGroups; - } + bool setWorkGroupSize = wsInfo.useStrictRatio + ? (ratioDiff < localRatio) + : (numWorkGroups < localNumWorkgroups) || ((numWorkGroups == localNumWorkgroups) && (ratioDiff < localRatio)); + if (setWorkGroupSize) { + workGroupSize[0] = xdim; + workGroupSize[1] = ydim; + localRatio = ratioDiff; + localNumWorkgroups = numWorkGroups; } } } } -void choosePreferredWorkGroupSizeWithOutRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, uint32_t workdim) { - uint64_t localEuThrdsDispatched = 0xffffffffffffffff; - uint64_t workGroups; + +void choosePreferredWorkGroupSizeWithOutRatio(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, bool enforceDescendingOrder) { + uint64_t localEuThrdsDispatched = std::numeric_limits::max(); for (uint32_t xFactorsIdx = 0; xFactorsIdx < xyzFactorsLen[0]; ++xFactorsIdx) { - for (uint32_t zFactorsIdx = 0; zFactorsIdx < xyzFactorsLen[2]; ++zFactorsIdx) { - for (uint32_t yFactorsIdx = 0; yFactorsIdx < xyzFactorsLen[1]; ++yFactorsIdx) { + for (uint32_t yFactorsIdx = 0; yFactorsIdx < xyzFactorsLen[1]; ++yFactorsIdx) { + for (uint32_t zFactorsIdx = 0; zFactorsIdx < xyzFactorsLen[2]; ++zFactorsIdx) { uint32_t xdim = xyzFactors[0][xyzFactorsLen[0] - 1 - xFactorsIdx]; - uint32_t ydim = xyzFactors[1][yFactorsIdx]; - uint32_t zdim = xyzFactors[2][zFactorsIdx]; + uint32_t ydim = xyzFactors[1][xyzFactorsLen[1] - 1 - yFactorsIdx]; + uint32_t zdim = xyzFactors[2][xyzFactorsLen[2] - 1 - zFactorsIdx]; - if (zdim > ydim) { + if (enforceDescendingOrder) { + if (ydim > xdim) { + break; + } else if (zdim > ydim) { + continue; + } + } + + uint32_t numItemsInWorkGroup = xdim * ydim * zdim; + if (numItemsInWorkGroup > wsInfo.maxWorkGroupSize) { continue; } - if (ydim > xdim) { + if (numItemsInWorkGroup < wsInfo.minWorkGroupSize) { break; } - if ((xdim * ydim * zdim) > wsInfo.maxWorkGroupSize) { - break; - } - if ((xdim * ydim * zdim) < wsInfo.minWorkGroupSize) { - continue; - } - - workGroups = Math::divideAndRoundUp(workItems[0], xdim); - workGroups *= Math::divideAndRoundUp(workItems[1], ydim); - workGroups *= Math::divideAndRoundUp(workItems[2], zdim); - uint64_t euThrdsDispatched; - - euThrdsDispatched = Math::divideAndRoundUp(xdim * ydim * zdim, wsInfo.simdSize); - euThrdsDispatched *= workGroups; - + uint64_t numWorkGroups = Math::divideAndRoundUp(workItems[0], xdim); + numWorkGroups *= Math::divideAndRoundUp(workItems[1], ydim); + numWorkGroups *= Math::divideAndRoundUp(workItems[2], zdim); + uint64_t numThreadsPerWorkGroup = Math::divideAndRoundUp(numItemsInWorkGroup, wsInfo.simdSize); + uint64_t euThrdsDispatched = numThreadsPerWorkGroup * numWorkGroups; if (euThrdsDispatched < localEuThrdsDispatched) { localEuThrdsDispatched = euThrdsDispatched; workGroupSize[0] = xdim; @@ -216,10 +206,7 @@ void choosePreferredWorkGroupSizeWithOutRatio(uint32_t xyzFactors[3][1024], uint } } -void computeWorkgroupSize1D(uint32_t maxWorkGroupSize, - size_t workGroupSize[3], - const size_t workItems[3], - size_t simdSize) { +void computeWorkgroupSize1D(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize) { auto items = workItems[0]; // Determine the LSB set to quickly handle factors of 2 @@ -240,6 +227,42 @@ void computeWorkgroupSize1D(uint32_t maxWorkGroupSize, workGroupSize[2] = 1; } +void choosePreferredWorkgroupSize(uint32_t xyzFactors[3][1024], uint32_t xyzFactorsLen[3], size_t workGroupSize[3], const size_t workItems[3], WorkSizeInfo &wsInfo, bool enforceDescendingOrder) { + // check if algorithm should use ratio + wsInfo.checkRatio(workItems); + + if (wsInfo.useRatio) { + choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, enforceDescendingOrder); + if (wsInfo.useStrictRatio && workGroupSize[0] * workGroupSize[1] * 2 <= wsInfo.simdSize) { + wsInfo.useStrictRatio = false; + choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, enforceDescendingOrder); + } + } else { + choosePreferredWorkGroupSizeWithOutRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, enforceDescendingOrder); + } +} + +void choosePrefferedWorkgroupSize(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) { + // find all divisors for all dimensions + uint32_t xyzFactors[3][1024]; + uint32_t xyzFactorsLen[3] = {}; + for (int i = 0; i < 3; i++) + xyzFactors[i][xyzFactorsLen[i]++] = 1; + for (auto i = 0u; i < workDim; i++) { + for (auto j = 2u; j < wsInfo.maxWorkGroupSize; ++j) { + if ((workItems[i] % j) == 0) { + xyzFactors[i][xyzFactorsLen[i]++] = j; + } + } + } + + choosePreferredWorkgroupSize(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, true); + size_t wgs = workGroupSize[0] * workGroupSize[1] * workGroupSize[2]; + if (wgs * 2 <= wsInfo.simdSize) { + choosePreferredWorkgroupSize(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, false); + } +} + void computeWorkgroupSize2D(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize) { uint32_t xFactors[1024]; uint32_t yFactors[1024]; @@ -336,15 +359,12 @@ void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const for (int i = 0; i < 3; i++) workGroupSize[i] = 1; - uint64_t totalNuberOfItems = workItems[0] * workItems[1] * workItems[2]; - UNRECOVERABLE_IF(wsInfo.simdSize == 0); // Find biggest power of two which devide each dimension size if (wsInfo.slmTotalSize == 0 && !wsInfo.hasBarriers) { if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && workDim == 2 && !wsInfo.imgUsed) { - computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim); - return; + return computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim); } size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1}; @@ -363,8 +383,7 @@ void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const // bigger than maxWorkGroupSize or this group would create more than optimal hardware threads then downsize it uint64_t allItems = itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] * itemsPowerOfTwoDivisors[2]; if (allItems > wsInfo.simdSize && (allItems > wsInfo.maxWorkGroupSize || allItems > wsInfo.simdSize * optimalHardwareThreadCountGeneric[0])) { - computePowerOfTwoLWS(itemsPowerOfTwoDivisors, wsInfo, workGroupSize, workDim, canUseNx4); - return; + return computePowerOfTwoLWS(itemsPowerOfTwoDivisors, wsInfo, workGroupSize, workDim, canUseNx4); } // If coputed workgroup is at this point in correct size else if (allItems >= wsInfo.simdSize) { @@ -374,41 +393,20 @@ void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const return; } } + + uint64_t totalNuberOfItems = workItems[0] * workItems[1] * workItems[2]; // If dimensions are not powers of two but total number of items is less than max work group size if (totalNuberOfItems <= wsInfo.maxWorkGroupSize) { for (auto i = 0u; i < workDim; i++) workGroupSize[i] = workItems[i]; return; - } else { - if (workDim == 1) - computeWorkgroupSize1D(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize); - else { - uint32_t xyzFactors[3][1024]; - uint32_t xyzFactorsLen[3] = {}; - - // check if algorithm should use ratio - wsInfo.checkRatio(workItems); - - // find all divisors for all dimensions - for (int i = 0; i < 3; i++) - xyzFactors[i][xyzFactorsLen[i]++] = 1; - for (auto i = 0u; i < workDim; i++) { - for (auto j = 2u; j < wsInfo.maxWorkGroupSize; ++j) { - if ((workItems[i] % j) == 0) { - xyzFactors[i][xyzFactorsLen[i]++] = j; - } - } - } - if (wsInfo.useRatio) { - choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo); - if (wsInfo.useStrictRatio && workGroupSize[0] * workGroupSize[1] * 2 <= wsInfo.simdSize) { - wsInfo.useStrictRatio = false; - choosePreferredWorkGroupSizeWithRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo); - } - } else - choosePreferredWorkGroupSizeWithOutRatio(xyzFactors, xyzFactorsLen, workGroupSize, workItems, wsInfo, workDim); - } } + + if (workDim == 1) { + return computeWorkgroupSize1D(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize); + } + + choosePrefferedWorkgroupSize(wsInfo, workGroupSize, workItems, workDim); } Vec3 computeWorkgroupsNumber(const Vec3 &gws, const Vec3 &lws) { diff --git a/shared/source/helpers/local_work_size.h b/shared/source/helpers/local_work_size.h index 8dcc0a45cd..50ea45dcc5 100644 --- a/shared/source/helpers/local_work_size.h +++ b/shared/source/helpers/local_work_size.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -39,6 +39,8 @@ void computeWorkgroupSizeSquared( size_t simdSize, const uint32_t workDim); +void choosePrefferedWorkgroupSize(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim); + Vec3 computeWorkgroupsNumber( const Vec3 &gws, const Vec3 &lws);