diff --git a/runtime/command_queue/local_work_size.cpp b/runtime/command_queue/local_work_size.cpp index 96de36f205..035d4b99a7 100644 --- a/runtime/command_queue/local_work_size.cpp +++ b/runtime/command_queue/local_work_size.cpp @@ -294,6 +294,37 @@ void computeWorkgroupSize2D(uint32_t maxWorkGroupSize, size_t workGroupSize[3], } } +void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) { + for (int i = 0; i < 3; i++) + workGroupSize[i] = 1; + size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1}; + for (auto i = 0u; i < workDim; i++) { + uint32_t requiredWorkItemsCount = maxWorkGroupSize; + while (requiredWorkItemsCount > 1 && !(Math::isDivisableByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount))) + requiredWorkItemsCount = requiredWorkItemsCount >> 1; + itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount; + } + if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) { + while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) { + if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1]) + itemsPowerOfTwoDivisors[0] = itemsPowerOfTwoDivisors[0] >> 1; + else + itemsPowerOfTwoDivisors[1] = itemsPowerOfTwoDivisors[1] >> 1; + } + for (auto i = 0u; i < 3; i++) + workGroupSize[i] = itemsPowerOfTwoDivisors[i]; + return; + + } else if (workItems[0] * workItems[1] > maxWorkGroupSize) { + computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize); + return; + } else { + for (auto i = 0u; i < workDim; i++) + workGroupSize[i] = workItems[i]; + return; + } +} + void computeWorkgroupSizeND(WorkSizeInfo wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) { for (int i = 0; i < 3; i++) workGroupSize[i] = 1; @@ -303,8 +334,11 @@ void computeWorkgroupSizeND(WorkSizeInfo wsInfo, size_t workGroupSize[3], const return; } else { //Find biggest power of two which devide each dimension size - if (wsInfo.slmTotalSize == 0 && wsInfo.hasBarriers == 0) { + if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && workDim == 2 && !wsInfo.imgUsed) { + return computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim); + } + size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1}; for (auto i = 0u; i < workDim; i++) { uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalHardwareThreadCountGeneric[0]); @@ -370,37 +404,6 @@ void computeWorkgroupSizeND(WorkSizeInfo wsInfo, size_t workGroupSize[3], const } } -void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) { - for (int i = 0; i < 3; i++) - workGroupSize[i] = 1; - size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1}; - for (auto i = 0u; i < workDim; i++) { - uint32_t requiredWorkItemsCount = maxWorkGroupSize; - while (requiredWorkItemsCount > 1 && !(Math::isDivisableByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount))) - requiredWorkItemsCount = requiredWorkItemsCount >> 1; - itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount; - } - if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) { - while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) { - if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1]) - itemsPowerOfTwoDivisors[0] = itemsPowerOfTwoDivisors[0] >> 1; - else - itemsPowerOfTwoDivisors[1] = itemsPowerOfTwoDivisors[1] >> 1; - } - for (auto i = 0u; i < 3; i++) - workGroupSize[i] = itemsPowerOfTwoDivisors[i]; - return; - - } else if (workItems[0] * workItems[1] > maxWorkGroupSize) { - computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize); - return; - } else { - for (auto i = 0u; i < workDim; i++) - workGroupSize[i] = workItems[i]; - return; - } -} - Vec3 computeWorkgroupSize(const DispatchInfo &dispatchInfo) { size_t workGroupSize[3] = {}; if (dispatchInfo.getKernel() != nullptr) { @@ -452,20 +455,12 @@ Vec3 canonizeWorkgroup(Vec3 workgroup) { void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo) { if (context != nullptr && context->isProvidingPerformanceHints() && dispatchInfo.getDim() <= 3) { size_t preferredWorkGroupSize[3]; - size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z}; - if (DebugManager.flags.EnableComputeWorkSizeND.get()) { - WorkSizeInfo wsInfo(dispatchInfo); - computeWorkgroupSizeND(wsInfo, preferredWorkGroupSize, workItems, dispatchInfo.getDim()); - } else { - auto simd = dispatchInfo.getKernel()->getKernelInfo().getMaxSimdSize(); - if (dispatchInfo.getDim() == 1) - computeWorkgroupSize1D(maxWorkGroupSize, preferredWorkGroupSize, workItems, simd); - else if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && dispatchInfo.getDim() == 2) { - computeWorkgroupSizeSquared(maxWorkGroupSize, preferredWorkGroupSize, workItems, simd, dispatchInfo.getDim()); - } else { - computeWorkgroupSize2D(maxWorkGroupSize, preferredWorkGroupSize, workItems, simd); - } - } + + auto lws = computeWorkgroupSize(dispatchInfo); + preferredWorkGroupSize[0] = lws.x; + preferredWorkGroupSize[1] = lws.y; + preferredWorkGroupSize[2] = lws.z; + if (dispatchInfo.getEnqueuedWorkgroupSize().x == 0) { context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL, NULL_LOCAL_WORKGROUP_SIZE, dispatchInfo.getKernel()->getKernelInfo().name.c_str(), preferredWorkGroupSize[0], preferredWorkGroupSize[1], preferredWorkGroupSize[2]); diff --git a/runtime/os_interface/DebugVariables.def b/runtime/os_interface/DebugVariables.def index 542da92697..a1415cb9e5 100644 --- a/runtime/os_interface/DebugVariables.def +++ b/runtime/os_interface/DebugVariables.def @@ -68,7 +68,7 @@ DECLARE_DEBUG_VARIABLE(bool, EnableAsyncEventsHandler, true, "Enables async even DECLARE_DEBUG_VARIABLE(bool, EnableForcePin, true, "Enables early pinning for memory object") DECLARE_DEBUG_VARIABLE(int32_t, Enable64kbpages, -1, "-1: default behaviour, 0 Disables, 1 Enables support for 64KB pages for driver allocated fine grain svm buffers") DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeND, true, "Enables diffrent algorithm to compute local work size") -DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, false, "Enables algorithm to compute the most squared work group as possible") +DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, true, "Enables algorithm to compute the most squared work group as possible") DECLARE_DEBUG_VARIABLE(int32_t, OverrideEnableKmdNotify, -1, "-1: dont override, 0: disable, 1: enable") DECLARE_DEBUG_VARIABLE(int32_t, OverrideKmdNotifyDelayMs, -1, "-1: dont override, 0: infinite timeout, >0: timeout in ms") DECLARE_DEBUG_VARIABLE(bool, EnableVaLibCalls, true, "Enable cl-va sharing lib calls") diff --git a/unit_tests/command_queue/local_work_size_tests.cpp b/unit_tests/command_queue/local_work_size_tests.cpp index 0ed1103be6..f3f13dcb4b 100644 --- a/unit_tests/command_queue/local_work_size_tests.cpp +++ b/unit_tests/command_queue/local_work_size_tests.cpp @@ -22,6 +22,7 @@ #include "runtime/command_queue/dispatch_walker.h" #include "runtime/helpers/options.h" +#include "unit_tests/helpers/debug_manager_state_restore.h" #include "gtest/gtest.h" using namespace OCLRT; @@ -95,8 +96,8 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual8WhenComputeCalledThenLoca size_t workGroupSize[3]; OCLRT::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim); - EXPECT_EQ(workGroupSize[0], 128u); - EXPECT_EQ(workGroupSize[1], 2u); + EXPECT_EQ(workGroupSize[0], 16u); + EXPECT_EQ(workGroupSize[1], 16u); EXPECT_EQ(workGroupSize[2], 1u); workGroup[0] = 48; @@ -115,6 +116,8 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual8WhenComputeCalledThenLoca } TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual32WhenComputeCalledThenLocalGroupComputed) { + DebugManagerStateRestore dbgRestore; + DebugManager.flags.EnableComputeWorkSizeSquared.set(false); WorkSizeInfo wsInfo(256, 0u, 32, 0u, platformDevices[0]->pPlatform->eRenderCoreFamily, 32u, 0u, false, false); uint32_t workDim = 2; size_t workGroup[3] = {384, 96, 1}; @@ -232,8 +235,8 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual256WhenComputeCalledThenLo size_t workGroupSize[3]; OCLRT::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim); - EXPECT_EQ(workGroupSize[0], 128u); - EXPECT_EQ(workGroupSize[1], 2u); + EXPECT_EQ(workGroupSize[0], 16u); + EXPECT_EQ(workGroupSize[1], 16u); EXPECT_EQ(workGroupSize[2], 1u); } @@ -267,8 +270,8 @@ TEST(localWorkSizeTest, givenKernelWithTwoDimensionalGlobalSizesWhenLwsIsCompute workGroup[0] = 1024; workGroup[1] = 1024; OCLRT::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim); - EXPECT_EQ(workGroupSize[0], 256u); - EXPECT_EQ(workGroupSize[1], 1u); + EXPECT_EQ(workGroupSize[0], 16u); + EXPECT_EQ(workGroupSize[1], 16u); EXPECT_EQ(workGroupSize[2], 1u); } @@ -521,6 +524,8 @@ TEST(localWorkSizeTest, given2DimWorkWhenComputeSquaredCalledThenLocalGroupCompu } TEST(localWorkSizeTest, givenDeviceSupportingLws1024AndKernelCompiledInSimd8WhenGwsIs1024ThenLwsIsComputedAsMaxOptimalMultipliedBySimd) { + DebugManagerStateRestore dbgRestore; + DebugManager.flags.EnableComputeWorkSizeSquared.set(false); WorkSizeInfo wsInfo(1024, 0u, 8, 0u, platformDevices[0]->pPlatform->eRenderCoreFamily, 32u, 0u, false, false); uint32_t workDim = 2; @@ -535,4 +540,7 @@ TEST(localWorkSizeTest, givenDeviceSupportingLws1024AndKernelCompiledInSimd8When TEST(localWorkSizeTest, givenDebugVariableEnableComputeWorkSizeNDWhenCheckValueExpectTrue) { EXPECT_TRUE(DebugManager.flags.EnableComputeWorkSizeND.get()); +} +TEST(localWorkSizeTest, givenDefaultDebugVariablesWhenEnableComputeWorkSizeSquaredIsCheckdThenTrueIsReturned) { + EXPECT_TRUE(DebugManager.flags.EnableComputeWorkSizeSquared.get()); } \ No newline at end of file