From 1dad22a12a0c1d4547ba23b5d19dd6c6d145f295 Mon Sep 17 00:00:00 2001
From: Kamil Kopryk <kamil.kopryk@intel.com>
Date: Thu, 7 May 2020 13:51:31 +0200
Subject: [PATCH] Add isSpecialWorkgroupSizeRequired helper

Change-Id: Ic8d4471f48ed5f25eefa802444d0ea62ac0112da
Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
Related-To: NEO-4648
---
 Jenkinsfile                                   |  2 +-
 opencl/source/command_queue/gpgpu_walker.h    |  2 ++
 .../source/command_queue/local_work_size.cpp  | 23 +++++++++++++++----
 .../command_queue/work_group_size_tests.cpp   |  9 ++++++++
 shared/source/helpers/hw_helper.h             |  4 ++++
 shared/source/helpers/hw_helper_base.inl      |  5 ++++
 6 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 606bbfd2ed..06a9ec677e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,5 +1,5 @@
 #!groovy
 dependenciesRevision='c8de469c107af1ced325c9857239bb187d139d1d-1411'
 strategy='EQUAL'
-allowedCD=247
+allowedCD=249
 allowedF=20
diff --git a/opencl/source/command_queue/gpgpu_walker.h b/opencl/source/command_queue/gpgpu_walker.h
index a9e0c78235..e4b297a6ab 100644
--- a/opencl/source/command_queue/gpgpu_walker.h
+++ b/opencl/source/command_queue/gpgpu_walker.h
@@ -87,6 +87,8 @@ Vec3<size_t> canonizeWorkgroup(
 
 void provideLocalWorkGroupSizeHints(Context *context, DispatchInfo dispatchInfo);
 
+void setSpecialWorkgroupSize(size_t workgroupSize[3]);
+
 inline cl_uint computeDimensions(const size_t workItems[3]) {
     return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
 }
diff --git a/opencl/source/command_queue/local_work_size.cpp b/opencl/source/command_queue/local_work_size.cpp
index f75508d6a5..58d8235ba3 100644
--- a/opencl/source/command_queue/local_work_size.cpp
+++ b/opencl/source/command_queue/local_work_size.cpp
@@ -9,7 +9,9 @@
 #include "shared/source/helpers/array_count.h"
 #include "shared/source/helpers/basic_math.h"
 #include "shared/source/helpers/debug_helpers.h"
+#include "shared/source/helpers/hw_helper.h"
 
+#include "opencl/source/cl_device/cl_device.h"
 #include "opencl/source/context/context.h"
 #include "opencl/source/helpers/dispatch_info.h"
 #include "opencl/source/kernel/kernel.h"
@@ -207,6 +209,12 @@ void choosePreferredWorkGroupSizeWithOutRatio(uint32_t xyzFactors[3][1024], uint
     }
 }
 
+void setSpecialWorkgroupSize(size_t workgroupSize[3]) {
+    workgroupSize[0] = 1;
+    workgroupSize[1] = 1;
+    workgroupSize[2] = 1;
+}
+
 void computeWorkgroupSize1D(uint32_t maxWorkGroupSize,
                             size_t workGroupSize[3],
                             const size_t workItems[3],
@@ -403,14 +411,21 @@ void computeWorkgroupSizeND(WorkSizeInfo wsInfo, size_t workGroupSize[3], const
 
 Vec3<size_t> computeWorkgroupSize(const DispatchInfo &dispatchInfo) {
     size_t workGroupSize[3] = {};
-    if (dispatchInfo.getKernel() != nullptr) {
-        if (DebugManager.flags.EnableComputeWorkSizeND.get()) {
+    auto kernel = dispatchInfo.getKernel();
+
+    if (kernel != nullptr) {
+        const auto &hwInfo = kernel->getDevice().getHardwareInfo();
+        auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
+
+        if (kernel->isBuiltIn && hwHelper.isSpecialWorkgroupSizeRequired(hwInfo)) {
+            setSpecialWorkgroupSize(workGroupSize);
+        } else if (DebugManager.flags.EnableComputeWorkSizeND.get()) {
             WorkSizeInfo wsInfo(dispatchInfo);
             size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
             computeWorkgroupSizeND(wsInfo, workGroupSize, workItems, dispatchInfo.getDim());
         } else {
-            auto maxWorkGroupSize = dispatchInfo.getKernel()->maxKernelWorkGroupSize;
-            auto simd = dispatchInfo.getKernel()->getKernelInfo().getMaxSimdSize();
+            auto maxWorkGroupSize = kernel->maxKernelWorkGroupSize;
+            auto simd = kernel->getKernelInfo().getMaxSimdSize();
             size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
             if (dispatchInfo.getDim() == 1) {
                 computeWorkgroupSize1D(maxWorkGroupSize, workGroupSize, workItems, simd);
diff --git a/opencl/test/unit_test/command_queue/work_group_size_tests.cpp b/opencl/test/unit_test/command_queue/work_group_size_tests.cpp
index 1d69325cac..107f884167 100644
--- a/opencl/test/unit_test/command_queue/work_group_size_tests.cpp
+++ b/opencl/test/unit_test/command_queue/work_group_size_tests.cpp
@@ -348,3 +348,12 @@ INSTANTIATE_TEST_CASE_P(wgs,
                         ::testing::Combine(
                             ::testing::ValuesIn(simdSizes),
                             ::testing::ValuesIn(regionCases)));
+
+TEST(WorkgroupSizeTest, WhenSetSpecialWorkgroupSizeIsCalledThenWorkgroupSizeIsSetTo1x1x1) {
+    size_t workgroupSize[3] = {};
+    setSpecialWorkgroupSize(workgroupSize);
+
+    EXPECT_EQ(1u, workgroupSize[0]);
+    EXPECT_EQ(1u, workgroupSize[1]);
+    EXPECT_EQ(1u, workgroupSize[2]);
+}
diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h
index 59d00540dd..a453177c03 100644
--- a/shared/source/helpers/hw_helper.h
+++ b/shared/source/helpers/hw_helper.h
@@ -89,6 +89,8 @@ class HwHelper {
     virtual uint64_t getGpuTimeStampInNS(uint64_t timeStamp, double frequency) const = 0;
     virtual uint32_t getBindlessSurfaceExtendedMessageDescriptorValue(uint32_t surfStateOffset) const = 0;
 
+    virtual bool isSpecialWorkgroupSizeRequired(const HardwareInfo &hwInfo) const = 0;
+
     static uint32_t getSubDevicesCount(const HardwareInfo *pHwInfo);
     static uint32_t getEnginesCount(const HardwareInfo &hwInfo);
     static uint32_t getCopyEnginesCount(const HardwareInfo &hwInfo);
@@ -233,6 +235,8 @@ class HwHelperHw : public HwHelper {
 
     uint64_t getGpuTimeStampInNS(uint64_t timeStamp, double frequency) const override;
 
+    bool isSpecialWorkgroupSizeRequired(const HardwareInfo &hwInfo) const override;
+
   protected:
     static const AuxTranslationMode defaultAuxTranslationMode;
     HwHelperHw() = default;
diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl
index 001b2a0c14..61312b5d79 100644
--- a/shared/source/helpers/hw_helper_base.inl
+++ b/shared/source/helpers/hw_helper_base.inl
@@ -340,6 +340,11 @@ inline bool HwHelperHw<GfxFamily>::isFusedEuDispatchEnabled(const HardwareInfo &
     return false;
 }
 
+template <typename GfxFamily>
+inline bool HwHelperHw<GfxFamily>::isSpecialWorkgroupSizeRequired(const HardwareInfo &hwInfo) const {
+    return false;
+}
+
 template <typename GfxFamily>
 size_t MemorySynchronizationCommands<GfxFamily>::getSizeForFullCacheFlush() {
     return sizeof(typename GfxFamily::PIPE_CONTROL);