From 60c7587c2b9a3f7b3b7ad13173bc7e5bfd211956 Mon Sep 17 00:00:00 2001
From: Mateusz Hoppe <mateusz.hoppe@intel.com>
Date: Thu, 23 Jan 2020 15:52:49 +0100
Subject: [PATCH] Simplify HardwareCommandsHelper

Related-To: NEO-4175

Change-Id: I39b08353514ea0bf384b6b592f24952d0ed631e6
Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
---
 core/helpers/hw_helper.h                      |  3 +-
 core/helpers/hw_helper_base.inl               |  2 +-
 .../command_queue/gpgpu_walker_bdw_plus.inl   |  6 +-
 .../hardware_interface_bdw_plus.inl           |  5 +-
 runtime/device_queue/device_queue_hw_base.inl |  7 ++-
 .../device_queue/device_queue_hw_bdw_plus.inl |  8 ++-
 runtime/gen12lp/hw_helper_gen12lp.cpp         |  2 +-
 runtime/helpers/hardware_commands_helper.h    | 27 +--------
 runtime/helpers/hardware_commands_helper.inl  | 15 ++---
 .../helpers/hardware_commands_helper_base.inl | 22 +------
 runtime/kernel/kernel.cpp                     | 33 +++++++++--
 runtime/kernel/kernel.h                       |  8 ++-
 .../enqueue_execution_model_kernel_tests.cpp  | 12 +++-
 unit_tests/gen12lp/CMakeLists.txt             |  3 +-
 unit_tests/gen12lp/gen12lp_tests_wrapper.cpp  |  3 +-
 unit_tests/gen12lp/tgllp/CMakeLists.txt       |  3 +-
 .../kernel_tests_tgllp.cpp}                   | 10 ++--
 .../hardware_commands_helper_tests.cpp        | 55 +++++++++++-------
 .../helpers/hardware_commands_helper_tests.h  | 10 +++-
 unit_tests/helpers/hw_helper_tests.cpp        |  3 +-
 unit_tests/kernel/kernel_tests.cpp            | 57 +++++++++++++++++++
 21 files changed, 185 insertions(+), 109 deletions(-)
 rename unit_tests/gen12lp/{hardware_commands_helper_tests_gen12lp.inl => tgllp/kernel_tests_tgllp.cpp} (70%)
diff --git a/core/helpers/hw_helper.h b/core/helpers/hw_helper.h
index 074b689cd4..86cdd42f3b 100644
--- a/core/helpers/hw_helper.h
+++ b/core/helpers/hw_helper.h
@@ -72,6 +72,7 @@ class HwHelper {
                                                    uint32_t threadsPerEu) = 0;
     virtual uint32_t alignSlmSize(uint32_t slmSize) = 0;
     virtual bool isForceEmuInt32DivRemSPWARequired(const HardwareInfo &hwInfo) = 0;
+    virtual bool isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) const = 0;
 
     static uint32_t getSubDevicesCount(const HardwareInfo *pHwInfo);
     static uint32_t getEnginesCount(const HardwareInfo &hwInfo);
@@ -186,7 +187,7 @@ class HwHelperHw : public HwHelper {
 
     static bool isBlitAuxTranslationRequired(const HardwareInfo &hwInfo, const MultiDispatchInfo &multiDispatchInfo);
 
-    static bool isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo);
+    bool isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) const override;
 
     static bool isForceDefaultRCSEngineWARequired(const HardwareInfo &hwInfo);
 
diff --git a/core/helpers/hw_helper_base.inl b/core/helpers/hw_helper_base.inl
index eeb70dc1f2..a2ca78d4a1 100644
--- a/core/helpers/hw_helper_base.inl
+++ b/core/helpers/hw_helper_base.inl
@@ -263,7 +263,7 @@ uint32_t HwHelperHw<GfxFamily>::getBarriersCountFromHasBarriers(uint32_t hasBarr
 }
 
 template <typename GfxFamily>
-bool HwHelperHw<GfxFamily>::isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) {
+bool HwHelperHw<GfxFamily>::isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) const {
     return false;
 }
 
diff --git a/runtime/command_queue/gpgpu_walker_bdw_plus.inl b/runtime/command_queue/gpgpu_walker_bdw_plus.inl
index b0c5603e5c..c1b30843fd 100644
--- a/runtime/command_queue/gpgpu_walker_bdw_plus.inl
+++ b/runtime/command_queue/gpgpu_walker_bdw_plus.inl
@@ -126,12 +126,15 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
     auto pGpGpuWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
     *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
     bool inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(scheduler);
+    auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(scheduler);
+
     HardwareCommandsHelper<GfxFamily>::sendIndirectState(
         commandStream,
         *dsh,
         *ioh,
         *ssh,
         scheduler,
+        scheduler.getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
         simd,
         localWorkSizes,
         offsetInterfaceDescriptorTable,
@@ -139,8 +142,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
         preemptionMode,
         pGpGpuWalkerCmd,
         nullptr,
-        true,
-        isCcsUsed);
+        true);
 
     // Implement enabling special WA DisableLSQCROPERFforOCL if needed
     GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, true);
diff --git a/runtime/command_queue/hardware_interface_bdw_plus.inl b/runtime/command_queue/hardware_interface_bdw_plus.inl
index 96a77eac1a..0ccb9e7deb 100644
--- a/runtime/command_queue/hardware_interface_bdw_plus.inl
+++ b/runtime/command_queue/hardware_interface_bdw_plus.inl
@@ -115,6 +115,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
     }
 
     auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
 
     HardwareCommandsHelper<GfxFamily>::sendIndirectState(
         commandStream,
@@ -122,6 +123,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
         ioh,
         ssh,
         kernel,
+        kernel.getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
         simd,
         localWorkSizes,
         offsetInterfaceDescriptorTable,
@@ -129,8 +131,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
         preemptionMode,
         walkerCmd,
         nullptr,
-        true,
-        isCcsUsed);
+        true);
 
     GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups,
                                                            numWorkGroups, localWorkSizes, simd, dim,
diff --git a/runtime/device_queue/device_queue_hw_base.inl b/runtime/device_queue/device_queue_hw_base.inl
index 77e66a3e80..9c2e831ba5 100644
--- a/runtime/device_queue/device_queue_hw_base.inl
+++ b/runtime/device_queue/device_queue_hw_base.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2020 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -237,7 +237,10 @@ uint64_t DeviceQueueHw<GfxFamily>::getBlockKernelStartPointer(const Device &devi
 
     auto blockKernelStartPointer = blockAllocation ? blockAllocation->getGpuAddressToPatch() : 0llu;
 
-    if (blockAllocation && isCcsUsed && HwHelperHw<GfxFamily>::isOffsetToSkipSetFFIDGPWARequired(device.getHardwareInfo())) {
+    auto &hardwareInfo = device.getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+
+    if (blockAllocation && isCcsUsed && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
         blockKernelStartPointer += blockInfo->patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
     }
     return blockKernelStartPointer;
diff --git a/runtime/device_queue/device_queue_hw_bdw_plus.inl b/runtime/device_queue/device_queue_hw_bdw_plus.inl
index 122b440782..fafc1ae8d7 100644
--- a/runtime/device_queue/device_queue_hw_bdw_plus.inl
+++ b/runtime/device_queue/device_queue_hw_bdw_plus.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2020 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -182,7 +182,11 @@ void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap
 
         totalBlockSSHSize += alignUp(pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
 
-        auto btOffset = HardwareCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(surfaceStateHeap, *pBlockInfo);
+        auto btOffset = HardwareCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(surfaceStateHeap, bindingTableCount,
+                                                                                            pBlockInfo->heapInfo.pSsh,
+                                                                                            pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize,
+                                                                                            bindingTableCount,
+                                                                                            pBlockInfo->patchInfo.bindingTableState->Offset);
 
         parentKernel->setReflectionSurfaceBlockBtOffset(i, static_cast<uint32_t>(btOffset));
 
diff --git a/runtime/gen12lp/hw_helper_gen12lp.cpp b/runtime/gen12lp/hw_helper_gen12lp.cpp
index 22a9f0778a..e9bae0a305 100644
--- a/runtime/gen12lp/hw_helper_gen12lp.cpp
+++ b/runtime/gen12lp/hw_helper_gen12lp.cpp
@@ -16,7 +16,7 @@ namespace NEO {
 typedef TGLLPFamily Family;
 
 template <>
-bool HwHelperHw<Family>::isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) {
+bool HwHelperHw<Family>::isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) const {
     return (hwInfo.platform.usRevId < REVISION_B);
 }
 
diff --git a/runtime/helpers/hardware_commands_helper.h b/runtime/helpers/hardware_commands_helper.h
index 8bd9e6f47b..d2f2b898fb 100644
--- a/runtime/helpers/hardware_commands_helper.h
+++ b/runtime/helpers/hardware_commands_helper.h
@@ -86,26 +86,13 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
                                                    const void *srcKernelSsh, size_t srcKernelSshSize,
                                                    size_t numberOfBindingTableStates, size_t offsetOfBindingTable);
 
-    static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo) {
-        return pushBindingTableAndSurfaceStates(dstHeap, (srcKernelInfo.patchInfo.bindingTableState != nullptr) ? srcKernelInfo.patchInfo.bindingTableState->Count : 0,
-                                                srcKernelInfo.heapInfo.pSsh,
-                                                srcKernelInfo.heapInfo.pKernelHeader->SurfaceStateHeapSize,
-                                                (srcKernelInfo.patchInfo.bindingTableState != nullptr) ? srcKernelInfo.patchInfo.bindingTableState->Count : 0,
-                                                (srcKernelInfo.patchInfo.bindingTableState != nullptr) ? srcKernelInfo.patchInfo.bindingTableState->Offset : 0);
-    }
-
-    static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) {
-        return pushBindingTableAndSurfaceStates(dstHeap, (srcKernel.getKernelInfo().patchInfo.bindingTableState != nullptr) ? srcKernel.getKernelInfo().patchInfo.bindingTableState->Count : 0,
-                                                srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize(),
-                                                srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset());
-    }
-
     static size_t sendIndirectState(
         LinearStream &commandStream,
         IndirectHeap &dsh,
         IndirectHeap &ioh,
         IndirectHeap &ssh,
         Kernel &kernel,
+        uint64_t kernelStartOffset,
         uint32_t simd,
         const size_t localWorkSize[3],
         const uint64_t offsetInterfaceDescriptorTable,
@@ -113,8 +100,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
         PreemptionMode preemptionMode,
         WALKER_TYPE<GfxFamily> *walkerCmd,
         INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
-        bool localIdsGenerationByRuntime,
-        bool isCcsUsed);
+        bool localIdsGenerationByRuntime);
 
     static void programPerThreadData(
         size_t &sizePerThreadData,
@@ -136,15 +122,6 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
 
     inline static bool resetBindingTablePrefetch(Kernel &kernel);
 
-    static void setKernelStartOffset(
-        uint64_t &kernelStartOffset,
-        bool kernelAllocation,
-        const KernelInfo &kernelInfo,
-        const bool &localIdsGenerationByRuntime,
-        const bool &kernelUsesLocalIds,
-        Kernel &kernel,
-        bool isCssUsed);
-
     static size_t getSizeRequiredCS(const Kernel *kernel);
     static size_t getSizeRequiredForCacheFlush(const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress);
     static bool isPipeControlWArequired(const HardwareInfo &hwInfo);
diff --git a/runtime/helpers/hardware_commands_helper.inl b/runtime/helpers/hardware_commands_helper.inl
index 1f12841cb0..1eeb588520 100644
--- a/runtime/helpers/hardware_commands_helper.inl
+++ b/runtime/helpers/hardware_commands_helper.inl
@@ -277,6 +277,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
     IndirectHeap &ioh,
     IndirectHeap &ssh,
     Kernel &kernel,
+    uint64_t kernelStartOffset,
     uint32_t simd,
     const size_t localWorkSize[3],
     const uint64_t offsetInterfaceDescriptorTable,
@@ -284,26 +285,20 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
     PreemptionMode preemptionMode,
     WALKER_TYPE<GfxFamily> *walkerCmd,
     INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
-    bool localIdsGenerationByRuntime,
-    bool isCcsUsed) {
+    bool localIdsGenerationByRuntime) {
 
     using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
 
     DEBUG_BREAK_IF(simd != 1 && simd != 8 && simd != 16 && simd != 32);
-    auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
     auto inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel);
 
     // Copy the kernel over to the ISH
-    uint64_t kernelStartOffset = 0llu;
     const auto &kernelInfo = kernel.getKernelInfo();
-    auto kernelAllocation = kernelInfo.getGraphicsAllocation();
-    DEBUG_BREAK_IF(!kernelAllocation);
-    setKernelStartOffset(kernelStartOffset, kernelAllocation, kernelInfo, localIdsGenerationByRuntime,
-                         kernelUsesLocalIds, kernel, isCcsUsed);
-
     const auto &patchInfo = kernelInfo.patchInfo;
 
-    auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, kernel);
+    auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, (kernelInfo.patchInfo.bindingTableState != nullptr) ? kernelInfo.patchInfo.bindingTableState->Count : 0,
+                                                                   kernel.getSurfaceStateHeap(), kernel.getSurfaceStateHeapSize(),
+                                                                   kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset());
 
     // Copy our sampler state if it exists
     size_t samplerStateOffset = 0;
diff --git a/runtime/helpers/hardware_commands_helper_base.inl b/runtime/helpers/hardware_commands_helper_base.inl
index 778715aab9..cab9e9477c 100644
--- a/runtime/helpers/hardware_commands_helper_base.inl
+++ b/runtime/helpers/hardware_commands_helper_base.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,26 +89,6 @@ void HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
     }
 }
 
-template <typename GfxFamily>
-void HardwareCommandsHelper<GfxFamily>::setKernelStartOffset(
-    uint64_t &kernelStartOffset,
-    bool kernelAllocation,
-    const KernelInfo &kernelInfo,
-    const bool &localIdsGenerationByRuntime,
-    const bool &kernelUsesLocalIds,
-    Kernel &kernel,
-    bool isCssUsed) {
-
-    if (kernelAllocation) {
-        kernelStartOffset = kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
-    }
-    kernelStartOffset += kernel.getStartOffset();
-
-    if (isCssUsed && HwHelperHw<GfxFamily>::isOffsetToSkipSetFFIDGPWARequired(kernel.getDevice().getHardwareInfo())) {
-        kernelStartOffset += kernelInfo.patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
-    }
-}
-
 template <typename GfxFamily>
 void HardwareCommandsHelper<GfxFamily>::programPerThreadData(
     size_t &sizePerThreadData,
diff --git a/runtime/kernel/kernel.cpp b/runtime/kernel/kernel.cpp
index 42032294e8..d3a16e3a63 100644
--- a/runtime/kernel/kernel.cpp
+++ b/runtime/kernel/kernel.cpp
@@ -776,11 +776,7 @@ void Kernel::setStartOffset(uint32_t offset) {
     this->startOffset = offset;
 }
 
-const void *Kernel::getSurfaceStateHeap() const {
-    return kernelInfo.usesSsh ? pSshLocal.get() : nullptr;
-}
-
-void *Kernel::getSurfaceStateHeap() {
+void *Kernel::getSurfaceStateHeap() const {
     return kernelInfo.usesSsh ? pSshLocal.get() : nullptr;
 }
 
@@ -2405,4 +2401,31 @@ bool Kernel::checkIfIsParentKernelAndBlocksUsesPrintf() {
     return isParentKernel && getProgram()->getBlockKernelManager()->getIfBlockUsesPrintf();
 }
 
+uint64_t Kernel::getKernelStartOffset(
+    const bool localIdsGenerationByRuntime,
+    const bool kernelUsesLocalIds,
+    const bool isCssUsed) const {
+
+    uint64_t kernelStartOffset = 0;
+
+    if (kernelInfo.getGraphicsAllocation()) {
+        kernelStartOffset = kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
+        if (localIdsGenerationByRuntime == false && kernelUsesLocalIds == true) {
+            DEBUG_BREAK_IF(kernelInfo.patchInfo.threadPayload->OffsetToSkipPerThreadDataLoad != 128);
+            kernelStartOffset += kernelInfo.patchInfo.threadPayload->OffsetToSkipPerThreadDataLoad;
+        }
+    }
+
+    kernelStartOffset += getStartOffset();
+
+    auto &hardwareInfo = getDevice().getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+
+    if (isCssUsed && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
+        kernelStartOffset += kernelInfo.patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
+    }
+
+    return kernelStartOffset;
+}
+
 } // namespace NEO
diff --git a/runtime/kernel/kernel.h b/runtime/kernel/kernel.h
index f0ac60c269..b28e065ff1 100644
--- a/runtime/kernel/kernel.h
+++ b/runtime/kernel/kernel.h
@@ -147,8 +147,7 @@ class Kernel : public BaseObject<_cl_kernel> {
                            size_t *paramValueSizeRet) const;
 
     const void *getKernelHeap() const;
-    const void *getSurfaceStateHeap() const;
-    void *getSurfaceStateHeap();
+    void *getSurfaceStateHeap() const;
     const void *getDynamicStateHeap() const;
 
     size_t getKernelHeapSize() const;
@@ -404,6 +403,11 @@ class Kernel : public BaseObject<_cl_kernel> {
                                    size_t *localWorkSize);
     uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize) const;
 
+    uint64_t getKernelStartOffset(
+        const bool localIdsGenerationByRuntime,
+        const bool kernelUsesLocalIds,
+        const bool isCssUsed) const;
+
   protected:
     struct ObjectCounts {
         uint32_t imageCount;
diff --git a/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp b/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp
index cfe74e6de4..9d309abacf 100644
--- a/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp
+++ b/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp
@@ -62,7 +62,11 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu
 
         auto graphicsAllocation = pKernel->getKernelInfo().getGraphicsAllocation();
         auto kernelIsaAddress = graphicsAllocation->getGpuAddressToPatch();
-        if (EngineHelpers::isCcs(pCmdQ->getGpgpuEngine().osContext->getEngineType()) && HwHelperHw<FamilyType>::isOffsetToSkipSetFFIDGPWARequired(pKernel->getDevice().getHardwareInfo())) {
+
+        auto &hardwareInfo = pKernel->getDevice().getHardwareInfo();
+        auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+
+        if (EngineHelpers::isCcs(pCmdQ->getGpgpuEngine().osContext->getEngineType()) && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
             kernelIsaAddress += pKernel->getKernelInfo().patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
         }
 
@@ -104,7 +108,11 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu
 
             uint64_t blockKernelAddress = ((uint64_t)idData[blockFirstIndex + i].getKernelStartPointerHigh() << 32) | (uint64_t)idData[blockFirstIndex + i].getKernelStartPointer();
             uint64_t expectedBlockKernelAddress = pBlockInfo->getGraphicsAllocation()->getGpuAddressToPatch();
-            if (EngineHelpers::isCcs(pCmdQ->getGpgpuEngine().osContext->getEngineType()) && HwHelperHw<FamilyType>::isOffsetToSkipSetFFIDGPWARequired(pKernel->getDevice().getHardwareInfo())) {
+
+            auto &hardwareInfo = pKernel->getDevice().getHardwareInfo();
+            auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+
+            if (EngineHelpers::isCcs(pCmdQ->getGpgpuEngine().osContext->getEngineType()) && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
                 expectedBlockKernelAddress += pBlockInfo->patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
             }
 
diff --git a/unit_tests/gen12lp/CMakeLists.txt b/unit_tests/gen12lp/CMakeLists.txt
index de6b65da4d..23de2f53ae 100644
--- a/unit_tests/gen12lp/CMakeLists.txt
+++ b/unit_tests/gen12lp/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2018-2019 Intel Corporation
+# Copyright (C) 2018-2020 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
@@ -15,7 +15,6 @@ if(TESTS_GEN12LP)
     ${CMAKE_CURRENT_SOURCE_DIR}/device_queue_tests_gen12lp.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_media_kernel_gen12lp.inl
     ${CMAKE_CURRENT_SOURCE_DIR}/gen12lp_tests_wrapper.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/hardware_commands_helper_tests_gen12lp.inl
     ${CMAKE_CURRENT_SOURCE_DIR}/hw_helper_tests_gen12lp.inl
     ${CMAKE_CURRENT_SOURCE_DIR}/image_tests_gen12lp.inl
     ${CMAKE_CURRENT_SOURCE_DIR}/kernel_tests_gen12lp.inl
diff --git a/unit_tests/gen12lp/gen12lp_tests_wrapper.cpp b/unit_tests/gen12lp/gen12lp_tests_wrapper.cpp
index b1fb6f0bb5..886aaa8af2 100644
--- a/unit_tests/gen12lp/gen12lp_tests_wrapper.cpp
+++ b/unit_tests/gen12lp/gen12lp_tests_wrapper.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2020 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -12,7 +12,6 @@
 #include "unit_tests/gen12lp/command_stream_receiver_simulated_common_hw_tests_gen12lp.inl"
 #include "unit_tests/gen12lp/compute_mode_tests_gen12lp.inl"
 #include "unit_tests/gen12lp/enqueue_media_kernel_gen12lp.inl"
-#include "unit_tests/gen12lp/hardware_commands_helper_tests_gen12lp.inl"
 #include "unit_tests/gen12lp/hw_helper_tests_gen12lp.inl"
 #include "unit_tests/gen12lp/image_tests_gen12lp.inl"
 #include "unit_tests/gen12lp/kernel_tests_gen12lp.inl"
diff --git a/unit_tests/gen12lp/tgllp/CMakeLists.txt b/unit_tests/gen12lp/tgllp/CMakeLists.txt
index e20cf7eb0c..9f314842ef 100644
--- a/unit_tests/gen12lp/tgllp/CMakeLists.txt
+++ b/unit_tests/gen12lp/tgllp/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2019 Intel Corporation
+# Copyright (C) 2019-2020 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
@@ -7,6 +7,7 @@
 if(TESTS_TGLLP)
   set(IGDRCL_SRCS_tests_gen12lp_tgllp
     ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
+    ${CMAKE_CURRENT_SOURCE_DIR}/kernel_tests_tgllp.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_hw_helper_tgllp.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_hw_info_config_tgllp.cpp
   )
diff --git a/unit_tests/gen12lp/hardware_commands_helper_tests_gen12lp.inl b/unit_tests/gen12lp/tgllp/kernel_tests_tgllp.cpp
similarity index 70%
rename from unit_tests/gen12lp/hardware_commands_helper_tests_gen12lp.inl
rename to unit_tests/gen12lp/tgllp/kernel_tests_tgllp.cpp
index 5ce76d2356..a27b8a893a 100644
--- a/unit_tests/gen12lp/hardware_commands_helper_tests_gen12lp.inl
+++ b/unit_tests/gen12lp/tgllp/kernel_tests_tgllp.cpp
@@ -12,9 +12,9 @@
 
 using namespace NEO;
 
-using HardwareCommandsGen12LpTests = ::testing::Test;
+using KernelTgllpTests = ::testing::Test;
 
-TGLLPTEST_F(HardwareCommandsGen12LpTests, GivenUseOffsetToSkipSetFFIDGPWorkaroundActiveWhenSettingKernelStartOffsetThenAdditionalOffsetIsSet) {
+TGLLPTEST_F(KernelTgllpTests, GivenUseOffsetToSkipSetFFIDGPWorkaroundActiveWhenSettingKernelStartOffsetThenAdditionalOffsetIsSet) {
     const uint64_t defaultKernelStartOffset = 0;
     const uint64_t additionalOffsetDueToFfid = 0x1234;
     SPatchThreadPayload threadPayload{};
@@ -30,9 +30,7 @@ TGLLPTEST_F(HardwareCommandsGen12LpTests, GivenUseOffsetToSkipSetFFIDGPWorkaroun
         mockKernelWithInternals.kernelInfo.patchInfo.threadPayload = &threadPayload;
 
         for (auto isCcsUsed : ::testing::Bool()) {
-            uint64_t kernelStartOffset = defaultKernelStartOffset;
-            HardwareCommandsHelper<FamilyType>::setKernelStartOffset(kernelStartOffset, false, mockKernelWithInternals.kernelInfo, false,
-                                                                     false, *mockKernelWithInternals.mockKernel, isCcsUsed);
+            uint64_t kernelStartOffset = mockKernelWithInternals.mockKernel->getKernelStartOffset(false, false, isCcsUsed);
 
             if (stepping < REVISION_B && isCcsUsed) {
                 EXPECT_EQ(defaultKernelStartOffset + additionalOffsetDueToFfid, kernelStartOffset);
@@ -41,4 +39,4 @@ TGLLPTEST_F(HardwareCommandsGen12LpTests, GivenUseOffsetToSkipSetFFIDGPWorkaroun
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/unit_tests/helpers/hardware_commands_helper_tests.cpp b/unit_tests/helpers/hardware_commands_helper_tests.cpp
index 59f759f5dc..55fb6eb360 100644
--- a/unit_tests/helpers/hardware_commands_helper_tests.cpp
+++ b/unit_tests/helpers/hardware_commands_helper_tests.cpp
@@ -325,6 +325,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, sendIndirectStateResourceUsage
         sizeof(INTERFACE_DESCRIPTOR_DATA));
     uint32_t interfaceDescriptorIndex = 0;
     auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*kernel);
 
     HardwareCommandsHelper<FamilyType>::sendIndirectState(
         commandStream,
@@ -332,6 +333,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, sendIndirectStateResourceUsage
         ioh,
         ssh,
         *kernel,
+        kernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
         kernel->getKernelInfo().getMaxSimdSize(),
         localWorkSizes,
         IDToffset,
@@ -339,8 +341,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, sendIndirectStateResourceUsage
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true,
-        isCcsUsed);
+        true);
 
     // It's okay these are EXPECT_GE as they're only going to be used for
     // estimation purposes to avoid OOM.
@@ -378,12 +379,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
     const size_t localWorkSizes[3]{localWorkSize, 1, 1};
     uint32_t interfaceDescriptorIndex = 0;
     auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
+
     HardwareCommandsHelper<FamilyType>::sendIndirectState(
         commandStream,
         dsh,
         ioh,
         ssh,
         *mockKernelWithInternal->mockKernel,
+        mockKernelWithInternal->mockKernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
         mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
         localWorkSizes,
         0,
@@ -391,8 +395,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true,
-        isCcsUsed);
+        true);
 
     auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
     if (HardwareCommandsHelper<FamilyType>::doBindingTablePrefetch()) {
@@ -423,12 +426,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelThatIsSchedulerWhen
     const size_t localWorkSizes[3]{localWorkSize, 1, 1};
     uint32_t interfaceDescriptorIndex = 0;
     auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
+
     HardwareCommandsHelper<FamilyType>::sendIndirectState(
         commandStream,
         dsh,
         ioh,
         ssh,
         *mockKernelWithInternal->mockKernel,
+        mockKernelWithInternal->mockKernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
         mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
         localWorkSizes,
         0,
@@ -436,8 +442,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelThatIsSchedulerWhen
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true,
-        isCcsUsed);
+        true);
 
     auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
     EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount());
@@ -462,12 +467,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
     const size_t localWorkSizes[3]{localWorkSize, 1, 1};
     uint32_t interfaceDescriptorIndex = 0;
     auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
+
     HardwareCommandsHelper<FamilyType>::sendIndirectState(
         commandStream,
         dsh,
         ioh,
         ssh,
         *mockKernelWithInternal->mockKernel,
+        mockKernelWithInternal->mockKernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
         mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
         localWorkSizes,
         0,
@@ -475,8 +483,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true,
-        isCcsUsed);
+        true);
 
     auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
     if (HardwareCommandsHelper<FamilyType>::doBindingTablePrefetch()) {
@@ -536,12 +543,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
     MockKernel mockKernel{kernel->getProgram(), modifiedKernelInfo, kernel->getDevice(), false};
     uint32_t interfaceDescriptorIndex = 0;
     auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(mockKernel);
+
     HardwareCommandsHelper<FamilyType>::sendIndirectState(
         commandStream,
         dsh,
         ioh,
         ssh,
         mockKernel,
+        mockKernel.getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
         modifiedKernelInfo.getMaxSimdSize(),
         localWorkSizes,
         IDToffset,
@@ -549,8 +559,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true,
-        isCcsUsed);
+        true);
 
     size_t numThreads = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
     numThreads = Math::divideAndRoundUp(numThreads, modifiedKernelInfo.getMaxSimdSize());
@@ -618,12 +627,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointer)
     const_cast<KernelInfo &>(kernelInfo).requiresSshForBuffers = true;
     uint32_t interfaceDescriptorIndex = 0;
     auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*kernel);
+
     HardwareCommandsHelper<FamilyType>::sendIndirectState(
         commandStream,
         dsh,
         ioh,
         ssh,
         *kernel,
+        kernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
         kernel->getKernelInfo().getMaxSimdSize(),
         localWorkSizes,
         0,
@@ -631,8 +643,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointer)
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true,
-        isCcsUsed);
+        true);
 
     EXPECT_EQ(0x00000000u, *(&bindingTableStatesPointers[0]));
     EXPECT_EQ(0x00000040u, *(&bindingTableStatesPointers[1]));
@@ -780,12 +791,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointersF
         // push surfaces states and binding table to given ssh heap
         uint32_t interfaceDescriptorIndex = 0;
         auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+        auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*pKernel);
+
         HardwareCommandsHelper<FamilyType>::sendIndirectState(
             commandStream,
             dsh,
             ioh,
             ssh,
             *pKernel,
+            pKernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
             pKernel->getKernelInfo().getMaxSimdSize(),
             localWorkSizes,
             0,
@@ -793,8 +807,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointersF
             pDevice->getPreemptionMode(),
             pWalkerCmd,
             nullptr,
-            true,
-            isCcsUsed);
+            true);
 
         bti = reinterpret_cast<typename FamilyType::BINDING_TABLE_STATE *>(reinterpret_cast<unsigned char *>(ssh.getCpuBase()) + localSshOffset + btiOffset);
         for (uint32_t i = 0; i < numSurfaces; ++i) {
@@ -859,7 +872,7 @@ HWTEST_F(HardwareCommandsTest, setBindingTableStatesForKernelWithBuffersNotRequi
     EXPECT_EQ(0u, numSurfaceStates);
 
     // set binding table states
-    auto dstBindingTablePointer = HardwareCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
+    auto dstBindingTablePointer = pushBindingTableAndSurfaceStates<FamilyType>(ssh, *pKernel);
     EXPECT_EQ(0u, dstBindingTablePointer);
 
     auto usedAfter = ssh.getUsed();
@@ -904,10 +917,10 @@ HWTEST_F(HardwareCommandsTest, setBindingTableStatesForNoSurfaces) {
     auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
     EXPECT_EQ(0u, numSurfaceStates);
 
-    auto dstBindingTablePointer = HardwareCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernelInfo);
+    auto dstBindingTablePointer = pushBindingTableAndSurfaceStates<FamilyType>(ssh, *pKernel);
     EXPECT_EQ(0u, dstBindingTablePointer);
 
-    dstBindingTablePointer = HardwareCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
+    dstBindingTablePointer = pushBindingTableAndSurfaceStates<FamilyType>(ssh, *pKernel);
     EXPECT_EQ(0u, dstBindingTablePointer);
 
     SPatchBindingTableState bindingTableState;
@@ -918,7 +931,7 @@ HWTEST_F(HardwareCommandsTest, setBindingTableStatesForNoSurfaces) {
     bindingTableState.SurfaceStateOffset = 0;
     pKernelInfo->patchInfo.bindingTableState = &bindingTableState;
 
-    dstBindingTablePointer = HardwareCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
+    dstBindingTablePointer = pushBindingTableAndSurfaceStates<FamilyType>(ssh, *pKernel);
     EXPECT_EQ(0u, dstBindingTablePointer);
 
     pKernelInfo->patchInfo.bindingTableState = nullptr;
@@ -1060,12 +1073,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
     mockKernelWithInternal->mockKernel->setSshLocal(mockKernelWithInternal->sshLocal, sizeof(mockKernelWithInternal->sshLocal));
     uint32_t interfaceDescriptorIndex = 0;
     auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
+
     HardwareCommandsHelper<FamilyType>::sendIndirectState(
         commandStream,
         dsh,
         ioh,
         ssh,
         *mockKernelWithInternal->mockKernel,
+        mockKernelWithInternal->mockKernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
         8,
         localWorkSizes,
         interfaceDescriptorTableOffset,
@@ -1073,8 +1089,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true,
-        isCcsUsed);
+        true);
 
     bool isMemorySame = memcmp(borderColorPointer, mockDsh, borderColorSize) == 0;
     EXPECT_TRUE(isMemorySame);
diff --git a/unit_tests/helpers/hardware_commands_helper_tests.h b/unit_tests/helpers/hardware_commands_helper_tests.h
index 30d5d5fa3a..fd258a34bc 100644
--- a/unit_tests/helpers/hardware_commands_helper_tests.h
+++ b/unit_tests/helpers/hardware_commands_helper_tests.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
  */
 
 #include "runtime/built_ins/built_ins.h"
+#include "runtime/helpers/hardware_commands_helper.h"
 #include "runtime/kernel/kernel.h"
 #include "test.h"
 #include "unit_tests/fixtures/built_in_fixture.h"
@@ -39,4 +40,11 @@ struct HardwareCommandsTest : DeviceFixture,
     std::unique_ptr<MockKernelWithInternals> mockKernelWithInternal;
     Kernel::SimpleKernelArgInfo kernelArgInfo = {};
     std::vector<Kernel::SimpleKernelArgInfo> kernelArguments;
+
+    template <typename GfxFamily>
+    size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) {
+        return HardwareCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(dstHeap, (srcKernel.getKernelInfo().patchInfo.bindingTableState != nullptr) ? srcKernel.getKernelInfo().patchInfo.bindingTableState->Count : 0,
+                                                                                   srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize(),
+                                                                                   srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset());
+    }
 };
diff --git a/unit_tests/helpers/hw_helper_tests.cpp b/unit_tests/helpers/hw_helper_tests.cpp
index e88a0aacb9..006d7e13a3 100644
--- a/unit_tests/helpers/hw_helper_tests.cpp
+++ b/unit_tests/helpers/hw_helper_tests.cpp
@@ -788,7 +788,8 @@ HWTEST_F(HwHelperTest, givenDefaultHwHelperHwWhenIsOffsetToSkipSetFFIDGPWARequir
     if (hardwareInfo.platform.eRenderCoreFamily == IGFX_GEN12LP_CORE) {
         GTEST_SKIP();
     }
-    EXPECT_FALSE(HwHelperHw<FamilyType>::isOffsetToSkipSetFFIDGPWARequired(hardwareInfo));
+    auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+    EXPECT_FALSE(hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo));
 }
 
 HWTEST_F(HwHelperTest, givenDefaultHwHelperHwWhenIsForceDefaultRCSEngineWARequiredCalledThenFalseIsReturned) {
diff --git a/unit_tests/kernel/kernel_tests.cpp b/unit_tests/kernel/kernel_tests.cpp
index f20588fa95..5a99b484f5 100644
--- a/unit_tests/kernel/kernel_tests.cpp
+++ b/unit_tests/kernel/kernel_tests.cpp
@@ -766,6 +766,7 @@ TEST_F(KernelPrivateSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenPriv
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
     EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap());
 
     program.setConstantSurface(nullptr);
     delete pKernel;
@@ -1014,6 +1015,7 @@ TEST_F(KernelGlobalSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenGloba
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
     EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap());
 
     program.setGlobalSurface(nullptr);
     delete pKernel;
@@ -1188,6 +1190,7 @@ TEST_F(KernelConstantSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenCon
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
     EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap());
 
     program.setConstantSurface(nullptr);
     delete pKernel;
@@ -2936,6 +2939,60 @@ TEST(KernelTest, GivenDifferentValuesWhenSetKernelExecutionTypeIsCalledThenCorre
     EXPECT_EQ(KernelExecutionType::Default, kernel.executionType);
 }
 
+TEST(KernelTest, givenKernelLocalIdGenerationByRuntimeFalseWhenGettingStartOffsetThenOffsetToSkipPerThreadDataLoadIsAdded) {
+    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+
+    MockKernelWithInternals mockKernel(*device);
+    SPatchThreadPayload threadPayload = {};
+
+    threadPayload.OffsetToSkipPerThreadDataLoad = 128u;
+    mockKernel.kernelInfo.patchInfo.threadPayload = &threadPayload;
+
+    mockKernel.kernelInfo.createKernelAllocation(device->getRootDeviceIndex(), device->getMemoryManager());
+    auto allocationOffset = mockKernel.kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
+
+    mockKernel.mockKernel->setStartOffset(128);
+    auto offset = mockKernel.mockKernel->getKernelStartOffset(false, true, false);
+    EXPECT_EQ(allocationOffset + 256u, offset);
+    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getGraphicsAllocation());
+}
+
+TEST(KernelTest, givenKernelLocalIdGenerationByRuntimeTrueAndLocalIdsUsedWhenGettingStartOffsetThenOffsetToSkipPerThreadDataLoadIsNotAdded) {
+    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+
+    MockKernelWithInternals mockKernel(*device);
+    SPatchThreadPayload threadPayload = {};
+
+    threadPayload.OffsetToSkipPerThreadDataLoad = 128u;
+    mockKernel.kernelInfo.patchInfo.threadPayload = &threadPayload;
+
+    mockKernel.kernelInfo.createKernelAllocation(device->getRootDeviceIndex(), device->getMemoryManager());
+    auto allocationOffset = mockKernel.kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
+
+    mockKernel.mockKernel->setStartOffset(128);
+    auto offset = mockKernel.mockKernel->getKernelStartOffset(true, true, false);
+    EXPECT_EQ(allocationOffset + 128u, offset);
+    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getGraphicsAllocation());
+}
+
+TEST(KernelTest, givenKernelLocalIdGenerationByRuntimeFalseAndLocalIdsNotUsedWhenGettingStartOffsetThenOffsetToSkipPerThreadDataLoadIsNotAdded) {
+    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+
+    MockKernelWithInternals mockKernel(*device);
+    SPatchThreadPayload threadPayload = {};
+
+    threadPayload.OffsetToSkipPerThreadDataLoad = 128u;
+    mockKernel.kernelInfo.patchInfo.threadPayload = &threadPayload;
+
+    mockKernel.kernelInfo.createKernelAllocation(device->getRootDeviceIndex(), device->getMemoryManager());
+    auto allocationOffset = mockKernel.kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
+
+    mockKernel.mockKernel->setStartOffset(128);
+    auto offset = mockKernel.mockKernel->getKernelStartOffset(false, false, false);
+    EXPECT_EQ(allocationOffset + 128u, offset);
+    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getGraphicsAllocation());
+}
+
 namespace NEO {
 
 template <typename GfxFamily>