Simplify HardwareCommandsHelper

Related-To: NEO-4175 Change-Id: I39b08353514ea0bf384b6b592f24952d0ed631e6 Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
2025-12-21 01:04:57 +08:00 · 2020-01-23 15:52:49 +01:00
parent fd8c5ba67f
commit 60c7587c2b
21 changed files with 185 additions and 109 deletions
--- a/core/helpers/hw_helper.h
+++ b/core/helpers/hw_helper.h
@@ -72,6 +72,7 @@ class HwHelper {
                                                   uint32_t threadsPerEu) = 0;
    virtual uint32_t alignSlmSize(uint32_t slmSize) = 0;
    virtual bool isForceEmuInt32DivRemSPWARequired(const HardwareInfo &hwInfo) = 0;
+    virtual bool isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) const = 0;

    static uint32_t getSubDevicesCount(const HardwareInfo *pHwInfo);
    static uint32_t getEnginesCount(const HardwareInfo &hwInfo);
@@ -186,7 +187,7 @@ class HwHelperHw : public HwHelper {

    static bool isBlitAuxTranslationRequired(const HardwareInfo &hwInfo, const MultiDispatchInfo &multiDispatchInfo);

-    static bool isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo);
+    bool isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) const override;

    static bool isForceDefaultRCSEngineWARequired(const HardwareInfo &hwInfo);

--- a/core/helpers/hw_helper_base.inl
+++ b/core/helpers/hw_helper_base.inl
@@ -263,7 +263,7 @@ uint32_t HwHelperHw<GfxFamily>::getBarriersCountFromHasBarriers(uint32_t hasBarr
 }

 template <typename GfxFamily>
-bool HwHelperHw<GfxFamily>::isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) {
+bool HwHelperHw<GfxFamily>::isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) const {
    return false;
 }

--- a/runtime/command_queue/gpgpu_walker_bdw_plus.inl
+++ b/runtime/command_queue/gpgpu_walker_bdw_plus.inl
@@ -126,12 +126,15 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
    auto pGpGpuWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
    *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
    bool inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(scheduler);
+    auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(scheduler);
+
    HardwareCommandsHelper<GfxFamily>::sendIndirectState(
        commandStream,
        *dsh,
        *ioh,
        *ssh,
        scheduler,
+        scheduler.getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
        simd,
        localWorkSizes,
        offsetInterfaceDescriptorTable,
@@ -139,8 +142,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
        preemptionMode,
        pGpGpuWalkerCmd,
        nullptr,
-        true,
-        isCcsUsed);
+        true);

    // Implement enabling special WA DisableLSQCROPERFforOCL if needed
    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, true);
--- a/runtime/command_queue/hardware_interface_bdw_plus.inl
+++ b/runtime/command_queue/hardware_interface_bdw_plus.inl
@@ -115,6 +115,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
    }

    auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);

    HardwareCommandsHelper<GfxFamily>::sendIndirectState(
        commandStream,
@@ -122,6 +123,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
        ioh,
        ssh,
        kernel,
+        kernel.getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
        simd,
        localWorkSizes,
        offsetInterfaceDescriptorTable,
@@ -129,8 +131,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
        preemptionMode,
        walkerCmd,
        nullptr,
-        true,
-        isCcsUsed);
+        true);

    GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups,
                                                           numWorkGroups, localWorkSizes, simd, dim,
--- a/runtime/device_queue/device_queue_hw_base.inl
+++ b/runtime/device_queue/device_queue_hw_base.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2020 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -237,7 +237,10 @@ uint64_t DeviceQueueHw<GfxFamily>::getBlockKernelStartPointer(const Device &devi

    auto blockKernelStartPointer = blockAllocation ? blockAllocation->getGpuAddressToPatch() : 0llu;

-    if (blockAllocation && isCcsUsed && HwHelperHw<GfxFamily>::isOffsetToSkipSetFFIDGPWARequired(device.getHardwareInfo())) {
+    auto &hardwareInfo = device.getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+
+    if (blockAllocation && isCcsUsed && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
        blockKernelStartPointer += blockInfo->patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
    }
    return blockKernelStartPointer;
--- a/runtime/device_queue/device_queue_hw_bdw_plus.inl
+++ b/runtime/device_queue/device_queue_hw_bdw_plus.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2020 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -182,7 +182,11 @@ void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap

        totalBlockSSHSize += alignUp(pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);

-        auto btOffset = HardwareCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(surfaceStateHeap, *pBlockInfo);
+        auto btOffset = HardwareCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(surfaceStateHeap, bindingTableCount,
+                                                                                            pBlockInfo->heapInfo.pSsh,
+                                                                                            pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize,
+                                                                                            bindingTableCount,
+                                                                                            pBlockInfo->patchInfo.bindingTableState->Offset);

        parentKernel->setReflectionSurfaceBlockBtOffset(i, static_cast<uint32_t>(btOffset));

--- a/runtime/gen12lp/hw_helper_gen12lp.cpp
+++ b/runtime/gen12lp/hw_helper_gen12lp.cpp
@@ -16,7 +16,7 @@ namespace NEO {
 typedef TGLLPFamily Family;

 template <>
-bool HwHelperHw<Family>::isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) {
+bool HwHelperHw<Family>::isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) const {
    return (hwInfo.platform.usRevId < REVISION_B);
 }

--- a/runtime/helpers/hardware_commands_helper.h
+++ b/runtime/helpers/hardware_commands_helper.h
@@ -86,26 +86,13 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
                                                   const void *srcKernelSsh, size_t srcKernelSshSize,
                                                   size_t numberOfBindingTableStates, size_t offsetOfBindingTable);

-    static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo) {
-        return pushBindingTableAndSurfaceStates(dstHeap, (srcKernelInfo.patchInfo.bindingTableState != nullptr) ? srcKernelInfo.patchInfo.bindingTableState->Count : 0,
-                                                srcKernelInfo.heapInfo.pSsh,
-                                                srcKernelInfo.heapInfo.pKernelHeader->SurfaceStateHeapSize,
-                                                (srcKernelInfo.patchInfo.bindingTableState != nullptr) ? srcKernelInfo.patchInfo.bindingTableState->Count : 0,
-                                                (srcKernelInfo.patchInfo.bindingTableState != nullptr) ? srcKernelInfo.patchInfo.bindingTableState->Offset : 0);
-    }
-
-    static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) {
-        return pushBindingTableAndSurfaceStates(dstHeap, (srcKernel.getKernelInfo().patchInfo.bindingTableState != nullptr) ? srcKernel.getKernelInfo().patchInfo.bindingTableState->Count : 0,
-                                                srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize(),
-                                                srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset());
-    }
-
    static size_t sendIndirectState(
        LinearStream &commandStream,
        IndirectHeap &dsh,
        IndirectHeap &ioh,
        IndirectHeap &ssh,
        Kernel &kernel,
+        uint64_t kernelStartOffset,
        uint32_t simd,
        const size_t localWorkSize[3],
        const uint64_t offsetInterfaceDescriptorTable,
@@ -113,8 +100,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
        PreemptionMode preemptionMode,
        WALKER_TYPE<GfxFamily> *walkerCmd,
        INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
-        bool localIdsGenerationByRuntime,
-        bool isCcsUsed);
+        bool localIdsGenerationByRuntime);

    static void programPerThreadData(
        size_t &sizePerThreadData,
@@ -136,15 +122,6 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {

    inline static bool resetBindingTablePrefetch(Kernel &kernel);

-    static void setKernelStartOffset(
-        uint64_t &kernelStartOffset,
-        bool kernelAllocation,
-        const KernelInfo &kernelInfo,
-        const bool &localIdsGenerationByRuntime,
-        const bool &kernelUsesLocalIds,
-        Kernel &kernel,
-        bool isCssUsed);
-
    static size_t getSizeRequiredCS(const Kernel *kernel);
    static size_t getSizeRequiredForCacheFlush(const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress);
    static bool isPipeControlWArequired(const HardwareInfo &hwInfo);
--- a/runtime/helpers/hardware_commands_helper.inl
+++ b/runtime/helpers/hardware_commands_helper.inl
@@ -277,6 +277,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
    IndirectHeap &ioh,
    IndirectHeap &ssh,
    Kernel &kernel,
+    uint64_t kernelStartOffset,
    uint32_t simd,
    const size_t localWorkSize[3],
    const uint64_t offsetInterfaceDescriptorTable,
@@ -284,26 +285,20 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
    PreemptionMode preemptionMode,
    WALKER_TYPE<GfxFamily> *walkerCmd,
    INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
-    bool localIdsGenerationByRuntime,
-    bool isCcsUsed) {
+    bool localIdsGenerationByRuntime) {

    using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;

    DEBUG_BREAK_IF(simd != 1 && simd != 8 && simd != 16 && simd != 32);
-    auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
    auto inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel);

    // Copy the kernel over to the ISH
-    uint64_t kernelStartOffset = 0llu;
    const auto &kernelInfo = kernel.getKernelInfo();
-    auto kernelAllocation = kernelInfo.getGraphicsAllocation();
-    DEBUG_BREAK_IF(!kernelAllocation);
-    setKernelStartOffset(kernelStartOffset, kernelAllocation, kernelInfo, localIdsGenerationByRuntime,
-                         kernelUsesLocalIds, kernel, isCcsUsed);
-
    const auto &patchInfo = kernelInfo.patchInfo;

-    auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, kernel);
+    auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, (kernelInfo.patchInfo.bindingTableState != nullptr) ? kernelInfo.patchInfo.bindingTableState->Count : 0,
+                                                                   kernel.getSurfaceStateHeap(), kernel.getSurfaceStateHeapSize(),
+                                                                   kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset());

    // Copy our sampler state if it exists
    size_t samplerStateOffset = 0;
--- a/runtime/helpers/hardware_commands_helper_base.inl
+++ b/runtime/helpers/hardware_commands_helper_base.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -89,26 +89,6 @@ void HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
    }
 }

-template <typename GfxFamily>
-void HardwareCommandsHelper<GfxFamily>::setKernelStartOffset(
-    uint64_t &kernelStartOffset,
-    bool kernelAllocation,
-    const KernelInfo &kernelInfo,
-    const bool &localIdsGenerationByRuntime,
-    const bool &kernelUsesLocalIds,
-    Kernel &kernel,
-    bool isCssUsed) {
-
-    if (kernelAllocation) {
-        kernelStartOffset = kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
-    }
-    kernelStartOffset += kernel.getStartOffset();
-
-    if (isCssUsed && HwHelperHw<GfxFamily>::isOffsetToSkipSetFFIDGPWARequired(kernel.getDevice().getHardwareInfo())) {
-        kernelStartOffset += kernelInfo.patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
-    }
-}
-
 template <typename GfxFamily>
 void HardwareCommandsHelper<GfxFamily>::programPerThreadData(
    size_t &sizePerThreadData,
--- a/runtime/kernel/kernel.cpp
+++ b/runtime/kernel/kernel.cpp
@@ -776,11 +776,7 @@ void Kernel::setStartOffset(uint32_t offset) {
    this->startOffset = offset;
 }

-const void *Kernel::getSurfaceStateHeap() const {
-    return kernelInfo.usesSsh ? pSshLocal.get() : nullptr;
-}
-
-void *Kernel::getSurfaceStateHeap() {
+void *Kernel::getSurfaceStateHeap() const {
    return kernelInfo.usesSsh ? pSshLocal.get() : nullptr;
 }

@@ -2405,4 +2401,31 @@ bool Kernel::checkIfIsParentKernelAndBlocksUsesPrintf() {
    return isParentKernel && getProgram()->getBlockKernelManager()->getIfBlockUsesPrintf();
 }

+uint64_t Kernel::getKernelStartOffset(
+    const bool localIdsGenerationByRuntime,
+    const bool kernelUsesLocalIds,
+    const bool isCssUsed) const {
+
+    uint64_t kernelStartOffset = 0;
+
+    if (kernelInfo.getGraphicsAllocation()) {
+        kernelStartOffset = kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
+        if (localIdsGenerationByRuntime == false && kernelUsesLocalIds == true) {
+            DEBUG_BREAK_IF(kernelInfo.patchInfo.threadPayload->OffsetToSkipPerThreadDataLoad != 128);
+            kernelStartOffset += kernelInfo.patchInfo.threadPayload->OffsetToSkipPerThreadDataLoad;
+        }
+    }
+
+    kernelStartOffset += getStartOffset();
+
+    auto &hardwareInfo = getDevice().getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+
+    if (isCssUsed && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
+        kernelStartOffset += kernelInfo.patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
+    }
+
+    return kernelStartOffset;
+}
+
 } // namespace NEO
--- a/runtime/kernel/kernel.h
+++ b/runtime/kernel/kernel.h
@@ -147,8 +147,7 @@ class Kernel : public BaseObject<_cl_kernel> {
                           size_t *paramValueSizeRet) const;

    const void *getKernelHeap() const;
-    const void *getSurfaceStateHeap() const;
-    void *getSurfaceStateHeap();
+    void *getSurfaceStateHeap() const;
    const void *getDynamicStateHeap() const;

    size_t getKernelHeapSize() const;
@@ -404,6 +403,11 @@ class Kernel : public BaseObject<_cl_kernel> {
                                   size_t *localWorkSize);
    uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize) const;

+    uint64_t getKernelStartOffset(
+        const bool localIdsGenerationByRuntime,
+        const bool kernelUsesLocalIds,
+        const bool isCssUsed) const;
+
  protected:
    struct ObjectCounts {
        uint32_t imageCount;
--- a/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp
+++ b/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp
@@ -62,7 +62,11 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu

        auto graphicsAllocation = pKernel->getKernelInfo().getGraphicsAllocation();
        auto kernelIsaAddress = graphicsAllocation->getGpuAddressToPatch();
-        if (EngineHelpers::isCcs(pCmdQ->getGpgpuEngine().osContext->getEngineType()) && HwHelperHw<FamilyType>::isOffsetToSkipSetFFIDGPWARequired(pKernel->getDevice().getHardwareInfo())) {
+
+        auto &hardwareInfo = pKernel->getDevice().getHardwareInfo();
+        auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+
+        if (EngineHelpers::isCcs(pCmdQ->getGpgpuEngine().osContext->getEngineType()) && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
            kernelIsaAddress += pKernel->getKernelInfo().patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
        }

@@ -104,7 +108,11 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu

            uint64_t blockKernelAddress = ((uint64_t)idData[blockFirstIndex + i].getKernelStartPointerHigh() << 32) | (uint64_t)idData[blockFirstIndex + i].getKernelStartPointer();
            uint64_t expectedBlockKernelAddress = pBlockInfo->getGraphicsAllocation()->getGpuAddressToPatch();
-            if (EngineHelpers::isCcs(pCmdQ->getGpgpuEngine().osContext->getEngineType()) && HwHelperHw<FamilyType>::isOffsetToSkipSetFFIDGPWARequired(pKernel->getDevice().getHardwareInfo())) {
+
+            auto &hardwareInfo = pKernel->getDevice().getHardwareInfo();
+            auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+
+            if (EngineHelpers::isCcs(pCmdQ->getGpgpuEngine().osContext->getEngineType()) && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
                expectedBlockKernelAddress += pBlockInfo->patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
            }

--- a/unit_tests/gen12lp/CMakeLists.txt
+++ b/unit_tests/gen12lp/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2018-2019 Intel Corporation
+# Copyright (C) 2018-2020 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
@@ -15,7 +15,6 @@ if(TESTS_GEN12LP)
    ${CMAKE_CURRENT_SOURCE_DIR}/device_queue_tests_gen12lp.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_media_kernel_gen12lp.inl
    ${CMAKE_CURRENT_SOURCE_DIR}/gen12lp_tests_wrapper.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/hardware_commands_helper_tests_gen12lp.inl
    ${CMAKE_CURRENT_SOURCE_DIR}/hw_helper_tests_gen12lp.inl
    ${CMAKE_CURRENT_SOURCE_DIR}/image_tests_gen12lp.inl
    ${CMAKE_CURRENT_SOURCE_DIR}/kernel_tests_gen12lp.inl
--- a/unit_tests/gen12lp/gen12lp_tests_wrapper.cpp
+++ b/unit_tests/gen12lp/gen12lp_tests_wrapper.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2020 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -12,7 +12,6 @@
 #include "unit_tests/gen12lp/command_stream_receiver_simulated_common_hw_tests_gen12lp.inl"
 #include "unit_tests/gen12lp/compute_mode_tests_gen12lp.inl"
 #include "unit_tests/gen12lp/enqueue_media_kernel_gen12lp.inl"
-#include "unit_tests/gen12lp/hardware_commands_helper_tests_gen12lp.inl"
 #include "unit_tests/gen12lp/hw_helper_tests_gen12lp.inl"
 #include "unit_tests/gen12lp/image_tests_gen12lp.inl"
 #include "unit_tests/gen12lp/kernel_tests_gen12lp.inl"
--- a/unit_tests/gen12lp/tgllp/CMakeLists.txt
+++ b/unit_tests/gen12lp/tgllp/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2019 Intel Corporation
+# Copyright (C) 2019-2020 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
@@ -7,6 +7,7 @@
 if(TESTS_TGLLP)
  set(IGDRCL_SRCS_tests_gen12lp_tgllp
    ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
+    ${CMAKE_CURRENT_SOURCE_DIR}/kernel_tests_tgllp.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/test_hw_helper_tgllp.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/test_hw_info_config_tgllp.cpp
  )
--- a/unit_tests/gen12lp/hardware_commands_helper_tests_gen12lp.inl
+++ b/unit_tests/gen12lp/hardware_commands_helper_tests_gen12lp.inl
@@ -12,9 +12,9 @@

 using namespace NEO;

-using HardwareCommandsGen12LpTests = ::testing::Test;
+using KernelTgllpTests = ::testing::Test;

-TGLLPTEST_F(HardwareCommandsGen12LpTests, GivenUseOffsetToSkipSetFFIDGPWorkaroundActiveWhenSettingKernelStartOffsetThenAdditionalOffsetIsSet) {
+TGLLPTEST_F(KernelTgllpTests, GivenUseOffsetToSkipSetFFIDGPWorkaroundActiveWhenSettingKernelStartOffsetThenAdditionalOffsetIsSet) {
    const uint64_t defaultKernelStartOffset = 0;
    const uint64_t additionalOffsetDueToFfid = 0x1234;
    SPatchThreadPayload threadPayload{};
@@ -30,9 +30,7 @@ TGLLPTEST_F(HardwareCommandsGen12LpTests, GivenUseOffsetToSkipSetFFIDGPWorkaroun
        mockKernelWithInternals.kernelInfo.patchInfo.threadPayload = &threadPayload;

        for (auto isCcsUsed : ::testing::Bool()) {
-            uint64_t kernelStartOffset = defaultKernelStartOffset;
-            HardwareCommandsHelper<FamilyType>::setKernelStartOffset(kernelStartOffset, false, mockKernelWithInternals.kernelInfo, false,
-                                                                     false, *mockKernelWithInternals.mockKernel, isCcsUsed);
+            uint64_t kernelStartOffset = mockKernelWithInternals.mockKernel->getKernelStartOffset(false, false, isCcsUsed);

            if (stepping < REVISION_B && isCcsUsed) {
                EXPECT_EQ(defaultKernelStartOffset + additionalOffsetDueToFfid, kernelStartOffset);
@@ -41,4 +39,4 @@ TGLLPTEST_F(HardwareCommandsGen12LpTests, GivenUseOffsetToSkipSetFFIDGPWorkaroun
            }
        }
    }
-}
+}
--- a/unit_tests/helpers/hardware_commands_helper_tests.cpp
+++ b/unit_tests/helpers/hardware_commands_helper_tests.cpp
@@ -325,6 +325,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, sendIndirectStateResourceUsage
        sizeof(INTERFACE_DESCRIPTOR_DATA));
    uint32_t interfaceDescriptorIndex = 0;
    auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*kernel);

    HardwareCommandsHelper<FamilyType>::sendIndirectState(
        commandStream,
@@ -332,6 +333,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, sendIndirectStateResourceUsage
        ioh,
        ssh,
        *kernel,
+        kernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
        kernel->getKernelInfo().getMaxSimdSize(),
        localWorkSizes,
        IDToffset,
@@ -339,8 +341,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, sendIndirectStateResourceUsage
        pDevice->getPreemptionMode(),
        pWalkerCmd,
        nullptr,
-        true,
-        isCcsUsed);
+        true);

    // It's okay these are EXPECT_GE as they're only going to be used for
    // estimation purposes to avoid OOM.
@@ -378,12 +379,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
    const size_t localWorkSizes[3]{localWorkSize, 1, 1};
    uint32_t interfaceDescriptorIndex = 0;
    auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
+
    HardwareCommandsHelper<FamilyType>::sendIndirectState(
        commandStream,
        dsh,
        ioh,
        ssh,
        *mockKernelWithInternal->mockKernel,
+        mockKernelWithInternal->mockKernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
        mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
        localWorkSizes,
        0,
@@ -391,8 +395,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
        pDevice->getPreemptionMode(),
        pWalkerCmd,
        nullptr,
-        true,
-        isCcsUsed);
+        true);

    auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
    if (HardwareCommandsHelper<FamilyType>::doBindingTablePrefetch()) {
@@ -423,12 +426,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelThatIsSchedulerWhen
    const size_t localWorkSizes[3]{localWorkSize, 1, 1};
    uint32_t interfaceDescriptorIndex = 0;
    auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
+
    HardwareCommandsHelper<FamilyType>::sendIndirectState(
        commandStream,
        dsh,
        ioh,
        ssh,
        *mockKernelWithInternal->mockKernel,
+        mockKernelWithInternal->mockKernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
        mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
        localWorkSizes,
        0,
@@ -436,8 +442,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelThatIsSchedulerWhen
        pDevice->getPreemptionMode(),
        pWalkerCmd,
        nullptr,
-        true,
-        isCcsUsed);
+        true);

    auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
    EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount());
@@ -462,12 +467,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
    const size_t localWorkSizes[3]{localWorkSize, 1, 1};
    uint32_t interfaceDescriptorIndex = 0;
    auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
+
    HardwareCommandsHelper<FamilyType>::sendIndirectState(
        commandStream,
        dsh,
        ioh,
        ssh,
        *mockKernelWithInternal->mockKernel,
+        mockKernelWithInternal->mockKernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
        mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
        localWorkSizes,
        0,
@@ -475,8 +483,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
        pDevice->getPreemptionMode(),
        pWalkerCmd,
        nullptr,
-        true,
-        isCcsUsed);
+        true);

    auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
    if (HardwareCommandsHelper<FamilyType>::doBindingTablePrefetch()) {
@@ -536,12 +543,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
    MockKernel mockKernel{kernel->getProgram(), modifiedKernelInfo, kernel->getDevice(), false};
    uint32_t interfaceDescriptorIndex = 0;
    auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(mockKernel);
+
    HardwareCommandsHelper<FamilyType>::sendIndirectState(
        commandStream,
        dsh,
        ioh,
        ssh,
        mockKernel,
+        mockKernel.getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
        modifiedKernelInfo.getMaxSimdSize(),
        localWorkSizes,
        IDToffset,
@@ -549,8 +559,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
        pDevice->getPreemptionMode(),
        pWalkerCmd,
        nullptr,
-        true,
-        isCcsUsed);
+        true);

    size_t numThreads = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
    numThreads = Math::divideAndRoundUp(numThreads, modifiedKernelInfo.getMaxSimdSize());
@@ -618,12 +627,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointer)
    const_cast<KernelInfo &>(kernelInfo).requiresSshForBuffers = true;
    uint32_t interfaceDescriptorIndex = 0;
    auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*kernel);
+
    HardwareCommandsHelper<FamilyType>::sendIndirectState(
        commandStream,
        dsh,
        ioh,
        ssh,
        *kernel,
+        kernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
        kernel->getKernelInfo().getMaxSimdSize(),
        localWorkSizes,
        0,
@@ -631,8 +643,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointer)
        pDevice->getPreemptionMode(),
        pWalkerCmd,
        nullptr,
-        true,
-        isCcsUsed);
+        true);

    EXPECT_EQ(0x00000000u, *(&bindingTableStatesPointers[0]));
    EXPECT_EQ(0x00000040u, *(&bindingTableStatesPointers[1]));
@@ -780,12 +791,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointersF
        // push surfaces states and binding table to given ssh heap
        uint32_t interfaceDescriptorIndex = 0;
        auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+        auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*pKernel);
+
        HardwareCommandsHelper<FamilyType>::sendIndirectState(
            commandStream,
            dsh,
            ioh,
            ssh,
            *pKernel,
+            pKernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
            pKernel->getKernelInfo().getMaxSimdSize(),
            localWorkSizes,
            0,
@@ -793,8 +807,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointersF
            pDevice->getPreemptionMode(),
            pWalkerCmd,
            nullptr,
-            true,
-            isCcsUsed);
+            true);

        bti = reinterpret_cast<typename FamilyType::BINDING_TABLE_STATE *>(reinterpret_cast<unsigned char *>(ssh.getCpuBase()) + localSshOffset + btiOffset);
        for (uint32_t i = 0; i < numSurfaces; ++i) {
@@ -859,7 +872,7 @@ HWTEST_F(HardwareCommandsTest, setBindingTableStatesForKernelWithBuffersNotRequi
    EXPECT_EQ(0u, numSurfaceStates);

    // set binding table states
-    auto dstBindingTablePointer = HardwareCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
+    auto dstBindingTablePointer = pushBindingTableAndSurfaceStates<FamilyType>(ssh, *pKernel);
    EXPECT_EQ(0u, dstBindingTablePointer);

    auto usedAfter = ssh.getUsed();
@@ -904,10 +917,10 @@ HWTEST_F(HardwareCommandsTest, setBindingTableStatesForNoSurfaces) {
    auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
    EXPECT_EQ(0u, numSurfaceStates);

-    auto dstBindingTablePointer = HardwareCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernelInfo);
+    auto dstBindingTablePointer = pushBindingTableAndSurfaceStates<FamilyType>(ssh, *pKernel);
    EXPECT_EQ(0u, dstBindingTablePointer);

-    dstBindingTablePointer = HardwareCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
+    dstBindingTablePointer = pushBindingTableAndSurfaceStates<FamilyType>(ssh, *pKernel);
    EXPECT_EQ(0u, dstBindingTablePointer);

    SPatchBindingTableState bindingTableState;
@@ -918,7 +931,7 @@ HWTEST_F(HardwareCommandsTest, setBindingTableStatesForNoSurfaces) {
    bindingTableState.SurfaceStateOffset = 0;
    pKernelInfo->patchInfo.bindingTableState = &bindingTableState;

-    dstBindingTablePointer = HardwareCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
+    dstBindingTablePointer = pushBindingTableAndSurfaceStates<FamilyType>(ssh, *pKernel);
    EXPECT_EQ(0u, dstBindingTablePointer);

    pKernelInfo->patchInfo.bindingTableState = nullptr;
@@ -1060,12 +1073,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
    mockKernelWithInternal->mockKernel->setSshLocal(mockKernelWithInternal->sshLocal, sizeof(mockKernelWithInternal->sshLocal));
    uint32_t interfaceDescriptorIndex = 0;
    auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
+    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
+
    HardwareCommandsHelper<FamilyType>::sendIndirectState(
        commandStream,
        dsh,
        ioh,
        ssh,
        *mockKernelWithInternal->mockKernel,
+        mockKernelWithInternal->mockKernel->getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
        8,
        localWorkSizes,
        interfaceDescriptorTableOffset,
@@ -1073,8 +1089,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
        pDevice->getPreemptionMode(),
        pWalkerCmd,
        nullptr,
-        true,
-        isCcsUsed);
+        true);

    bool isMemorySame = memcmp(borderColorPointer, mockDsh, borderColorSize) == 0;
    EXPECT_TRUE(isMemorySame);
--- a/unit_tests/helpers/hardware_commands_helper_tests.h
+++ b/unit_tests/helpers/hardware_commands_helper_tests.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

 #include "runtime/built_ins/built_ins.h"
+#include "runtime/helpers/hardware_commands_helper.h"
 #include "runtime/kernel/kernel.h"
 #include "test.h"
 #include "unit_tests/fixtures/built_in_fixture.h"
@@ -39,4 +40,11 @@ struct HardwareCommandsTest : DeviceFixture,
    std::unique_ptr<MockKernelWithInternals> mockKernelWithInternal;
    Kernel::SimpleKernelArgInfo kernelArgInfo = {};
    std::vector<Kernel::SimpleKernelArgInfo> kernelArguments;
+
+    template <typename GfxFamily>
+    size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) {
+        return HardwareCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(dstHeap, (srcKernel.getKernelInfo().patchInfo.bindingTableState != nullptr) ? srcKernel.getKernelInfo().patchInfo.bindingTableState->Count : 0,
+                                                                                   srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize(),
+                                                                                   srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset());
+    }
 };
--- a/unit_tests/helpers/hw_helper_tests.cpp
+++ b/unit_tests/helpers/hw_helper_tests.cpp
@@ -788,7 +788,8 @@ HWTEST_F(HwHelperTest, givenDefaultHwHelperHwWhenIsOffsetToSkipSetFFIDGPWARequir
    if (hardwareInfo.platform.eRenderCoreFamily == IGFX_GEN12LP_CORE) {
        GTEST_SKIP();
    }
-    EXPECT_FALSE(HwHelperHw<FamilyType>::isOffsetToSkipSetFFIDGPWARequired(hardwareInfo));
+    auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+    EXPECT_FALSE(hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo));
 }

 HWTEST_F(HwHelperTest, givenDefaultHwHelperHwWhenIsForceDefaultRCSEngineWARequiredCalledThenFalseIsReturned) {
--- a/unit_tests/kernel/kernel_tests.cpp
+++ b/unit_tests/kernel/kernel_tests.cpp
@@ -766,6 +766,7 @@ TEST_F(KernelPrivateSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenPriv
    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap());

    program.setConstantSurface(nullptr);
    delete pKernel;
@@ -1014,6 +1015,7 @@ TEST_F(KernelGlobalSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenGloba
    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap());

    program.setGlobalSurface(nullptr);
    delete pKernel;
@@ -1188,6 +1190,7 @@ TEST_F(KernelConstantSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenCon
    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap());

    program.setConstantSurface(nullptr);
    delete pKernel;
@@ -2936,6 +2939,60 @@ TEST(KernelTest, GivenDifferentValuesWhenSetKernelExecutionTypeIsCalledThenCorre
    EXPECT_EQ(KernelExecutionType::Default, kernel.executionType);
 }

+TEST(KernelTest, givenKernelLocalIdGenerationByRuntimeFalseWhenGettingStartOffsetThenOffsetToSkipPerThreadDataLoadIsAdded) {
+    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+
+    MockKernelWithInternals mockKernel(*device);
+    SPatchThreadPayload threadPayload = {};
+
+    threadPayload.OffsetToSkipPerThreadDataLoad = 128u;
+    mockKernel.kernelInfo.patchInfo.threadPayload = &threadPayload;
+
+    mockKernel.kernelInfo.createKernelAllocation(device->getRootDeviceIndex(), device->getMemoryManager());
+    auto allocationOffset = mockKernel.kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
+
+    mockKernel.mockKernel->setStartOffset(128);
+    auto offset = mockKernel.mockKernel->getKernelStartOffset(false, true, false);
+    EXPECT_EQ(allocationOffset + 256u, offset);
+    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getGraphicsAllocation());
+}
+
+TEST(KernelTest, givenKernelLocalIdGenerationByRuntimeTrueAndLocalIdsUsedWhenGettingStartOffsetThenOffsetToSkipPerThreadDataLoadIsNotAdded) {
+    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+
+    MockKernelWithInternals mockKernel(*device);
+    SPatchThreadPayload threadPayload = {};
+
+    threadPayload.OffsetToSkipPerThreadDataLoad = 128u;
+    mockKernel.kernelInfo.patchInfo.threadPayload = &threadPayload;
+
+    mockKernel.kernelInfo.createKernelAllocation(device->getRootDeviceIndex(), device->getMemoryManager());
+    auto allocationOffset = mockKernel.kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
+
+    mockKernel.mockKernel->setStartOffset(128);
+    auto offset = mockKernel.mockKernel->getKernelStartOffset(true, true, false);
+    EXPECT_EQ(allocationOffset + 128u, offset);
+    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getGraphicsAllocation());
+}
+
+TEST(KernelTest, givenKernelLocalIdGenerationByRuntimeFalseAndLocalIdsNotUsedWhenGettingStartOffsetThenOffsetToSkipPerThreadDataLoadIsNotAdded) {
+    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+
+    MockKernelWithInternals mockKernel(*device);
+    SPatchThreadPayload threadPayload = {};
+
+    threadPayload.OffsetToSkipPerThreadDataLoad = 128u;
+    mockKernel.kernelInfo.patchInfo.threadPayload = &threadPayload;
+
+    mockKernel.kernelInfo.createKernelAllocation(device->getRootDeviceIndex(), device->getMemoryManager());
+    auto allocationOffset = mockKernel.kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
+
+    mockKernel.mockKernel->setStartOffset(128);
+    auto offset = mockKernel.mockKernel->getKernelStartOffset(false, false, false);
+    EXPECT_EQ(allocationOffset + 128u, offset);
+    device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getGraphicsAllocation());
+}
+
 namespace NEO {

 template <typename GfxFamily>