From bb62343aba9eb03dc29967a00fe3272817d8c1f4 Mon Sep 17 00:00:00 2001
From: "Zdanowicz, Zbigniew" <zbigniew.zdanowicz@intel.com>
Date: Wed, 3 Oct 2018 15:13:54 +0200
Subject: [PATCH] Add new parameter to thread data dispatching

Change-Id: I86710b0cc764156f4c2db9d24ccd1c96b32d7660
---
 runtime/command_queue/CMakeLists.txt          |   2 +
 runtime/command_queue/gpgpu_walker.h          |   4 +-
 runtime/command_queue/gpgpu_walker_base.inl   |  15 ++-
 runtime/command_queue/hardware_interface.inl  |  11 +-
 runtime/helpers/kernel_commands.h             |   6 +-
 runtime/helpers/kernel_commands.inl           |  21 +++-
 runtime/os_interface/DebugVariables_base.inl  |   6 +-
 runtime/program/process_gen_binary.cpp        |   3 +-
 .../get_size_required_buffer_tests.cpp        |   4 +-
 .../command_queue/work_group_size_tests.cpp   |   2 +-
 unit_tests/helpers/kernel_commands_tests.cpp  | 108 ++++++++++++++++--
 unit_tests/program/kernel_data.cpp            |   1 +
 unit_tests/test_files/igdrcl.config           |   1 +
 13 files changed, 160 insertions(+), 24 deletions(-)
diff --git a/runtime/command_queue/CMakeLists.txt b/runtime/command_queue/CMakeLists.txt
index 71424d67fc..88f1079132 100644
--- a/runtime/command_queue/CMakeLists.txt
+++ b/runtime/command_queue/CMakeLists.txt
@@ -34,6 +34,7 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
   ${CMAKE_CURRENT_SOURCE_DIR}/flush.h
   ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.h
   ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.inl
+  ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker_base.inl
   ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.h
   ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.inl
   ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface_base.inl
@@ -46,3 +47,4 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
 )
 target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_COMMAND_QUEUE})
 set_property(GLOBAL PROPERTY RUNTIME_SRCS_COMMAND_QUEUE ${RUNTIME_SRCS_COMMAND_QUEUE})
+add_subdirectories()
\ No newline at end of file
diff --git a/runtime/command_queue/gpgpu_walker.h b/runtime/command_queue/gpgpu_walker.h
index 67e49e5e3b..85772d4b9b 100644
--- a/runtime/command_queue/gpgpu_walker.h
+++ b/runtime/command_queue/gpgpu_walker.h
@@ -134,7 +134,9 @@ class GpgpuWalkerHelper {
         const size_t localWorkSizesIn[3],
         uint32_t simd,
         uint32_t workDim,
-        bool localIdsGeneration);
+        bool localIdsGenerationByRuntime,
+        bool kernelUsesLocalIds,
+        bool inlineDataProgrammingRequired);
 
     static void dispatchProfilingCommandsStart(
         HwTimeStamps &hwTimeStamps,
diff --git a/runtime/command_queue/gpgpu_walker_base.inl b/runtime/command_queue/gpgpu_walker_base.inl
index 0b93bd33e9..e4161f14f5 100644
--- a/runtime/command_queue/gpgpu_walker_base.inl
+++ b/runtime/command_queue/gpgpu_walker_base.inl
@@ -19,7 +19,9 @@ inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
     const size_t localWorkSizesIn[3],
     uint32_t simd,
     uint32_t workDim,
-    bool localIdsGeneration) {
+    bool localIdsGenerationByRuntime,
+    bool kernelUsesLocalIds,
+    bool inlineDataProgrammingRequired) {
     auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
 
     auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
@@ -127,7 +129,9 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
     auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
     *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
 
-    bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes);
+    bool localIdsGenerationByRuntime = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes);
+    bool inlineDataProgrammingRequired = KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(scheduler);
+    bool kernelUsesLocalIds = KernelCommandsHelper<GfxFamily>::kernelUsesLocalIds(scheduler);
     KernelCommandsHelper<GfxFamily>::sendIndirectState(
         *commandStream,
         *dsh,
@@ -141,14 +145,17 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
         preemptionMode,
         pGpGpuWalkerCmd,
         nullptr,
-        localIdsGeneration);
+        localIdsGenerationByRuntime,
+        kernelUsesLocalIds,
+        inlineDataProgrammingRequired);
 
     // Implement enabling special WA DisableLSQCROPERFforOCL if needed
     GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true);
 
     size_t globalOffsets[3] = {0, 0, 0};
     size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
-    GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1, localIdsGeneration);
+    GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes,
+                                                           simd, 1, localIdsGenerationByRuntime, kernelUsesLocalIds, inlineDataProgrammingRequired);
 
     // Implement disabling special WA DisableLSQCROPERFforOCL if needed
     GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false);
diff --git a/runtime/command_queue/hardware_interface.inl b/runtime/command_queue/hardware_interface.inl
index 5da5dd478a..5d08184d7b 100644
--- a/runtime/command_queue/hardware_interface.inl
+++ b/runtime/command_queue/hardware_interface.inl
@@ -198,7 +198,9 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
 
         auto idd = obtainInterfaceDescriptorData(walkerCmd);
 
-        bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(dim, globalWorkSizes, localWorkSizes);
+        bool localIdsGenerationByRuntime = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(dim, globalWorkSizes, localWorkSizes);
+        bool inlineDataProgrammingRequired = KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel);
+        bool kernelUsesLocalIds = KernelCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
         KernelCommandsHelper<GfxFamily>::sendIndirectState(
             *commandStream,
             *dsh,
@@ -212,13 +214,16 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
             preemptionMode,
             walkerCmd,
             idd,
-            localIdsGeneration);
+            localIdsGenerationByRuntime,
+            kernelUsesLocalIds,
+            inlineDataProgrammingRequired);
 
         size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
         size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
         size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
         GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups,
-                                                               numWorkGroups, localWorkSizes, simd, dim, localIdsGeneration);
+                                                               numWorkGroups, localWorkSizes, simd, dim,
+                                                               localIdsGenerationByRuntime, kernelUsesLocalIds, inlineDataProgrammingRequired);
 
         dispatchWorkarounds(commandStream, commandQueue, kernel, false);
         currentDispatchIndex++;
diff --git a/runtime/helpers/kernel_commands.h b/runtime/helpers/kernel_commands.h
index 8b24629352..2238cc97cc 100644
--- a/runtime/helpers/kernel_commands.h
+++ b/runtime/helpers/kernel_commands.h
@@ -95,7 +95,9 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
         PreemptionMode preemptionMode,
         WALKER_TYPE<GfxFamily> *walkerCmd,
         INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
-        bool localIdsGeneration);
+        bool localIdsGenerationByRuntime,
+        bool kernelUsesLocalIds,
+        bool inlineDataProgrammingRequired);
 
     static size_t getSizeRequiredCS();
     static bool isPipeControlWArequired();
@@ -160,5 +162,7 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
     static bool doBindingTablePrefetch();
 
     static bool isRuntimeLocalIdsGenerationRequired(uint32_t workDim, size_t *gws, size_t *lws);
+    static bool inlineDataProgrammingRequired(const Kernel &kernel);
+    static bool kernelUsesLocalIds(const Kernel &kernel);
 };
 } // namespace OCLRT
diff --git a/runtime/helpers/kernel_commands.inl b/runtime/helpers/kernel_commands.inl
index be3dabb30b..b15761eb40 100644
--- a/runtime/helpers/kernel_commands.inl
+++ b/runtime/helpers/kernel_commands.inl
@@ -296,7 +296,9 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
     PreemptionMode preemptionMode,
     WALKER_TYPE<GfxFamily> *walkerCmd,
     INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
-    bool localIdsGeneration) {
+    bool localIdsGenerationByRuntime,
+    bool kernelUsesLocalIds,
+    bool inlineDataProgrammingRequired) {
     using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
 
     DEBUG_BREAK_IF(simd != 8 && simd != 16 && simd != 32);
@@ -460,4 +462,21 @@ template <typename GfxFamily>
 bool KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(uint32_t workDim, size_t *gws, size_t *lws) {
     return true;
 }
+
+template <typename GfxFamily>
+bool KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(const Kernel &kernel) {
+    if (DebugManager.flags.EnablePassInlineData.get()) {
+        return kernel.getKernelInfo().patchInfo.threadPayload->PassInlineData &&
+               kernel.getCrossThreadDataSize() <= sizeof(GRF);
+    }
+    return false;
+}
+
+template <typename GfxFamily>
+bool KernelCommandsHelper<GfxFamily>::kernelUsesLocalIds(const Kernel &kernel) {
+    return (kernel.getKernelInfo().patchInfo.threadPayload->LocalIDXPresent ||
+            kernel.getKernelInfo().patchInfo.threadPayload->LocalIDYPresent ||
+            kernel.getKernelInfo().patchInfo.threadPayload->LocalIDZPresent);
+}
+
 } // namespace OCLRT
diff --git a/runtime/os_interface/DebugVariables_base.inl b/runtime/os_interface/DebugVariables_base.inl
index 6b777f450f..e0a9f59e1f 100644
--- a/runtime/os_interface/DebugVariables_base.inl
+++ b/runtime/os_interface/DebugVariables_base.inl
@@ -12,9 +12,9 @@ DECLARE_DEBUG_VARIABLE(std::string, ProductFamilyOverride, std::string("unk"), "
 DECLARE_DEBUG_VARIABLE(std::string, ForceCompilerUsePlatform, std::string("unk"), "Specify product for use in compiler interface")
 DECLARE_DEBUG_VARIABLE(std::string, AUBDumpCaptureFileName, std::string("unk"), "Name of file to save AUB capture into")
 DECLARE_DEBUG_VARIABLE(std::string, AUBDumpFilterKernelName, std::string("unk"), "Name of kernel to AUB capture")
+DECLARE_DEBUG_VARIABLE(std::string, AUBDumpToggleFileName, std::string("unk"), "Name of file to save AUB in toggle mode")
 DECLARE_DEBUG_VARIABLE(int32_t, AUBDumpFilterNamedKernelStartIdx, 0, "Start index of named kernel to AUB capture")
 DECLARE_DEBUG_VARIABLE(int32_t, AUBDumpFilterNamedKernelEndIdx, -1, "End index of named kernel to AUB capture")
-DECLARE_DEBUG_VARIABLE(std::string, AUBDumpToggleFileName, std::string("unk"), "Name of file to save AUB in toggle mode")
 DECLARE_DEBUG_VARIABLE(int32_t, AUBDumpSubCaptureMode, 0, "AUB dump subcapture mode (off, toggle, filter)")
 DECLARE_DEBUG_VARIABLE(int32_t, AUBDumpFilterKernelStartIdx, 0, "Start index of kernel to AUB capture")
 DECLARE_DEBUG_VARIABLE(int32_t, AUBDumpFilterKernelEndIdx, -1, "End index of kernel to AUB capture")
@@ -81,6 +81,8 @@ DECLARE_DEBUG_VARIABLE(bool, EnableForcePin, true, "Enables early pinning for me
 DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeND, true, "Enables diffrent algorithm to compute local work size")
 DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, false, "Enables algorithm to compute the most squared work group as possible")
 DECLARE_DEBUG_VARIABLE(bool, EnableVaLibCalls, true, "Enable cl-va sharing lib calls")
+DECLARE_DEBUG_VARIABLE(bool, AddClGlSharing, false, "Add cl-gl extension")
+DECLARE_DEBUG_VARIABLE(bool, EnablePassInlineData, false, "Enable passing of inline data")
 DECLARE_DEBUG_VARIABLE(int32_t, EnableStatelessToStatefulBufferOffsetOpt, -1, "-1: dont override, 0: disable, 1: enable, Enables buffer-offset improvement of the stateless to stateful optimization")
 DECLARE_DEBUG_VARIABLE(int32_t, CreateMultipleDevices, 0, "0: default - disable, 1+: Driver will create multiple (N) devices during initialization.")
 DECLARE_DEBUG_VARIABLE(int32_t, Enable64kbpages, -1, "-1: default behaviour, 0 Disables, 1 Enables support for 64KB pages for driver allocated fine grain svm buffers")
@@ -92,7 +94,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideEnableQuickKmdSleepForSporadicWaits, -1,
 DECLARE_DEBUG_VARIABLE(int32_t, OverrideDelayQuickKmdSleepForSporadicWaitsMicroseconds, -1, "-1: dont override, >0: timeout in microseconds")
 DECLARE_DEBUG_VARIABLE(int32_t, CsrDispatchMode, 0, "Chooses DispatchMode for Csr")
 DECLARE_DEBUG_VARIABLE(int32_t, OverrideDefaultFP64Settings, -1, "-1: dont override, 0: disable, 1: enable.")
-DECLARE_DEBUG_VARIABLE(bool, AddClGlSharing, false, "Add cl-gl extension")
+
 /*DRIVER TOGGLES*/
 DECLARE_DEBUG_VARIABLE(int32_t, ForceOCLVersion, 0, "Force specific OpenCL API version")
 DECLARE_DEBUG_VARIABLE(int32_t, ForcePreemptionMode, -1, "Keep this variable in sync with PreemptionMode enum. -1 - devices default mode, 1 - disable, 2 - midBatch, 3 " "- threadGroup, 4 - midThread")
diff --git a/runtime/program/process_gen_binary.cpp b/runtime/program/process_gen_binary.cpp
index 484429acf6..a27a29cde0 100644
--- a/runtime/program/process_gen_binary.cpp
+++ b/runtime/program/process_gen_binary.cpp
@@ -489,7 +489,8 @@ cl_int Program::parsePatchList(KernelInfo &kernelInfo) {
                     "\n  .GetLocalIDPresent", kernelInfo.patchInfo.threadPayload->GetLocalIDPresent,
                     "\n  .GetGroupIDPresent", kernelInfo.patchInfo.threadPayload->GetGroupIDPresent,
                     "\n  .GetGlobalOffsetPresent", kernelInfo.patchInfo.threadPayload->GetGlobalOffsetPresent,
-                    "\n  .OffsetToSkipPerThreadDataLoad", kernelInfo.patchInfo.threadPayload->OffsetToSkipPerThreadDataLoad);
+                    "\n  .OffsetToSkipPerThreadDataLoad", kernelInfo.patchInfo.threadPayload->OffsetToSkipPerThreadDataLoad,
+                    "\n  .PassInlineData", kernelInfo.patchInfo.threadPayload->PassInlineData);
             break;
 
         case PATCH_TOKEN_EXECUTION_ENVIRONMENT:
diff --git a/unit_tests/command_queue/get_size_required_buffer_tests.cpp b/unit_tests/command_queue/get_size_required_buffer_tests.cpp
index e98cd9462f..74d9562f54 100644
--- a/unit_tests/command_queue/get_size_required_buffer_tests.cpp
+++ b/unit_tests/command_queue/get_size_required_buffer_tests.cpp
@@ -392,7 +392,7 @@ HWTEST_F(GetSizeRequiredBufferTest, enqueueKernelHelloWorld) {
     auto iohAfter = pIOH->getUsed();
     auto sshAfter = pSSH->getUsed();
 
-    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, nullptr);
+    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel);
     auto expectedSizeDSH = KernelCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
     auto expectedSizeIOH = KernelCommandsHelper<FamilyType>::getSizeRequiredIOH(*KernelFixture::pKernel, workSize[0]);
     auto expectedSizeSSH = KernelCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
@@ -431,7 +431,7 @@ HWTEST_F(GetSizeRequiredBufferTest, enqueueKernelSimpleArg) {
     auto iohAfter = pIOH->getUsed();
     auto sshAfter = pSSH->getUsed();
 
-    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, nullptr);
+    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel);
     auto expectedSizeDSH = KernelCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
     auto expectedSizeIOH = KernelCommandsHelper<FamilyType>::getSizeRequiredIOH(*KernelFixture::pKernel, workSize[0]);
     auto expectedSizeSSH = KernelCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
diff --git a/unit_tests/command_queue/work_group_size_tests.cpp b/unit_tests/command_queue/work_group_size_tests.cpp
index 883ff7e5b6..9d9e42f101 100644
--- a/unit_tests/command_queue/work_group_size_tests.cpp
+++ b/unit_tests/command_queue/work_group_size_tests.cpp
@@ -85,7 +85,7 @@ struct WorkGroupSizeBase {
             (workItems[1] + workGroupSize[1] - 1) / workGroupSize[1],
             (workItems[2] + workGroupSize[2] - 1) / workGroupSize[2]};
         GpgpuWalkerHelper<FamilyType>::setGpgpuWalkerThreadData(&pCmd, globalOffsets, workGroupsStart, workGroupsNum,
-                                                                workGroupSize, simdSize, dims, true);
+                                                                workGroupSize, simdSize, dims, true, false, false);
 
         //And check if it is programmed correctly
         auto numWorkItems = computeWalkerWorkItems<FamilyType>(pCmd);
diff --git a/unit_tests/helpers/kernel_commands_tests.cpp b/unit_tests/helpers/kernel_commands_tests.cpp
index 1110be5b65..3594c55cb0 100644
--- a/unit_tests/helpers/kernel_commands_tests.cpp
+++ b/unit_tests/helpers/kernel_commands_tests.cpp
@@ -323,7 +323,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, sendIndirectStateResourceUsage)
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true);
+        true,
+        true,
+        false);
 
     // It's okay these are EXPECT_GE as they're only going to be used for
     // estimation purposes to avoid OOM.
@@ -375,7 +377,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWithFourBindingTableE
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true);
+        true,
+        true,
+        false);
 
     auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
     if (KernelCommandsHelper<FamilyType>::doBindingTablePrefetch()) {
@@ -420,7 +424,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelThatIsSchedulerWhenIn
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true);
+        true,
+        true,
+        false);
 
     auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
     EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount());
@@ -459,7 +465,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWith100BindingTableEn
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true);
+        true,
+        true,
+        false);
 
     auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
     if (KernelCommandsHelper<FamilyType>::doBindingTablePrefetch()) {
@@ -531,7 +539,10 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, whenSendingIndirectStateThenKern
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true);
+        true,
+        true,
+        false);
+
     size_t numThreads = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
     numThreads = (numThreads + modifiedKernelInfo.getMaxSimdSize() - 1) / modifiedKernelInfo.getMaxSimdSize();
     size_t expectedIohSize = ((modifiedKernelInfo.getMaxSimdSize() == 32) ? 32 : 16) * 3 * numThreads * sizeof(uint16_t);
@@ -609,7 +620,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointer) {
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true);
+        true,
+        true,
+        false);
 
     EXPECT_EQ(0x00000000u, *(&bindingTableStatesPointers[0]));
     EXPECT_EQ(0x00000040u, *(&bindingTableStatesPointers[1]));
@@ -769,7 +782,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointersFor
             pDevice->getPreemptionMode(),
             pWalkerCmd,
             nullptr,
-            true);
+            true,
+            true,
+            false);
 
         bti = reinterpret_cast<typename FamilyType::BINDING_TABLE_STATE *>(reinterpret_cast<unsigned char *>(ssh.getCpuBase()) + localSshOffset + btiOffset);
         for (uint32_t i = 0; i < numSurfaces; ++i) {
@@ -1009,7 +1024,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, GivenKernelWithSamplersWhenIndir
         pDevice->getPreemptionMode(),
         pWalkerCmd,
         nullptr,
-        true);
+        true,
+        true,
+        false);
 
     bool isMemorySame = memcmp(borderColorPointer, mockDsh, borderColorSize) == 0;
     EXPECT_TRUE(isMemorySame);
@@ -1166,3 +1183,78 @@ INSTANTIATE_TEST_CASE_P(ParentKernelCommandsFromBinaryTest,
                         ::testing::Combine(
                             ::testing::Values(binaryFile),
                             ::testing::ValuesIn(KernelNames)));
+
+HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelAllowsInlineAndCrossThreadSizeLesserEqualThanGrfThenReturnTrue) {
+    DebugManagerStateRestore restore;
+    DebugManager.flags.EnablePassInlineData.set(true);
+
+    uint32_t crossThreadData[8];
+
+    MockKernelWithInternals mockKernelWithInternal(*pDevice);
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->PassInlineData = 1;
+    mockKernelWithInternal.mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));
+
+    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel));
+}
+
+HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelDisallowsInlineAndCrossThreadSizeLesserEqualThanGrfThenReturnFalse) {
+    DebugManagerStateRestore restore;
+    DebugManager.flags.EnablePassInlineData.set(true);
+
+    uint32_t crossThreadData[8];
+
+    MockKernelWithInternals mockKernelWithInternal(*pDevice);
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->PassInlineData = 0;
+    mockKernelWithInternal.mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));
+
+    EXPECT_FALSE(KernelCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel));
+}
+
+HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelAllowsInlineAndCrossThreadSizeGreaterThanGrfThenReturnFalse) {
+    DebugManagerStateRestore restore;
+    DebugManager.flags.EnablePassInlineData.set(true);
+
+    uint32_t crossThreadData[16];
+
+    MockKernelWithInternals mockKernelWithInternal(*pDevice);
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->PassInlineData = 1;
+    mockKernelWithInternal.mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));
+
+    EXPECT_FALSE(KernelCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel));
+}
+
+HWTEST_F(KernelCommandsTest, whenLocalIdxInXDimPresentThenExpectLocalIdsInUseIsTrue) {
+    MockKernelWithInternals mockKernelWithInternal(*pDevice);
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 1;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;
+
+    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel));
+}
+
+HWTEST_F(KernelCommandsTest, whenLocalIdxInYDimPresentThenExpectLocalIdsInUseIsTrue) {
+    MockKernelWithInternals mockKernelWithInternal(*pDevice);
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 1;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;
+
+    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel));
+}
+
+HWTEST_F(KernelCommandsTest, whenLocalIdxInZDimPresentThenExpectLocalIdsInUseIsTrue) {
+    MockKernelWithInternals mockKernelWithInternal(*pDevice);
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 1;
+
+    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel));
+}
+
+HWTEST_F(KernelCommandsTest, whenLocalIdxAreNotPresentThenExpectLocalIdsInUseIsFalse) {
+    MockKernelWithInternals mockKernelWithInternal(*pDevice);
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
+    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;
+
+    EXPECT_FALSE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel));
+}
diff --git a/unit_tests/program/kernel_data.cpp b/unit_tests/program/kernel_data.cpp
index d6ee4c2751..985575a66b 100644
--- a/unit_tests/program/kernel_data.cpp
+++ b/unit_tests/program/kernel_data.cpp
@@ -239,6 +239,7 @@ TEST_F(KernelDataTest, ThreadPayload) {
     threadPayload.LocalIDYPresent = true;
     threadPayload.LocalIDZPresent = true;
     threadPayload.OffsetToSkipPerThreadDataLoad = true;
+    threadPayload.PassInlineData = true;
 
     pPatchList = &threadPayload;
     patchListSize = threadPayload.Size;
diff --git a/unit_tests/test_files/igdrcl.config b/unit_tests/test_files/igdrcl.config
index 44f6e6e54f..4c24a7b0ab 100644
--- a/unit_tests/test_files/igdrcl.config
+++ b/unit_tests/test_files/igdrcl.config
@@ -86,3 +86,4 @@ EnableTimestampPacket = false
 ReturnRawGpuTimestamps = 0
 DoNotRegisterTrimCallback = false
 AddClGlSharing = 0
+EnablePassInlineData = false
\ No newline at end of file