From b6b92ae808a2590af4cccf30709dcbc652652ab5 Mon Sep 17 00:00:00 2001
From: "Zdanowicz, Zbigniew" <zbigniew.zdanowicz@intel.com>
Date: Fri, 30 Mar 2018 17:57:51 +0200
Subject: [PATCH] Create GpgpuWalkerHelper class

Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b
---
 runtime/command_queue/CMakeLists.txt          |   5 +-
 runtime/command_queue/command_queue_hw.inl    |   4 +-
 .../command_queue/dispatch_walker_helper.h    |  56 ---
 .../command_queue/dispatch_walker_helper.inl  |  99 -----
 runtime/command_queue/enqueue_common.h        |   6 +-
 runtime/command_queue/enqueue_kernel.h        |   6 +-
 runtime/command_queue/enqueue_marker.h        |   4 +-
 .../enqueue_migrate_mem_objects.h             |   4 +-
 runtime/command_queue/gpgpu_walker.h          | 371 ++++++++++++++++++
 .../{dispatch_walker.h => gpgpu_walker.inl}   | 332 +++++-----------
 .../command_stream_receiver_hw.inl            |   2 +-
 runtime/command_stream/preemption.inl         |   2 +-
 runtime/device_queue/device_queue_hw.inl      |  13 +-
 runtime/enable_gens.cmake                     |   1 +
 runtime/gen8/command_queue.cpp                |  43 +-
 runtime/gen8/gpgpu_walker.cpp                 |  71 ++++
 runtime/gen9/command_queue.cpp                |  43 +-
 runtime/gen9/gpgpu_walker.cpp                 |  71 ++++
 runtime/helpers/dispatch_info_builder.h       |   8 +-
 .../command_queue/dispatch_walker_tests.cpp   |  63 +--
 .../get_size_required_buffer_tests.cpp        |   4 +-
 .../get_size_required_image_tests.cpp         |   2 +-
 .../command_queue/local_work_size_tests.cpp   |   4 +-
 .../command_queue/work_group_size_tests.cpp   |   4 +-
 .../command_stream_receiver_hw_tests.cpp      |   2 +-
 unit_tests/context/driver_diagnostics_tests.h |   2 +-
 .../device_queue/device_queue_hw_tests.cpp    |   8 +-
 .../enqueue_execution_model_kernel_tests.cpp  |   4 +-
 .../parent_kernel_dispatch_tests.cpp          | 208 +++++-----
 .../scheduler_dispatch_tests.cpp              |   6 +-
 unit_tests/gen8/scheduler_dispatch_tests.cpp  |   4 +-
 unit_tests/gen9/test_device_queue_hw.cpp      |   4 +-
 unit_tests/libult/mock_gfx_family.cpp         |  96 +++++
 unit_tests/libult/mock_gfx_family.h           | 303 ++++++++++++++
 34 files changed, 1209 insertions(+), 646 deletions(-)
 delete mode 100644 runtime/command_queue/dispatch_walker_helper.h
 delete mode 100644 runtime/command_queue/dispatch_walker_helper.inl
 create mode 100644 runtime/command_queue/gpgpu_walker.h
 rename runtime/command_queue/{dispatch_walker.h => gpgpu_walker.inl} (70%)
 create mode 100644 runtime/gen8/gpgpu_walker.cpp
 create mode 100644 runtime/gen9/gpgpu_walker.cpp

diff --git a/runtime/command_queue/CMakeLists.txt b/runtime/command_queue/CMakeLists.txt
index bd395ee888..4d1d481d4d 100644
--- a/runtime/command_queue/CMakeLists.txt
+++ b/runtime/command_queue/CMakeLists.txt
@@ -25,9 +25,6 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
   ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw.h
   ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw.inl
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_data_transfer_handler.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker_helper.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker_helper.inl
   ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_barrier.h
   ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_common.h
   ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_copy_buffer.h
@@ -49,6 +46,8 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
   ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_write_image.h
   ${CMAKE_CURRENT_SOURCE_DIR}/finish.h
   ${CMAKE_CURRENT_SOURCE_DIR}/flush.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.inl
   ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.h
   ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl
diff --git a/runtime/command_queue/command_queue_hw.inl b/runtime/command_queue/command_queue_hw.inl
index 0a54a00811..5ded683e3d 100644
--- a/runtime/command_queue/command_queue_hw.inl
+++ b/runtime/command_queue/command_queue_hw.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017 - 2018, Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -20,7 +20,7 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/command_queue/enqueue_barrier.h"
 #include "runtime/command_queue/enqueue_copy_buffer.h"
 #include "runtime/command_queue/enqueue_copy_buffer_rect.h"
diff --git a/runtime/command_queue/dispatch_walker_helper.h b/runtime/command_queue/dispatch_walker_helper.h
deleted file mode 100644
index 3be20da5b9..0000000000
--- a/runtime/command_queue/dispatch_walker_helper.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2017, Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#pragma once
-
-namespace OCLRT {
-
-constexpr int NUM_ALU_INST_FOR_READ_MODIFY_WRITE = 4;
-
-constexpr int L3SQC_BIT_LQSC_RO_PERF_DIS = 0x08000000;
-constexpr int L3SQC_REG4 = 0xB118;
-
-constexpr int GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER = 0xFFFFFFFF;
-constexpr int GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER = 0x00000000;
-
-constexpr int CS_GPR_R0 = 0x2600;
-constexpr int CS_GPR_R1 = 0x2608;
-
-constexpr int ALU_OPCODE_LOAD = 0x080;
-constexpr int ALU_OPCODE_STORE = 0x180;
-constexpr int ALU_OPCODE_OR = 0x103;
-constexpr int ALU_OPCODE_AND = 0x102;
-
-constexpr int ALU_REGISTER_R_0 = 0x0;
-constexpr int ALU_REGISTER_R_1 = 0x1;
-constexpr int ALU_REGISTER_R_SRCA = 0x20;
-constexpr int ALU_REGISTER_R_SRCB = 0x21;
-constexpr int ALU_REGISTER_R_ACCU = 0x31;
-
-constexpr unsigned int GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW = 0x23A8;
-
-template <typename GfxFamily>
-void applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode);
-
-template <typename GfxFamily>
-size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
-} // namespace OCLRT
diff --git a/runtime/command_queue/dispatch_walker_helper.inl b/runtime/command_queue/dispatch_walker_helper.inl
deleted file mode 100644
index 6f663208a0..0000000000
--- a/runtime/command_queue/dispatch_walker_helper.inl
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2017, Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "runtime/command_queue/dispatch_walker_helper.h"
-
-namespace OCLRT {
-
-// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask
-template <typename GfxFamily>
-void addAluReadModifyWriteRegister(
-    OCLRT::LinearStream *pCommandStream,
-    uint32_t aluRegister,
-    uint32_t operation,
-    uint32_t mask) {
-    // Load "Register" value into CS_GPR_R0
-    typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
-    typedef typename GfxFamily::MI_MATH MI_MATH;
-    typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
-    auto pCmd = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
-    *pCmd = MI_LOAD_REGISTER_REG::sInit();
-    pCmd->setSourceRegisterAddress(aluRegister);
-    pCmd->setDestinationRegisterAddress(CS_GPR_R0);
-
-    // Load "Mask" into CS_GPR_R1
-    typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
-    auto pCmd2 = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM)));
-    *pCmd2 = MI_LOAD_REGISTER_IMM::sInit();
-    pCmd2->setRegisterOffset(CS_GPR_R1);
-    pCmd2->setDataDword(mask);
-
-    // Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands
-    auto pCmd3 = reinterpret_cast<uint32_t *>(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)));
-    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.Value = 0x0;
-    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
-    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
-    // 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE
-    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1;
-    pCmd3++;
-    MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(pCmd3);
-
-    // Setup first operand of MI_MATH - load CS_GPR_R0 into register A
-    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
-    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA;
-    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0;
-    pAluParam++;
-
-    // Setup second operand of MI_MATH - load CS_GPR_R1 into register B
-    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
-    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB;
-    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1;
-    pAluParam++;
-
-    // Setup third operand of MI_MATH - "Operation" on registers A and B
-    pAluParam->DW0.BitField.ALUOpcode = operation;
-    pAluParam->DW0.BitField.Operand1 = 0;
-    pAluParam->DW0.BitField.Operand2 = 0;
-    pAluParam++;
-
-    // Setup fourth operand of MI_MATH - store result into CS_GPR_R0
-    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE;
-    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0;
-    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;
-
-    // LOAD value of CS_GPR_R0 into "Register"
-    auto pCmd4 = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
-    *pCmd4 = MI_LOAD_REGISTER_REG::sInit();
-    pCmd4->setSourceRegisterAddress(CS_GPR_R0);
-    pCmd4->setDestinationRegisterAddress(aluRegister);
-
-    // Add PIPE_CONTROL to flush caches
-    typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL;
-    auto pCmd5 = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
-    *pCmd5 = PIPE_CONTROL::sInit();
-    pCmd5->setCommandStreamerStallEnable(true);
-    pCmd5->setDcFlushEnable(true);
-    pCmd5->setTextureCacheInvalidationEnable(true);
-    pCmd5->setPipeControlFlushEnable(true);
-    pCmd5->setStateCacheInvalidationEnable(true);
-}
-} // namespace OCLRT
diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h
index c271e08a2b..6529099fac 100644
--- a/runtime/command_queue/enqueue_common.h
+++ b/runtime/command_queue/enqueue_common.h
@@ -24,7 +24,7 @@
 #include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
 #include "hw_cmds.h"
 #include "runtime/command_queue/command_queue_hw.h"
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/command_stream/command_stream_receiver.h"
 #include "runtime/event/event_builder.h"
 #include "runtime/gtpin/gtpin_notify.h"
@@ -243,7 +243,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
             }
         }
 
-        dispatchWalker<GfxFamily>(
+        GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
             *this,
             multiDispatchInfo,
             numEventsInWaitList,
@@ -293,7 +293,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
                               this->getIndirectHeap(IndirectHeap::SURFACE_STATE).getGraphicsAllocation(),
                               devQueueHw->getDebugQueue());
 
-            dispatchScheduler<GfxFamily>(
+            GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
                 *this,
                 *devQueueHw,
                 preemption,
diff --git a/runtime/command_queue/enqueue_kernel.h b/runtime/command_queue/enqueue_kernel.h
index 0dc3f0e88e..52c09264f8 100644
--- a/runtime/command_queue/enqueue_kernel.h
+++ b/runtime/command_queue/enqueue_kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017 - 2018, Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,7 +24,7 @@
 #include "hw_cmds.h"
 #include "runtime/command_queue/command_queue_hw.h"
 #include "runtime/command_stream/command_stream_receiver.h"
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/helpers/kernel_commands.h"
 #include "runtime/helpers/task_information.h"
 #include "runtime/mem_obj/buffer.h"
@@ -69,7 +69,7 @@ struct EnqueueOperation<GfxFamily, CL_COMMAND_NDRANGE_KERNEL> {
             //user registers
             size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
         }
-        size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(pKernel);
+        size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
 
         return size;
     }
diff --git a/runtime/command_queue/enqueue_marker.h b/runtime/command_queue/enqueue_marker.h
index de28fda17c..29eef91778 100644
--- a/runtime/command_queue/enqueue_marker.h
+++ b/runtime/command_queue/enqueue_marker.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017 - 2018, Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,7 +24,7 @@
 #include "hw_cmds.h"
 #include "runtime/command_queue/command_queue_hw.h"
 #include "runtime/command_stream/command_stream_receiver.h"
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/device/device.h"
 #include "runtime/event/event.h"
 #include "runtime/memory_manager/surface.h"
diff --git a/runtime/command_queue/enqueue_migrate_mem_objects.h b/runtime/command_queue/enqueue_migrate_mem_objects.h
index ad3971f1f7..28accd667e 100644
--- a/runtime/command_queue/enqueue_migrate_mem_objects.h
+++ b/runtime/command_queue/enqueue_migrate_mem_objects.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017 - 2018, Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,7 +24,7 @@
 #include "hw_cmds.h"
 #include "runtime/command_queue/command_queue_hw.h"
 #include "runtime/command_stream/command_stream_receiver.h"
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/device/device.h"
 #include "runtime/event/event.h"
 #include "runtime/memory_manager/surface.h"
diff --git a/runtime/command_queue/gpgpu_walker.h b/runtime/command_queue/gpgpu_walker.h
new file mode 100644
index 0000000000..37f8e0ceee
--- /dev/null
+++ b/runtime/command_queue/gpgpu_walker.h
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2018, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "runtime/built_ins/built_ins.h"
+#include "runtime/context/context.h"
+#include "runtime/command_queue/command_queue.h"
+#include "runtime/command_stream/linear_stream.h"
+#include "runtime/command_stream/preemption.h"
+#include "runtime/device_queue/device_queue_hw.h"
+#include "runtime/event/hw_timestamps.h"
+#include "runtime/event/perf_counter.h"
+#include "runtime/helpers/dispatch_info.h"
+#include "runtime/helpers/kernel_commands.h"
+#include "runtime/helpers/task_information.h"
+#include "runtime/indirect_heap/indirect_heap.h"
+#include "runtime/kernel/kernel.h"
+#include "runtime/program/kernel_info.h"
+#include "runtime/utilities/vec.h"
+
+namespace OCLRT {
+
+constexpr int32_t NUM_ALU_INST_FOR_READ_MODIFY_WRITE = 4;
+
+constexpr int32_t L3SQC_BIT_LQSC_RO_PERF_DIS = 0x08000000;
+constexpr int32_t L3SQC_REG4 = 0xB118;
+
+constexpr int32_t GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER = 0xFFFFFFFF;
+constexpr int32_t GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER = 0x00000000;
+
+constexpr int32_t CS_GPR_R0 = 0x2600;
+constexpr int32_t CS_GPR_R1 = 0x2608;
+
+constexpr int32_t ALU_OPCODE_LOAD = 0x080;
+constexpr int32_t ALU_OPCODE_STORE = 0x180;
+constexpr int32_t ALU_OPCODE_OR = 0x103;
+constexpr int32_t ALU_OPCODE_AND = 0x102;
+
+constexpr int32_t ALU_REGISTER_R_0 = 0x0;
+constexpr int32_t ALU_REGISTER_R_1 = 0x1;
+constexpr int32_t ALU_REGISTER_R_SRCA = 0x20;
+constexpr int32_t ALU_REGISTER_R_SRCB = 0x21;
+constexpr int32_t ALU_REGISTER_R_ACCU = 0x31;
+
+constexpr uint32_t GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW = 0x23A8;
+
+void computeWorkgroupSize1D(
+    uint32_t maxWorkGroupSize,
+    size_t workGroupSize[3],
+    const size_t workItems[3],
+    size_t simdSize);
+
+void computeWorkgroupSizeND(
+    WorkSizeInfo wsInfo,
+    size_t workGroupSize[3],
+    const size_t workItems[3],
+    const uint32_t workDim);
+
+void computeWorkgroupSize2D(
+    uint32_t maxWorkGroupSize,
+    size_t workGroupSize[3],
+    const size_t workItems[3],
+    size_t simdSize);
+
+void computeWorkgroupSizeSquared(
+    uint32_t maxWorkGroupSize,
+    size_t workGroupSize[3],
+    const size_t workItems[3],
+    size_t simdSize,
+    const uint32_t workDim);
+
+Vec3<size_t> computeWorkgroupSize(
+    const DispatchInfo &dispatchInfo);
+
+Vec3<size_t> generateWorkgroupSize(
+    const DispatchInfo &dispatchInfo);
+
+Vec3<size_t> computeWorkgroupsNumber(
+    const Vec3<size_t> gws,
+    const Vec3<size_t> lws);
+
+Vec3<size_t> generateWorkgroupsNumber(
+    const Vec3<size_t> gws,
+    const Vec3<size_t> lws);
+
+Vec3<size_t> generateWorkgroupsNumber(
+    const DispatchInfo &dispatchInfo);
+
+inline uint32_t calculateDispatchDim(Vec3<size_t> dispatchSize, Vec3<size_t> dispatchOffset) {
+    return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
+}
+
+Vec3<size_t> canonizeWorkgroup(
+    Vec3<size_t> workgroup);
+
+void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo);
+
+inline cl_uint computeDimensions(const size_t workItems[3]) {
+    return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
+}
+
+template <typename SizeAndAllocCalcT, typename... CalcArgsT>
+IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
+    size_t alignment = MemoryConstants::pageSize;
+    size_t size = calc(std::forward<CalcArgsT>(args)...);
+    return new IndirectHeap(alignedMalloc(size, alignment), size);
+}
+
+template <typename GfxFamily>
+class GpgpuWalkerHelper {
+  public:
+    static void addAluReadModifyWriteRegister(
+        LinearStream *pCommandStream,
+        uint32_t aluRegister,
+        uint32_t operation,
+        uint32_t mask);
+
+    static void applyWADisableLSQCROPERFforOCL(LinearStream *pCommandStream,
+                                               const Kernel &kernel,
+                                               bool disablePerfMode);
+
+    static size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
+
+    static size_t setGpgpuWalkerThreadData(
+        typename GfxFamily::GPGPU_WALKER *pCmd,
+        const size_t globalOffsets[3],
+        const size_t startWorkGroups[3],
+        const size_t numWorkGroups[3],
+        const size_t localWorkSizesIn[3],
+        uint32_t simd);
+
+    static void dispatchProfilingCommandsStart(
+        HwTimeStamps &hwTimeStamps,
+        OCLRT::LinearStream *commandStream);
+
+    static void dispatchProfilingCommandsEnd(
+        HwTimeStamps &hwTimeStamps,
+        OCLRT::LinearStream *commandStream);
+
+    static void dispatchPerfCountersNoopidRegisterCommands(
+        CommandQueue &commandQueue,
+        OCLRT::HwPerfCounter &hwPerfCounter,
+        OCLRT::LinearStream *commandStream,
+        bool start);
+
+    static void dispatchPerfCountersReadFreqRegisterCommands(
+        CommandQueue &commandQueue,
+        OCLRT::HwPerfCounter &hwPerfCounter,
+        OCLRT::LinearStream *commandStream,
+        bool start);
+
+    static void dispatchPerfCountersGeneralPurposeCounterCommands(
+        CommandQueue &commandQueue,
+        OCLRT::HwPerfCounter &hwPerfCounter,
+        OCLRT::LinearStream *commandStream,
+        bool start);
+
+    static void dispatchPerfCountersUserCounterCommands(
+        CommandQueue &commandQueue,
+        OCLRT::HwPerfCounter &hwPerfCounter,
+        OCLRT::LinearStream *commandStream,
+        bool start);
+
+    static void dispatchPerfCountersOABufferStateCommands(
+        CommandQueue &commandQueue,
+        OCLRT::HwPerfCounter &hwPerfCounter,
+        OCLRT::LinearStream *commandStream);
+
+    static void dispatchPerfCountersCommandsStart(
+        CommandQueue &commandQueue,
+        OCLRT::HwPerfCounter &hwPerfCounter,
+        OCLRT::LinearStream *commandStream);
+
+    static void dispatchPerfCountersCommandsEnd(
+        CommandQueue &commandQueue,
+        OCLRT::HwPerfCounter &hwPerfCounter,
+        OCLRT::LinearStream *commandStream);
+
+    static void dispatchWalker(
+        CommandQueue &commandQueue,
+        const MultiDispatchInfo &multiDispatchInfo,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        KernelOperation **blockedCommandsData,
+        HwTimeStamps *hwTimeStamps,
+        OCLRT::HwPerfCounter *hwPerfCounter,
+        PreemptionMode preemptionMode,
+        bool blockQueue,
+        unsigned int commandType = 0);
+
+    static void dispatchWalker(
+        CommandQueue &commandQueue,
+        const Kernel &kernel,
+        cl_uint workDim,
+        const size_t globalOffsets[3],
+        const size_t workItems[3],
+        const size_t *localWorkSizesIn,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        KernelOperation **blockedCommandsData,
+        HwTimeStamps *hwTimeStamps,
+        HwPerfCounter *hwPerfCounter,
+        PreemptionMode preemptionMode,
+        bool blockQueue);
+
+    static void dispatchScheduler(
+        CommandQueue &commandQueue,
+        DeviceQueueHw<GfxFamily> &devQueueHw,
+        PreemptionMode preemptionMode,
+        SchedulerKernel &scheduler);
+};
+
+template <typename GfxFamily, uint32_t eventType>
+struct EnqueueOperation {
+    static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class");
+    static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class");
+    static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class");
+    static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
+        size_t size = KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
+                      sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
+        if (reserveProfilingCmdsSpace) {
+            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        }
+        if (reservePerfCounters) {
+            //start cmds
+            //P_C: flush CS & TimeStamp BEGIN
+            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
+            //SRM NOOPID & Frequency
+            size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //gp registers
+            size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //report perf count
+            size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
+            //user registers
+            size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+
+            //end cmds
+            //P_C: flush CS & TimeStamp END;
+            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
+            //OA buffer (status head, tail)
+            size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //report perf count
+            size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
+            //gp registers
+            size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //SRM NOOPID & Frequency
+            size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //user registers
+            size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        }
+        Device &device = commandQueue.getDevice();
+        for (auto &dispatchInfo : multiDispatchInfo) {
+            auto &kernel = *dispatchInfo.getKernel();
+            size += sizeof(typename GfxFamily::GPGPU_WALKER);
+            size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(&kernel);
+            size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(device);
+        }
+        return size;
+    }
+
+    static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
+        size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
+                      sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
+        size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
+        if (reserveProfilingCmdsSpace) {
+            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        }
+        if (reservePerfCounters) {
+            //start cmds
+            //P_C: flush CS & TimeStamp BEGIN
+            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
+            //SRM NOOPID & Frequency
+            size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //gp registers
+            size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //report perf count
+            size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
+            //user registers
+            size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+
+            //end cmds
+            //P_C: flush CS & TimeStamp END;
+            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
+            //OA buffer (status head, tail)
+            size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //report perf count
+            size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
+            //gp registers
+            size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //SRM NOOPID & Frequency
+            size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //user registers
+            size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        }
+        size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
+
+        return size;
+    }
+};
+
+template <typename GfxFamily, uint32_t eventType>
+LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) {
+    auto expectedSizeCS = EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel);
+    return commandQueue.getCS(expectedSizeCS);
+}
+
+template <typename GfxFamily, uint32_t eventType>
+LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
+    size_t expectedSizeCS = 0;
+    Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr;
+    for (auto &dispatchInfo : multiDispatchInfo) {
+        expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel());
+    }
+    if (parentKernel && parentKernel->isParentKernel) {
+        SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext());
+        expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler);
+    }
+    return commandQueue.getCS(expectedSizeCS);
+}
+
+template <typename GfxFamily, IndirectHeap::Type heapType>
+IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
+    size_t expectedSize = 0;
+    IndirectHeap *ih = nullptr;
+
+    // clang-format off
+    switch (heapType) {
+    case IndirectHeap::DYNAMIC_STATE:   expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo); break;
+    case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo); break;
+    case IndirectHeap::SURFACE_STATE:   expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo); break;
+    }
+    // clang-format on
+
+    if (multiDispatchInfo.begin()->getKernel()->isParentKernel) {
+        if (heapType == IndirectHeap::SURFACE_STATE) {
+            expectedSize += KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<heapType>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
+        } else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
+        {
+            DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
+            DEBUG_BREAK_IF(pDevQueue == nullptr);
+            ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
+        }
+    }
+
+    if (ih == nullptr)
+        ih = &commandQueue.getIndirectHeap(heapType, expectedSize);
+
+    return *ih;
+}
+
+} // namespace OCLRT
diff --git a/runtime/command_queue/dispatch_walker.h b/runtime/command_queue/gpgpu_walker.inl
similarity index 70%
rename from runtime/command_queue/dispatch_walker.h
rename to runtime/command_queue/gpgpu_walker.inl
index 40bbb13bd0..c178bb2ac9 100644
--- a/runtime/command_queue/dispatch_walker.h
+++ b/runtime/command_queue/gpgpu_walker.inl
@@ -21,24 +21,17 @@
  */
 
 #pragma once
-#include "runtime/context/context.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/command_queue/local_id_gen.h"
-#include "runtime/command_queue/command_queue.h"
-#include "runtime/command_queue/dispatch_walker_helper.h"
 #include "runtime/command_stream/command_stream_receiver.h"
-#include "runtime/command_stream/preemption.h"
 #include "runtime/device/device_info.h"
-#include "runtime/device_queue/device_queue_hw.h"
 #include "runtime/event/perf_counter.h"
 #include "runtime/event/user_event.h"
 #include "runtime/indirect_heap/indirect_heap.h"
 #include "runtime/helpers/aligned_memory.h"
 #include "runtime/helpers/debug_helpers.h"
 #include "runtime/helpers/kernel_commands.h"
-#include "runtime/helpers/task_information.h"
 #include "runtime/helpers/validators.h"
-#include "runtime/helpers/dispatch_info.h"
-#include "runtime/kernel/kernel.h"
 #include "runtime/mem_obj/mem_obj.h"
 #include "runtime/memory_manager/graphics_allocation.h"
 #include <algorithm>
@@ -46,57 +39,81 @@
 
 namespace OCLRT {
 
-void computeWorkgroupSize1D(
-    uint32_t maxWorkGroupSize,
-    size_t workGroupSize[3],
-    const size_t workItems[3],
-    size_t simdSize);
+// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
+    OCLRT::LinearStream *pCommandStream,
+    uint32_t aluRegister,
+    uint32_t operation,
+    uint32_t mask) {
+    // Load "Register" value into CS_GPR_R0
+    typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
+    typedef typename GfxFamily::MI_MATH MI_MATH;
+    typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
+    auto pCmd = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
+    *pCmd = MI_LOAD_REGISTER_REG::sInit();
+    pCmd->setSourceRegisterAddress(aluRegister);
+    pCmd->setDestinationRegisterAddress(CS_GPR_R0);
 
-void computeWorkgroupSizeND(
-    WorkSizeInfo wsInfo,
-    size_t workGroupSize[3],
-    const size_t workItems[3],
-    const uint32_t workDim);
+    // Load "Mask" into CS_GPR_R1
+    typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
+    auto pCmd2 = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM)));
+    *pCmd2 = MI_LOAD_REGISTER_IMM::sInit();
+    pCmd2->setRegisterOffset(CS_GPR_R1);
+    pCmd2->setDataDword(mask);
 
-void computeWorkgroupSize2D(
-    uint32_t maxWorkGroupSize,
-    size_t workGroupSize[3],
-    const size_t workItems[3],
-    size_t simdSize);
+    // Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands
+    auto pCmd3 = reinterpret_cast<uint32_t *>(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)));
+    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.Value = 0x0;
+    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
+    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
+    // 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE
+    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1;
+    pCmd3++;
+    MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(pCmd3);
 
-void computeWorkgroupSizeSquared(
-    uint32_t maxWorkGroupSize,
-    size_t workGroupSize[3],
-    const size_t workItems[3],
-    size_t simdSize,
-    const uint32_t workDim);
+    // Setup first operand of MI_MATH - load CS_GPR_R0 into register A
+    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
+    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA;
+    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0;
+    pAluParam++;
 
-Vec3<size_t> computeWorkgroupSize(
-    const DispatchInfo &dispatchInfo);
+    // Setup second operand of MI_MATH - load CS_GPR_R1 into register B
+    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
+    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB;
+    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1;
+    pAluParam++;
 
-Vec3<size_t> generateWorkgroupSize(
-    const DispatchInfo &dispatchInfo);
+    // Setup third operand of MI_MATH - "Operation" on registers A and B
+    pAluParam->DW0.BitField.ALUOpcode = operation;
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
 
-Vec3<size_t> computeWorkgroupsNumber(
-    const Vec3<size_t> gws,
-    const Vec3<size_t> lws);
+    // Setup fourth operand of MI_MATH - store result into CS_GPR_R0
+    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE;
+    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0;
+    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;
 
-Vec3<size_t> generateWorkgroupsNumber(
-    const Vec3<size_t> gws,
-    const Vec3<size_t> lws);
+    // LOAD value of CS_GPR_R0 into "Register"
+    auto pCmd4 = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
+    *pCmd4 = MI_LOAD_REGISTER_REG::sInit();
+    pCmd4->setSourceRegisterAddress(CS_GPR_R0);
+    pCmd4->setDestinationRegisterAddress(aluRegister);
 
-Vec3<size_t> generateWorkgroupsNumber(
-    const DispatchInfo &dispatchInfo);
-
-Vec3<size_t> canonizeWorkgroup(
-    Vec3<size_t> workgroup);
-
-inline uint32_t calculateDispatchDim(Vec3<size_t> dispatchSize, Vec3<size_t> dispatchOffset) {
-    return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
+    // Add PIPE_CONTROL to flush caches
+    typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL;
+    auto pCmd5 = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
+    *pCmd5 = PIPE_CONTROL::sInit();
+    pCmd5->setCommandStreamerStallEnable(true);
+    pCmd5->setDcFlushEnable(true);
+    pCmd5->setTextureCacheInvalidationEnable(true);
+    pCmd5->setPipeControlFlushEnable(true);
+    pCmd5->setStateCacheInvalidationEnable(true);
 }
 
 template <typename GfxFamily>
-inline size_t setGpgpuWalkerThreadData(
+inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
     typename GfxFamily::GPGPU_WALKER *pCmd,
     const size_t globalOffsets[3],
     const size_t startWorkGroups[3],
@@ -132,21 +149,8 @@ inline size_t setGpgpuWalkerThreadData(
     return localWorkSize;
 }
 
-inline cl_uint computeDimensions(const size_t workItems[3]) {
-    return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
-}
-
-void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo);
-
-template <typename SizeAndAllocCalcT, typename... CalcArgsT>
-IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
-    size_t alignment = MemoryConstants::pageSize;
-    size_t size = calc(std::forward<CalcArgsT>(args)...);
-    return new IndirectHeap(alignedMalloc(size, alignment), size);
-}
-
 template <typename GfxFamily>
-void dispatchProfilingCommandsStart(
+void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
     HwTimeStamps &hwTimeStamps,
     OCLRT::LinearStream *commandStream) {
     using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
@@ -173,7 +177,7 @@ void dispatchProfilingCommandsStart(
 }
 
 template <typename GfxFamily>
-void dispatchProfilingCommandsEnd(
+void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
     HwTimeStamps &hwTimeStamps,
     OCLRT::LinearStream *commandStream) {
 
@@ -196,7 +200,7 @@ void dispatchProfilingCommandsEnd(
 }
 
 template <typename GfxFamily>
-void dispatchPerfCountersNoopidRegisterCommands(
+void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersNoopidRegisterCommands(
     CommandQueue &commandQueue,
     OCLRT::HwPerfCounter &hwPerfCounter,
     OCLRT::LinearStream *commandStream,
@@ -214,7 +218,7 @@ void dispatchPerfCountersNoopidRegisterCommands(
 }
 
 template <typename GfxFamily>
-void dispatchPerfCountersReadFreqRegisterCommands(
+void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersReadFreqRegisterCommands(
     CommandQueue &commandQueue,
     OCLRT::HwPerfCounter &hwPerfCounter,
     OCLRT::LinearStream *commandStream,
@@ -232,7 +236,7 @@ void dispatchPerfCountersReadFreqRegisterCommands(
 }
 
 template <typename GfxFamily>
-void dispatchPerfCountersGeneralPurposeCounterCommands(
+void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(
     CommandQueue &commandQueue,
     OCLRT::HwPerfCounter &hwPerfCounter,
     OCLRT::LinearStream *commandStream,
@@ -256,7 +260,7 @@ void dispatchPerfCountersGeneralPurposeCounterCommands(
 }
 
 template <typename GfxFamily>
-void dispatchPerfCountersUserCounterCommands(
+void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(
     CommandQueue &commandQueue,
     OCLRT::HwPerfCounter &hwPerfCounter,
     OCLRT::LinearStream *commandStream,
@@ -297,7 +301,7 @@ void dispatchPerfCountersUserCounterCommands(
 }
 
 template <typename GfxFamily>
-void dispatchPerfCountersOABufferStateCommands(
+void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(
     CommandQueue &commandQueue,
     OCLRT::HwPerfCounter &hwPerfCounter,
     OCLRT::LinearStream *commandStream) {
@@ -328,7 +332,7 @@ void dispatchPerfCountersOABufferStateCommands(
 }
 
 template <typename GfxFamily>
-void dispatchPerfCountersCommandsStart(
+void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
     CommandQueue &commandQueue,
     OCLRT::HwPerfCounter &hwPerfCounter,
     OCLRT::LinearStream *commandStream) {
@@ -347,12 +351,12 @@ void dispatchPerfCountersCommandsStart(
     pPipeControlCmd->setCommandStreamerStallEnable(true);
 
     //Store value of NOOPID register
-    dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, true);
 
     //Read Core Frequency
-    dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, true);
 
-    dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, true);
 
     auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT));
     *pReportPerfCount = MI_REPORT_PERF_COUNT::sInit();
@@ -369,13 +373,13 @@ void dispatchPerfCountersCommandsStart(
     pPipeControlCmd->setAddress(static_cast<uint32_t>(address & ((uint64_t)UINT32_MAX)));
     pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
 
-    dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, true);
 
     commandQueue.sendPerfCountersConfig();
 }
 
 template <typename GfxFamily>
-void dispatchPerfCountersCommandsEnd(
+void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(
     CommandQueue &commandQueue,
     OCLRT::HwPerfCounter &hwPerfCounter,
     OCLRT::LinearStream *commandStream) {
@@ -394,7 +398,7 @@ void dispatchPerfCountersCommandsEnd(
     *pPipeControlCmd = PIPE_CONTROL::sInit();
     pPipeControlCmd->setCommandStreamerStallEnable(true);
 
-    dispatchPerfCountersOABufferStateCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream);
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(commandQueue, hwPerfCounter, commandStream);
 
     //Timestamp: Global End
     pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
@@ -411,21 +415,21 @@ void dispatchPerfCountersCommandsEnd(
     address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.Oa));
     pReportPerfCount->setMemoryAddress(address);
 
-    dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, false);
 
     //Store value of NOOPID register
-    dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, false);
 
     //Read Core Frequency
-    dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, false);
 
-    dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, false);
 
     perfCounters->setCpuTimestamp();
 }
 
 template <typename GfxFamily>
-void dispatchWalker(
+void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
     CommandQueue &commandQueue,
     const MultiDispatchInfo &multiDispatchInfo,
     cl_uint numEventsInWaitList,
@@ -435,7 +439,7 @@ void dispatchWalker(
     OCLRT::HwPerfCounter *hwPerfCounter,
     PreemptionMode preemptionMode,
     bool blockQueue,
-    unsigned int commandType = 0) {
+    unsigned int commandType) {
 
     OCLRT::LinearStream *commandStream = nullptr;
     OCLRT::IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
@@ -586,17 +590,17 @@ void dispatchWalker(
         if (&dispatchInfo == &*multiDispatchInfo.begin()) {
             // If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
             if (hwTimeStamps != nullptr) {
-                dispatchProfilingCommandsStart<GfxFamily>(*hwTimeStamps, commandStream);
+                GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream);
             }
             if (hwPerfCounter != nullptr) {
-                dispatchPerfCountersCommandsStart<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
+                GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);
             }
         }
 
         PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
 
         // Implement enabling special WA DisableLSQCROPERFforOCL if needed
-        applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, true);
+        GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, true);
 
         // Program the walker.  Invokes execution so all state should already be programmed
         typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
@@ -606,7 +610,7 @@ void dispatchWalker(
         size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
         size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
         size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
-        auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd);
+        auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd);
 
         pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
         DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
@@ -627,22 +631,22 @@ void dispatchWalker(
         pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
 
         // Implement disabling special WA DisableLSQCROPERFforOCL if needed
-        applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, false);
+        GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, false);
 
         PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
     }
 
     // If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
     if (hwTimeStamps != nullptr) {
-        dispatchProfilingCommandsEnd<GfxFamily>(*hwTimeStamps, commandStream);
+        GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream);
     }
     if (hwPerfCounter != nullptr) {
-        dispatchPerfCountersCommandsEnd<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
+        GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);
     }
 }
 
 template <typename GfxFamily>
-void dispatchWalker(
+void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
     CommandQueue &commandQueue,
     const Kernel &kernel,
     cl_uint workDim,
@@ -658,12 +662,12 @@ void dispatchWalker(
     bool blockQueue) {
 
     DispatchInfo dispatchInfo(const_cast<Kernel *>(&kernel), workDim, workItems, localWorkSizesIn, globalOffsets);
-    dispatchWalker<GfxFamily>(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList,
-                              blockedCommandsData, hwTimeStamps, hwPerfCounter, preemptionMode, blockQueue);
+    GpgpuWalkerHelper<GfxFamily>::dispatchWalker(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList,
+                                                 blockedCommandsData, hwTimeStamps, hwPerfCounter, preemptionMode, blockQueue);
 }
 
 template <typename GfxFamily>
-void dispatchScheduler(
+void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
     CommandQueue &commandQueue,
     DeviceQueueHw<GfxFamily> &devQueueHw,
     PreemptionMode preemptionMode,
@@ -752,7 +756,7 @@ void dispatchScheduler(
         preemptionMode);
 
     // Implement enabling special WA DisableLSQCROPERFforOCL if needed
-    applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, true);
+    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true);
 
     // Program the walker.  Invokes execution so all state should already be programmed
     auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
@@ -760,7 +764,7 @@ void dispatchScheduler(
 
     size_t globalOffsets[3] = {0, 0, 0};
     size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
-    auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd);
+    auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd);
 
     pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
     DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
@@ -781,7 +785,7 @@ void dispatchScheduler(
     pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
 
     // Implement disabling special WA DisableLSQCROPERFforOCL if needed
-    applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, false);
+    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false);
 
     // Do not put BB_START only when returning in first Scheduler run
     if (devQueueHw.getSchedulerReturnInstance() != 1) {
@@ -797,141 +801,13 @@ void dispatchScheduler(
     }
 }
 
-template <typename GfxFamily, unsigned int eventType>
-struct EnqueueOperation {
-    static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class");
-    static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class");
-    static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class");
-    static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
-        size_t size = KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
-                      sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
-        if (reserveProfilingCmdsSpace) {
-            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-        }
-        if (reservePerfCounters) {
-            //start cmds
-            //P_C: flush CS & TimeStamp BEGIN
-            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
-            //SRM NOOPID & Frequency
-            size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-            //gp registers
-            size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-            //report perf count
-            size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
-            //user registers
-            size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-
-            //end cmds
-            //P_C: flush CS & TimeStamp END;
-            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
-            //OA buffer (status head, tail)
-            size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-            //report perf count
-            size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
-            //gp registers
-            size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-            //SRM NOOPID & Frequency
-            size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-            //user registers
-            size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-        }
-        Device &device = commandQueue.getDevice();
-        for (auto &dispatchInfo : multiDispatchInfo) {
-            auto &kernel = *dispatchInfo.getKernel();
-            size += sizeof(typename GfxFamily::GPGPU_WALKER);
-            size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(&kernel);
-            size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(device);
-        }
-        return size;
-    }
-
-    static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
-        size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
-                      sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
-        size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
-        if (reserveProfilingCmdsSpace) {
-            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-        }
-        if (reservePerfCounters) {
-            //start cmds
-            //P_C: flush CS & TimeStamp BEGIN
-            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
-            //SRM NOOPID & Frequency
-            size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-            //gp registers
-            size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-            //report perf count
-            size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
-            //user registers
-            size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-
-            //end cmds
-            //P_C: flush CS & TimeStamp END;
-            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
-            //OA buffer (status head, tail)
-            size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-            //report perf count
-            size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
-            //gp registers
-            size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-            //SRM NOOPID & Frequency
-            size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-            //user registers
-            size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-        }
-        size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(pKernel);
-
-        return size;
-    }
-};
-
-template <typename GfxFamily, unsigned int eventType>
-LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) {
-    auto expectedSizeCS = EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel);
-    return commandQueue.getCS(expectedSizeCS);
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
 }
 
-template <typename GfxFamily, unsigned int eventType>
-LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
-    size_t expectedSizeCS = 0;
-    Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr;
-    for (auto &dispatchInfo : multiDispatchInfo) {
-        expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel());
-    }
-    if (parentKernel && parentKernel->isParentKernel) {
-        SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext());
-        expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler);
-    }
-    return commandQueue.getCS(expectedSizeCS);
+template <typename GfxFamily>
+size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
+    return (size_t)0;
 }
 
-template <typename GfxFamily, IndirectHeap::Type heapType>
-IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
-    size_t expectedSize = 0;
-    IndirectHeap *ih = nullptr;
-
-    // clang-format off
-    switch(heapType) {
-    case IndirectHeap::DYNAMIC_STATE:   expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo); break;
-    case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo); break;
-    case IndirectHeap::SURFACE_STATE:   expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo); break;
-    }
-    // clang-format on
-
-    if (multiDispatchInfo.begin()->getKernel()->isParentKernel) {
-        if (heapType == IndirectHeap::SURFACE_STATE) {
-            expectedSize += KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<heapType>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
-        } else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
-        {
-            DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
-            DEBUG_BREAK_IF(pDevQueue == nullptr);
-            ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
-        }
-    }
-
-    if (ih == nullptr)
-        ih = &commandQueue.getIndirectHeap(heapType, expectedSize);
-
-    return *ih;
-}
 } // namespace OCLRT
diff --git a/runtime/command_stream/command_stream_receiver_hw.inl b/runtime/command_stream/command_stream_receiver_hw.inl
index 2c20ef9930..bbf857ca93 100644
--- a/runtime/command_stream/command_stream_receiver_hw.inl
+++ b/runtime/command_stream/command_stream_receiver_hw.inl
@@ -32,7 +32,7 @@
 #include "runtime/memory_manager/memory_manager.h"
 #include "runtime/os_interface/debug_settings_manager.h"
 #include "runtime/command_stream/preemption.h"
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "command_stream_receiver_hw.h"
 
 namespace OCLRT {
diff --git a/runtime/command_stream/preemption.inl b/runtime/command_stream/preemption.inl
index fe5c978b1b..8ed136d2fe 100644
--- a/runtime/command_stream/preemption.inl
+++ b/runtime/command_stream/preemption.inl
@@ -24,7 +24,7 @@
 #include "runtime/built_ins/sip.h"
 #include "runtime/command_stream/preemption.h"
 #include "runtime/device/device.h"
-#include "runtime/command_queue/dispatch_walker_helper.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/memory_manager/graphics_allocation.h"
 
 namespace OCLRT {
diff --git a/runtime/device_queue/device_queue_hw.inl b/runtime/device_queue/device_queue_hw.inl
index 5a0d953af9..2535a8e3d6 100644
--- a/runtime/device_queue/device_queue_hw.inl
+++ b/runtime/device_queue/device_queue_hw.inl
@@ -22,8 +22,7 @@
 
 #pragma once
 #include "runtime/device_queue/device_queue_hw.h"
-#include "runtime/command_queue/dispatch_walker.h"
-#include "runtime/command_queue/dispatch_walker_helper.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/helpers/kernel_commands.h"
 #include "runtime/helpers/preamble.h"
 #include "runtime/helpers/string.h"
@@ -217,7 +216,7 @@ void DeviceQueueHw<GfxFamily>::addExecutionModelCleanUpSection(Kernel *parentKer
     offset = slbCS.getUsed();
 
     igilQueue->m_controls.m_CleanupSectionAddress = ptrOffset(slbBuffer->getGpuAddress(), slbCS.getUsed());
-    applyWADisableLSQCROPERFforOCL<GfxFamily>(&slbCS, *parentKernel, true);
+    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&slbCS, *parentKernel, true);
 
     using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
     using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
@@ -388,10 +387,10 @@ size_t DeviceQueueHw<GfxFamily>::setSchedulerCrossThreadData(SchedulerKernel &sc
 
 template <typename GfxFamily>
 void DeviceQueueHw<GfxFamily>::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) {
-    OCLRT::dispatchScheduler<GfxFamily>(cmdQ,
-                                        *this,
-                                        preemptionMode,
-                                        scheduler);
+    GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(cmdQ,
+                                                    *this,
+                                                    preemptionMode,
+                                                    scheduler);
     return;
 }
 
diff --git a/runtime/enable_gens.cmake b/runtime/enable_gens.cmake
index 4f0873ab97..b487c38020 100644
--- a/runtime/enable_gens.cmake
+++ b/runtime/enable_gens.cmake
@@ -37,6 +37,7 @@ set(RUNTIME_SRCS_GENX_BASE
   device_enqueue.h
   device_queue.cpp
   command_stream_receiver_hw.cpp
+  gpgpu_walker.cpp
   hw_cmds.h
   hw_cmds_generated.h
   hw_helper.cpp
diff --git a/runtime/gen8/command_queue.cpp b/runtime/gen8/command_queue.cpp
index 2e1ed96d25..d66f48ae84 100644
--- a/runtime/gen8/command_queue.cpp
+++ b/runtime/gen8/command_queue.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017 - 2018, Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -23,8 +23,6 @@
 #include "runtime/memory_manager/svm_memory_manager.h"
 #include "runtime/command_queue/command_queue_hw.h"
 #include "runtime/command_queue/command_queue_hw.inl"
-#include "runtime/command_queue/dispatch_walker_helper.h"
-#include "runtime/command_queue/dispatch_walker_helper.inl"
 
 namespace OCLRT {
 
@@ -37,43 +35,4 @@ void populateFactoryTable<CommandQueueHw<Family>>() {
     commandQueueFactory[gfxCore] = CommandQueueHw<Family>::create;
 }
 
-template <>
-void applyWADisableLSQCROPERFforOCL<Family>(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
-    if (disablePerfMode) {
-        if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
-            // Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
-            addAluReadModifyWriteRegister<Family>(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
-        }
-    } else {
-        if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
-            // Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
-            typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
-            auto pCmd = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
-            *pCmd = PIPE_CONTROL::sInit();
-            pCmd->setCommandStreamerStallEnable(true);
-            // Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
-            addAluReadModifyWriteRegister<Family>(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
-        }
-    }
-}
-
-template <>
-size_t getSizeForWADisableLSQCROPERFforOCL<Family>(const Kernel *pKernel) {
-    typedef typename Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
-    typedef typename Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
-    typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
-    typedef typename Family::MI_MATH MI_MATH;
-    typedef typename Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
-    size_t n = 0;
-    if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
-        n += sizeof(PIPE_CONTROL) +
-             (2 * sizeof(MI_LOAD_REGISTER_REG) +
-              sizeof(MI_LOAD_REGISTER_IMM) +
-              sizeof(PIPE_CONTROL) +
-              sizeof(MI_MATH) +
-              NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) *
-                 2; // For 2 WADisableLSQCROPERFforOCL WAs
-    }
-    return n;
-}
 } // namespace OCLRT
diff --git a/runtime/gen8/gpgpu_walker.cpp b/runtime/gen8/gpgpu_walker.cpp
new file mode 100644
index 0000000000..c8ee90c25d
--- /dev/null
+++ b/runtime/gen8/gpgpu_walker.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/gen8/hw_info.h"
+#include "runtime/command_queue/gpgpu_walker.h"
+#include "runtime/command_queue/gpgpu_walker.inl"
+
+namespace OCLRT {
+
+template <>
+void GpgpuWalkerHelper<BDWFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
+    if (disablePerfMode) {
+        if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
+            // Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
+            GpgpuWalkerHelper<BDWFamily>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
+        }
+    } else {
+        if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
+            // Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
+            typedef typename BDWFamily::PIPE_CONTROL PIPE_CONTROL;
+            auto pCmd = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
+            *pCmd = PIPE_CONTROL::sInit();
+            pCmd->setCommandStreamerStallEnable(true);
+            // Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
+            GpgpuWalkerHelper<BDWFamily>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
+        }
+    }
+}
+
+template <>
+size_t GpgpuWalkerHelper<BDWFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
+    typedef typename BDWFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
+    typedef typename BDWFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
+    typedef typename BDWFamily::PIPE_CONTROL PIPE_CONTROL;
+    typedef typename BDWFamily::MI_MATH MI_MATH;
+    typedef typename BDWFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
+    size_t n = 0;
+    if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
+        n += sizeof(PIPE_CONTROL) +
+             (2 * sizeof(MI_LOAD_REGISTER_REG) +
+              sizeof(MI_LOAD_REGISTER_IMM) +
+              sizeof(PIPE_CONTROL) +
+              sizeof(MI_MATH) +
+              NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) *
+                 2; // For 2 WADisableLSQCROPERFforOCL WAs
+    }
+    return n;
+}
+
+template class GpgpuWalkerHelper<BDWFamily>;
+
+} // namespace OCLRT
diff --git a/runtime/gen9/command_queue.cpp b/runtime/gen9/command_queue.cpp
index e5fab4e824..42ab5dd0db 100644
--- a/runtime/gen9/command_queue.cpp
+++ b/runtime/gen9/command_queue.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017 - 2018, Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -23,8 +23,6 @@
 #include "runtime/memory_manager/svm_memory_manager.h"
 #include "runtime/command_queue/command_queue_hw.h"
 #include "runtime/command_queue/command_queue_hw.inl"
-#include "runtime/command_queue/dispatch_walker_helper.h"
-#include "runtime/command_queue/dispatch_walker_helper.inl"
 
 namespace OCLRT {
 
@@ -37,43 +35,4 @@ void populateFactoryTable<CommandQueueHw<Family>>() {
     commandQueueFactory[gfxCore] = CommandQueueHw<Family>::create;
 }
 
-template <>
-void applyWADisableLSQCROPERFforOCL<Family>(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
-    if (disablePerfMode) {
-        if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
-            // Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
-            addAluReadModifyWriteRegister<Family>(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
-        }
-    } else {
-        if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
-            // Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
-            typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
-            auto pCmd = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
-            *pCmd = PIPE_CONTROL::sInit();
-            pCmd->setCommandStreamerStallEnable(true);
-            // Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
-            addAluReadModifyWriteRegister<Family>(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
-        }
-    }
-}
-
-template <>
-size_t getSizeForWADisableLSQCROPERFforOCL<Family>(const Kernel *pKernel) {
-    typedef typename Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
-    typedef typename Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
-    typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
-    typedef typename Family::MI_MATH MI_MATH;
-    typedef typename Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
-    size_t n = 0;
-    if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
-        n += sizeof(PIPE_CONTROL) +
-             (2 * sizeof(MI_LOAD_REGISTER_REG) +
-              sizeof(MI_LOAD_REGISTER_IMM) +
-              sizeof(PIPE_CONTROL) +
-              sizeof(MI_MATH) +
-              NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) *
-                 2; // For 2 WADisableLSQCROPERFforOCL WAs
-    }
-    return n;
-}
 } // namespace OCLRT
diff --git a/runtime/gen9/gpgpu_walker.cpp b/runtime/gen9/gpgpu_walker.cpp
new file mode 100644
index 0000000000..668ee963ae
--- /dev/null
+++ b/runtime/gen9/gpgpu_walker.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/gen9/hw_cmds_base.h"
+#include "runtime/command_queue/gpgpu_walker.h"
+#include "runtime/command_queue/gpgpu_walker.inl"
+
+namespace OCLRT {
+
+template <>
+void GpgpuWalkerHelper<SKLFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
+    if (disablePerfMode) {
+        if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
+            // Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
+            GpgpuWalkerHelper<SKLFamily>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
+        }
+    } else {
+        if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
+            // Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
+            typedef typename SKLFamily::PIPE_CONTROL PIPE_CONTROL;
+            auto pCmd = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
+            *pCmd = PIPE_CONTROL::sInit();
+            pCmd->setCommandStreamerStallEnable(true);
+            // Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
+            GpgpuWalkerHelper<SKLFamily>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
+        }
+    }
+}
+
+template <>
+size_t GpgpuWalkerHelper<SKLFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
+    typedef typename SKLFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
+    typedef typename SKLFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
+    typedef typename SKLFamily::PIPE_CONTROL PIPE_CONTROL;
+    typedef typename SKLFamily::MI_MATH MI_MATH;
+    typedef typename SKLFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
+    size_t n = 0;
+    if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
+        n += sizeof(PIPE_CONTROL) +
+             (2 * sizeof(MI_LOAD_REGISTER_REG) +
+              sizeof(MI_LOAD_REGISTER_IMM) +
+              sizeof(PIPE_CONTROL) +
+              sizeof(MI_MATH) +
+              NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) *
+                 2; // For 2 WADisableLSQCROPERFforOCL WAs
+    }
+    return n;
+}
+
+template class GpgpuWalkerHelper<SKLFamily>;
+
+} // namespace OCLRT
diff --git a/runtime/helpers/dispatch_info_builder.h b/runtime/helpers/dispatch_info_builder.h
index d71009260e..c0eb7b3c52 100644
--- a/runtime/helpers/dispatch_info_builder.h
+++ b/runtime/helpers/dispatch_info_builder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017 - 2018, Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,7 +24,7 @@
 
 #include "runtime/helpers/dispatch_info.h"
 #include "runtime/kernel/kernel.h"
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 
 namespace OCLRT {
 
@@ -67,7 +67,7 @@ enum class RegionCoordZ : uint32_t {
     Middle = 1,
     Back = 2
 };
-}
+} // namespace SplitDispatch
 
 // Compute power in compile time
 static constexpr uint32_t powConst(uint32_t base, uint32_t currExp) {
@@ -453,4 +453,4 @@ class DispatchInfoBuilder {
         return x % y ? 1 : 0;
     }
 };
-}
+} // namespace OCLRT
diff --git a/unit_tests/command_queue/dispatch_walker_tests.cpp b/unit_tests/command_queue/dispatch_walker_tests.cpp
index dc1647ba0d..f6827c553c 100644
--- a/unit_tests/command_queue/dispatch_walker_tests.cpp
+++ b/unit_tests/command_queue/dispatch_walker_tests.cpp
@@ -21,13 +21,14 @@
  */
 
 #include "test.h"
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/event/perf_counter.h"
 #include "runtime/helpers/aligned_memory.h"
 #include "runtime/helpers/kernel_commands.h"
 #include "runtime/helpers/task_information.h"
 #include "unit_tests/fixtures/device_fixture.h"
 #include "unit_tests/command_queue/command_queue_fixture.h"
+#include "unit_tests/libult/mock_gfx_family.h"
 #include "unit_tests/helpers/hw_parse.h"
 #include "unit_tests/helpers/debug_manager_state_restore.h"
 #include "unit_tests/mocks/mock_kernel.h"
@@ -137,7 +138,7 @@ HWTEST_F(DispatchWalkerTest, shouldntChangeCommandStreamMemory) {
     size_t globalOffsets[3] = {0, 0, 0};
     size_t workItems[3] = {1, 1, 1};
     cl_uint dimensions = 1;
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         kernel,
         dimensions,
@@ -185,7 +186,7 @@ HWTEST_F(DispatchWalkerTest, noLocalIdsShouldntCrash) {
     size_t globalOffsets[3] = {0, 0, 0};
     size_t workItems[3] = {1, 1, 1};
     cl_uint dimensions = 1;
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         kernel,
         dimensions,
@@ -214,7 +215,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithDefaultLwsAlgorithm)
     size_t workItems[3] = {1, 1, 1};
     for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
         workItems[dimension - 1] = 256;
-        dispatchWalker<FamilyType>(
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(
             *pCmdQ,
             kernel,
             dimension,
@@ -244,7 +245,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithSquaredLwsAlgorithm)
     size_t workItems[3] = {1, 1, 1};
     for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
         workItems[dimension - 1] = 256;
-        dispatchWalker<FamilyType>(
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(
             *pCmdQ,
             kernel,
             dimension,
@@ -273,7 +274,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithNDLwsAlgorithm) {
     size_t workItems[3] = {1, 1, 1};
     for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
         workItems[dimension - 1] = 256;
-        dispatchWalker<FamilyType>(
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(
             *pCmdQ,
             kernel,
             dimension,
@@ -303,7 +304,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithOldLwsAlgorithm) {
     size_t workItems[3] = {1, 1, 1};
     for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
         workItems[dimension - 1] = 256;
-        dispatchWalker<FamilyType>(
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(
             *pCmdQ,
             kernel,
             dimension,
@@ -332,7 +333,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNumWorkGroups) {
     size_t workItems[3] = {2, 5, 10};
     size_t workGroupSize[3] = {1, 1, 1};
     cl_uint dimensions = 3;
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         kernel,
         dimensions,
@@ -363,7 +364,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithOutComputeND) {
     size_t globalOffsets[3] = {0, 0, 0};
     size_t workItems[3] = {2, 5, 10};
     cl_uint dimensions = 3;
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         kernel,
         dimensions,
@@ -394,7 +395,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeND) {
     size_t globalOffsets[3] = {0, 0, 0};
     size_t workItems[3] = {2, 5, 10};
     cl_uint dimensions = 3;
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         kernel,
         dimensions,
@@ -426,7 +427,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeSquared) {
     size_t globalOffsets[3] = {0, 0, 0};
     size_t workItems[3] = {2, 5, 10};
     cl_uint dimensions = 3;
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         kernel,
         dimensions,
@@ -458,7 +459,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithOutComputeSquaredAn
     size_t globalOffsets[3] = {0, 0, 0};
     size_t workItems[3] = {2, 5, 10};
     cl_uint dimensions = 3;
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         kernel,
         dimensions,
@@ -488,7 +489,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSize) {
     size_t workItems[3] = {2, 5, 10};
     size_t workGroupSize[3] = {1, 2, 3};
     cl_uint dimensions = 3;
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         kernel,
         dimensions,
@@ -521,7 +522,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizes) {
     size_t workItems[3] = {2, 5, 10};
     size_t workGroupSize[3] = {1, 2, 3};
     cl_uint dimensions = 3;
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         kernel,
         dimensions,
@@ -561,7 +562,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizeForSplitKernel) {
 
     MockMultiDispatchInfo multiDispatchInfo(std::vector<DispatchInfo *>({&di1, &di2}));
 
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         multiDispatchInfo,
         0,
@@ -604,7 +605,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizesForSplitWalker) {
 
     MockMultiDispatchInfo multiDispatchInfo(std::vector<DispatchInfo *>({&di1, &di2}));
 
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         multiDispatchInfo,
         0,
@@ -646,7 +647,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerDoesntConsumeCommandStreamWhenQueueIs
 
     KernelOperation *blockedCommandsData = nullptr;
 
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         kernel,
         dimensions,
@@ -686,7 +687,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromKernelW
 
     KernelOperation *blockedCommandsData = nullptr;
 
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         kernel,
         dimensions,
@@ -727,7 +728,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromMdiWhen
 
     KernelOperation *blockedCommandsData = nullptr;
 
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         multiDispatchInfo,
         0,
@@ -759,7 +760,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfo) {
 
     MockMultiDispatchInfo multiDispatchInfo(std::vector<Kernel *>({&kernel1, &kernel2}));
 
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         multiDispatchInfo,
         0,
@@ -800,7 +801,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoCorrectlyProg
     indirectHeap.align(KernelCommandsHelper<FamilyType>::alignInterfaceDescriptorData);
     auto dshBeforeMultiDisptach = indirectHeap.getUsed();
 
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         multiDispatchInfo,
         0,
@@ -884,7 +885,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoCorrectlyProg
     // create commandStream
     auto &cmdStream = pCmdQ->getCS(0);
 
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         multiDispatchInfo,
         0,
@@ -929,7 +930,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoAndDifferentK
     // create commandStream
     auto &cmdStream = pCmdQ->getCS(0);
 
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         multiDispatchInfo,
         0,
@@ -979,7 +980,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoButSameKernel
     // create commandStream
     auto &cmdStream = pCmdQ->getCS(0);
 
-    dispatchWalker<FamilyType>(
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(
         *pCmdQ,
         multiDispatchInfo,
         0,
@@ -1030,7 +1031,7 @@ HWTEST_F(DispatchWalkerTest, givenMultiDispatchWhenWhitelistedRegisterForCoheren
     DispatchInfo di2(&kernel, 1, Vec3<size_t>(1, 1, 1), Vec3<size_t>(1, 1, 1), Vec3<size_t>(0, 0, 0));
     MockMultiDispatchInfo multiDispatchInfo(std::vector<DispatchInfo *>({&di1, &di2}));
 
-    dispatchWalker<FamilyType>(*pCmdQ, multiDispatchInfo, 0, nullptr, nullptr, nullptr, nullptr, pDevice->getPreemptionMode(), false);
+    GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ, multiDispatchInfo, 0, nullptr, nullptr, nullptr, nullptr, pDevice->getPreemptionMode(), false);
 
     hwParser.parseCommands<FamilyType>(cmdStream, 0);
 
@@ -1056,3 +1057,15 @@ TEST(DispatchWalker, calculateDispatchDim) {
         }
     }
 }
+
+HWTEST_F(DispatchWalkerTest, WhenCallingDefaultWaMethodsThenExpectNothing) {
+    auto &cmdStream = pCmdQ->getCS(0);
+    MockKernel kernel(&program, kernelInfo, *pDevice);
+    EXPECT_EQ(CL_SUCCESS, kernel.initialize());
+
+    GpgpuWalkerHelper<GENX>::applyWADisableLSQCROPERFforOCL(&cmdStream, kernel, false);
+
+    size_t expectedSize = 0;
+    size_t actualSize = GpgpuWalkerHelper<GENX>::getSizeForWADisableLSQCROPERFforOCL(&kernel);
+    EXPECT_EQ(expectedSize, actualSize);
+}
diff --git a/unit_tests/command_queue/get_size_required_buffer_tests.cpp b/unit_tests/command_queue/get_size_required_buffer_tests.cpp
index beaf35e03e..57d3da77fa 100644
--- a/unit_tests/command_queue/get_size_required_buffer_tests.cpp
+++ b/unit_tests/command_queue/get_size_required_buffer_tests.cpp
@@ -20,7 +20,7 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/command_queue/enqueue_fill_buffer.h"
 #include "runtime/command_queue/enqueue_kernel.h"
 #include "runtime/command_queue/enqueue_read_buffer.h"
@@ -43,8 +43,8 @@ struct GetSizeRequiredBufferTest : public CommandEnqueueFixture,
                                    public HelloWorldKernelFixture,
                                    public ::testing::Test {
 
-    using SimpleArgKernelFixture::SetUp;
     using HelloWorldKernelFixture::SetUp;
+    using SimpleArgKernelFixture::SetUp;
 
     GetSizeRequiredBufferTest() {
     }
diff --git a/unit_tests/command_queue/get_size_required_image_tests.cpp b/unit_tests/command_queue/get_size_required_image_tests.cpp
index a95762630e..56185df77f 100644
--- a/unit_tests/command_queue/get_size_required_image_tests.cpp
+++ b/unit_tests/command_queue/get_size_required_image_tests.cpp
@@ -22,7 +22,7 @@
 
 #include "runtime/built_ins/built_ins.h"
 #include "runtime/command_queue/command_queue_hw.h"
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/command_queue/enqueue_copy_image.h"
 #include "runtime/command_queue/enqueue_fill_image.h"
 #include "runtime/command_queue/enqueue_read_image.h"
diff --git a/unit_tests/command_queue/local_work_size_tests.cpp b/unit_tests/command_queue/local_work_size_tests.cpp
index 3fea0011e0..ecccbda145 100644
--- a/unit_tests/command_queue/local_work_size_tests.cpp
+++ b/unit_tests/command_queue/local_work_size_tests.cpp
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2017, Intel Corporation
+* Copyright (c) 2017 - 2018, Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -20,7 +20,7 @@
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/helpers/options.h"
 #include "unit_tests/mocks/mock_kernel.h"
 #include "unit_tests/mocks/mock_device.h"
diff --git a/unit_tests/command_queue/work_group_size_tests.cpp b/unit_tests/command_queue/work_group_size_tests.cpp
index 0bf8ef3859..6e56c78a3c 100644
--- a/unit_tests/command_queue/work_group_size_tests.cpp
+++ b/unit_tests/command_queue/work_group_size_tests.cpp
@@ -21,7 +21,7 @@
  */
 
 #include "hw_cmds.h"
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "unit_tests/fixtures/device_fixture.h"
 #include "unit_tests/helpers/debug_manager_state_restore.h"
 #include "test.h"
@@ -109,7 +109,7 @@ struct WorkGroupSizeBase : public DeviceFixture {
             (workItems[0] + workGroupSize[0] - 1) / workGroupSize[0],
             (workItems[1] + workGroupSize[1] - 1) / workGroupSize[1],
             (workItems[2] + workGroupSize[2] - 1) / workGroupSize[2]};
-        setGpgpuWalkerThreadData<FamilyType>(&pCmd, globalOffsets, workGroupsStart, workGroupsNum, workGroupSize, simdSize);
+        GpgpuWalkerHelper<FamilyType>::setGpgpuWalkerThreadData(&pCmd, globalOffsets, workGroupsStart, workGroupsNum, workGroupSize, simdSize);
 
         //And check if it is programmed correctly
         auto numWorkItems = computeWalkerWorkItems<FamilyType>(pCmd);
diff --git a/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp b/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp
index 71d21d8e1a..014f7346b1 100644
--- a/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp
+++ b/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp
@@ -52,7 +52,7 @@
 #include "gtest/gtest.h"
 #include "runtime/utilities/linux/debug_env_reader.h"
 #include "runtime/gmm_helper/gmm_helper.h"
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 
 using namespace OCLRT;
 
diff --git a/unit_tests/context/driver_diagnostics_tests.h b/unit_tests/context/driver_diagnostics_tests.h
index ff34b9cead..5b9b8f190c 100644
--- a/unit_tests/context/driver_diagnostics_tests.h
+++ b/unit_tests/context/driver_diagnostics_tests.h
@@ -21,7 +21,7 @@
  */
 
 #pragma once
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/context/context.h"
 #include "runtime/helpers/aligned_memory.h"
 #include "runtime/helpers/options.h"
diff --git a/unit_tests/device_queue/device_queue_hw_tests.cpp b/unit_tests/device_queue/device_queue_hw_tests.cpp
index 6676ed6924..b2e2711d39 100644
--- a/unit_tests/device_queue/device_queue_hw_tests.cpp
+++ b/unit_tests/device_queue/device_queue_hw_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017 - 2018, Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -31,7 +31,7 @@
 #include "unit_tests/mocks/mock_kernel.h"
 #include "unit_tests/helpers/debug_manager_state_restore.h"
 
-#include "runtime/command_queue/dispatch_walker_helper.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/helpers/kernel_commands.h"
 
 #include <memory>
@@ -330,7 +330,7 @@ HWTEST_F(DeviceQueueSlb, cleanupSection) {
 
     if (mockParentKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
 
-        cleanupSectionOffsetToParse += getSizeForWADisableLSQCROPERFforOCL<FamilyType>(mockParentKernel) / 2;
+        cleanupSectionOffsetToParse += GpgpuWalkerHelper<FamilyType>::getSizeForWADisableLSQCROPERFforOCL(mockParentKernel) / 2;
     }
 
     hwParser.parseCommands<FamilyType>(*slbCS, cleanupSectionOffsetToParse);
@@ -394,7 +394,7 @@ HWTEST_F(DeviceQueueSlb, AddEMCleanupSectionWithProfiling) {
 
     auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
 
-    if (mockParentKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages && getSizeForWADisableLSQCROPERFforOCL<FamilyType>(mockParentKernel) > 0) {
+    if (mockParentKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages && GpgpuWalkerHelper<FamilyType>::getSizeForWADisableLSQCROPERFforOCL(mockParentKernel) > 0) {
         auto loadRegImmItor = find<MI_LOAD_REGISTER_IMM *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
         EXPECT_NE(hwParser.cmdList.end(), loadRegImmItor);
 
diff --git a/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp b/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp
index b1f19f7e65..bc717aab42 100644
--- a/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp
+++ b/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017 - 2018, Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -20,7 +20,7 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/command_queue/local_id_gen.h"
 #include "runtime/device_queue/device_queue_hw.h"
 #include "runtime/helpers/per_thread_data.h"
diff --git a/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp b/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp
index 8c7f783c3b..62be275e86 100644
--- a/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp
+++ b/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp
@@ -53,19 +53,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenDev
 
         size_t executionModelDSHUsedBefore = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE)->getUsed();
 
-        dispatchWalker<FamilyType>(*pCmdQ,
-                                   *pKernel,
-                                   1,
-                                   globalOffsets,
-                                   workItems,
-                                   nullptr,
-                                   0,
-                                   nullptr,
-                                   &blockedCommandsData,
-                                   nullptr,
-                                   nullptr,
-                                   pDevice->getPreemptionMode(),
-                                   false);
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
+                                                      *pKernel,
+                                                      1,
+                                                      globalOffsets,
+                                                      workItems,
+                                                      nullptr,
+                                                      0,
+                                                      nullptr,
+                                                      &blockedCommandsData,
+                                                      nullptr,
+                                                      nullptr,
+                                                      pDevice->getPreemptionMode(),
+                                                      false);
 
         size_t dshUsedAfter = pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE).getUsed();
         EXPECT_EQ(0u, dshUsedAfter);
@@ -109,19 +109,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenDef
 
         auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT);
 
-        dispatchWalker<FamilyType>(*pCmdQ,
-                                   *pKernel,
-                                   1,
-                                   globalOffsets,
-                                   workItems,
-                                   nullptr,
-                                   0,
-                                   nullptr,
-                                   &blockedCommandsData,
-                                   nullptr,
-                                   nullptr,
-                                   pDevice->getPreemptionMode(),
-                                   false);
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
+                                                      *pKernel,
+                                                      1,
+                                                      globalOffsets,
+                                                      workItems,
+                                                      nullptr,
+                                                      0,
+                                                      nullptr,
+                                                      &blockedCommandsData,
+                                                      nullptr,
+                                                      nullptr,
+                                                      pDevice->getPreemptionMode(),
+                                                      false);
 
         auto iohUsed = ioh.getUsed();
         EXPECT_EQ(0u, iohUsed);
@@ -136,19 +136,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenSSH
 
         MockMultiDispatchInfo multiDispatchInfo(pKernel);
 
-        dispatchWalker<FamilyType>(*pCmdQ,
-                                   *pKernel,
-                                   1,
-                                   globalOffsets,
-                                   workItems,
-                                   nullptr,
-                                   0,
-                                   nullptr,
-                                   &blockedCommandsData,
-                                   nullptr,
-                                   nullptr,
-                                   pDevice->getPreemptionMode(),
-                                   false);
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
+                                                      *pKernel,
+                                                      1,
+                                                      globalOffsets,
+                                                      workItems,
+                                                      nullptr,
+                                                      0,
+                                                      nullptr,
+                                                      &blockedCommandsData,
+                                                      nullptr,
+                                                      nullptr,
+                                                      pDevice->getPreemptionMode(),
+                                                      false);
 
         auto &ssh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE);
 
@@ -172,19 +172,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsBlockedThenSSHSiz
 
         MockMultiDispatchInfo multiDispatchInfo(pKernel);
 
-        dispatchWalker<FamilyType>(*pCmdQ,
-                                   *pKernel,
-                                   1,
-                                   globalOffsets,
-                                   workItems,
-                                   nullptr,
-                                   0,
-                                   nullptr,
-                                   &blockedCommandsData,
-                                   nullptr,
-                                   nullptr,
-                                   pDevice->getPreemptionMode(),
-                                   true); // blockQueue
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
+                                                      *pKernel,
+                                                      1,
+                                                      globalOffsets,
+                                                      workItems,
+                                                      nullptr,
+                                                      0,
+                                                      nullptr,
+                                                      &blockedCommandsData,
+                                                      nullptr,
+                                                      nullptr,
+                                                      pDevice->getPreemptionMode(),
+                                                      true); // blockQueue
         ASSERT_NE(nullptr, blockedCommandsData);
 
         size_t minRequiredSize = KernelCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@@ -269,19 +269,19 @@ HWTEST_F(MockParentKernelDispatch, GivenBlockedQueueWhenParentKernelIsDispatched
         const size_t globalOffsets[3] = {0, 0, 0};
         const size_t workItems[3] = {1, 1, 1};
 
-        dispatchWalker<FamilyType>(*pCmdQ,
-                                   *mockParentKernel,
-                                   1,
-                                   globalOffsets,
-                                   workItems,
-                                   nullptr,
-                                   0,
-                                   nullptr,
-                                   &blockedCommandsData,
-                                   nullptr,
-                                   nullptr,
-                                   pDevice->getPreemptionMode(),
-                                   true); // blockQueue
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
+                                                      *mockParentKernel,
+                                                      1,
+                                                      globalOffsets,
+                                                      workItems,
+                                                      nullptr,
+                                                      0,
+                                                      nullptr,
+                                                      &blockedCommandsData,
+                                                      nullptr,
+                                                      nullptr,
+                                                      pDevice->getPreemptionMode(),
+                                                      true); // blockQueue
 
         ASSERT_NE(nullptr, blockedCommandsData);
 
@@ -302,19 +302,19 @@ HWTEST_F(MockParentKernelDispatch, GivenParentKernelWhenDispatchedThenMediaInter
         const size_t globalOffsets[3] = {0, 0, 0};
         const size_t workItems[3] = {1, 1, 1};
 
-        dispatchWalker<FamilyType>(*pCmdQ,
-                                   *mockParentKernel,
-                                   1,
-                                   globalOffsets,
-                                   workItems,
-                                   nullptr,
-                                   0,
-                                   nullptr,
-                                   &blockedCommandsData,
-                                   nullptr,
-                                   nullptr,
-                                   pDevice->getPreemptionMode(),
-                                   false); // blockQueue
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
+                                                      *mockParentKernel,
+                                                      1,
+                                                      globalOffsets,
+                                                      workItems,
+                                                      nullptr,
+                                                      0,
+                                                      nullptr,
+                                                      &blockedCommandsData,
+                                                      nullptr,
+                                                      nullptr,
+                                                      pDevice->getPreemptionMode(),
+                                                      false); // blockQueue
 
         LinearStream *commandStream = &pCmdQ->getCS(0);
 
@@ -358,19 +358,19 @@ HWTEST_F(MockParentKernelDispatch, GivenUsedSSHHeapWhenParentKernelIsDispatchedT
         // If parent is not using SSH, then heap obtained has zero usage and the same buffer
         ASSERT_EQ(0u, mockParentKernel->getKernelInfo().heapInfo.pKernelHeader->SurfaceStateHeapSize);
 
-        dispatchWalker<FamilyType>(*pCmdQ,
-                                   *mockParentKernel,
-                                   1,
-                                   globalOffsets,
-                                   workItems,
-                                   nullptr,
-                                   0,
-                                   nullptr,
-                                   &blockedCommandsData,
-                                   nullptr,
-                                   nullptr,
-                                   pDevice->getPreemptionMode(),
-                                   false); // blockQueue
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
+                                                      *mockParentKernel,
+                                                      1,
+                                                      globalOffsets,
+                                                      workItems,
+                                                      nullptr,
+                                                      0,
+                                                      nullptr,
+                                                      &blockedCommandsData,
+                                                      nullptr,
+                                                      nullptr,
+                                                      pDevice->getPreemptionMode(),
+                                                      false); // blockQueue
 
         EXPECT_EQ(0u, ssh.getUsed());
 
@@ -393,19 +393,19 @@ HWTEST_F(MockParentKernelDispatch, GivenNotUsedSSHHeapWhenParentKernelIsDispatch
 
         auto *bufferMemory = ssh.getCpuBase();
 
-        dispatchWalker<FamilyType>(*pCmdQ,
-                                   *mockParentKernel,
-                                   1,
-                                   globalOffsets,
-                                   workItems,
-                                   nullptr,
-                                   0,
-                                   nullptr,
-                                   &blockedCommandsData,
-                                   nullptr,
-                                   nullptr,
-                                   pDevice->getPreemptionMode(),
-                                   false); // blockQueue
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
+                                                      *mockParentKernel,
+                                                      1,
+                                                      globalOffsets,
+                                                      workItems,
+                                                      nullptr,
+                                                      0,
+                                                      nullptr,
+                                                      &blockedCommandsData,
+                                                      nullptr,
+                                                      nullptr,
+                                                      pDevice->getPreemptionMode(),
+                                                      false); // blockQueue
 
         EXPECT_EQ(bufferMemory, ssh.getCpuBase());
 
diff --git a/unit_tests/execution_model/scheduler_dispatch_tests.cpp b/unit_tests/execution_model/scheduler_dispatch_tests.cpp
index c5ddb59834..adb9b36ace 100644
--- a/unit_tests/execution_model/scheduler_dispatch_tests.cpp
+++ b/unit_tests/execution_model/scheduler_dispatch_tests.cpp
@@ -72,7 +72,7 @@ HWTEST_F(ExecutionModelSchedulerFixture, dispatchScheduler) {
         LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
         pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
 
-        dispatchScheduler<FamilyType>(
+        GpgpuWalkerHelper<FamilyType>::dispatchScheduler(
             *pCmdQ,
             *pDevQueueHw,
             pDevice->getPreemptionMode(),
@@ -188,7 +188,7 @@ HWTEST_F(ExecutionModelSchedulerFixture, dispatchSchedulerDoesNotUseStandardCmdQ
         getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
         pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
 
-        dispatchScheduler<FamilyType>(
+        GpgpuWalkerHelper<FamilyType>::dispatchScheduler(
             *pCmdQ,
             *pDevQueueHw,
             pDevice->getPreemptionMode(),
@@ -219,7 +219,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, dispatchSchedulerWithEarlyReturnSetToF
         LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
         pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
 
-        dispatchScheduler<FamilyType>(
+        GpgpuWalkerHelper<FamilyType>::dispatchScheduler(
             *pCmdQ,
             mockDevQueue,
             device->getPreemptionMode(),
diff --git a/unit_tests/gen8/scheduler_dispatch_tests.cpp b/unit_tests/gen8/scheduler_dispatch_tests.cpp
index e4256d1c17..d27d76ed84 100644
--- a/unit_tests/gen8/scheduler_dispatch_tests.cpp
+++ b/unit_tests/gen8/scheduler_dispatch_tests.cpp
@@ -22,7 +22,7 @@
 
 #include "runtime/built_ins/built_ins.h"
 #include "runtime/command_queue/enqueue_kernel.h"
-#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/device_queue/device_queue.h"
 #include "runtime/device_queue/device_queue_hw.h"
 #include "runtime/helpers/kernel_commands.h"
@@ -51,7 +51,7 @@ BDWTEST_F(BdwSchedulerTest, givenCallToDispatchSchedulerWhenPipeControlWithCSSta
         LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
         pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
 
-        dispatchScheduler<FamilyType>(
+        GpgpuWalkerHelper<FamilyType>::dispatchScheduler(
             *pCmdQ,
             *pDevQueueHw,
             pDevice->getPreemptionMode(),
diff --git a/unit_tests/gen9/test_device_queue_hw.cpp b/unit_tests/gen9/test_device_queue_hw.cpp
index 64b7b636fd..4f9365a122 100644
--- a/unit_tests/gen9/test_device_queue_hw.cpp
+++ b/unit_tests/gen9/test_device_queue_hw.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017 - 2018, Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -21,7 +21,7 @@
  */
 
 #include "runtime/context/context.h"
-#include "runtime/command_queue/dispatch_walker_helper.h"
+#include "runtime/command_queue/gpgpu_walker.h"
 #include "unit_tests/fixtures/device_host_queue_fixture.h"
 #include "unit_tests/helpers/hw_parse.h"
 #include "unit_tests/mocks/mock_device_queue.h"
diff --git a/unit_tests/libult/mock_gfx_family.cpp b/unit_tests/libult/mock_gfx_family.cpp
index 4f0a834749..f31c39eecb 100644
--- a/unit_tests/libult/mock_gfx_family.cpp
+++ b/unit_tests/libult/mock_gfx_family.cpp
@@ -21,12 +21,23 @@
  */
 
 #include "unit_tests/libult/mock_gfx_family.h"
+#include "runtime/command_queue/gpgpu_walker.inl"
+#include "runtime/command_stream/preemption.inl"
+#include "runtime/device_queue/device_queue_hw.h"
+#include "runtime/device_queue/device_queue_hw.inl"
 #include "runtime/helpers/hw_helper.inl"
+#include "runtime/helpers/kernel_commands.inl"
+#include "runtime/helpers/preamble.inl"
 
 namespace OCLRT {
 
 bool (*GENX::isSimulationFcn)(unsigned short) = nullptr;
 
+GENX::GPGPU_WALKER GENX::cmdInitGpgpuWalker = GENX::GPGPU_WALKER::sInit();
+GENX::INTERFACE_DESCRIPTOR_DATA GENX::cmdInitInterfaceDescriptorData = GENX::INTERFACE_DESCRIPTOR_DATA::sInit();
+GENX::MEDIA_STATE_FLUSH GENX::cmdInitMediaStateFlush = GENX::MEDIA_STATE_FLUSH::sInit();
+GENX::MEDIA_INTERFACE_DESCRIPTOR_LOAD GENX::cmdInitMediaInterfaceDescriptorLoad = GENX::MEDIA_INTERFACE_DESCRIPTOR_LOAD::sInit();
+
 template <>
 size_t HwHelperHw<GENX>::getMaxBarrierRegisterPerSlice() const {
     return 32;
@@ -57,4 +68,89 @@ struct hw_helper_static_init {
 template class HwHelperHw<GENX>;
 
 hw_helper_static_init si;
+
+template class GpgpuWalkerHelper<GENX>;
+
+template <>
+bool KernelCommandsHelper<GENX>::isPipeControlWArequired() {
+    return false;
+}
+
+template struct KernelCommandsHelper<GENX>;
+
+template <>
+size_t PreemptionHelper::getRequiredCmdStreamSize<GENX>(PreemptionMode newPreemptionMode, PreemptionMode oldPreemptionMode) {
+    return 0;
+}
+
+template <>
+void PreemptionHelper::programCmdStream<GENX>(LinearStream &cmdStream, PreemptionMode newPreemptionMode, PreemptionMode oldPreemptionMode,
+                                              GraphicsAllocation *preemptionCsr, Device &device) {
+}
+
+template <>
+size_t PreemptionHelper::getRequiredPreambleSize<GENX>(const Device &device) {
+    return 0;
+}
+
+template <>
+void PreemptionHelper::programPreamble<GENX>(LinearStream &preambleCmdStream, Device &device,
+                                             const GraphicsAllocation *preemptionCsr) {
+}
+
+template <>
+size_t PreemptionHelper::getPreemptionWaCsSize<GENX>(const Device &device) {
+    return 0;
+}
+
+template void PreemptionHelper::programInterfaceDescriptorDataPreemption<GENX>(INTERFACE_DESCRIPTOR_DATA<GENX> *idd, PreemptionMode preemptionMode);
+
+template <>
+size_t DeviceQueueHw<GENX>::getWaCommandsSize() {
+    return (size_t)0;
+}
+
+template <>
+void DeviceQueueHw<GENX>::addArbCheckCmdWa() {
+}
+
+template <>
+void DeviceQueueHw<GENX>::addMiAtomicCmdWa(uint64_t atomicOpPlaceholder) {
+}
+
+template <>
+void DeviceQueueHw<GENX>::addLriCmdWa(bool setArbCheck) {
+}
+
+template <>
+void DeviceQueueHw<GENX>::addPipeControlCmdWa(bool isNoopCmd) {
+}
+
+template <>
+void DeviceQueueHw<GENX>::addProfilingEndCmds(uint64_t timestampAddress) {
+}
+
+template class DeviceQueueHw<GENX>;
+
+template <>
+void PreambleHelper<GENX>::addPipeControlBeforeVfeCmd(LinearStream *pCommandStream, const HardwareInfo *hwInfo) {
+}
+
+template <>
+uint32_t PreambleHelper<GENX>::getL3Config(const HardwareInfo &hwInfo, bool useSLM) {
+    uint32_t l3Config = 0;
+    return l3Config;
+}
+
+template <>
+void PreambleHelper<GENX>::programPipelineSelect(LinearStream *pCommandStream, bool mediaSamplerRequired) {
+}
+
+template <>
+struct L3CNTLRegisterOffset<GENX> {
+    static const uint32_t registerOffset = 0x7034;
+};
+
+template struct PreambleHelper<GENX>;
+
 } // namespace OCLRT
diff --git a/unit_tests/libult/mock_gfx_family.h b/unit_tests/libult/mock_gfx_family.h
index 7b723bd687..2bd8049279 100644
--- a/unit_tests/libult/mock_gfx_family.h
+++ b/unit_tests/libult/mock_gfx_family.h
@@ -31,9 +31,71 @@ extern HwHelper *hwHelperFactory[IGFX_MAX_CORE];
 struct GENX {
     static bool (*isSimulationFcn)(unsigned short);
     typedef struct tagINTERFACE_DESCRIPTOR_DATA {
+        typedef enum tagDENORM_MODE {
+            DENORM_MODE_FTZ = 0x0,
+            DENORM_MODE_SETBYKERNEL = 0x1,
+        } DENORM_MODE;
+        typedef enum tagSAMPLERSTATEPOINTER {
+            SAMPLERSTATEPOINTER_BIT_SHIFT = 0x5,
+            SAMPLERSTATEPOINTER_ALIGN_SIZE = 0x20,
+        } SAMPLERSTATEPOINTER;
+        typedef enum tagSAMPLER_COUNT {
+            SAMPLER_COUNT_NO_SAMPLERS_USED = 0x0,
+            SAMPLER_COUNT_BETWEEN_1_AND_4_SAMPLERS_USED = 0x1,
+            SAMPLER_COUNT_BETWEEN_5_AND_8_SAMPLERS_USED = 0x2,
+            SAMPLER_COUNT_BETWEEN_9_AND_12_SAMPLERS_USED = 0x3,
+            SAMPLER_COUNT_BETWEEN_13_AND_16_SAMPLERS_USED = 0x4,
+        } SAMPLER_COUNT;
+        typedef enum tagSHARED_LOCAL_MEMORY_SIZE {
+            SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K = 0x0,
+            SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K = 0x1,
+            SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K = 0x2,
+            SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K = 0x3,
+            SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K = 0x4,
+            SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K = 0x5,
+            SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K = 0x6,
+            SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K = 0x7,
+        } SHARED_LOCAL_MEMORY_SIZE;
+        typedef enum tagBINDINGTABLEPOINTER {
+            BINDINGTABLEPOINTER_BIT_SHIFT = 0x5,
+            BINDINGTABLEPOINTER_ALIGN_SIZE = 0x20,
+        } BINDINGTABLEPOINTER;
+        static tagINTERFACE_DESCRIPTOR_DATA sInit(void) {
+            INTERFACE_DESCRIPTOR_DATA state;
+            return state;
+        }
+        inline void setKernelStartPointerHigh(const uint32_t value) {
+        }
+        inline void setKernelStartPointer(const uint64_t value) {
+        }
+        inline void setNumberOfThreadsInGpgpuThreadGroup(const uint32_t value) {
+        }
+        inline void setCrossThreadConstantDataReadLength(const uint32_t value) {
+        }
+        inline void setDenormMode(const DENORM_MODE value) {
+        }
+        inline void setConstantIndirectUrbEntryReadLength(const uint32_t value) {
+        }
+        inline void setBindingTablePointer(const uint64_t value) {
+        }
+        inline void setSamplerStatePointer(const uint64_t value) {
+        }
+        inline void setSamplerCount(const SAMPLER_COUNT value) {
+        }
+        inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
+        }
+        inline void setBarrierEnable(const bool value) {
+        }
     } INTERFACE_DESCRIPTOR_DATA;
 
     typedef struct tagBINDING_TABLE_STATE {
+        inline void init(void) {
+        }
+        inline uint32_t getSurfaceStatePointer(void) const {
+            return 0u;
+        }
+        inline void setSurfaceStatePointer(const uint64_t value) {
+        }
         inline uint32_t getRawData(const uint32_t index) {
             return 0;
         }
@@ -42,6 +104,247 @@ struct GENX {
             SURFACESTATEPOINTER_ALIGN_SIZE = 0x40,
         } SURFACESTATEPOINTER;
     } BINDING_TABLE_STATE;
+
+    typedef struct tagGPGPU_WALKER {
+        typedef enum tagSIMD_SIZE {
+            SIMD_SIZE_SIMD8 = 0x0,
+            SIMD_SIZE_SIMD16 = 0x1,
+            SIMD_SIZE_SIMD32 = 0x2,
+        } SIMD_SIZE;
+        typedef enum tagINDIRECTDATASTARTADDRESS {
+            INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
+            INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
+        } INDIRECTDATASTARTADDRESS;
+        static tagGPGPU_WALKER sInit(void) {
+            GPGPU_WALKER state;
+            return state;
+        }
+        inline void setThreadWidthCounterMaximum(const uint32_t value) {
+        }
+        inline void setThreadGroupIdXDimension(const uint32_t value) {
+        }
+        inline void setThreadGroupIdYDimension(const uint32_t value) {
+        }
+        inline void setThreadGroupIdZDimension(const uint32_t value) {
+        }
+        inline void setRightExecutionMask(const uint32_t value) {
+        }
+        inline void setBottomExecutionMask(const uint32_t value) {
+        }
+        inline void setSimdSize(const SIMD_SIZE value) {
+        }
+        inline void setThreadGroupIdStartingX(const uint32_t value) {
+        }
+        inline void setThreadGroupIdStartingY(const uint32_t value) {
+        }
+        inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
+        }
+        inline void setIndirectDataStartAddress(const uint32_t value) {
+        }
+        inline void setInterfaceDescriptorOffset(const uint32_t value) {
+        }
+        inline void setIndirectDataLength(const uint32_t value) {
+        }
+    } GPGPU_WALKER;
+
+    typedef struct tagPIPE_CONTROL {
+        typedef enum tagPOST_SYNC_OPERATION {
+            POST_SYNC_OPERATION_NO_WRITE = 0x0,
+            POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA = 0x1,
+            POST_SYNC_OPERATION_WRITE_PS_DEPTH_COUNT = 0x2,
+            POST_SYNC_OPERATION_WRITE_TIMESTAMP = 0x3,
+        } POST_SYNC_OPERATION;
+        static tagPIPE_CONTROL sInit(void) {
+            PIPE_CONTROL state;
+            return state;
+        }
+        inline void setCommandStreamerStallEnable(const uint32_t value) {
+        }
+        inline void setDcFlushEnable(const bool value) {
+        }
+        inline void setStateCacheInvalidationEnable(const bool value) {
+        }
+        inline void setPipeControlFlushEnable(const bool value) {
+        }
+        inline void setTextureCacheInvalidationEnable(const bool value) {
+        }
+        inline void setPostSyncOperation(const POST_SYNC_OPERATION value) {
+        }
+        inline void setAddress(const uint32_t value) {
+        }
+        inline void setAddressHigh(const uint32_t value) {
+        }
+        inline void setImmediateData(const uint64_t value) {
+        }
+        inline void setGenericMediaStateClear(const bool value) {
+        }
+    } PIPE_CONTROL;
+
+    typedef struct tagMI_LOAD_REGISTER_IMM {
+        static tagMI_LOAD_REGISTER_IMM sInit(void) {
+            MI_LOAD_REGISTER_IMM state;
+            return state;
+        }
+        inline void setRegisterOffset(const uint32_t value) {
+        }
+        inline void setDataDword(const uint32_t value) {
+        }
+    } MI_LOAD_REGISTER_IMM;
+
+    typedef struct tagMI_LOAD_REGISTER_REG {
+        static tagMI_LOAD_REGISTER_REG sInit(void) {
+            MI_LOAD_REGISTER_REG state;
+            return state;
+        }
+        inline void setSourceRegisterAddress(const uint32_t value) {
+        }
+        inline void setDestinationRegisterAddress(const uint32_t value) {
+        }
+    } MI_LOAD_REGISTER_REG;
+
+    typedef struct tagMI_MATH {
+        union _DW0 {
+            struct _BitField {
+                uint32_t DwordLength : BITFIELD_RANGE(0, 5);
+                uint32_t Reserved : BITFIELD_RANGE(6, 22);
+                uint32_t InstructionOpcode : BITFIELD_RANGE(23, 28);
+                uint32_t InstructionType : BITFIELD_RANGE(29, 31);
+            } BitField;
+            uint32_t Value;
+        } DW0;
+        typedef enum tagMI_COMMAND_OPCODE {
+            MI_COMMAND_OPCODE_MI_MATH = 0x0,
+        } MI_COMMAND_OPCODE;
+        typedef enum tagCOMMAND_TYPE {
+            COMMAND_TYPE_MI_COMMAND = 0x0,
+        } COMMAND_TYPE;
+    } MI_MATH;
+
+    typedef struct tagMI_MATH_ALU_INST_INLINE {
+        union _DW0 {
+            struct _BitField {
+                uint32_t Operand2 : BITFIELD_RANGE(0, 9);
+                uint32_t Operand1 : BITFIELD_RANGE(10, 19);
+                uint32_t ALUOpcode : BITFIELD_RANGE(20, 31);
+            } BitField;
+            uint32_t Value;
+        } DW0;
+    } MI_MATH_ALU_INST_INLINE;
+
+    typedef struct tagMI_COMMAND_OPCODE_MI_MATH {
+    } MI_COMMAND_OPCODE_MI_MATH;
+
+    typedef struct tagMI_STORE_REGISTER_MEM {
+        static tagMI_STORE_REGISTER_MEM sInit(void) {
+            MI_STORE_REGISTER_MEM state;
+            return state;
+        }
+        inline void setRegisterAddress(const uint32_t value) {
+        }
+        inline void setMemoryAddress(const uint64_t value) {
+        }
+    } MI_STORE_REGISTER_MEM;
+
+    typedef struct tagMI_REPORT_PERF_COUNT {
+        static tagMI_REPORT_PERF_COUNT sInit(void) {
+            MI_REPORT_PERF_COUNT state;
+            return state;
+        }
+        inline void setReportId(const uint32_t value) {
+        }
+        inline void setMemoryAddress(const uint64_t value) {
+        }
+    } MI_REPORT_PERF_COUNT;
+
+    typedef struct tagMI_BATCH_BUFFER_START {
+        typedef enum tagSECOND_LEVEL_BATCH_BUFFER {
+            SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH = 0x0,
+            SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH = 0x1,
+        } SECOND_LEVEL_BATCH_BUFFER;
+        static tagMI_BATCH_BUFFER_START sInit(void) {
+            MI_BATCH_BUFFER_START state;
+            return state;
+        }
+        inline void setSecondLevelBatchBuffer(const SECOND_LEVEL_BATCH_BUFFER value) {
+        }
+        inline void setBatchBufferStartAddressGraphicsaddress472(const uint64_t value) {
+        }
+    } MI_BATCH_BUFFER_START;
+
+    typedef struct tagMEDIA_STATE_FLUSH {
+        static tagMEDIA_STATE_FLUSH sInit(void) {
+            MEDIA_STATE_FLUSH state;
+            return state;
+        }
+        inline void setInterfaceDescriptorOffset(const uint32_t value) {
+        }
+    } MEDIA_STATE_FLUSH;
+
+    typedef struct tagMEDIA_INTERFACE_DESCRIPTOR_LOAD {
+        static tagMEDIA_INTERFACE_DESCRIPTOR_LOAD sInit(void) {
+            MEDIA_INTERFACE_DESCRIPTOR_LOAD state;
+            return state;
+        }
+        inline void setInterfaceDescriptorDataStartAddress(const uint32_t value) {
+        }
+        inline void setInterfaceDescriptorTotalLength(const uint32_t value) {
+        }
+    } MEDIA_INTERFACE_DESCRIPTOR_LOAD;
+
+    typedef struct tagMI_BATCH_BUFFER_END {
+        static tagMI_BATCH_BUFFER_END sInit(void) {
+            MI_BATCH_BUFFER_END state;
+            return state;
+        }
+    } MI_BATCH_BUFFER_END;
+
+    typedef struct tagRENDER_SURFACE_STATE {
+    } RENDER_SURFACE_STATE;
+
+    typedef struct tagMEDIA_VFE_STATE {
+        static tagMEDIA_VFE_STATE sInit(void) {
+            MEDIA_VFE_STATE state;
+            return state;
+        }
+        inline void setMaximumNumberOfThreads(const uint32_t value) {
+        }
+        inline void setNumberOfUrbEntries(const uint32_t value) {
+        }
+        inline void setUrbEntryAllocationSize(const uint32_t value) {
+        }
+        inline void setPerThreadScratchSpace(const uint32_t value) {
+        }
+        inline void setStackSize(const uint32_t value) {
+        }
+        inline void setScratchSpaceBasePointer(const uint32_t value) {
+        }
+        inline void setScratchSpaceBasePointerHigh(const uint32_t value) {
+        }
+    } MEDIA_VFE_STATE;
+
+    typedef struct tagSAMPLER_STATE {
+        inline void setIndirectStatePointer(const uint32_t indirectStatePointerValue) {
+        }
+    } SAMPLER_STATE;
+
+    typedef struct tagGPGPU_CSR_BASE_ADDRESS {
+        inline void init(void) {
+        }
+        inline void setGpgpuCsrBaseAddress(uint64_t value) {
+        }
+    } GPGPU_CSR_BASE_ADDRESS;
+
+    typedef struct tagSTATE_SIP {
+        inline void init(void) {
+        }
+        inline void setSystemInstructionPointer(uint64_t value) {
+        }
+    } STATE_SIP;
+
+    static GPGPU_WALKER cmdInitGpgpuWalker;
+    static INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;
+    static MEDIA_STATE_FLUSH cmdInitMediaStateFlush;
+    static MEDIA_INTERFACE_DESCRIPTOR_LOAD cmdInitMediaInterfaceDescriptorLoad;
 };
 
 } // namespace OCLRT