Extract GpgpuWalker related functions to bdw_plus files

Change-Id: I3b2081af8e350d4072da5e1482a4bfc50e06fb6d Related-To: NEO-3016 Signed-off-by: Maciej Dziuban <maciej.dziuban@intel.com>
2025-12-30 01:35:20 +08:00 · 2019-05-13 14:15:03 +02:00
parent 7218bdb849
commit 608ec933da
12 changed files with 881 additions and 897 deletions
--- a/runtime/command_queue/CMakeLists.txt
+++ b/runtime/command_queue/CMakeLists.txt
@@ -35,11 +35,11 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
  ${CMAKE_CURRENT_SOURCE_DIR}/finish.h
  ${CMAKE_CURRENT_SOURCE_DIR}/flush.h
  ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.inl
  ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker_base.inl
+  ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker_bdw_plus.inl
  ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.inl
  ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface_base.inl
+  ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface_bdw_plus.inl
  ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.h
  ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl
--- a/runtime/command_queue/gpgpu_walker.inl
+++ b/runtime/command_queue/gpgpu_walker.inl
@@ -1,393 +0,0 @@
-/*
- * Copyright (C) 2017-2019 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- */
-
-#pragma once
-#include "runtime/command_queue/command_queue.h"
-#include "runtime/command_queue/gpgpu_walker.h"
-#include "runtime/command_queue/local_id_gen.h"
-#include "runtime/command_stream/command_stream_receiver.h"
-#include "runtime/device/device_info.h"
-#include "runtime/event/perf_counter.h"
-#include "runtime/event/user_event.h"
-#include "runtime/helpers/aligned_memory.h"
-#include "runtime/helpers/debug_helpers.h"
-#include "runtime/helpers/hw_helper.h"
-#include "runtime/helpers/kernel_commands.h"
-#include "runtime/helpers/queue_helpers.h"
-#include "runtime/helpers/validators.h"
-#include "runtime/indirect_heap/indirect_heap.h"
-#include "runtime/mem_obj/mem_obj.h"
-#include "runtime/memory_manager/graphics_allocation.h"
-#include "runtime/utilities/tag_allocator.h"
-
-#include "instrumentation.h"
-
-#include <algorithm>
-#include <cmath>
-
-namespace NEO {
-
-// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask
-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
-    NEO::LinearStream *pCommandStream,
-    uint32_t aluRegister,
-    uint32_t operation,
-    uint32_t mask) {
-    // Load "Register" value into CS_GPR_R0
-    typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
-    typedef typename GfxFamily::MI_MATH MI_MATH;
-    typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
-    auto pCmd = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
-    *pCmd = GfxFamily::cmdInitLoadRegisterReg;
-    pCmd->setSourceRegisterAddress(aluRegister);
-    pCmd->setDestinationRegisterAddress(CS_GPR_R0);
-
-    // Load "Mask" into CS_GPR_R1
-    typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
-    auto pCmd2 = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_IMM>();
-    *pCmd2 = GfxFamily::cmdInitLoadRegisterImm;
-    pCmd2->setRegisterOffset(CS_GPR_R1);
-    pCmd2->setDataDword(mask);
-
-    // Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands
-    auto pCmd3 = reinterpret_cast<uint32_t *>(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)));
-    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.Value = 0x0;
-    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
-    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
-    // 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE
-    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1;
-    pCmd3++;
-    MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(pCmd3);
-
-    // Setup first operand of MI_MATH - load CS_GPR_R0 into register A
-    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
-    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA;
-    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0;
-    pAluParam++;
-
-    // Setup second operand of MI_MATH - load CS_GPR_R1 into register B
-    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
-    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB;
-    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1;
-    pAluParam++;
-
-    // Setup third operand of MI_MATH - "Operation" on registers A and B
-    pAluParam->DW0.BitField.ALUOpcode = operation;
-    pAluParam->DW0.BitField.Operand1 = 0;
-    pAluParam->DW0.BitField.Operand2 = 0;
-    pAluParam++;
-
-    // Setup fourth operand of MI_MATH - store result into CS_GPR_R0
-    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE;
-    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0;
-    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;
-
-    // LOAD value of CS_GPR_R0 into "Register"
-    auto pCmd4 = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
-    *pCmd4 = GfxFamily::cmdInitLoadRegisterReg;
-    pCmd4->setSourceRegisterAddress(CS_GPR_R0);
-    pCmd4->setDestinationRegisterAddress(aluRegister);
-
-    // Add PIPE_CONTROL to flush caches
-    auto pCmd5 = pCommandStream->getSpaceForCmd<PIPE_CONTROL>();
-    *pCmd5 = GfxFamily::cmdInitPipeControl;
-    pCmd5->setCommandStreamerStallEnable(true);
-    pCmd5->setDcFlushEnable(true);
-    pCmd5->setTextureCacheInvalidationEnable(true);
-    pCmd5->setPipeControlFlushEnable(true);
-    pCmd5->setStateCacheInvalidationEnable(true);
-}
-
-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
-    TagNode<HwTimeStamps> &hwTimeStamps,
-    LinearStream *commandStream) {
-
-    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
-
-    // PIPE_CONTROL for global timestamp
-    uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, GlobalStartTS);
-
-    PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, timeStampAddress, 0llu, false);
-
-    //MI_STORE_REGISTER_MEM for context local timestamp
-    timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextStartTS);
-
-    //low part
-    auto pMICmdLow = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
-    *pMICmdLow = GfxFamily::cmdInitStoreRegisterMem;
-    adjustMiStoreRegMemMode(pMICmdLow);
-    pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
-    pMICmdLow->setMemoryAddress(timeStampAddress);
-}
-
-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
-    TagNode<HwTimeStamps> &hwTimeStamps,
-    LinearStream *commandStream) {
-
-    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
-
-    // PIPE_CONTROL for global timestamp
-    auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
-    *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
-    pPipeControlCmd->setCommandStreamerStallEnable(true);
-
-    //MI_STORE_REGISTER_MEM for context local timestamp
-    uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextEndTS);
-
-    //low part
-    auto pMICmdLow = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
-    *pMICmdLow = GfxFamily::cmdInitStoreRegisterMem;
-    adjustMiStoreRegMemMode(pMICmdLow);
-    pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
-    pMICmdLow->setMemoryAddress(timeStampAddress);
-}
-
-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(
-    LinearStream *commandStream,
-    uint64_t memoryAddress,
-    uint32_t registerAddress) {
-
-    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
-
-    auto pCmd = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
-    *pCmd = GfxFamily::cmdInitStoreRegisterMem;
-    pCmd->setRegisterAddress(registerAddress);
-    pCmd->setMemoryAddress(memoryAddress);
-}
-
-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(
-    LinearStream *commandStream,
-    uint64_t baseAddress) {
-
-    // Read General Purpose counters
-    for (auto i = 0u; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) {
-        uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint);
-        //Gp field is 2*uint64 wide so it can hold 4 uint32
-        uint64_t address = baseAddress + i * sizeof(cl_uint);
-        dispatchStoreRegisterCommand(commandStream, address, regAddr);
-    }
-}
-
-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(
-    CommandQueue &commandQueue,
-    LinearStream *commandStream,
-    uint64_t baseAddress) {
-
-    auto userRegs = &commandQueue.getPerfCountersConfigData()->ReadRegs;
-
-    for (uint32_t i = 0; i < userRegs->RegsCount; i++) {
-        uint32_t regAddr = userRegs->Reg[i].Offset;
-        //offset between base (low) registers is cl_ulong wide
-        uint64_t address = baseAddress + i * sizeof(cl_ulong);
-        dispatchStoreRegisterCommand(commandStream, address, regAddr);
-
-        if (userRegs->Reg[i].BitSize > 32) {
-            dispatchStoreRegisterCommand(commandStream, address + sizeof(cl_uint), regAddr + sizeof(cl_uint));
-        }
-    }
-}
-
-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(
-    TagNode<HwPerfCounter> &hwPerfCounter,
-    LinearStream *commandStream) {
-
-    dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaStatus), INSTR_GFX_OFFSETS::INSTR_OA_STATUS);
-    dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaHead), INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR);
-    dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaTail), INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR);
-}
-
-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
-    CommandQueue &commandQueue,
-    TagNode<HwPerfCounter> &hwPerfCounter,
-    LinearStream *commandStream) {
-
-    using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
-
-    auto perfCounters = commandQueue.getPerfCounters();
-
-    uint32_t currentReportId = perfCounters->getCurrentReportId();
-    uint64_t address = 0;
-    //flush command streamer
-    auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
-    *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
-    pPipeControlCmd->setCommandStreamerStallEnable(true);
-
-    //Store value of NOOPID register
-    GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdBegin), INSTR_MMIO_NOOPID);
-
-    //Read Core Frequency
-    GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqBegin), INSTR_MMIO_RPSTAT1);
-
-    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Gp));
-
-    auto pReportPerfCount = commandStream->getSpaceForCmd<MI_REPORT_PERF_COUNT>();
-    *pReportPerfCount = GfxFamily::cmdInitReportPerfCount;
-    pReportPerfCount->setReportId(currentReportId);
-    address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Oa);
-    pReportPerfCount->setMemoryAddress(address);
-
-    address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalStartTS);
-
-    PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false);
-
-    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User));
-
-    commandQueue.sendPerfCountersConfig();
-}
-
-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(
-    CommandQueue &commandQueue,
-    TagNode<HwPerfCounter> &hwPerfCounter,
-    LinearStream *commandStream) {
-
-    using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
-
-    auto perfCounters = commandQueue.getPerfCounters();
-
-    uint32_t currentReportId = perfCounters->getCurrentReportId();
-
-    //flush command streamer
-    auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
-    *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
-    pPipeControlCmd->setCommandStreamerStallEnable(true);
-
-    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(hwPerfCounter, commandStream);
-
-    //Timestamp: Global End
-    uint64_t address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalEndTS);
-    PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false);
-
-    auto pReportPerfCount = commandStream->getSpaceForCmd<MI_REPORT_PERF_COUNT>();
-    *pReportPerfCount = GfxFamily::cmdInitReportPerfCount;
-    pReportPerfCount->setReportId(currentReportId);
-    address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Oa);
-    pReportPerfCount->setMemoryAddress(address);
-
-    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Gp));
-
-    //Store value of NOOPID register
-    GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdEnd), INSTR_MMIO_NOOPID);
-
-    //Read Core Frequency
-    GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqEnd), INSTR_MMIO_RPSTAT1);
-
-    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User));
-
-    perfCounters->setCpuTimestamp();
-}
-
-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
-}
-
-template <typename GfxFamily>
-size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
-    return (size_t)0;
-}
-
-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxFamily> *storeCmd) {
-}
-
-template <typename GfxFamily>
-size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
-    size_t expectedSizeCS = 0;
-    Kernel *parentKernel = multiDispatchInfo.peekParentKernel();
-    if (multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->isAuxTranslationRequired()) {
-        expectedSizeCS += sizeof(PIPE_CONTROL);
-    }
-    for (auto &dispatchInfo : multiDispatchInfo) {
-        expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, dispatchInfo.getKernel());
-        if (dispatchInfo.isPipeControlRequired()) {
-            expectedSizeCS += sizeof(PIPE_CONTROL);
-        }
-    }
-    if (parentKernel) {
-        SchedulerKernel &scheduler = commandQueue.getDevice().getExecutionEnvironment()->getBuiltIns()->getSchedulerKernel(parentKernel->getContext());
-        expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, &scheduler);
-    }
-    if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
-        expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite();
-        expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(csrDeps);
-    }
-    return expectedSizeCS;
-}
-
-template <typename GfxFamily>
-size_t EnqueueOperation<GfxFamily>::getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
-    if (isCommandWithoutKernel(cmdType)) {
-        return EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue);
-    } else {
-        return EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, pKernel);
-    }
-}
-
-template <typename GfxFamily>
-size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
-    size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS(pKernel) +
-                  sizeof(PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
-    size += KernelCommandsHelper<GfxFamily>::getSizeRequiredForCacheFlush(commandQueue, pKernel, 0U, 0U);
-    size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
-    if (reserveProfilingCmdsSpace) {
-        size += 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-    }
-    if (reservePerfCounters) {
-        //start cmds
-        //P_C: flush CS & TimeStamp BEGIN
-        size += 2 * sizeof(PIPE_CONTROL);
-        //SRM NOOPID & Frequency
-        size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-        //gp registers
-        size += NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-        //report perf count
-        size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
-        //user registers
-        size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-
-        //end cmds
-        //P_C: flush CS & TimeStamp END;
-        size += 2 * sizeof(PIPE_CONTROL);
-        //OA buffer (status head, tail)
-        size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-        //report perf count
-        size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
-        //gp registers
-        size += NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-        //SRM NOOPID & Frequency
-        size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-        //user registers
-        size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-    }
-    size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
-
-    return size;
-}
-
-template <typename GfxFamily>
-size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue) {
-    size_t size = 0;
-    if (reserveProfilingCmdsSpace) {
-        size += 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
-    }
-    return size;
-}
-
-template <typename GfxFamily>
-size_t EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite() {
-    return sizeof(PIPE_CONTROL);
-}
-
-} // namespace NEO
--- a/runtime/command_queue/gpgpu_walker_base.inl
+++ b/runtime/command_queue/gpgpu_walker_base.inl
@@ -1,183 +1,347 @@
 /*
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2017-2019 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

 #pragma once
+#include "runtime/command_queue/command_queue.h"
 #include "runtime/command_queue/gpgpu_walker.h"
+#include "runtime/command_queue/local_id_gen.h"
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/device/device_info.h"
+#include "runtime/event/perf_counter.h"
+#include "runtime/event/user_event.h"
+#include "runtime/helpers/aligned_memory.h"
+#include "runtime/helpers/debug_helpers.h"
+#include "runtime/helpers/hw_helper.h"
+#include "runtime/helpers/kernel_commands.h"
+#include "runtime/helpers/queue_helpers.h"
+#include "runtime/helpers/validators.h"
+#include "runtime/indirect_heap/indirect_heap.h"
+#include "runtime/mem_obj/mem_obj.h"
+#include "runtime/memory_manager/graphics_allocation.h"
+#include "runtime/utilities/tag_allocator.h"
+
+#include "instrumentation.h"
+
+#include <algorithm>
+#include <cmath>

 namespace NEO {

+// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask
 template <typename GfxFamily>
-inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
-    WALKER_TYPE<GfxFamily> *walkerCmd,
-    const size_t globalOffsets[3],
-    const size_t startWorkGroups[3],
-    const size_t numWorkGroups[3],
-    const size_t localWorkSizesIn[3],
-    uint32_t simd,
-    uint32_t workDim,
-    bool localIdsGenerationByRuntime,
-    bool inlineDataProgrammingRequired,
-    const iOpenCL::SPatchThreadPayload &threadPayload) {
-    auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
+void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
+    NEO::LinearStream *pCommandStream,
+    uint32_t aluRegister,
+    uint32_t operation,
+    uint32_t mask) {
+    // Load "Register" value into CS_GPR_R0
+    typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
+    typedef typename GfxFamily::MI_MATH MI_MATH;
+    typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
+    auto pCmd = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
+    *pCmd = GfxFamily::cmdInitLoadRegisterReg;
+    pCmd->setSourceRegisterAddress(aluRegister);
+    pCmd->setDestinationRegisterAddress(CS_GPR_R0);

-    auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
-    walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
+    // Load "Mask" into CS_GPR_R1
+    typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
+    auto pCmd2 = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_IMM>();
+    *pCmd2 = GfxFamily::cmdInitLoadRegisterImm;
+    pCmd2->setRegisterOffset(CS_GPR_R1);
+    pCmd2->setDataDword(mask);

-    walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
-    walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
-    walkerCmd->setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
+    // Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands
+    auto pCmd3 = reinterpret_cast<uint32_t *>(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)));
+    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.Value = 0x0;
+    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
+    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
+    // 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE
+    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1;
+    pCmd3++;
+    MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(pCmd3);

-    // compute executionMask - to tell which SIMD lines are active within thread
-    auto remainderSimdLanes = localWorkSize & (simd - 1);
-    uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
-    if (!executionMask)
-        executionMask = ~executionMask;
+    // Setup first operand of MI_MATH - load CS_GPR_R0 into register A
+    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
+    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA;
+    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0;
+    pAluParam++;

-    using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
+    // Setup second operand of MI_MATH - load CS_GPR_R1 into register B
+    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
+    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB;
+    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1;
+    pAluParam++;

-    walkerCmd->setRightExecutionMask(static_cast<uint32_t>(executionMask));
-    walkerCmd->setBottomExecutionMask(static_cast<uint32_t>(0xffffffff));
-    walkerCmd->setSimdSize(static_cast<SIMD_SIZE>(simd >> 4));
+    // Setup third operand of MI_MATH - "Operation" on registers A and B
+    pAluParam->DW0.BitField.ALUOpcode = operation;
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;

-    walkerCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
-    walkerCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
-    walkerCmd->setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroups[2]));
+    // Setup fourth operand of MI_MATH - store result into CS_GPR_R0
+    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE;
+    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0;
+    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;

-    return localWorkSize;
+    // LOAD value of CS_GPR_R0 into "Register"
+    auto pCmd4 = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
+    *pCmd4 = GfxFamily::cmdInitLoadRegisterReg;
+    pCmd4->setSourceRegisterAddress(CS_GPR_R0);
+    pCmd4->setDestinationRegisterAddress(aluRegister);
+
+    // Add PIPE_CONTROL to flush caches
+    auto pCmd5 = pCommandStream->getSpaceForCmd<PIPE_CONTROL>();
+    *pCmd5 = GfxFamily::cmdInitPipeControl;
+    pCmd5->setCommandStreamerStallEnable(true);
+    pCmd5->setDcFlushEnable(true);
+    pCmd5->setTextureCacheInvalidationEnable(true);
+    pCmd5->setPipeControlFlushEnable(true);
+    pCmd5->setStateCacheInvalidationEnable(true);
 }

 template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
-    LinearStream &commandStream,
-    DeviceQueueHw<GfxFamily> &devQueueHw,
-    PreemptionMode preemptionMode,
-    SchedulerKernel &scheduler,
-    IndirectHeap *ssh,
-    IndirectHeap *dsh) {
+void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
+    TagNode<HwTimeStamps> &hwTimeStamps,
+    LinearStream *commandStream) {

-    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
-    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
-    using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
+    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;

-    bool dcFlush = false;
-    PipeControlHelper<GfxFamily>::addPipeControl(commandStream, dcFlush);
+    // PIPE_CONTROL for global timestamp
+    uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, GlobalStartTS);

-    uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
-    const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
-    const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
-    const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
+    PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, timeStampAddress, 0llu, false);

-    // Program media interface descriptor load
-    KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
-        commandStream,
-        offsetInterfaceDescriptor,
-        totalInterfaceDescriptorTableSize);
+    //MI_STORE_REGISTER_MEM for context local timestamp
+    timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextStartTS);

-    DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
+    //low part
+    auto pMICmdLow = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
+    *pMICmdLow = GfxFamily::cmdInitStoreRegisterMem;
+    adjustMiStoreRegMemMode(pMICmdLow);
+    pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
+    pMICmdLow->setMemoryAddress(timeStampAddress);
+}

-    // Determine SIMD size
-    uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
-    DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
+    TagNode<HwTimeStamps> &hwTimeStamps,
+    LinearStream *commandStream) {

-    // Patch our kernel constants
-    *scheduler.globalWorkOffsetX = 0;
-    *scheduler.globalWorkOffsetY = 0;
-    *scheduler.globalWorkOffsetZ = 0;
+    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;

-    *scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
-    *scheduler.globalWorkSizeY = 1;
-    *scheduler.globalWorkSizeZ = 1;
+    // PIPE_CONTROL for global timestamp
+    auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
+    *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
+    pPipeControlCmd->setCommandStreamerStallEnable(true);

-    *scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
-    *scheduler.localWorkSizeY = 1;
-    *scheduler.localWorkSizeZ = 1;
+    //MI_STORE_REGISTER_MEM for context local timestamp
+    uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextEndTS);

-    *scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
-    *scheduler.localWorkSizeY2 = 1;
-    *scheduler.localWorkSizeZ2 = 1;
+    //low part
+    auto pMICmdLow = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
+    *pMICmdLow = GfxFamily::cmdInitStoreRegisterMem;
+    adjustMiStoreRegMemMode(pMICmdLow);
+    pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
+    pMICmdLow->setMemoryAddress(timeStampAddress);
+}

-    *scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
-    *scheduler.enqueuedLocalWorkSizeY = 1;
-    *scheduler.enqueuedLocalWorkSizeZ = 1;
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(
+    LinearStream *commandStream,
+    uint64_t memoryAddress,
+    uint32_t registerAddress) {

-    *scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
-    *scheduler.numWorkGroupsY = 0;
-    *scheduler.numWorkGroupsZ = 0;
+    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;

-    *scheduler.workDim = 1;
+    auto pCmd = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
+    *pCmd = GfxFamily::cmdInitStoreRegisterMem;
+    pCmd->setRegisterAddress(registerAddress);
+    pCmd->setMemoryAddress(memoryAddress);
+}

-    // Send our indirect object data
-    size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
-    size_t globalWorkSizes[3] = {scheduler.getGws(), 1, 1};
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(
+    LinearStream *commandStream,
+    uint64_t baseAddress) {

-    // Create indirectHeap for IOH that is located at the end of device enqueue DSH
-    size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
-    IndirectHeap indirectObjectHeap(dsh->getCpuBase(), dsh->getMaxAvailableSpace());
-    indirectObjectHeap.getSpace(curbeOffset);
-    IndirectHeap *ioh = &indirectObjectHeap;
-
-    // Program the walker.  Invokes execution so all state should already be programmed
-    auto pGpGpuWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
-    *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
-
-    bool localIdsGenerationByRuntime = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes);
-    bool inlineDataProgrammingRequired = KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(scheduler);
-    KernelCommandsHelper<GfxFamily>::sendIndirectState(
-        commandStream,
-        *dsh,
-        *ioh,
-        *ssh,
-        scheduler,
-        simd,
-        localWorkSizes,
-        offsetInterfaceDescriptorTable,
-        interfaceDescriptorIndex,
-        preemptionMode,
-        pGpGpuWalkerCmd,
-        nullptr,
-        localIdsGenerationByRuntime);
-
-    // Implement enabling special WA DisableLSQCROPERFforOCL if needed
-    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, true);
-
-    size_t globalOffsets[3] = {0, 0, 0};
-    size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
-    GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes,
-                                                           simd, 1, localIdsGenerationByRuntime, inlineDataProgrammingRequired,
-                                                           *scheduler.getKernelInfo().patchInfo.threadPayload);
-
-    // Implement disabling special WA DisableLSQCROPERFforOCL if needed
-    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, false);
-
-    // Do not put BB_START only when returning in first Scheduler run
-    if (devQueueHw.getSchedulerReturnInstance() != 1) {
-
-        PipeControlHelper<GfxFamily>::addPipeControl(commandStream, true);
-
-        // Add BB Start Cmd to the SLB in the Primary Batch Buffer
-        auto *bbStart = static_cast<MI_BATCH_BUFFER_START *>(commandStream.getSpace(sizeof(MI_BATCH_BUFFER_START)));
-        *bbStart = GfxFamily::cmdInitBatchBufferStart;
-        bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
-        uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
-        bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
+    // Read General Purpose counters
+    for (auto i = 0u; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) {
+        uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint);
+        //Gp field is 2*uint64 wide so it can hold 4 uint32
+        uint64_t address = baseAddress + i * sizeof(cl_uint);
+        dispatchStoreRegisterCommand(commandStream, address, regAddr);
    }
 }

 template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
-    LinearStream *cmdStream,
-    WALKER_TYPE<GfxFamily> *walkerCmd,
-    TagNode<TimestampPacketStorage> *timestampPacketNode,
-    TimestampPacketStorage::WriteOperationType writeOperationType) {
+void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(
+    CommandQueue &commandQueue,
+    LinearStream *commandStream,
+    uint64_t baseAddress) {

-    if (TimestampPacketStorage::WriteOperationType::AfterWalker == writeOperationType) {
-        uint64_t address = timestampPacketNode->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextEnd);
-        PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(cmdStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, address, 0, false);
+    auto userRegs = &commandQueue.getPerfCountersConfigData()->ReadRegs;
+
+    for (uint32_t i = 0; i < userRegs->RegsCount; i++) {
+        uint32_t regAddr = userRegs->Reg[i].Offset;
+        //offset between base (low) registers is cl_ulong wide
+        uint64_t address = baseAddress + i * sizeof(cl_ulong);
+        dispatchStoreRegisterCommand(commandStream, address, regAddr);
+
+        if (userRegs->Reg[i].BitSize > 32) {
+            dispatchStoreRegisterCommand(commandStream, address + sizeof(cl_uint), regAddr + sizeof(cl_uint));
+        }
    }
 }

+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(
+    TagNode<HwPerfCounter> &hwPerfCounter,
+    LinearStream *commandStream) {
+
+    dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaStatus), INSTR_GFX_OFFSETS::INSTR_OA_STATUS);
+    dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaHead), INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR);
+    dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaTail), INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR);
+}
+
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
+    CommandQueue &commandQueue,
+    TagNode<HwPerfCounter> &hwPerfCounter,
+    LinearStream *commandStream) {
+
+    using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
+
+    auto perfCounters = commandQueue.getPerfCounters();
+
+    uint32_t currentReportId = perfCounters->getCurrentReportId();
+    uint64_t address = 0;
+    //flush command streamer
+    auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
+    *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
+    pPipeControlCmd->setCommandStreamerStallEnable(true);
+
+    //Store value of NOOPID register
+    GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdBegin), INSTR_MMIO_NOOPID);
+
+    //Read Core Frequency
+    GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqBegin), INSTR_MMIO_RPSTAT1);
+
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Gp));
+
+    auto pReportPerfCount = commandStream->getSpaceForCmd<MI_REPORT_PERF_COUNT>();
+    *pReportPerfCount = GfxFamily::cmdInitReportPerfCount;
+    pReportPerfCount->setReportId(currentReportId);
+    address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Oa);
+    pReportPerfCount->setMemoryAddress(address);
+
+    address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalStartTS);
+
+    PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false);
+
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User));
+
+    commandQueue.sendPerfCountersConfig();
+}
+
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(
+    CommandQueue &commandQueue,
+    TagNode<HwPerfCounter> &hwPerfCounter,
+    LinearStream *commandStream) {
+
+    using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
+
+    auto perfCounters = commandQueue.getPerfCounters();
+
+    uint32_t currentReportId = perfCounters->getCurrentReportId();
+
+    //flush command streamer
+    auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
+    *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
+    pPipeControlCmd->setCommandStreamerStallEnable(true);
+
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(hwPerfCounter, commandStream);
+
+    //Timestamp: Global End
+    uint64_t address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalEndTS);
+    PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false);
+
+    auto pReportPerfCount = commandStream->getSpaceForCmd<MI_REPORT_PERF_COUNT>();
+    *pReportPerfCount = GfxFamily::cmdInitReportPerfCount;
+    pReportPerfCount->setReportId(currentReportId);
+    address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Oa);
+    pReportPerfCount->setMemoryAddress(address);
+
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Gp));
+
+    //Store value of NOOPID register
+    GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdEnd), INSTR_MMIO_NOOPID);
+
+    //Read Core Frequency
+    GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqEnd), INSTR_MMIO_RPSTAT1);
+
+    GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User));
+
+    perfCounters->setCpuTimestamp();
+}
+
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
+}
+
+template <typename GfxFamily>
+size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
+    return (size_t)0;
+}
+
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxFamily> *storeCmd) {
+}
+
+template <typename GfxFamily>
+size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
+    size_t expectedSizeCS = 0;
+    Kernel *parentKernel = multiDispatchInfo.peekParentKernel();
+    if (multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->isAuxTranslationRequired()) {
+        expectedSizeCS += sizeof(PIPE_CONTROL);
+    }
+    for (auto &dispatchInfo : multiDispatchInfo) {
+        expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, dispatchInfo.getKernel());
+        if (dispatchInfo.isPipeControlRequired()) {
+            expectedSizeCS += sizeof(PIPE_CONTROL);
+        }
+    }
+    if (parentKernel) {
+        SchedulerKernel &scheduler = commandQueue.getDevice().getExecutionEnvironment()->getBuiltIns()->getSchedulerKernel(parentKernel->getContext());
+        expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, &scheduler);
+    }
+    if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
+        expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite();
+        expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(csrDeps);
+    }
+    return expectedSizeCS;
+}
+
+template <typename GfxFamily>
+size_t EnqueueOperation<GfxFamily>::getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
+    if (isCommandWithoutKernel(cmdType)) {
+        return EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue);
+    } else {
+        return EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, pKernel);
+    }
+}
+
+template <typename GfxFamily>
+size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue) {
+    size_t size = 0;
+    if (reserveProfilingCmdsSpace) {
+        size += 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+    }
+    return size;
+}
+
 } // namespace NEO
--- a/runtime/command_queue/gpgpu_walker_bdw_plus.inl
+++ b/runtime/command_queue/gpgpu_walker_bdw_plus.inl
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2018-2019 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#pragma once
+#include "runtime/command_queue/gpgpu_walker_base.inl"
+
+namespace NEO {
+
+template <typename GfxFamily>
+inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
+    WALKER_TYPE<GfxFamily> *walkerCmd,
+    const size_t globalOffsets[3],
+    const size_t startWorkGroups[3],
+    const size_t numWorkGroups[3],
+    const size_t localWorkSizesIn[3],
+    uint32_t simd,
+    uint32_t workDim,
+    bool localIdsGenerationByRuntime,
+    bool inlineDataProgrammingRequired,
+    const iOpenCL::SPatchThreadPayload &threadPayload) {
+    auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
+
+    auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
+    walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
+
+    walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
+    walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
+    walkerCmd->setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
+
+    // compute executionMask - to tell which SIMD lines are active within thread
+    auto remainderSimdLanes = localWorkSize & (simd - 1);
+    uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
+    if (!executionMask)
+        executionMask = ~executionMask;
+
+    using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
+
+    walkerCmd->setRightExecutionMask(static_cast<uint32_t>(executionMask));
+    walkerCmd->setBottomExecutionMask(static_cast<uint32_t>(0xffffffff));
+    walkerCmd->setSimdSize(static_cast<SIMD_SIZE>(simd >> 4));
+
+    walkerCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
+    walkerCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
+    walkerCmd->setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroups[2]));
+
+    return localWorkSize;
+}
+
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
+    LinearStream &commandStream,
+    DeviceQueueHw<GfxFamily> &devQueueHw,
+    PreemptionMode preemptionMode,
+    SchedulerKernel &scheduler,
+    IndirectHeap *ssh,
+    IndirectHeap *dsh) {
+
+    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
+    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
+    using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
+
+    bool dcFlush = false;
+    PipeControlHelper<GfxFamily>::addPipeControl(commandStream, dcFlush);
+
+    uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
+    const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
+    const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
+    const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
+
+    // Program media interface descriptor load
+    KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
+        commandStream,
+        offsetInterfaceDescriptor,
+        totalInterfaceDescriptorTableSize);
+
+    DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
+
+    // Determine SIMD size
+    uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
+    DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
+
+    // Patch our kernel constants
+    *scheduler.globalWorkOffsetX = 0;
+    *scheduler.globalWorkOffsetY = 0;
+    *scheduler.globalWorkOffsetZ = 0;
+
+    *scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
+    *scheduler.globalWorkSizeY = 1;
+    *scheduler.globalWorkSizeZ = 1;
+
+    *scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
+    *scheduler.localWorkSizeY = 1;
+    *scheduler.localWorkSizeZ = 1;
+
+    *scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
+    *scheduler.localWorkSizeY2 = 1;
+    *scheduler.localWorkSizeZ2 = 1;
+
+    *scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
+    *scheduler.enqueuedLocalWorkSizeY = 1;
+    *scheduler.enqueuedLocalWorkSizeZ = 1;
+
+    *scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
+    *scheduler.numWorkGroupsY = 0;
+    *scheduler.numWorkGroupsZ = 0;
+
+    *scheduler.workDim = 1;
+
+    // Send our indirect object data
+    size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
+    size_t globalWorkSizes[3] = {scheduler.getGws(), 1, 1};
+
+    // Create indirectHeap for IOH that is located at the end of device enqueue DSH
+    size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
+    IndirectHeap indirectObjectHeap(dsh->getCpuBase(), dsh->getMaxAvailableSpace());
+    indirectObjectHeap.getSpace(curbeOffset);
+    IndirectHeap *ioh = &indirectObjectHeap;
+
+    // Program the walker.  Invokes execution so all state should already be programmed
+    auto pGpGpuWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
+    *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
+
+    bool localIdsGenerationByRuntime = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes);
+    bool inlineDataProgrammingRequired = KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(scheduler);
+    KernelCommandsHelper<GfxFamily>::sendIndirectState(
+        commandStream,
+        *dsh,
+        *ioh,
+        *ssh,
+        scheduler,
+        simd,
+        localWorkSizes,
+        offsetInterfaceDescriptorTable,
+        interfaceDescriptorIndex,
+        preemptionMode,
+        pGpGpuWalkerCmd,
+        nullptr,
+        localIdsGenerationByRuntime);
+
+    // Implement enabling special WA DisableLSQCROPERFforOCL if needed
+    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, true);
+
+    size_t globalOffsets[3] = {0, 0, 0};
+    size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
+    GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes,
+                                                           simd, 1, localIdsGenerationByRuntime, inlineDataProgrammingRequired,
+                                                           *scheduler.getKernelInfo().patchInfo.threadPayload);
+
+    // Implement disabling special WA DisableLSQCROPERFforOCL if needed
+    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, false);
+
+    // Do not put BB_START only when returning in first Scheduler run
+    if (devQueueHw.getSchedulerReturnInstance() != 1) {
+
+        PipeControlHelper<GfxFamily>::addPipeControl(commandStream, true);
+
+        // Add BB Start Cmd to the SLB in the Primary Batch Buffer
+        auto *bbStart = static_cast<MI_BATCH_BUFFER_START *>(commandStream.getSpace(sizeof(MI_BATCH_BUFFER_START)));
+        *bbStart = GfxFamily::cmdInitBatchBufferStart;
+        bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
+        uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
+        bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
+    }
+}
+
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
+    LinearStream *cmdStream,
+    WALKER_TYPE<GfxFamily> *walkerCmd,
+    TagNode<TimestampPacketStorage> *timestampPacketNode,
+    TimestampPacketStorage::WriteOperationType writeOperationType) {
+
+    if (TimestampPacketStorage::WriteOperationType::AfterWalker == writeOperationType) {
+        uint64_t address = timestampPacketNode->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextEnd);
+        PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(cmdStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, address, 0, false);
+    }
+}
+
+template <typename GfxFamily>
+size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
+    size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS(pKernel) +
+                  sizeof(PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
+    size += KernelCommandsHelper<GfxFamily>::getSizeRequiredForCacheFlush(commandQueue, pKernel, 0U, 0U);
+    size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
+    if (reserveProfilingCmdsSpace) {
+        size += 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+    }
+    if (reservePerfCounters) {
+        //start cmds
+        //P_C: flush CS & TimeStamp BEGIN
+        size += 2 * sizeof(PIPE_CONTROL);
+        //SRM NOOPID & Frequency
+        size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        //gp registers
+        size += NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        //report perf count
+        size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
+        //user registers
+        size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+
+        //end cmds
+        //P_C: flush CS & TimeStamp END;
+        size += 2 * sizeof(PIPE_CONTROL);
+        //OA buffer (status head, tail)
+        size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        //report perf count
+        size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
+        //gp registers
+        size += NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        //SRM NOOPID & Frequency
+        size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        //user registers
+        size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+    }
+    size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
+
+    return size;
+}
+
+template <typename GfxFamily>
+size_t EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite() {
+    return sizeof(PIPE_CONTROL);
+}
+
+} // namespace NEO
--- a/runtime/command_queue/hardware_interface.inl
+++ b/runtime/command_queue/hardware_interface.inl
@@ -1,227 +0,0 @@
-/*
- * Copyright (C) 2018-2019 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- */
-
-#pragma once
-#include "runtime/command_queue/hardware_interface.h"
-#include "runtime/helpers/kernel_commands.h"
-#include "runtime/helpers/task_information.h"
-#include "runtime/memory_manager/internal_allocation_storage.h"
-
-namespace NEO {
-
-template <typename GfxFamily>
-inline WALKER_TYPE<GfxFamily> *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream,
-                                                                                 const Kernel &kernel) {
-    auto walkerCmd = static_cast<WALKER_TYPE<GfxFamily> *>(commandStream.getSpace(sizeof(WALKER_TYPE<GfxFamily>)));
-    *walkerCmd = GfxFamily::cmdInitGpgpuWalker;
-    return walkerCmd;
-}
-
-template <typename GfxFamily>
-void HardwareInterface<GfxFamily>::dispatchWalker(
-    CommandQueue &commandQueue,
-    const MultiDispatchInfo &multiDispatchInfo,
-    const CsrDependencies &csrDependencies,
-    KernelOperation **blockedCommandsData,
-    TagNode<HwTimeStamps> *hwTimeStamps,
-    TagNode<HwPerfCounter> *hwPerfCounter,
-    TimestampPacketContainer *previousTimestampPacketNodes,
-    TimestampPacketContainer *currentTimestampPacketNodes,
-    PreemptionMode preemptionMode,
-    bool blockQueue,
-    uint32_t commandType) {
-
-    LinearStream *commandStream = nullptr;
-    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
-    auto parentKernel = multiDispatchInfo.peekParentKernel();
-    auto mainKernel = multiDispatchInfo.peekMainKernel();
-
-    for (auto &dispatchInfo : multiDispatchInfo) {
-        // Compute local workgroup sizes
-        if (dispatchInfo.getLocalWorkgroupSize().x == 0) {
-            const auto lws = generateWorkgroupSize(dispatchInfo);
-            const_cast<DispatchInfo &>(dispatchInfo).setLWS(lws);
-        }
-    }
-
-    // Allocate command stream and indirect heaps
-    if (blockQueue) {
-        using KCH = KernelCommandsHelper<GfxFamily>;
-
-        constexpr static auto additionalAllocationSize = CSRequirements::csOverfetchSize;
-        constexpr static auto allocationSize = MemoryConstants::pageSize64k - additionalAllocationSize;
-        commandStream = new LinearStream();
-        commandQueue.getCommandStreamReceiver().ensureCommandBufferAllocation(*commandStream, allocationSize, additionalAllocationSize);
-
-        if (parentKernel) {
-            uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;
-
-            commandQueue.allocateHeapMemory(
-                IndirectHeap::DYNAMIC_STATE,
-                commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(),
-                dsh);
-
-            dsh->getSpace(colorCalcSize);
-            ioh = dsh;
-            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE,
-                                            KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<
-                                                IndirectHeap::SURFACE_STATE>(*parentKernel) +
-                                                KCH::getTotalSizeRequiredSSH(multiDispatchInfo),
-                                            ssh);
-        } else {
-            commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, KCH::getTotalSizeRequiredDSH(multiDispatchInfo), dsh);
-            commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, KCH::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
-            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, KCH::getTotalSizeRequiredSSH(multiDispatchInfo), ssh);
-        }
-
-        using UniqueIH = std::unique_ptr<IndirectHeap>;
-        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh),
-                                                   UniqueIH(ssh), *commandQueue.getCommandStreamReceiver().getInternalAllocationStorage());
-        if (parentKernel) {
-            (*blockedCommandsData)->doNotFreeISH = true;
-        }
-    } else {
-        commandStream = &commandQueue.getCS(0);
-        if (parentKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
-            commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
-        }
-        dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
-        ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
-        ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
-    }
-
-    TimestampPacketHelper::programCsrDependencies<GfxFamily>(*commandStream, csrDependencies);
-
-    dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);
-
-    uint32_t interfaceDescriptorIndex = 0;
-    const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
-
-    size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
-
-    getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize,
-                       parentKernel, dsh, commandStream);
-
-    // Program media interface descriptor load
-    KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
-        *commandStream,
-        offsetInterfaceDescriptorTable,
-        totalInterfaceDescriptorTableSize);
-
-    DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
-
-    if (mainKernel->isAuxTranslationRequired()) {
-        using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
-        auto pPipeControlCmd = static_cast<PIPE_CONTROL *>(commandStream->getSpace(sizeof(PIPE_CONTROL)));
-        *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
-        pPipeControlCmd->setDcFlushEnable(true);
-        pPipeControlCmd->setCommandStreamerStallEnable(true);
-    }
-
-    size_t currentDispatchIndex = 0;
-    for (auto &dispatchInfo : multiDispatchInfo) {
-        auto &kernel = *dispatchInfo.getKernel();
-        DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
-        DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
-        DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
-        DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
-        DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
-
-        // If we don't have a required WGS, compute one opportunistically
-        auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);
-        if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
-            provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo);
-        }
-
-        //Get dispatch geometry
-        uint32_t dim = dispatchInfo.getDim();
-        Vec3<size_t> gws = dispatchInfo.getGWS();
-        Vec3<size_t> offset = dispatchInfo.getOffset();
-        Vec3<size_t> startOfWorkgroups = dispatchInfo.getStartOfWorkgroups();
-
-        // Compute local workgroup sizes
-        Vec3<size_t> lws = dispatchInfo.getLocalWorkgroupSize();
-        Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;
-
-        // Compute number of work groups
-        Vec3<size_t> totalNumberOfWorkgroups = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups()
-                                                                                                 : generateWorkgroupsNumber(gws, lws);
-
-        Vec3<size_t> numberOfWorkgroups = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : totalNumberOfWorkgroups;
-
-        size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
-
-        // Patch our kernel constants
-        *kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
-        *kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
-        *kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
-
-        *kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
-        *kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
-        *kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
-
-        if ((&kernel == mainKernel) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
-            *kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
-            *kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
-            *kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
-        }
-
-        *kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
-        *kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
-        *kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
-
-        *kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
-        *kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
-        *kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
-
-        if (&kernel == mainKernel) {
-            *kernel.numWorkGroupsX = static_cast<uint32_t>(totalNumberOfWorkgroups.x);
-            *kernel.numWorkGroupsY = static_cast<uint32_t>(totalNumberOfWorkgroups.y);
-            *kernel.numWorkGroupsZ = static_cast<uint32_t>(totalNumberOfWorkgroups.z);
-        }
-
-        *kernel.workDim = dim;
-
-        // Send our indirect object data
-        size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
-
-        dispatchProfilingPerfStartCommands(dispatchInfo, multiDispatchInfo, hwTimeStamps,
-                                           hwPerfCounter, commandStream, commandQueue);
-
-        dispatchWorkarounds(commandStream, commandQueue, kernel, true);
-
-        if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
-            auto timestampPacketNode = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
-            GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, nullptr, timestampPacketNode, TimestampPacketStorage::WriteOperationType::BeforeWalker);
-        }
-
-        programWalker(*commandStream, kernel, commandQueue, currentTimestampPacketNodes, *dsh, *ioh, *ssh, globalWorkSizes,
-                      localWorkSizes, preemptionMode, currentDispatchIndex, interfaceDescriptorIndex, dispatchInfo,
-                      offsetInterfaceDescriptorTable, numberOfWorkgroups, startOfWorkgroups);
-
-        dispatchWorkarounds(commandStream, commandQueue, kernel, false);
-        if (dispatchInfo.isPipeControlRequired()) {
-            using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
-            auto pPipeControlCmd = static_cast<PIPE_CONTROL *>(commandStream->getSpace(sizeof(PIPE_CONTROL)));
-            *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
-            pPipeControlCmd->setCommandStreamerStallEnable(true);
-        }
-
-        currentDispatchIndex++;
-    }
-    if (mainKernel->requiresCacheFlushCommand(commandQueue)) {
-        uint64_t postSyncAddress = 0;
-        if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
-            auto timestampPacketNodeForPostSync = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
-            postSyncAddress = timestampPacketNodeForPostSync->getGpuAddress();
-        }
-        KernelCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(commandStream, commandQueue, mainKernel, postSyncAddress, 0);
-    }
-    dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
-}
-
-} // namespace NEO
--- a/runtime/command_queue/hardware_interface_base.inl
+++ b/runtime/command_queue/hardware_interface_base.inl
@@ -7,134 +7,221 @@

 #pragma once
 #include "runtime/command_queue/hardware_interface.h"
+#include "runtime/helpers/kernel_commands.h"
+#include "runtime/helpers/task_information.h"
+#include "runtime/memory_manager/internal_allocation_storage.h"

 namespace NEO {

 template <typename GfxFamily>
-inline void HardwareInterface<GfxFamily>::getDefaultDshSpace(
-    const size_t &offsetInterfaceDescriptorTable,
+inline WALKER_TYPE<GfxFamily> *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream,
+                                                                                 const Kernel &kernel) {
+    auto walkerCmd = static_cast<WALKER_TYPE<GfxFamily> *>(commandStream.getSpace(sizeof(WALKER_TYPE<GfxFamily>)));
+    *walkerCmd = GfxFamily::cmdInitGpgpuWalker;
+    return walkerCmd;
+}
+
+template <typename GfxFamily>
+void HardwareInterface<GfxFamily>::dispatchWalker(
    CommandQueue &commandQueue,
    const MultiDispatchInfo &multiDispatchInfo,
-    size_t &totalInterfaceDescriptorTableSize,
-    Kernel *parentKernel,
-    IndirectHeap *dsh,
-    LinearStream *commandStream) {
-
-    size_t numDispatches = multiDispatchInfo.size();
-    totalInterfaceDescriptorTableSize *= numDispatches;
-
-    if (!parentKernel) {
-        dsh->getSpace(totalInterfaceDescriptorTableSize);
-    } else {
-        dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
-    }
-}
-
-template <typename GfxFamily>
-inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
-    LinearStream *commandStream,
-    CommandQueue &commandQueue,
-    Kernel &kernel,
-    const bool &enable) {
-
-    if (enable) {
-        PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
-        // Implement enabling special WA DisableLSQCROPERFforOCL if needed
-        GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
-    } else {
-        // Implement disabling special WA DisableLSQCROPERFforOCL if needed
-        GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
-        PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
-    }
-}
-
-template <typename GfxFamily>
-inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
-    const DispatchInfo &dispatchInfo,
-    const MultiDispatchInfo &multiDispatchInfo,
+    const CsrDependencies &csrDependencies,
+    KernelOperation **blockedCommandsData,
    TagNode<HwTimeStamps> *hwTimeStamps,
    TagNode<HwPerfCounter> *hwPerfCounter,
-    LinearStream *commandStream,
-    CommandQueue &commandQueue) {
-
-    if (&dispatchInfo == &*multiDispatchInfo.begin()) {
-        // If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
-        if (hwTimeStamps != nullptr) {
-            GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream);
-        }
-        if (hwPerfCounter != nullptr) {
-            GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);
-        }
-    }
-}
-
-template <typename GfxFamily>
-inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
-    TagNode<HwTimeStamps> *hwTimeStamps,
-    TagNode<HwPerfCounter> *hwPerfCounter,
-    LinearStream *commandStream,
-    CommandQueue &commandQueue) {
-
-    // If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
-    if (hwTimeStamps != nullptr) {
-        GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream);
-    }
-    if (hwPerfCounter != nullptr) {
-        GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);
-    }
-}
-
-template <typename GfxFamily>
-inline void HardwareInterface<GfxFamily>::programWalker(
-    LinearStream &commandStream,
-    Kernel &kernel,
-    CommandQueue &commandQueue,
+    TimestampPacketContainer *previousTimestampPacketNodes,
    TimestampPacketContainer *currentTimestampPacketNodes,
-    IndirectHeap &dsh,
-    IndirectHeap &ioh,
-    IndirectHeap &ssh,
-    size_t globalWorkSizes[3],
-    size_t localWorkSizes[3],
    PreemptionMode preemptionMode,
-    size_t currentDispatchIndex,
-    uint32_t &interfaceDescriptorIndex,
-    const DispatchInfo &dispatchInfo,
-    size_t offsetInterfaceDescriptorTable,
-    Vec3<size_t> &numberOfWorkgroups,
-    Vec3<size_t> &startOfWorkgroups) {
+    bool blockQueue,
+    uint32_t commandType) {

-    auto walkerCmd = allocateWalkerSpace(commandStream, kernel);
-    uint32_t dim = dispatchInfo.getDim();
-    uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
+    LinearStream *commandStream = nullptr;
+    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+    auto parentKernel = multiDispatchInfo.peekParentKernel();
+    auto mainKernel = multiDispatchInfo.peekMainKernel();

-    size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
-    size_t startWorkGroups[3] = {startOfWorkgroups.x, startOfWorkgroups.y, startOfWorkgroups.z};
-    size_t numWorkGroups[3] = {numberOfWorkgroups.x, numberOfWorkgroups.y, numberOfWorkgroups.z};
-
-    if (currentTimestampPacketNodes && commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
-        auto timestampPacketNode = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
-        GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, walkerCmd, timestampPacketNode, TimestampPacketStorage::WriteOperationType::AfterWalker);
+    for (auto &dispatchInfo : multiDispatchInfo) {
+        // Compute local workgroup sizes
+        if (dispatchInfo.getLocalWorkgroupSize().x == 0) {
+            const auto lws = generateWorkgroupSize(dispatchInfo);
+            const_cast<DispatchInfo &>(dispatchInfo).setLWS(lws);
+        }
    }

-    KernelCommandsHelper<GfxFamily>::sendIndirectState(
-        commandStream,
-        dsh,
-        ioh,
-        ssh,
-        kernel,
-        simd,
-        localWorkSizes,
-        offsetInterfaceDescriptorTable,
-        interfaceDescriptorIndex,
-        preemptionMode,
-        walkerCmd,
-        nullptr,
-        true);
+    // Allocate command stream and indirect heaps
+    if (blockQueue) {
+        using KCH = KernelCommandsHelper<GfxFamily>;

-    GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups,
-                                                           numWorkGroups, localWorkSizes, simd, dim,
-                                                           false, false,
-                                                           *kernel.getKernelInfo().patchInfo.threadPayload);
+        constexpr static auto additionalAllocationSize = CSRequirements::csOverfetchSize;
+        constexpr static auto allocationSize = MemoryConstants::pageSize64k - additionalAllocationSize;
+        commandStream = new LinearStream();
+        commandQueue.getCommandStreamReceiver().ensureCommandBufferAllocation(*commandStream, allocationSize, additionalAllocationSize);
+
+        if (parentKernel) {
+            uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;
+
+            commandQueue.allocateHeapMemory(
+                IndirectHeap::DYNAMIC_STATE,
+                commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(),
+                dsh);
+
+            dsh->getSpace(colorCalcSize);
+            ioh = dsh;
+            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE,
+                                            KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<
+                                                IndirectHeap::SURFACE_STATE>(*parentKernel) +
+                                                KCH::getTotalSizeRequiredSSH(multiDispatchInfo),
+                                            ssh);
+        } else {
+            commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, KCH::getTotalSizeRequiredDSH(multiDispatchInfo), dsh);
+            commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, KCH::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
+            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, KCH::getTotalSizeRequiredSSH(multiDispatchInfo), ssh);
+        }
+
+        using UniqueIH = std::unique_ptr<IndirectHeap>;
+        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh),
+                                                   UniqueIH(ssh), *commandQueue.getCommandStreamReceiver().getInternalAllocationStorage());
+        if (parentKernel) {
+            (*blockedCommandsData)->doNotFreeISH = true;
+        }
+    } else {
+        commandStream = &commandQueue.getCS(0);
+        if (parentKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
+            commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
+        }
+        dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
+        ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
+        ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
+    }
+
+    TimestampPacketHelper::programCsrDependencies<GfxFamily>(*commandStream, csrDependencies);
+
+    dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);
+
+    uint32_t interfaceDescriptorIndex = 0;
+    const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
+
+    size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
+
+    getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize,
+                       parentKernel, dsh, commandStream);
+
+    // Program media interface descriptor load
+    KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
+        *commandStream,
+        offsetInterfaceDescriptorTable,
+        totalInterfaceDescriptorTableSize);
+
+    DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
+
+    if (mainKernel->isAuxTranslationRequired()) {
+        using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
+        auto pPipeControlCmd = static_cast<PIPE_CONTROL *>(commandStream->getSpace(sizeof(PIPE_CONTROL)));
+        *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
+        pPipeControlCmd->setDcFlushEnable(true);
+        pPipeControlCmd->setCommandStreamerStallEnable(true);
+    }
+
+    size_t currentDispatchIndex = 0;
+    for (auto &dispatchInfo : multiDispatchInfo) {
+        auto &kernel = *dispatchInfo.getKernel();
+        DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
+        DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
+        DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
+        DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
+        DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
+
+        // If we don't have a required WGS, compute one opportunistically
+        auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);
+        if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
+            provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo);
+        }
+
+        //Get dispatch geometry
+        uint32_t dim = dispatchInfo.getDim();
+        Vec3<size_t> gws = dispatchInfo.getGWS();
+        Vec3<size_t> offset = dispatchInfo.getOffset();
+        Vec3<size_t> startOfWorkgroups = dispatchInfo.getStartOfWorkgroups();
+
+        // Compute local workgroup sizes
+        Vec3<size_t> lws = dispatchInfo.getLocalWorkgroupSize();
+        Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;
+
+        // Compute number of work groups
+        Vec3<size_t> totalNumberOfWorkgroups = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups()
+                                                                                                 : generateWorkgroupsNumber(gws, lws);
+
+        Vec3<size_t> numberOfWorkgroups = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : totalNumberOfWorkgroups;
+
+        size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
+
+        // Patch our kernel constants
+        *kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
+        *kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
+        *kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
+
+        *kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
+        *kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
+        *kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
+
+        if ((&kernel == mainKernel) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
+            *kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
+            *kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
+            *kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
+        }
+
+        *kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
+        *kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
+        *kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
+
+        *kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
+        *kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
+        *kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
+
+        if (&kernel == mainKernel) {
+            *kernel.numWorkGroupsX = static_cast<uint32_t>(totalNumberOfWorkgroups.x);
+            *kernel.numWorkGroupsY = static_cast<uint32_t>(totalNumberOfWorkgroups.y);
+            *kernel.numWorkGroupsZ = static_cast<uint32_t>(totalNumberOfWorkgroups.z);
+        }
+
+        *kernel.workDim = dim;
+
+        // Send our indirect object data
+        size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
+
+        dispatchProfilingPerfStartCommands(dispatchInfo, multiDispatchInfo, hwTimeStamps,
+                                           hwPerfCounter, commandStream, commandQueue);
+
+        dispatchWorkarounds(commandStream, commandQueue, kernel, true);
+
+        if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
+            auto timestampPacketNode = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
+            GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, nullptr, timestampPacketNode, TimestampPacketStorage::WriteOperationType::BeforeWalker);
+        }
+
+        programWalker(*commandStream, kernel, commandQueue, currentTimestampPacketNodes, *dsh, *ioh, *ssh, globalWorkSizes,
+                      localWorkSizes, preemptionMode, currentDispatchIndex, interfaceDescriptorIndex, dispatchInfo,
+                      offsetInterfaceDescriptorTable, numberOfWorkgroups, startOfWorkgroups);
+
+        dispatchWorkarounds(commandStream, commandQueue, kernel, false);
+        if (dispatchInfo.isPipeControlRequired()) {
+            using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
+            auto pPipeControlCmd = static_cast<PIPE_CONTROL *>(commandStream->getSpace(sizeof(PIPE_CONTROL)));
+            *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
+            pPipeControlCmd->setCommandStreamerStallEnable(true);
+        }
+
+        currentDispatchIndex++;
+    }
+    if (mainKernel->requiresCacheFlushCommand(commandQueue)) {
+        uint64_t postSyncAddress = 0;
+        if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
+            auto timestampPacketNodeForPostSync = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
+            postSyncAddress = timestampPacketNodeForPostSync->getGpuAddress();
+        }
+        KernelCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(commandStream, commandQueue, mainKernel, postSyncAddress, 0);
+    }
+    dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
 }

 } // namespace NEO
--- a/runtime/command_queue/hardware_interface_bdw_plus.inl
+++ b/runtime/command_queue/hardware_interface_bdw_plus.inl
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2018-2019 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#pragma once
+#include "runtime/command_queue/hardware_interface_base.inl"
+
+namespace NEO {
+
+template <typename GfxFamily>
+inline void HardwareInterface<GfxFamily>::getDefaultDshSpace(
+    const size_t &offsetInterfaceDescriptorTable,
+    CommandQueue &commandQueue,
+    const MultiDispatchInfo &multiDispatchInfo,
+    size_t &totalInterfaceDescriptorTableSize,
+    Kernel *parentKernel,
+    IndirectHeap *dsh,
+    LinearStream *commandStream) {
+
+    size_t numDispatches = multiDispatchInfo.size();
+    totalInterfaceDescriptorTableSize *= numDispatches;
+
+    if (!parentKernel) {
+        dsh->getSpace(totalInterfaceDescriptorTableSize);
+    } else {
+        dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
+    }
+}
+
+template <typename GfxFamily>
+inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
+    LinearStream *commandStream,
+    CommandQueue &commandQueue,
+    Kernel &kernel,
+    const bool &enable) {
+
+    if (enable) {
+        PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
+        // Implement enabling special WA DisableLSQCROPERFforOCL if needed
+        GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
+    } else {
+        // Implement disabling special WA DisableLSQCROPERFforOCL if needed
+        GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
+        PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
+    }
+}
+
+template <typename GfxFamily>
+inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
+    const DispatchInfo &dispatchInfo,
+    const MultiDispatchInfo &multiDispatchInfo,
+    TagNode<HwTimeStamps> *hwTimeStamps,
+    TagNode<HwPerfCounter> *hwPerfCounter,
+    LinearStream *commandStream,
+    CommandQueue &commandQueue) {
+
+    if (&dispatchInfo == &*multiDispatchInfo.begin()) {
+        // If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
+        if (hwTimeStamps != nullptr) {
+            GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream);
+        }
+        if (hwPerfCounter != nullptr) {
+            GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);
+        }
+    }
+}
+
+template <typename GfxFamily>
+inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
+    TagNode<HwTimeStamps> *hwTimeStamps,
+    TagNode<HwPerfCounter> *hwPerfCounter,
+    LinearStream *commandStream,
+    CommandQueue &commandQueue) {
+
+    // If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
+    if (hwTimeStamps != nullptr) {
+        GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream);
+    }
+    if (hwPerfCounter != nullptr) {
+        GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);
+    }
+}
+
+template <typename GfxFamily>
+inline void HardwareInterface<GfxFamily>::programWalker(
+    LinearStream &commandStream,
+    Kernel &kernel,
+    CommandQueue &commandQueue,
+    TimestampPacketContainer *currentTimestampPacketNodes,
+    IndirectHeap &dsh,
+    IndirectHeap &ioh,
+    IndirectHeap &ssh,
+    size_t globalWorkSizes[3],
+    size_t localWorkSizes[3],
+    PreemptionMode preemptionMode,
+    size_t currentDispatchIndex,
+    uint32_t &interfaceDescriptorIndex,
+    const DispatchInfo &dispatchInfo,
+    size_t offsetInterfaceDescriptorTable,
+    Vec3<size_t> &numberOfWorkgroups,
+    Vec3<size_t> &startOfWorkgroups) {
+
+    auto walkerCmd = allocateWalkerSpace(commandStream, kernel);
+    uint32_t dim = dispatchInfo.getDim();
+    uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
+
+    size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
+    size_t startWorkGroups[3] = {startOfWorkgroups.x, startOfWorkgroups.y, startOfWorkgroups.z};
+    size_t numWorkGroups[3] = {numberOfWorkgroups.x, numberOfWorkgroups.y, numberOfWorkgroups.z};
+
+    if (currentTimestampPacketNodes && commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
+        auto timestampPacketNode = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
+        GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, walkerCmd, timestampPacketNode, TimestampPacketStorage::WriteOperationType::AfterWalker);
+    }
+
+    KernelCommandsHelper<GfxFamily>::sendIndirectState(
+        commandStream,
+        dsh,
+        ioh,
+        ssh,
+        kernel,
+        simd,
+        localWorkSizes,
+        offsetInterfaceDescriptorTable,
+        interfaceDescriptorIndex,
+        preemptionMode,
+        walkerCmd,
+        nullptr,
+        true);
+
+    GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups,
+                                                           numWorkGroups, localWorkSizes, simd, dim,
+                                                           false, false,
+                                                           *kernel.getKernelInfo().patchInfo.threadPayload);
+}
+
+} // namespace NEO