From ce8284badee412cb6bbd435275e4a438e2ae4452 Mon Sep 17 00:00:00 2001
From: Filip Hazubski <filip.hazubski@intel.com>
Date: Fri, 7 Sep 2018 17:04:18 +0200
Subject: [PATCH] Move dispatchWalker() to seperate class

Change-Id: Idc95df3d67775022f2197c1f973182acb3558a2d
Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
---
 CMakeLists.txt                                |   1 +
 runtime/command_queue/CMakeLists.txt          |   2 +
 runtime/command_queue/gpgpu_walker.h          |  40 +-
 runtime/command_queue/gpgpu_walker.inl        | 320 +---------------
 .../hardware_interface/hardware_interface.h   | 144 +++++++
 .../hardware_interface/hardware_interface.inl | 350 ++++++++++++++++++
 runtime/gen10/hw_cmds.h                       |   6 +
 runtime/gen8/hw_cmds_base.h                   |   5 +
 runtime/gen9/hw_cmds_base.h                   |   4 +
 unit_tests/libult/mock_gfx_family.h           |   4 +
 10 files changed, 535 insertions(+), 341 deletions(-)
 create mode 100644 runtime/command_queue/hardware_interface/hardware_interface.h
 create mode 100644 runtime/command_queue/hardware_interface/hardware_interface.inl
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c61f788f83..38c88f3516 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -539,6 +539,7 @@ include_directories(${IGDRCL_SOURCE_DIR}/runtime/gen_common/reg_configs${BRANCH_
 include_directories(${IGDRCL_SOURCE_DIR}/runtime/gmm_helper/${BRANCH_DIR_SUFFIX})
 include_directories(${IGDRCL_SOURCE_DIR}/runtime/gmm_helper/client_context${BRANCH_DIR_SUFFIX})
 include_directories(${IGDRCL_SOURCE_DIR}/runtime/gmm_helper/gmm_memory${BRANCH_DIR_SUFFIX})
+include_directories(${IGDRCL_SOURCE_DIR}/runtime/command_queue/hardware_interface${BRANCH_DIR_SUFFIX})
 
 set(HW_SRC_INCLUDE_PATH ${IGDRCL_SOURCE_DIR}/runtime/gen_common)
 
diff --git a/runtime/command_queue/CMakeLists.txt b/runtime/command_queue/CMakeLists.txt
index 4d1d481d4d..3971d40e1e 100644
--- a/runtime/command_queue/CMakeLists.txt
+++ b/runtime/command_queue/CMakeLists.txt
@@ -54,6 +54,8 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
   ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_avx2.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_sse4.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/local_work_size.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface${BRANCH_DIR_SUFFIX}/hardware_interface.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface${BRANCH_DIR_SUFFIX}/hardware_interface.inl
 )
 target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_COMMAND_QUEUE})
 set_property(GLOBAL PROPERTY RUNTIME_SRCS_COMMAND_QUEUE ${RUNTIME_SRCS_COMMAND_QUEUE})
diff --git a/runtime/command_queue/gpgpu_walker.h b/runtime/command_queue/gpgpu_walker.h
index c712c2c73f..7358568c6d 100644
--- a/runtime/command_queue/gpgpu_walker.h
+++ b/runtime/command_queue/gpgpu_walker.h
@@ -46,6 +46,9 @@ using WALKER_HANDLE = void *;
 template <typename GfxFamily>
 using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;
 
+template <typename GfxFamily>
+using HARDWARE_INTERFACE = typename GfxFamily::HARDWARE_INTERFACE;
+
 constexpr int32_t NUM_ALU_INST_FOR_READ_MODIFY_WRITE = 4;
 
 constexpr int32_t L3SQC_BIT_LQSC_RO_PERF_DIS = 0x08000000;
@@ -218,43 +221,6 @@ class GpgpuWalkerHelper {
         TimestampPacket *timestampPacket,
         TimestampPacket::WriteOperationType writeOperationType);
 
-    static void getDefaultDshSpace(
-        const size_t &offsetInterfaceDescriptorTable,
-        CommandQueue &commandQueue,
-        const MultiDispatchInfo &multiDispatchInfo,
-        size_t &totalInterfaceDescriptorTableSize,
-        OCLRT::Kernel *parentKernelDispatched,
-        OCLRT::IndirectHeap *dsh,
-        OCLRT::LinearStream *commandStream);
-
-    static INTERFACE_DESCRIPTOR_DATA *obtainInterfaceDescriptorData(
-        WALKER_HANDLE pCmdData);
-
-    static void setOffsetCrossThreadData(
-        WALKER_HANDLE pCmdData,
-        size_t &offsetCrossThreadData,
-        uint32_t &interfaceDescriptorIndex);
-
-    static void dispatchWorkarounds(
-        OCLRT::LinearStream *commandStream,
-        CommandQueue &commandQueue,
-        OCLRT::Kernel &kernel,
-        const bool &enable);
-
-    static void dispatchProfilingPerfStartCommands(
-        const OCLRT::DispatchInfo &dispatchInfo,
-        const MultiDispatchInfo &multiDispatchInfo,
-        HwTimeStamps *hwTimeStamps,
-        OCLRT::HwPerfCounter *hwPerfCounter,
-        OCLRT::LinearStream *commandStream,
-        CommandQueue &commandQueue);
-
-    static void dispatchProfilingPerfEndCommands(
-        HwTimeStamps *hwTimeStamps,
-        OCLRT::HwPerfCounter *hwPerfCounter,
-        OCLRT::LinearStream *commandStream,
-        CommandQueue &commandQueue);
-
     static void dispatchScheduler(
         CommandQueue &commandQueue,
         DeviceQueueHw<GfxFamily> &devQueueHw,
diff --git a/runtime/command_queue/gpgpu_walker.inl b/runtime/command_queue/gpgpu_walker.inl
index 3543613454..c2531006c0 100644
--- a/runtime/command_queue/gpgpu_walker.inl
+++ b/runtime/command_queue/gpgpu_walker.inl
@@ -22,6 +22,8 @@
 
 #pragma once
 #include "runtime/command_queue/gpgpu_walker.h"
+#include "hardware_interface.h"
+#include "hardware_interface.inl"
 #include "runtime/command_queue/command_queue.h"
 #include "runtime/command_queue/local_id_gen.h"
 #include "runtime/command_stream/command_stream_receiver.h"
@@ -441,218 +443,20 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
     bool blockQueue,
     uint32_t commandType) {
 
-    OCLRT::LinearStream *commandStream = nullptr;
-    OCLRT::IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
-    auto parentKernel = multiDispatchInfo.peekParentKernel();
-
-    for (auto &dispatchInfo : multiDispatchInfo) {
-        // Compute local workgroup sizes
-        if (dispatchInfo.getLocalWorkgroupSize().x == 0) {
-            const auto lws = generateWorkgroupSize(dispatchInfo);
-            const_cast<DispatchInfo &>(dispatchInfo).setLWS(lws);
-        }
-    }
-
-    // Allocate command stream and indirect heaps
-    if (blockQueue) {
-        using KCH = KernelCommandsHelper<GfxFamily>;
-        commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize),
-                                         MemoryConstants::pageSize);
-        if (parentKernel) {
-            uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;
-
-            commandQueue.allocateHeapMemory(
-                IndirectHeap::DYNAMIC_STATE,
-                commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(),
-                dsh);
-
-            dsh->getSpace(colorCalcSize);
-            ioh = dsh;
-            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE,
-                                            KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<
-                                                IndirectHeap::SURFACE_STATE>(*parentKernel) +
-                                                KCH::getTotalSizeRequiredSSH(multiDispatchInfo),
-                                            ssh);
-        } else {
-            commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, KCH::getTotalSizeRequiredDSH(multiDispatchInfo), dsh);
-            commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, KCH::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
-            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, KCH::getTotalSizeRequiredSSH(multiDispatchInfo), ssh);
-        }
-
-        using UniqueIH = std::unique_ptr<IndirectHeap>;
-        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh),
-                                                   UniqueIH(ssh), *commandQueue.getDevice().getMemoryManager());
-        if (parentKernel) {
-            (*blockedCommandsData)->doNotFreeISH = true;
-        }
-    } else {
-        commandStream = &commandQueue.getCS(0);
-        if (parentKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
-            commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
-        }
-        dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
-        ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
-        ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
-    }
-
-    if (commandQueue.getDevice().getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
-        GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(commandStream, commandQueue.getDevice(),
-                                                                         numEventsInWaitList, eventWaitList);
-        if (previousTimestampPacket) {
-            auto compareAddress = previousTimestampPacket->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd);
-            KernelCommandsHelper<GfxFamily>::programMiSemaphoreWait(*commandStream, compareAddress, 1);
-        }
-    }
-
-    dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);
-
-    uint32_t interfaceDescriptorIndex = 0;
-    const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
-
-    size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
-
-    getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize,
-                       parentKernel, dsh, commandStream);
-
-    // Program media interface descriptor load
-    KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
-        *commandStream,
-        offsetInterfaceDescriptorTable,
-        totalInterfaceDescriptorTableSize);
-
-    DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
-
-    size_t currentDispatchIndex = 0;
-    for (auto &dispatchInfo : multiDispatchInfo) {
-        auto &kernel = *dispatchInfo.getKernel();
-
-        DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
-        DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
-        DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
-        DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
-        DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
-
-        // Determine SIMD size
-        uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
-
-        // If we don't have a required WGS, compute one opportunistically
-        auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);
-        if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
-            provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo);
-        }
-
-        //Get dispatch geometry
-        uint32_t dim = dispatchInfo.getDim();
-        Vec3<size_t> gws = dispatchInfo.getGWS();
-        Vec3<size_t> offset = dispatchInfo.getOffset();
-        Vec3<size_t> swgs = dispatchInfo.getStartOfWorkgroups();
-
-        // Compute local workgroup sizes
-        Vec3<size_t> lws = dispatchInfo.getLocalWorkgroupSize();
-        Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;
-
-        // Compute number of work groups
-        Vec3<size_t> twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups()
-                                                                              : generateWorkgroupsNumber(gws, lws);
-        Vec3<size_t> nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs;
-
-        // Patch our kernel constants
-        *kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
-        *kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
-        *kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
-
-        *kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
-        *kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
-        *kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
-
-        if ((&kernel == multiDispatchInfo.peekMainKernel()) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
-            *kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
-            *kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
-            *kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
-        }
-
-        *kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
-        *kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
-        *kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
-
-        *kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
-        *kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
-        *kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
-
-        if (&kernel == multiDispatchInfo.peekMainKernel()) {
-            *kernel.numWorkGroupsX = static_cast<uint32_t>(twgs.x);
-            *kernel.numWorkGroupsY = static_cast<uint32_t>(twgs.y);
-            *kernel.numWorkGroupsZ = static_cast<uint32_t>(twgs.z);
-        }
-
-        *kernel.workDim = dim;
-
-        // Send our indirect object data
-        size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
-
-        dispatchProfilingPerfStartCommands(dispatchInfo, multiDispatchInfo, hwTimeStamps,
-                                           hwPerfCounter, commandStream, commandQueue);
-
-        dispatchWorkarounds(commandStream, commandQueue, kernel, true);
-
-        bool setupTimestampPacket = currentTimestampPacket && (currentDispatchIndex == multiDispatchInfo.size() - 1);
-        if (setupTimestampPacket) {
-            GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, nullptr, currentTimestampPacket,
-                                                               TimestampPacket::WriteOperationType::BeforeWalker);
-        }
-
-        // Program the walker.  Invokes execution so all state should already be programmed
-        auto pWalkerCmd = static_cast<WALKER_TYPE<GfxFamily> *>(commandStream->getSpace(sizeof(WALKER_TYPE<GfxFamily>)));
-        *pWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
-
-        if (setupTimestampPacket) {
-            GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, pWalkerCmd, currentTimestampPacket,
-                                                               TimestampPacket::WriteOperationType::AfterWalker);
-        }
-
-        auto idd = obtainInterfaceDescriptorData(pWalkerCmd);
-
-        auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
-            *commandStream,
-            *dsh,
-            *ioh,
-            *ssh,
-            kernel,
-            simd,
-            localWorkSizes,
-            offsetInterfaceDescriptorTable,
-            interfaceDescriptorIndex,
-            preemptionMode,
-            idd);
-
-        size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
-        size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
-        size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
-        auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pWalkerCmd, globalOffsets, startWorkGroups,
-                                                                                    numWorkGroups, localWorkSizes, simd);
-
-        DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
-        setOffsetCrossThreadData(pWalkerCmd, offsetCrossThreadData, interfaceDescriptorIndex);
-
-        auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
-        DEBUG_BREAK_IF(nullptr == threadPayload);
-
-        auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
-        auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
-        localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
-
-        auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
-        DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
-
-        auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
-        auto IndirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
-                                          WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
-        pWalkerCmd->setIndirectDataLength(IndirectDataLength);
-
-        dispatchWorkarounds(commandStream, commandQueue, kernel, false);
-        currentDispatchIndex++;
-    }
-    dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
+    HARDWARE_INTERFACE<GfxFamily> hardwareInterface;
+    hardwareInterface.dispatchWalker(
+        commandQueue,
+        multiDispatchInfo,
+        numEventsInWaitList,
+        eventWaitList,
+        blockedCommandsData,
+        hwTimeStamps,
+        hwPerfCounter,
+        previousTimestampPacket,
+        currentTimestampPacket,
+        preemptionMode,
+        blockQueue,
+        commandType);
 }
 
 template <typename GfxFamily>
@@ -673,98 +477,6 @@ inline void GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(Lin
     }
 }
 
-template <typename GfxFamily>
-inline void GpgpuWalkerHelper<GfxFamily>::getDefaultDshSpace(
-    const size_t &offsetInterfaceDescriptorTable,
-    CommandQueue &commandQueue,
-    const MultiDispatchInfo &multiDispatchInfo,
-    size_t &totalInterfaceDescriptorTableSize,
-    OCLRT::Kernel *parentKernel,
-    OCLRT::IndirectHeap *dsh,
-    OCLRT::LinearStream *commandStream) {
-
-    size_t numDispatches = multiDispatchInfo.size();
-    totalInterfaceDescriptorTableSize *= numDispatches;
-
-    if (!parentKernel) {
-        dsh->getSpace(totalInterfaceDescriptorTableSize);
-    } else {
-        dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
-    }
-}
-
-template <typename GfxFamily>
-inline typename GpgpuWalkerHelper<GfxFamily>::INTERFACE_DESCRIPTOR_DATA *GpgpuWalkerHelper<GfxFamily>::obtainInterfaceDescriptorData(
-    WALKER_HANDLE pCmdData) {
-
-    return nullptr;
-}
-
-template <typename GfxFamily>
-inline void GpgpuWalkerHelper<GfxFamily>::setOffsetCrossThreadData(
-    WALKER_HANDLE pCmdData,
-    size_t &offsetCrossThreadData,
-    uint32_t &interfaceDescriptorIndex) {
-
-    WALKER_TYPE<GfxFamily> *pCmd = static_cast<WALKER_TYPE<GfxFamily> *>(pCmdData);
-    pCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
-    pCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);
-}
-
-template <typename GfxFamily>
-inline void GpgpuWalkerHelper<GfxFamily>::dispatchWorkarounds(
-    OCLRT::LinearStream *commandStream,
-    CommandQueue &commandQueue,
-    OCLRT::Kernel &kernel,
-    const bool &enable) {
-
-    if (enable) {
-        PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
-        // Implement enabling special WA DisableLSQCROPERFforOCL if needed
-        GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
-    } else {
-        // Implement disabling special WA DisableLSQCROPERFforOCL if needed
-        GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
-        PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
-    }
-}
-
-template <typename GfxFamily>
-inline void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingPerfStartCommands(
-    const OCLRT::DispatchInfo &dispatchInfo,
-    const MultiDispatchInfo &multiDispatchInfo,
-    HwTimeStamps *hwTimeStamps,
-    OCLRT::HwPerfCounter *hwPerfCounter,
-    OCLRT::LinearStream *commandStream,
-    CommandQueue &commandQueue) {
-
-    if (&dispatchInfo == &*multiDispatchInfo.begin()) {
-        // If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
-        if (hwTimeStamps != nullptr) {
-            GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream);
-        }
-        if (hwPerfCounter != nullptr) {
-            GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);
-        }
-    }
-}
-
-template <typename GfxFamily>
-inline void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingPerfEndCommands(
-    HwTimeStamps *hwTimeStamps,
-    OCLRT::HwPerfCounter *hwPerfCounter,
-    OCLRT::LinearStream *commandStream,
-    CommandQueue &commandQueue) {
-
-    // If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
-    if (hwTimeStamps != nullptr) {
-        GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream);
-    }
-    if (hwPerfCounter != nullptr) {
-        GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);
-    }
-}
-
 template <typename GfxFamily>
 void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
     LinearStream *cmdStream,
diff --git a/runtime/command_queue/hardware_interface/hardware_interface.h b/runtime/command_queue/hardware_interface/hardware_interface.h
new file mode 100644
index 0000000000..75dc3274d6
--- /dev/null
+++ b/runtime/command_queue/hardware_interface/hardware_interface.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2018, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "runtime/command_queue/command_queue.h"
+#include "runtime/built_ins/built_ins.h"
+#include "runtime/context/context.h"
+#include "runtime/event/perf_counter.h"
+#include "runtime/indirect_heap/indirect_heap.h"
+#include "runtime/kernel/kernel.h"
+#include "runtime/command_stream/linear_stream.h"
+#include "runtime/event/hw_timestamps.h"
+#include "runtime/command_stream/preemption.h"
+#include "runtime/device_queue/device_queue_hw.h"
+#include "runtime/helpers/dispatch_info.h"
+#include "runtime/helpers/kernel_commands.h"
+#include "runtime/helpers/task_information.h"
+#include "runtime/helpers/timestamp_packet.h"
+#include "runtime/program/kernel_info.h"
+#include "runtime/utilities/vec.h"
+
+namespace OCLRT {
+
+using WALKER_HANDLE = void *;
+
+template <typename GfxFamily>
+class HardwareInterface {
+  public:
+    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
+
+    void dispatchWalker(
+        CommandQueue &commandQueue,
+        const MultiDispatchInfo &multiDispatchInfo,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        KernelOperation **blockedCommandsData,
+        HwTimeStamps *hwTimeStamps,
+        OCLRT::HwPerfCounter *hwPerfCounter,
+        TimestampPacket *previousTimestampPacket,
+        TimestampPacket *currentTimestampPacket,
+        PreemptionMode preemptionMode,
+        bool blockQueue,
+        uint32_t commandType = 0);
+
+    virtual void getDefaultDshSpace(
+        const size_t &offsetInterfaceDescriptorTable,
+        CommandQueue &commandQueue,
+        const MultiDispatchInfo &multiDispatchInfo,
+        size_t &totalInterfaceDescriptorTableSize,
+        OCLRT::Kernel *parentKernel,
+        OCLRT::IndirectHeap *dsh,
+        OCLRT::LinearStream *commandStream) = 0;
+
+    virtual INTERFACE_DESCRIPTOR_DATA *obtainInterfaceDescriptorData(
+        WALKER_HANDLE pCmdData) = 0;
+
+    virtual void setOffsetCrossThreadData(
+        WALKER_HANDLE pCmdData,
+        size_t &offsetCrossThreadData,
+        uint32_t &interfaceDescriptorIndex) = 0;
+
+    virtual void dispatchWorkarounds(
+        OCLRT::LinearStream *commandStream,
+        CommandQueue &commandQueue,
+        OCLRT::Kernel &kernel,
+        const bool &enable) = 0;
+
+    virtual void dispatchProfilingPerfStartCommands(
+        const OCLRT::DispatchInfo &dispatchInfo,
+        const MultiDispatchInfo &multiDispatchInfo,
+        HwTimeStamps *hwTimeStamps,
+        OCLRT::HwPerfCounter *hwPerfCounter,
+        OCLRT::LinearStream *commandStream,
+        CommandQueue &commandQueue) = 0;
+
+    virtual void dispatchProfilingPerfEndCommands(
+        HwTimeStamps *hwTimeStamps,
+        OCLRT::HwPerfCounter *hwPerfCounter,
+        OCLRT::LinearStream *commandStream,
+        CommandQueue &commandQueue) = 0;
+};
+
+template <typename GfxFamily>
+class BaseInterfaceVersion : public HardwareInterface<GfxFamily> {
+    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
+
+    void getDefaultDshSpace(
+        const size_t &offsetInterfaceDescriptorTable,
+        CommandQueue &commandQueue,
+        const MultiDispatchInfo &multiDispatchInfo,
+        size_t &totalInterfaceDescriptorTableSize,
+        OCLRT::Kernel *parentKernel,
+        OCLRT::IndirectHeap *dsh,
+        OCLRT::LinearStream *commandStream) override;
+
+    INTERFACE_DESCRIPTOR_DATA *obtainInterfaceDescriptorData(
+        WALKER_HANDLE pCmdData) override;
+
+    void setOffsetCrossThreadData(
+        WALKER_HANDLE pCmdData,
+        size_t &offsetCrossThreadData,
+        uint32_t &interfaceDescriptorIndex) override;
+
+    void dispatchWorkarounds(
+        OCLRT::LinearStream *commandStream,
+        CommandQueue &commandQueue,
+        OCLRT::Kernel &kernel,
+        const bool &enable) override;
+
+    void dispatchProfilingPerfStartCommands(
+        const OCLRT::DispatchInfo &dispatchInfo,
+        const MultiDispatchInfo &multiDispatchInfo,
+        HwTimeStamps *hwTimeStamps,
+        OCLRT::HwPerfCounter *hwPerfCounter,
+        OCLRT::LinearStream *commandStream,
+        CommandQueue &commandQueue) override;
+
+    void dispatchProfilingPerfEndCommands(
+        HwTimeStamps *hwTimeStamps,
+        OCLRT::HwPerfCounter *hwPerfCounter,
+        OCLRT::LinearStream *commandStream,
+        CommandQueue &commandQueue) override;
+};
+
+} // namespace OCLRT
diff --git a/runtime/command_queue/hardware_interface/hardware_interface.inl b/runtime/command_queue/hardware_interface/hardware_interface.inl
new file mode 100644
index 0000000000..95201240a0
--- /dev/null
+++ b/runtime/command_queue/hardware_interface/hardware_interface.inl
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2018, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "runtime/command_queue/hardware_interface/hardware_interface.h"
+
+namespace OCLRT {
+
+template <typename GfxFamily>
+void HardwareInterface<GfxFamily>::dispatchWalker(
+    CommandQueue &commandQueue,
+    const MultiDispatchInfo &multiDispatchInfo,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    KernelOperation **blockedCommandsData,
+    HwTimeStamps *hwTimeStamps,
+    OCLRT::HwPerfCounter *hwPerfCounter,
+    TimestampPacket *previousTimestampPacket,
+    TimestampPacket *currentTimestampPacket,
+    PreemptionMode preemptionMode,
+    bool blockQueue,
+    uint32_t commandType) {
+
+    OCLRT::LinearStream *commandStream = nullptr;
+    OCLRT::IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+    auto parentKernel = multiDispatchInfo.peekParentKernel();
+
+    for (auto &dispatchInfo : multiDispatchInfo) {
+        // Compute local workgroup sizes
+        if (dispatchInfo.getLocalWorkgroupSize().x == 0) {
+            const auto lws = generateWorkgroupSize(dispatchInfo);
+            const_cast<DispatchInfo &>(dispatchInfo).setLWS(lws);
+        }
+    }
+
+    // Allocate command stream and indirect heaps
+    if (blockQueue) {
+        using KCH = KernelCommandsHelper<GfxFamily>;
+        commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize),
+                                         MemoryConstants::pageSize);
+        if (parentKernel) {
+            uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;
+
+            commandQueue.allocateHeapMemory(
+                IndirectHeap::DYNAMIC_STATE,
+                commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(),
+                dsh);
+
+            dsh->getSpace(colorCalcSize);
+            ioh = dsh;
+            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE,
+                                            KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<
+                                                IndirectHeap::SURFACE_STATE>(*parentKernel) +
+                                                KCH::getTotalSizeRequiredSSH(multiDispatchInfo),
+                                            ssh);
+        } else {
+            commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, KCH::getTotalSizeRequiredDSH(multiDispatchInfo), dsh);
+            commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, KCH::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
+            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, KCH::getTotalSizeRequiredSSH(multiDispatchInfo), ssh);
+        }
+
+        using UniqueIH = std::unique_ptr<IndirectHeap>;
+        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh),
+                                                   UniqueIH(ssh), *commandQueue.getDevice().getMemoryManager());
+        if (parentKernel) {
+            (*blockedCommandsData)->doNotFreeISH = true;
+        }
+    } else {
+        commandStream = &commandQueue.getCS(0);
+        if (parentKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
+            commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
+        }
+        dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
+        ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
+        ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
+    }
+
+    if (commandQueue.getDevice().getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
+        GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(commandStream, commandQueue.getDevice(),
+                                                                         numEventsInWaitList, eventWaitList);
+        if (previousTimestampPacket) {
+            auto compareAddress = previousTimestampPacket->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd);
+            KernelCommandsHelper<GfxFamily>::programMiSemaphoreWait(*commandStream, compareAddress, 1);
+        }
+    }
+
+    dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);
+
+    uint32_t interfaceDescriptorIndex = 0;
+    const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
+
+    size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
+
+    getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize,
+                       parentKernel, dsh, commandStream);
+
+    // Program media interface descriptor load
+    KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
+        *commandStream,
+        offsetInterfaceDescriptorTable,
+        totalInterfaceDescriptorTableSize);
+
+    DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
+
+    size_t currentDispatchIndex = 0;
+    for (auto &dispatchInfo : multiDispatchInfo) {
+        auto &kernel = *dispatchInfo.getKernel();
+
+        DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
+        DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
+        DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
+        DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
+        DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
+
+        // Determine SIMD size
+        uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
+
+        // If we don't have a required WGS, compute one opportunistically
+        auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);
+        if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
+            provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo);
+        }
+
+        //Get dispatch geometry
+        uint32_t dim = dispatchInfo.getDim();
+        Vec3<size_t> gws = dispatchInfo.getGWS();
+        Vec3<size_t> offset = dispatchInfo.getOffset();
+        Vec3<size_t> swgs = dispatchInfo.getStartOfWorkgroups();
+
+        // Compute local workgroup sizes
+        Vec3<size_t> lws = dispatchInfo.getLocalWorkgroupSize();
+        Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;
+
+        // Compute number of work groups
+        Vec3<size_t> twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups()
+                                                                              : generateWorkgroupsNumber(gws, lws);
+        Vec3<size_t> nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs;
+
+        // Patch our kernel constants
+        *kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
+        *kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
+        *kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
+
+        *kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
+        *kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
+        *kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
+
+        if ((&kernel == multiDispatchInfo.peekMainKernel()) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
+            *kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
+            *kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
+            *kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
+        }
+
+        *kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
+        *kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
+        *kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
+
+        *kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
+        *kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
+        *kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
+
+        if (&kernel == multiDispatchInfo.peekMainKernel()) {
+            *kernel.numWorkGroupsX = static_cast<uint32_t>(twgs.x);
+            *kernel.numWorkGroupsY = static_cast<uint32_t>(twgs.y);
+            *kernel.numWorkGroupsZ = static_cast<uint32_t>(twgs.z);
+        }
+
+        *kernel.workDim = dim;
+
+        // Send our indirect object data
+        size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
+
+        dispatchProfilingPerfStartCommands(dispatchInfo, multiDispatchInfo, hwTimeStamps,
+                                           hwPerfCounter, commandStream, commandQueue);
+
+        dispatchWorkarounds(commandStream, commandQueue, kernel, true);
+
+        bool setupTimestampPacket = currentTimestampPacket && (currentDispatchIndex == multiDispatchInfo.size() - 1);
+        if (setupTimestampPacket) {
+            GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, nullptr, currentTimestampPacket,
+                                                               TimestampPacket::WriteOperationType::BeforeWalker);
+        }
+
+        // Program the walker.  Invokes execution so all state should already be programmed
+        auto pWalkerCmd = static_cast<WALKER_TYPE<GfxFamily> *>(commandStream->getSpace(sizeof(WALKER_TYPE<GfxFamily>)));
+        *pWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
+
+        if (setupTimestampPacket) {
+            GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, pWalkerCmd, currentTimestampPacket,
+                                                               TimestampPacket::WriteOperationType::AfterWalker);
+        }
+
+        auto idd = obtainInterfaceDescriptorData(pWalkerCmd);
+
+        auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
+            *commandStream,
+            *dsh,
+            *ioh,
+            *ssh,
+            kernel,
+            simd,
+            localWorkSizes,
+            offsetInterfaceDescriptorTable,
+            interfaceDescriptorIndex,
+            preemptionMode,
+            idd);
+
+        size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
+        size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
+        size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
+        auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pWalkerCmd, globalOffsets, startWorkGroups,
+                                                                                    numWorkGroups, localWorkSizes, simd);
+
+        DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
+        setOffsetCrossThreadData(pWalkerCmd, offsetCrossThreadData, interfaceDescriptorIndex);
+
+        auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
+        DEBUG_BREAK_IF(nullptr == threadPayload);
+
+        auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
+        auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
+        localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
+
+        auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
+        DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
+
+        auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
+        auto IndirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
+                                          WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
+        pWalkerCmd->setIndirectDataLength(IndirectDataLength);
+
+        dispatchWorkarounds(commandStream, commandQueue, kernel, false);
+        currentDispatchIndex++;
+    }
+    dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
+}
+
+template <typename GfxFamily>
+inline void BaseInterfaceVersion<GfxFamily>::getDefaultDshSpace(
+    const size_t &offsetInterfaceDescriptorTable,
+    CommandQueue &commandQueue,
+    const MultiDispatchInfo &multiDispatchInfo,
+    size_t &totalInterfaceDescriptorTableSize,
+    OCLRT::Kernel *parentKernel,
+    OCLRT::IndirectHeap *dsh,
+    OCLRT::LinearStream *commandStream) {
+
+    size_t numDispatches = multiDispatchInfo.size();
+    totalInterfaceDescriptorTableSize *= numDispatches;
+
+    if (!parentKernel) {
+        dsh->getSpace(totalInterfaceDescriptorTableSize);
+    } else {
+        dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
+    }
+}
+
+template <typename GfxFamily>
+inline typename BaseInterfaceVersion<GfxFamily>::INTERFACE_DESCRIPTOR_DATA *
+BaseInterfaceVersion<GfxFamily>::obtainInterfaceDescriptorData(
+    WALKER_HANDLE pCmdData) {
+
+    return nullptr;
+}
+
+template <typename GfxFamily>
+inline void BaseInterfaceVersion<GfxFamily>::setOffsetCrossThreadData(
+    WALKER_HANDLE pCmdData,
+    size_t &offsetCrossThreadData,
+    uint32_t &interfaceDescriptorIndex) {
+
+    WALKER_TYPE<GfxFamily> *pCmd = static_cast<WALKER_TYPE<GfxFamily> *>(pCmdData);
+    pCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
+    pCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);
+}
+
+template <typename GfxFamily>
+inline void BaseInterfaceVersion<GfxFamily>::dispatchWorkarounds(
+    OCLRT::LinearStream *commandStream,
+    CommandQueue &commandQueue,
+    OCLRT::Kernel &kernel,
+    const bool &enable) {
+
+    if (enable) {
+        PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
+        // Implement enabling special WA DisableLSQCROPERFforOCL if needed
+        GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
+    } else {
+        // Implement disabling special WA DisableLSQCROPERFforOCL if needed
+        GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
+        PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
+    }
+}
+
+template <typename GfxFamily>
+inline void BaseInterfaceVersion<GfxFamily>::dispatchProfilingPerfStartCommands(
+    const OCLRT::DispatchInfo &dispatchInfo,
+    const MultiDispatchInfo &multiDispatchInfo,
+    HwTimeStamps *hwTimeStamps,
+    OCLRT::HwPerfCounter *hwPerfCounter,
+    OCLRT::LinearStream *commandStream,
+    CommandQueue &commandQueue) {
+
+    if (&dispatchInfo == &*multiDispatchInfo.begin()) {
+        // If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
+        if (hwTimeStamps != nullptr) {
+            GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream);
+        }
+        if (hwPerfCounter != nullptr) {
+            GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);
+        }
+    }
+}
+
+template <typename GfxFamily>
+inline void BaseInterfaceVersion<GfxFamily>::dispatchProfilingPerfEndCommands(
+    HwTimeStamps *hwTimeStamps,
+    OCLRT::HwPerfCounter *hwPerfCounter,
+    OCLRT::LinearStream *commandStream,
+    CommandQueue &commandQueue) {
+
+    // If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
+    if (hwTimeStamps != nullptr) {
+        GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream);
+    }
+    if (hwPerfCounter != nullptr) {
+        GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);
+    }
+}
+
+} // namespace OCLRT
diff --git a/runtime/gen10/hw_cmds.h b/runtime/gen10/hw_cmds.h
index 53a1ec5997..eec1b3ac7c 100644
--- a/runtime/gen10/hw_cmds.h
+++ b/runtime/gen10/hw_cmds.h
@@ -30,14 +30,20 @@
 #define TILERESOURCE_CHICKENBIT_VECTOR_BITMASK (1UL << 8)
 struct CnlParse;
 namespace OCLRT {
+
+template <class GfxFamily>
+class BaseInterfaceVersion;
+
 struct GEN10 {
 #include "runtime/gen10/hw_cmds_generated_patched.h"
 #include "runtime/gen10/hw_cmds_generated.h"
 };
+
 struct CNLFamily : public GEN10 {
     typedef CnlParse PARSE;
     typedef CNLFamily GfxFamily;
     typedef GPGPU_WALKER WALKER_TYPE;
+    using HARDWARE_INTERFACE = BaseInterfaceVersion<CNLFamily>;
     static const GPGPU_WALKER cmdInitGpgpuWalker;
     static const INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;
     static const MEDIA_INTERFACE_DESCRIPTOR_LOAD cmdInitMediaInterfaceDescriptorLoad;
diff --git a/runtime/gen8/hw_cmds_base.h b/runtime/gen8/hw_cmds_base.h
index 853a264366..ca78aa5609 100644
--- a/runtime/gen8/hw_cmds_base.h
+++ b/runtime/gen8/hw_cmds_base.h
@@ -30,6 +30,10 @@
 //forward declaration for parsing logic
 struct BdwParse;
 namespace OCLRT {
+
+template <class GfxFamily>
+class BaseInterfaceVersion;
+
 struct GEN8 {
 #include "runtime/gen8/hw_cmds_generated.h"
 #include "runtime/gen8/hw_cmds_generated_patched.h"
@@ -38,6 +42,7 @@ struct BDWFamily : public GEN8 {
     typedef BdwParse PARSE;
     typedef BDWFamily GfxFamily;
     typedef GPGPU_WALKER WALKER_TYPE;
+    using HARDWARE_INTERFACE = BaseInterfaceVersion<BDWFamily>;
     static const GPGPU_WALKER cmdInitGpgpuWalker;
     static const INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;
     static const MEDIA_INTERFACE_DESCRIPTOR_LOAD cmdInitMediaInterfaceDescriptorLoad;
diff --git a/runtime/gen9/hw_cmds_base.h b/runtime/gen9/hw_cmds_base.h
index 2f36b48223..ee65c57dab 100644
--- a/runtime/gen9/hw_cmds_base.h
+++ b/runtime/gen9/hw_cmds_base.h
@@ -31,6 +31,9 @@ struct SklParse;
 
 namespace OCLRT {
 
+template <class GfxFamily>
+class BaseInterfaceVersion;
+
 struct GEN9 {
 #include "runtime/gen9/hw_cmds_generated_patched.h"
 #include "runtime/gen9/hw_cmds_generated.h"
@@ -40,6 +43,7 @@ struct SKLFamily : public GEN9 {
     typedef SklParse PARSE;
     typedef SKLFamily GfxFamily;
     typedef GPGPU_WALKER WALKER_TYPE;
+    using HARDWARE_INTERFACE = BaseInterfaceVersion<SKLFamily>;
     static const GPGPU_WALKER cmdInitGpgpuWalker;
     static const INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;
     static const MEDIA_INTERFACE_DESCRIPTOR_LOAD cmdInitMediaInterfaceDescriptorLoad;
diff --git a/unit_tests/libult/mock_gfx_family.h b/unit_tests/libult/mock_gfx_family.h
index 5e3bf90782..695bfaf282 100644
--- a/unit_tests/libult/mock_gfx_family.h
+++ b/unit_tests/libult/mock_gfx_family.h
@@ -26,6 +26,9 @@
 
 namespace OCLRT {
 
+template <class GfxFamily>
+class BaseInterfaceVersion;
+
 extern HwHelper *hwHelperFactory[IGFX_MAX_CORE];
 
 struct GENX {
@@ -358,6 +361,7 @@ struct GENX {
         inline void setCompareOperation(COMPARE_OPERATION value) {}
     } MI_SEMAPHORE_WAIT;
 
+    using HARDWARE_INTERFACE = BaseInterfaceVersion<GENX>;
     typedef GPGPU_WALKER WALKER_TYPE;
     static GPGPU_WALKER cmdInitGpgpuWalker;
     static INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;