Extract functions to device_queue_hw_base.inl

Change-Id: I91216453effadf7290b6364bfd442704add97566 Related-To: NEO-3016 Signed-off-by: Maciej Dziuban <maciej.dziuban@intel.com>
2026-01-08 22:12:59 +08:00 · 2019-05-13 13:35:14 +02:00
parent 2e6e791a1c
commit e67879ffca
8 changed files with 238 additions and 221 deletions
--- a/runtime/device_queue/CMakeLists.txt
+++ b/runtime/device_queue/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2018 Intel Corporation
+# Copyright (C) 2018-2019 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
@@ -9,8 +9,10 @@ set(RUNTIME_SRCS_DEVICE_QUEUE
  ${CMAKE_CURRENT_SOURCE_DIR}/device_queue.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/device_queue.h
  ${CMAKE_CURRENT_SOURCE_DIR}/device_queue_hw.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/device_queue_hw.inl
+  ${CMAKE_CURRENT_SOURCE_DIR}/device_queue_hw_base.inl
+  ${CMAKE_CURRENT_SOURCE_DIR}/device_queue_hw_bdw_plus.inl
  ${CMAKE_CURRENT_SOURCE_DIR}/device_queue_hw_profiling.inl
 )
 target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_DEVICE_QUEUE})
 set_property(GLOBAL PROPERTY RUNTIME_SRCS_DEVICE_QUEUE ${RUNTIME_SRCS_DEVICE_QUEUE})
+add_subdirectories()
--- a/runtime/device_queue/device_queue_hw_base.inl
+++ b/runtime/device_queue/device_queue_hw_base.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2019 Intel Corporation
+ * Copyright (C) 2019 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -88,21 +88,6 @@ void DeviceQueueHw<GfxFamily>::resetDeviceQueue() {
    resetDSH();
 }

-template <typename GfxFamily>
-size_t DeviceQueueHw<GfxFamily>::getMinimumSlbSize() {
-    using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH;
-    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
-    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
-
-    return sizeof(MEDIA_STATE_FLUSH) +
-           sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) +
-           sizeof(PIPE_CONTROL) +
-           sizeof(GPGPU_WALKER) +
-           sizeof(MEDIA_STATE_FLUSH) +
-           sizeof(PIPE_CONTROL) +
-           DeviceQueueHw<GfxFamily>::getCSPrefetchSize();
-}
-
 template <typename GfxFamily>
 void DeviceQueueHw<GfxFamily>::initPipeControl(PIPE_CONTROL *pc) {
    *pc = GfxFamily::cmdInitPipeControl;
@@ -113,95 +98,6 @@ void DeviceQueueHw<GfxFamily>::initPipeControl(PIPE_CONTROL *pc) {
    pc->setCommandStreamerStallEnable(true);
 }

-template <typename GfxFamily>
-void DeviceQueueHw<GfxFamily>::buildSlbDummyCommands() {
-    using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH;
-    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
-    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
-
-    auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());
-    auto slbEndOffset = igilCmdQueue->m_controls.m_SLBENDoffsetInBytes;
-    size_t commandsSize = getMinimumSlbSize() + getWaCommandsSize();
-    size_t numEnqueues = numberOfDeviceEnqueues;
-
-    // buildSlbDummyCommands is called from resetDeviceQueue() - reset slbCS each time
-    slbCS.replaceBuffer(slbBuffer->getUnderlyingBuffer(), slbBuffer->getUnderlyingBufferSize());
-
-    if (slbEndOffset >= 0) {
-        DEBUG_BREAK_IF(slbEndOffset % commandsSize != 0);
-        //We always overwrite at most one enqueue space with BB_START command pointing to cleanup section
-        //if SLBENDoffset is the at the end then BB_START added after scheduler did not corrupt anything so no need to regenerate
-        numEnqueues = (slbEndOffset == static_cast<int>(commandsSize)) ? 0 : 1;
-        slbCS.getSpace(slbEndOffset);
-    }
-
-    for (size_t i = 0; i < numEnqueues; i++) {
-        auto mediaStateFlush = slbCS.getSpaceForCmd<MEDIA_STATE_FLUSH>();
-        *mediaStateFlush = GfxFamily::cmdInitMediaStateFlush;
-
-        addArbCheckCmdWa();
-
-        addMiAtomicCmdWa((uint64_t)&igilCmdQueue->m_controls.m_DummyAtomicOperationPlaceholder);
-
-        auto mediaIdLoad = slbCS.getSpaceForCmd<MEDIA_INTERFACE_DESCRIPTOR_LOAD>();
-        *mediaIdLoad = GfxFamily::cmdInitMediaInterfaceDescriptorLoad;
-        mediaIdLoad->setInterfaceDescriptorTotalLength(2048);
-
-        auto dataStartAddress = colorCalcStateSize;
-
-        mediaIdLoad->setInterfaceDescriptorDataStartAddress(dataStartAddress + sizeof(INTERFACE_DESCRIPTOR_DATA) * schedulerIDIndex);
-
-        addLriCmdWa(true);
-
-        if (isProfilingEnabled()) {
-            addPipeControlCmdWa();
-            auto pipeControl = slbCS.getSpaceForCmd<PIPE_CONTROL>();
-            initPipeControl(pipeControl);
-
-        } else {
-            auto noop = slbCS.getSpace(sizeof(PIPE_CONTROL));
-            memset(noop, 0x0, sizeof(PIPE_CONTROL));
-            addPipeControlCmdWa(true);
-        }
-
-        auto gpgpuWalker = slbCS.getSpaceForCmd<GPGPU_WALKER>();
-        *gpgpuWalker = GfxFamily::cmdInitGpgpuWalker;
-        gpgpuWalker->setSimdSize(GPGPU_WALKER::SIMD_SIZE::SIMD_SIZE_SIMD16);
-        gpgpuWalker->setThreadGroupIdXDimension(1);
-        gpgpuWalker->setThreadGroupIdYDimension(1);
-        gpgpuWalker->setThreadGroupIdZDimension(1);
-        gpgpuWalker->setRightExecutionMask(0xFFFFFFFF);
-        gpgpuWalker->setBottomExecutionMask(0xFFFFFFFF);
-
-        mediaStateFlush = slbCS.getSpaceForCmd<MEDIA_STATE_FLUSH>();
-        *mediaStateFlush = GfxFamily::cmdInitMediaStateFlush;
-
-        addArbCheckCmdWa();
-
-        addPipeControlCmdWa();
-
-        auto pipeControl2 = slbCS.getSpaceForCmd<PIPE_CONTROL>();
-        initPipeControl(pipeControl2);
-
-        addLriCmdWa(false);
-
-        auto prefetch = slbCS.getSpace(getCSPrefetchSize());
-        memset(prefetch, 0x0, getCSPrefetchSize());
-    }
-
-    // always the same BBStart position (after 128 enqueues)
-    auto bbStartOffset = (commandsSize * 128) - slbCS.getUsed();
-    slbCS.getSpace(bbStartOffset);
-
-    auto bbStart = slbCS.getSpaceForCmd<MI_BATCH_BUFFER_START>();
-    *bbStart = GfxFamily::cmdInitBatchBufferStart;
-    auto slbPtr = reinterpret_cast<uintptr_t>(slbBuffer->getUnderlyingBuffer());
-    bbStart->setBatchBufferStartAddressGraphicsaddress472(slbPtr);
-
-    igilCmdQueue->m_controls.m_CleanupSectionSize = 0;
-    igilQueue->m_controls.m_CleanupSectionAddress = 0;
-}
-
 template <typename GfxFamily>
 void DeviceQueueHw<GfxFamily>::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint32_t taskCount) {
    // CleanUp Section
@@ -271,88 +167,6 @@ IndirectHeap *DeviceQueueHw<GfxFamily>::getIndirectHeap(IndirectHeap::Type type)
    return heaps[type];
 }

-template <typename GfxFamily>
-void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) {
-    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
-    void *pDSH = dynamicStateHeap.getCpuBase();
-
-    // Set scheduler ID to last entry in first table, it will have ID == 0, blocks will have following entries.
-    auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());
-    igilCmdQueue->m_controls.m_IDTstart = colorCalcStateSize + sizeof(INTERFACE_DESCRIPTOR_DATA) * (interfaceDescriptorEntries - 2);
-
-    // Parent's dsh is located after ColorCalcState and 2 ID tables
-    igilCmdQueue->m_controls.m_DynamicHeapStart = offsetDsh + alignUp((uint32_t)parentKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
-    igilCmdQueue->m_controls.m_DynamicHeapSizeInBytes = (uint32_t)dshBuffer->getUnderlyingBufferSize();
-
-    igilCmdQueue->m_controls.m_CurrentDSHoffset = igilCmdQueue->m_controls.m_DynamicHeapStart;
-    igilCmdQueue->m_controls.m_ParentDSHOffset = offsetDsh;
-
-    uint32_t blockIndex = parentIDCount;
-
-    pDSH = ptrOffset(pDSH, colorCalcStateSize);
-
-    INTERFACE_DESCRIPTOR_DATA *pIDDestination = static_cast<INTERFACE_DESCRIPTOR_DATA *>(pDSH);
-
-    BlockKernelManager *blockManager = parentKernel->getProgram()->getBlockKernelManager();
-    uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
-
-    uint32_t maxBindingTableCount = 0;
-    uint32_t totalBlockSSHSize = 0;
-
-    igilCmdQueue->m_controls.m_StartBlockID = blockIndex;
-
-    for (uint32_t i = 0; i < blockCount; i++) {
-        const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
-
-        auto blockAllocation = pBlockInfo->getGraphicsAllocation();
-        DEBUG_BREAK_IF(!blockAllocation);
-
-        auto gpuAddress = blockAllocation ? blockAllocation->getGpuAddressToPatch() : 0llu;
-
-        auto bindingTableCount = pBlockInfo->patchInfo.bindingTableState->Count;
-        maxBindingTableCount = std::max(maxBindingTableCount, bindingTableCount);
-
-        totalBlockSSHSize += alignUp(pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
-
-        auto btOffset = KernelCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(surfaceStateHeap, *pBlockInfo);
-
-        parentKernel->setReflectionSurfaceBlockBtOffset(i, static_cast<uint32_t>(btOffset));
-
-        // Determine SIMD size
-        uint32_t simd = pBlockInfo->getMaxSimdSize();
-        DEBUG_BREAK_IF(pBlockInfo->patchInfo.interfaceDescriptorData == nullptr);
-
-        uint32_t idOffset = pBlockInfo->patchInfo.interfaceDescriptorData->Offset;
-        const INTERFACE_DESCRIPTOR_DATA *pBlockID = static_cast<const INTERFACE_DESCRIPTOR_DATA *>(ptrOffset(pBlockInfo->heapInfo.pDsh, idOffset));
-
-        pIDDestination[blockIndex + i] = *pBlockID;
-        pIDDestination[blockIndex + i].setKernelStartPointerHigh(gpuAddress >> 32);
-        pIDDestination[blockIndex + i].setKernelStartPointer((uint32_t)gpuAddress);
-        pIDDestination[blockIndex + i].setBarrierEnable(pBlockInfo->patchInfo.executionEnvironment->HasBarriers > 0);
-        pIDDestination[blockIndex + i].setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL);
-
-        // Set offset to sampler states, block's DHSOffset is added by scheduler
-        pIDDestination[blockIndex + i].setSamplerStatePointer(static_cast<uint32_t>(pBlockInfo->getBorderColorStateSize()));
-
-        auto threadPayload = pBlockInfo->patchInfo.threadPayload;
-        DEBUG_BREAK_IF(nullptr == threadPayload);
-
-        auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
-        auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels);
-
-        auto numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / sizeof(GRF));
-
-        // HW requires a minimum of 1 GRF of perThreadData for each thread in a thread group
-        // when sizeCrossThreadData != 0
-        numGrfPerThreadData = std::max(numGrfPerThreadData, 1u);
-        pIDDestination[blockIndex + i].setConstantIndirectUrbEntryReadLength(numGrfPerThreadData);
-    }
-
-    igilCmdQueue->m_controls.m_BTmaxSize = alignUp(maxBindingTableCount * (uint32_t)sizeof(BINDING_TABLE_STATE), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE);
-    igilCmdQueue->m_controls.m_BTbaseOffset = alignUp((uint32_t)surfaceStateHeap.getUsed(), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE);
-    igilCmdQueue->m_controls.m_CurrentSSHoffset = igilCmdQueue->m_controls.m_BTbaseOffset;
-}
-
 template <typename GfxFamily>
 size_t DeviceQueueHw<GfxFamily>::setSchedulerCrossThreadData(SchedulerKernel &scheduler) {
    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
@@ -393,34 +207,6 @@ void DeviceQueueHw<GfxFamily>::addLriCmd(bool setArbCheck) {
        lri->setDataDword(0x0);
 }

-template <typename GfxFamily>
-void DeviceQueueHw<GfxFamily>::addMediaStateClearCmds() {
-    typedef typename GfxFamily::MEDIA_VFE_STATE MEDIA_VFE_STATE;
-
-    addPipeControlCmdWa();
-
-    auto pipeControl = slbCS.getSpaceForCmd<PIPE_CONTROL>();
-    *pipeControl = GfxFamily::cmdInitPipeControl;
-    pipeControl->setGenericMediaStateClear(true);
-    pipeControl->setCommandStreamerStallEnable(true);
-
-    addDcFlushToPipeControlWa(pipeControl);
-
-    PreambleHelper<GfxFamily>::programVFEState(&slbCS, device->getHardwareInfo(), 0, 0);
-}
-
-template <typename GfxFamily>
-size_t DeviceQueueHw<GfxFamily>::getMediaStateClearCmdsSize() {
-    using MEDIA_VFE_STATE = typename GfxFamily::MEDIA_VFE_STATE;
-    // PC with GenreicMediaStateClear + WA PC
-    size_t size = 2 * sizeof(PIPE_CONTROL);
-
-    // VFE state cmds
-    size += sizeof(PIPE_CONTROL);
-    size += sizeof(MEDIA_VFE_STATE);
-    return size;
-}
-
 template <typename GfxFamily>
 size_t DeviceQueueHw<GfxFamily>::getExecutionModelCleanupSectionSize() {
    size_t totalSize = 0;
--- a/runtime/device_queue/device_queue_hw_bdw_plus.inl
+++ b/runtime/device_queue/device_queue_hw_bdw_plus.inl
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "runtime/device_queue/device_queue_hw_base.inl"
+
+namespace NEO {
+
+template <typename GfxFamily>
+size_t DeviceQueueHw<GfxFamily>::getMinimumSlbSize() {
+    using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH;
+    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
+    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
+
+    return sizeof(MEDIA_STATE_FLUSH) +
+           sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) +
+           sizeof(PIPE_CONTROL) +
+           sizeof(GPGPU_WALKER) +
+           sizeof(MEDIA_STATE_FLUSH) +
+           sizeof(PIPE_CONTROL) +
+           DeviceQueueHw<GfxFamily>::getCSPrefetchSize();
+}
+
+template <typename GfxFamily>
+void DeviceQueueHw<GfxFamily>::buildSlbDummyCommands() {
+    using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH;
+    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
+    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
+
+    auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());
+    auto slbEndOffset = igilCmdQueue->m_controls.m_SLBENDoffsetInBytes;
+    size_t commandsSize = getMinimumSlbSize() + getWaCommandsSize();
+    size_t numEnqueues = numberOfDeviceEnqueues;
+
+    // buildSlbDummyCommands is called from resetDeviceQueue() - reset slbCS each time
+    slbCS.replaceBuffer(slbBuffer->getUnderlyingBuffer(), slbBuffer->getUnderlyingBufferSize());
+
+    if (slbEndOffset >= 0) {
+        DEBUG_BREAK_IF(slbEndOffset % commandsSize != 0);
+        //We always overwrite at most one enqueue space with BB_START command pointing to cleanup section
+        //if SLBENDoffset is the at the end then BB_START added after scheduler did not corrupt anything so no need to regenerate
+        numEnqueues = (slbEndOffset == static_cast<int>(commandsSize)) ? 0 : 1;
+        slbCS.getSpace(slbEndOffset);
+    }
+
+    for (size_t i = 0; i < numEnqueues; i++) {
+        auto mediaStateFlush = slbCS.getSpaceForCmd<MEDIA_STATE_FLUSH>();
+        *mediaStateFlush = GfxFamily::cmdInitMediaStateFlush;
+
+        addArbCheckCmdWa();
+
+        addMiAtomicCmdWa((uint64_t)&igilCmdQueue->m_controls.m_DummyAtomicOperationPlaceholder);
+
+        auto mediaIdLoad = slbCS.getSpaceForCmd<MEDIA_INTERFACE_DESCRIPTOR_LOAD>();
+        *mediaIdLoad = GfxFamily::cmdInitMediaInterfaceDescriptorLoad;
+        mediaIdLoad->setInterfaceDescriptorTotalLength(2048);
+
+        auto dataStartAddress = colorCalcStateSize;
+
+        mediaIdLoad->setInterfaceDescriptorDataStartAddress(dataStartAddress + sizeof(INTERFACE_DESCRIPTOR_DATA) * schedulerIDIndex);
+
+        addLriCmdWa(true);
+
+        if (isProfilingEnabled()) {
+            addPipeControlCmdWa();
+            auto pipeControl = slbCS.getSpaceForCmd<PIPE_CONTROL>();
+            initPipeControl(pipeControl);
+
+        } else {
+            auto noop = slbCS.getSpace(sizeof(PIPE_CONTROL));
+            memset(noop, 0x0, sizeof(PIPE_CONTROL));
+            addPipeControlCmdWa(true);
+        }
+
+        auto gpgpuWalker = slbCS.getSpaceForCmd<GPGPU_WALKER>();
+        *gpgpuWalker = GfxFamily::cmdInitGpgpuWalker;
+        gpgpuWalker->setSimdSize(GPGPU_WALKER::SIMD_SIZE::SIMD_SIZE_SIMD16);
+        gpgpuWalker->setThreadGroupIdXDimension(1);
+        gpgpuWalker->setThreadGroupIdYDimension(1);
+        gpgpuWalker->setThreadGroupIdZDimension(1);
+        gpgpuWalker->setRightExecutionMask(0xFFFFFFFF);
+        gpgpuWalker->setBottomExecutionMask(0xFFFFFFFF);
+
+        mediaStateFlush = slbCS.getSpaceForCmd<MEDIA_STATE_FLUSH>();
+        *mediaStateFlush = GfxFamily::cmdInitMediaStateFlush;
+
+        addArbCheckCmdWa();
+
+        addPipeControlCmdWa();
+
+        auto pipeControl2 = slbCS.getSpaceForCmd<PIPE_CONTROL>();
+        initPipeControl(pipeControl2);
+
+        addLriCmdWa(false);
+
+        auto prefetch = slbCS.getSpace(getCSPrefetchSize());
+        memset(prefetch, 0x0, getCSPrefetchSize());
+    }
+
+    // always the same BBStart position (after 128 enqueues)
+    auto bbStartOffset = (commandsSize * 128) - slbCS.getUsed();
+    slbCS.getSpace(bbStartOffset);
+
+    auto bbStart = slbCS.getSpaceForCmd<MI_BATCH_BUFFER_START>();
+    *bbStart = GfxFamily::cmdInitBatchBufferStart;
+    auto slbPtr = reinterpret_cast<uintptr_t>(slbBuffer->getUnderlyingBuffer());
+    bbStart->setBatchBufferStartAddressGraphicsaddress472(slbPtr);
+
+    igilCmdQueue->m_controls.m_CleanupSectionSize = 0;
+    igilQueue->m_controls.m_CleanupSectionAddress = 0;
+}
+
+template <typename GfxFamily>
+void DeviceQueueHw<GfxFamily>::addMediaStateClearCmds() {
+    typedef typename GfxFamily::MEDIA_VFE_STATE MEDIA_VFE_STATE;
+
+    addPipeControlCmdWa();
+
+    auto pipeControl = slbCS.getSpaceForCmd<PIPE_CONTROL>();
+    *pipeControl = GfxFamily::cmdInitPipeControl;
+    pipeControl->setGenericMediaStateClear(true);
+    pipeControl->setCommandStreamerStallEnable(true);
+
+    addDcFlushToPipeControlWa(pipeControl);
+
+    PreambleHelper<GfxFamily>::programVFEState(&slbCS, device->getHardwareInfo(), 0, 0);
+}
+
+template <typename GfxFamily>
+size_t DeviceQueueHw<GfxFamily>::getMediaStateClearCmdsSize() {
+    using MEDIA_VFE_STATE = typename GfxFamily::MEDIA_VFE_STATE;
+    // PC with GenreicMediaStateClear + WA PC
+    size_t size = 2 * sizeof(PIPE_CONTROL);
+
+    // VFE state cmds
+    size += sizeof(PIPE_CONTROL);
+    size += sizeof(MEDIA_VFE_STATE);
+    return size;
+}
+
+template <typename GfxFamily>
+void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) {
+    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
+    void *pDSH = dynamicStateHeap.getCpuBase();
+
+    // Set scheduler ID to last entry in first table, it will have ID == 0, blocks will have following entries.
+    auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());
+    igilCmdQueue->m_controls.m_IDTstart = colorCalcStateSize + sizeof(INTERFACE_DESCRIPTOR_DATA) * (interfaceDescriptorEntries - 2);
+
+    // Parent's dsh is located after ColorCalcState and 2 ID tables
+    igilCmdQueue->m_controls.m_DynamicHeapStart = offsetDsh + alignUp((uint32_t)parentKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
+    igilCmdQueue->m_controls.m_DynamicHeapSizeInBytes = (uint32_t)dshBuffer->getUnderlyingBufferSize();
+
+    igilCmdQueue->m_controls.m_CurrentDSHoffset = igilCmdQueue->m_controls.m_DynamicHeapStart;
+    igilCmdQueue->m_controls.m_ParentDSHOffset = offsetDsh;
+
+    uint32_t blockIndex = parentIDCount;
+
+    pDSH = ptrOffset(pDSH, colorCalcStateSize);
+
+    INTERFACE_DESCRIPTOR_DATA *pIDDestination = static_cast<INTERFACE_DESCRIPTOR_DATA *>(pDSH);
+
+    BlockKernelManager *blockManager = parentKernel->getProgram()->getBlockKernelManager();
+    uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
+
+    uint32_t maxBindingTableCount = 0;
+    uint32_t totalBlockSSHSize = 0;
+
+    igilCmdQueue->m_controls.m_StartBlockID = blockIndex;
+
+    for (uint32_t i = 0; i < blockCount; i++) {
+        const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
+
+        auto blockAllocation = pBlockInfo->getGraphicsAllocation();
+        DEBUG_BREAK_IF(!blockAllocation);
+
+        auto gpuAddress = blockAllocation ? blockAllocation->getGpuAddressToPatch() : 0llu;
+
+        auto bindingTableCount = pBlockInfo->patchInfo.bindingTableState->Count;
+        maxBindingTableCount = std::max(maxBindingTableCount, bindingTableCount);
+
+        totalBlockSSHSize += alignUp(pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
+
+        auto btOffset = KernelCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(surfaceStateHeap, *pBlockInfo);
+
+        parentKernel->setReflectionSurfaceBlockBtOffset(i, static_cast<uint32_t>(btOffset));
+
+        // Determine SIMD size
+        uint32_t simd = pBlockInfo->getMaxSimdSize();
+        DEBUG_BREAK_IF(pBlockInfo->patchInfo.interfaceDescriptorData == nullptr);
+
+        uint32_t idOffset = pBlockInfo->patchInfo.interfaceDescriptorData->Offset;
+        const INTERFACE_DESCRIPTOR_DATA *pBlockID = static_cast<const INTERFACE_DESCRIPTOR_DATA *>(ptrOffset(pBlockInfo->heapInfo.pDsh, idOffset));
+
+        pIDDestination[blockIndex + i] = *pBlockID;
+        pIDDestination[blockIndex + i].setKernelStartPointerHigh(gpuAddress >> 32);
+        pIDDestination[blockIndex + i].setKernelStartPointer((uint32_t)gpuAddress);
+        pIDDestination[blockIndex + i].setBarrierEnable(pBlockInfo->patchInfo.executionEnvironment->HasBarriers > 0);
+        pIDDestination[blockIndex + i].setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL);
+
+        // Set offset to sampler states, block's DHSOffset is added by scheduler
+        pIDDestination[blockIndex + i].setSamplerStatePointer(static_cast<uint32_t>(pBlockInfo->getBorderColorStateSize()));
+
+        auto threadPayload = pBlockInfo->patchInfo.threadPayload;
+        DEBUG_BREAK_IF(nullptr == threadPayload);
+
+        auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
+        auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels);
+
+        auto numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / sizeof(GRF));
+
+        // HW requires a minimum of 1 GRF of perThreadData for each thread in a thread group
+        // when sizeCrossThreadData != 0
+        numGrfPerThreadData = std::max(numGrfPerThreadData, 1u);
+        pIDDestination[blockIndex + i].setConstantIndirectUrbEntryReadLength(numGrfPerThreadData);
+    }
+
+    igilCmdQueue->m_controls.m_BTmaxSize = alignUp(maxBindingTableCount * (uint32_t)sizeof(BINDING_TABLE_STATE), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE);
+    igilCmdQueue->m_controls.m_BTbaseOffset = alignUp((uint32_t)surfaceStateHeap.getUsed(), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE);
+    igilCmdQueue->m_controls.m_CurrentSSHoffset = igilCmdQueue->m_controls.m_BTbaseOffset;
+}
+
+} // namespace NEO
--- a/runtime/device_queue/device_queue_hw_profiling.inl
+++ b/runtime/device_queue/device_queue_hw_profiling.inl
@@ -5,6 +5,9 @@
 *
 */

+#include "runtime/command_queue/gpgpu_walker.h"
+#include "runtime/device_queue/device_queue_hw.h"
+
 namespace NEO {

 template <typename GfxFamily>
--- a/runtime/gen10/device_queue_gen10.cpp
+++ b/runtime/gen10/device_queue_gen10.cpp
@@ -6,7 +6,7 @@
 */

 #include "runtime/device_queue/device_queue_hw.h"
-#include "runtime/device_queue/device_queue_hw.inl"
+#include "runtime/device_queue/device_queue_hw_bdw_plus.inl"
 #include "runtime/device_queue/device_queue_hw_profiling.inl"
 #include "runtime/gen10/hw_cmds.h"

--- a/runtime/gen11/device_queue_gen11.cpp
+++ b/runtime/gen11/device_queue_gen11.cpp
@@ -6,7 +6,7 @@
 */

 #include "runtime/device_queue/device_queue_hw.h"
-#include "runtime/device_queue/device_queue_hw.inl"
+#include "runtime/device_queue/device_queue_hw_bdw_plus.inl"
 #include "runtime/device_queue/device_queue_hw_profiling.inl"
 #include "runtime/gen11/device_enqueue.h"
 #include "runtime/gen11/hw_cmds.h"
--- a/runtime/gen8/device_queue_gen8.cpp
+++ b/runtime/gen8/device_queue_gen8.cpp
@@ -6,7 +6,7 @@
 */

 #include "runtime/device_queue/device_queue_hw.h"
-#include "runtime/device_queue/device_queue_hw.inl"
+#include "runtime/device_queue/device_queue_hw_bdw_plus.inl"
 #include "runtime/gen8/hw_cmds.h"

 namespace NEO {
--- a/runtime/gen9/device_queue_gen9.cpp
+++ b/runtime/gen9/device_queue_gen9.cpp
@@ -6,7 +6,7 @@
 */

 #include "runtime/device_queue/device_queue_hw.h"
-#include "runtime/device_queue/device_queue_hw.inl"
+#include "runtime/device_queue/device_queue_hw_bdw_plus.inl"
 #include "runtime/device_queue/device_queue_hw_profiling.inl"
 #include "runtime/gen9/hw_cmds.h"