compute-runtime/runtime/command_queue/hardware_interface/hardware_interface.inl

/*
 * Copyright (c) 2018, Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#pragma once
#include "runtime/command_queue/hardware_interface/hardware_interface.h"

namespace OCLRT {

template <typename GfxFamily>
void HardwareInterface<GfxFamily>::dispatchWalker(
    CommandQueue &commandQueue,
    const MultiDispatchInfo &multiDispatchInfo,
    cl_uint numEventsInWaitList,
    const cl_event *eventWaitList,
    KernelOperation **blockedCommandsData,
    HwTimeStamps *hwTimeStamps,
    OCLRT::HwPerfCounter *hwPerfCounter,
    TimestampPacket *previousTimestampPacket,
    TimestampPacket *currentTimestampPacket,
    PreemptionMode preemptionMode,
    bool blockQueue,
    uint32_t commandType) {

    OCLRT::LinearStream *commandStream = nullptr;
    OCLRT::IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
    auto parentKernel = multiDispatchInfo.peekParentKernel();

    for (auto &dispatchInfo : multiDispatchInfo) {
        // Compute local workgroup sizes
        if (dispatchInfo.getLocalWorkgroupSize().x == 0) {
            const auto lws = generateWorkgroupSize(dispatchInfo);
            const_cast<DispatchInfo &>(dispatchInfo).setLWS(lws);
        }
    }

    // Allocate command stream and indirect heaps
    if (blockQueue) {
        using KCH = KernelCommandsHelper<GfxFamily>;
        commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize),
                                         MemoryConstants::pageSize);
        if (parentKernel) {
            uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;

            commandQueue.allocateHeapMemory(
                IndirectHeap::DYNAMIC_STATE,
                commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(),
                dsh);

            dsh->getSpace(colorCalcSize);
            ioh = dsh;
            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE,
                                            KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<
                                                IndirectHeap::SURFACE_STATE>(*parentKernel) +
                                                KCH::getTotalSizeRequiredSSH(multiDispatchInfo),
                                            ssh);
        } else {
            commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, KCH::getTotalSizeRequiredDSH(multiDispatchInfo), dsh);
            commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, KCH::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, KCH::getTotalSizeRequiredSSH(multiDispatchInfo), ssh);
        }

        using UniqueIH = std::unique_ptr<IndirectHeap>;
        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh),
                                                   UniqueIH(ssh), *commandQueue.getDevice().getMemoryManager());
        if (parentKernel) {
            (*blockedCommandsData)->doNotFreeISH = true;
        }
    } else {
        commandStream = &commandQueue.getCS(0);
        if (parentKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
            commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
        }
        dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
        ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
        ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
    }

    if (commandQueue.getDevice().getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
        GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(commandStream, commandQueue.getDevice(),
                                                                         numEventsInWaitList, eventWaitList);
        if (previousTimestampPacket) {
            auto compareAddress = previousTimestampPacket->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd);
            KernelCommandsHelper<GfxFamily>::programMiSemaphoreWait(*commandStream, compareAddress, 1);
        }
    }

    dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);

    uint32_t interfaceDescriptorIndex = 0;
    const size_t offsetInterfaceDescriptorTable = dsh->getUsed();

    size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);

    getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize,
                       parentKernel, dsh, commandStream);

    // Program media interface descriptor load
    KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
        *commandStream,
        offsetInterfaceDescriptorTable,
        totalInterfaceDescriptorTableSize);

    DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);

    size_t currentDispatchIndex = 0;
    for (auto &dispatchInfo : multiDispatchInfo) {
        auto &kernel = *dispatchInfo.getKernel();

        DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
        DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
        DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
        DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
        DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));

        // Determine SIMD size
        uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();

        // If we don't have a required WGS, compute one opportunistically
        auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);
        if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
            provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo);
        }

        //Get dispatch geometry
        uint32_t dim = dispatchInfo.getDim();
        Vec3<size_t> gws = dispatchInfo.getGWS();
        Vec3<size_t> offset = dispatchInfo.getOffset();
        Vec3<size_t> swgs = dispatchInfo.getStartOfWorkgroups();

        // Compute local workgroup sizes
        Vec3<size_t> lws = dispatchInfo.getLocalWorkgroupSize();
        Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;

        // Compute number of work groups
        Vec3<size_t> twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups()
                                                                              : generateWorkgroupsNumber(gws, lws);
        Vec3<size_t> nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs;

        // Patch our kernel constants
        *kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
        *kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
        *kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);

        *kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
        *kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
        *kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);

        if ((&kernel == multiDispatchInfo.peekMainKernel()) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
            *kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
            *kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
            *kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
        }

        *kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
        *kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
        *kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);

        *kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
        *kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
        *kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);

        if (&kernel == multiDispatchInfo.peekMainKernel()) {
            *kernel.numWorkGroupsX = static_cast<uint32_t>(twgs.x);
            *kernel.numWorkGroupsY = static_cast<uint32_t>(twgs.y);
            *kernel.numWorkGroupsZ = static_cast<uint32_t>(twgs.z);
        }

        *kernel.workDim = dim;

        // Send our indirect object data
        size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};

        dispatchProfilingPerfStartCommands(dispatchInfo, multiDispatchInfo, hwTimeStamps,
                                           hwPerfCounter, commandStream, commandQueue);

        dispatchWorkarounds(commandStream, commandQueue, kernel, true);

        bool setupTimestampPacket = currentTimestampPacket && (currentDispatchIndex == multiDispatchInfo.size() - 1);
        if (setupTimestampPacket) {
            GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, nullptr, currentTimestampPacket,
                                                               TimestampPacket::WriteOperationType::BeforeWalker);
        }

        // Program the walker.  Invokes execution so all state should already be programmed
        auto pWalkerCmd = static_cast<WALKER_TYPE<GfxFamily> *>(commandStream->getSpace(sizeof(WALKER_TYPE<GfxFamily>)));
        *pWalkerCmd = GfxFamily::cmdInitGpgpuWalker;

        if (setupTimestampPacket) {
            GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, pWalkerCmd, currentTimestampPacket,
                                                               TimestampPacket::WriteOperationType::AfterWalker);
        }

        auto idd = obtainInterfaceDescriptorData(pWalkerCmd);

        auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
            *commandStream,
            *dsh,
            *ioh,
            *ssh,
            kernel,
            simd,
            localWorkSizes,
            offsetInterfaceDescriptorTable,
            interfaceDescriptorIndex,
            preemptionMode,
            idd);

        size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
        size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
        size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
        auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pWalkerCmd, globalOffsets, startWorkGroups,
                                                                                    numWorkGroups, localWorkSizes, simd);

        DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
        setOffsetCrossThreadData(pWalkerCmd, offsetCrossThreadData, interfaceDescriptorIndex);

        auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
        DEBUG_BREAK_IF(nullptr == threadPayload);

        auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
        auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
        localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));

        auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
        DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group

        auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
        auto IndirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
                                          WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
        pWalkerCmd->setIndirectDataLength(IndirectDataLength);

        dispatchWorkarounds(commandStream, commandQueue, kernel, false);
        currentDispatchIndex++;
    }
    dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
}

template <typename GfxFamily>
inline void BaseInterfaceVersion<GfxFamily>::getDefaultDshSpace(
    const size_t &offsetInterfaceDescriptorTable,
    CommandQueue &commandQueue,
    const MultiDispatchInfo &multiDispatchInfo,
    size_t &totalInterfaceDescriptorTableSize,
    OCLRT::Kernel *parentKernel,
    OCLRT::IndirectHeap *dsh,
    OCLRT::LinearStream *commandStream) {

    size_t numDispatches = multiDispatchInfo.size();
    totalInterfaceDescriptorTableSize *= numDispatches;

    if (!parentKernel) {
        dsh->getSpace(totalInterfaceDescriptorTableSize);
    } else {
        dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
    }
}

template <typename GfxFamily>
inline typename BaseInterfaceVersion<GfxFamily>::INTERFACE_DESCRIPTOR_DATA *
BaseInterfaceVersion<GfxFamily>::obtainInterfaceDescriptorData(
    WALKER_HANDLE pCmdData) {

    return nullptr;
}

template <typename GfxFamily>
inline void BaseInterfaceVersion<GfxFamily>::setOffsetCrossThreadData(
    WALKER_HANDLE pCmdData,
    size_t &offsetCrossThreadData,
    uint32_t &interfaceDescriptorIndex) {

    WALKER_TYPE<GfxFamily> *pCmd = static_cast<WALKER_TYPE<GfxFamily> *>(pCmdData);
    pCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
    pCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);
}

template <typename GfxFamily>
inline void BaseInterfaceVersion<GfxFamily>::dispatchWorkarounds(
    OCLRT::LinearStream *commandStream,
    CommandQueue &commandQueue,
    OCLRT::Kernel &kernel,
    const bool &enable) {

    if (enable) {
        PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
        // Implement enabling special WA DisableLSQCROPERFforOCL if needed
        GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
    } else {
        // Implement disabling special WA DisableLSQCROPERFforOCL if needed
        GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
        PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
    }
}

template <typename GfxFamily>
inline void BaseInterfaceVersion<GfxFamily>::dispatchProfilingPerfStartCommands(
    const OCLRT::DispatchInfo &dispatchInfo,
    const MultiDispatchInfo &multiDispatchInfo,
    HwTimeStamps *hwTimeStamps,
    OCLRT::HwPerfCounter *hwPerfCounter,
    OCLRT::LinearStream *commandStream,
    CommandQueue &commandQueue) {

    if (&dispatchInfo == &*multiDispatchInfo.begin()) {
        // If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
        if (hwTimeStamps != nullptr) {
            GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream);
        }
        if (hwPerfCounter != nullptr) {
            GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);
        }
    }
}

template <typename GfxFamily>
inline void BaseInterfaceVersion<GfxFamily>::dispatchProfilingPerfEndCommands(
    HwTimeStamps *hwTimeStamps,
    OCLRT::HwPerfCounter *hwPerfCounter,
    OCLRT::LinearStream *commandStream,
    CommandQueue &commandQueue) {

    // If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
    if (hwTimeStamps != nullptr) {
        GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream);
    }
    if (hwPerfCounter != nullptr) {
        GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);
    }
}

} // namespace OCLRT
Move dispatchWalker() to seperate class Change-Id: Idc95df3d67775022f2197c1f973182acb3558a2d Signed-off-by: Filip Hazubski <filip.hazubski@intel.com> 2018-09-07 23:04:18 +08:00			`/*`
			`* Copyright (c) 2018, Intel Corporation`
			`*`
			`* Permission is hereby granted, free of charge, to any person obtaining a`
			`* copy of this software and associated documentation files (the "Software"),`
			`* to deal in the Software without restriction, including without limitation`
			`* the rights to use, copy, modify, merge, publish, distribute, sublicense,`
			`* and/or sell copies of the Software, and to permit persons to whom the`
			`* Software is furnished to do so, subject to the following conditions:`
			`*`
			`* The above copyright notice and this permission notice shall be included`
			`* in all copies or substantial portions of the Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS`
			`* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL`
			`* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR`
			`* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,`
			`* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR`
			`* OTHER DEALINGS IN THE SOFTWARE.`
			`*/`

			`#pragma once`
			`#include "runtime/command_queue/hardware_interface/hardware_interface.h"`

			`namespace OCLRT {`

			`template <typename GfxFamily>`
			`void HardwareInterface<GfxFamily>::dispatchWalker(`
			`CommandQueue &commandQueue,`
			`const MultiDispatchInfo &multiDispatchInfo,`
			`cl_uint numEventsInWaitList,`
			`const cl_event *eventWaitList,`
			`KernelOperation **blockedCommandsData,`
			`HwTimeStamps *hwTimeStamps,`
			`OCLRT::HwPerfCounter *hwPerfCounter,`
			`TimestampPacket *previousTimestampPacket,`
			`TimestampPacket *currentTimestampPacket,`
			`PreemptionMode preemptionMode,`
			`bool blockQueue,`
			`uint32_t commandType) {`

			`OCLRT::LinearStream *commandStream = nullptr;`
			`OCLRT::IndirectHeap dsh = nullptr, ioh = nullptr, *ssh = nullptr;`
			`auto parentKernel = multiDispatchInfo.peekParentKernel();`

			`for (auto &dispatchInfo : multiDispatchInfo) {`
			`// Compute local workgroup sizes`
			`if (dispatchInfo.getLocalWorkgroupSize().x == 0) {`
			`const auto lws = generateWorkgroupSize(dispatchInfo);`
			`const_cast<DispatchInfo &>(dispatchInfo).setLWS(lws);`
			`}`
			`}`

			`// Allocate command stream and indirect heaps`
			`if (blockQueue) {`
			`using KCH = KernelCommandsHelper<GfxFamily>;`
			`commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize),`
			`MemoryConstants::pageSize);`
			`if (parentKernel) {`
			`uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;`

			`commandQueue.allocateHeapMemory(`
			`IndirectHeap::DYNAMIC_STATE,`
			`commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(),`
			`dsh);`

			`dsh->getSpace(colorCalcSize);`
			`ioh = dsh;`
			`commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE,`
			`KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<`
			`IndirectHeap::SURFACE_STATE>(*parentKernel) +`
			`KCH::getTotalSizeRequiredSSH(multiDispatchInfo),`
			`ssh);`
			`} else {`
			`commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, KCH::getTotalSizeRequiredDSH(multiDispatchInfo), dsh);`
			`commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, KCH::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);`
			`commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, KCH::getTotalSizeRequiredSSH(multiDispatchInfo), ssh);`
			`}`

			`using UniqueIH = std::unique_ptr<IndirectHeap>;`
			`*blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh),`
			`UniqueIH(ssh), *commandQueue.getDevice().getMemoryManager());`
			`if (parentKernel) {`
			`(*blockedCommandsData)->doNotFreeISH = true;`
			`}`
			`} else {`
			`commandStream = &commandQueue.getCS(0);`
			`if (parentKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {`
			`commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);`
			`}`
			`dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);`
			`ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);`
			`ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);`
			`}`

			`if (commandQueue.getDevice().getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {`
			`GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(commandStream, commandQueue.getDevice(),`
			`numEventsInWaitList, eventWaitList);`
			`if (previousTimestampPacket) {`
			`auto compareAddress = previousTimestampPacket->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd);`
			`KernelCommandsHelper<GfxFamily>::programMiSemaphoreWait(*commandStream, compareAddress, 1);`
			`}`
			`}`

			`dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);`

			`uint32_t interfaceDescriptorIndex = 0;`
			`const size_t offsetInterfaceDescriptorTable = dsh->getUsed();`

			`size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);`

			`getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize,`
			`parentKernel, dsh, commandStream);`

			`// Program media interface descriptor load`
			`KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(`
			`*commandStream,`
			`offsetInterfaceDescriptorTable,`
			`totalInterfaceDescriptorTableSize);`

			`DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);`

			`size_t currentDispatchIndex = 0;`
			`for (auto &dispatchInfo : multiDispatchInfo) {`
			`auto &kernel = *dispatchInfo.getKernel();`

			`DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));`
			`DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 \|\| dispatchInfo.getDim() == 3));`
			`DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 \|\| dispatchInfo.getDim() >= 2));`
			`DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 \|\| dispatchInfo.getDim() == 3));`
			`DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 \|\| dispatchInfo.getDim() >= 2));`

			`// Determine SIMD size`
			`uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();`

			`// If we don't have a required WGS, compute one opportunistically`
			`auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);`
			`if (commandType == CL_COMMAND_NDRANGE_KERNEL) {`
			`provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo);`
			`}`

			`//Get dispatch geometry`
			`uint32_t dim = dispatchInfo.getDim();`
			`Vec3<size_t> gws = dispatchInfo.getGWS();`
			`Vec3<size_t> offset = dispatchInfo.getOffset();`
			`Vec3<size_t> swgs = dispatchInfo.getStartOfWorkgroups();`

			`// Compute local workgroup sizes`
			`Vec3<size_t> lws = dispatchInfo.getLocalWorkgroupSize();`
			`Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;`

			`// Compute number of work groups`
			`Vec3<size_t> twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups()`
			`: generateWorkgroupsNumber(gws, lws);`
			`Vec3<size_t> nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs;`

			`// Patch our kernel constants`
			`*kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);`
			`*kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);`
			`*kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);`

			`*kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);`
			`*kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);`
			`*kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);`

			`if ((&kernel == multiDispatchInfo.peekMainKernel()) \|\| (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {`
			`*kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);`
			`*kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);`
			`*kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);`
			`}`

			`*kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);`
			`*kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);`
			`*kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);`

			`*kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);`
			`*kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);`
			`*kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);`

			`if (&kernel == multiDispatchInfo.peekMainKernel()) {`
			`*kernel.numWorkGroupsX = static_cast<uint32_t>(twgs.x);`
			`*kernel.numWorkGroupsY = static_cast<uint32_t>(twgs.y);`
			`*kernel.numWorkGroupsZ = static_cast<uint32_t>(twgs.z);`
			`}`

			`*kernel.workDim = dim;`

			`// Send our indirect object data`
			`size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};`

			`dispatchProfilingPerfStartCommands(dispatchInfo, multiDispatchInfo, hwTimeStamps,`
			`hwPerfCounter, commandStream, commandQueue);`

			`dispatchWorkarounds(commandStream, commandQueue, kernel, true);`

			`bool setupTimestampPacket = currentTimestampPacket && (currentDispatchIndex == multiDispatchInfo.size() - 1);`
			`if (setupTimestampPacket) {`
			`GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, nullptr, currentTimestampPacket,`
			`TimestampPacket::WriteOperationType::BeforeWalker);`
			`}`

			`// Program the walker. Invokes execution so all state should already be programmed`
			`auto pWalkerCmd = static_cast<WALKER_TYPE<GfxFamily> *>(commandStream->getSpace(sizeof(WALKER_TYPE<GfxFamily>)));`
			`*pWalkerCmd = GfxFamily::cmdInitGpgpuWalker;`

			`if (setupTimestampPacket) {`
			`GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, pWalkerCmd, currentTimestampPacket,`
			`TimestampPacket::WriteOperationType::AfterWalker);`
			`}`

			`auto idd = obtainInterfaceDescriptorData(pWalkerCmd);`

			`auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(`
			`*commandStream,`
			`*dsh,`
			`*ioh,`
			`*ssh,`
			`kernel,`
			`simd,`
			`localWorkSizes,`
			`offsetInterfaceDescriptorTable,`
			`interfaceDescriptorIndex,`
			`preemptionMode,`
			`idd);`

			`size_t globalOffsets[3] = {offset.x, offset.y, offset.z};`
			`size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};`
			`size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};`
			`auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pWalkerCmd, globalOffsets, startWorkGroups,`
			`numWorkGroups, localWorkSizes, simd);`

			`DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);`
			`setOffsetCrossThreadData(pWalkerCmd, offsetCrossThreadData, interfaceDescriptorIndex);`

			`auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;`
			`DEBUG_BREAK_IF(nullptr == threadPayload);`

			`auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);`
			`auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);`
			`localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));`

			`auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;`
			`DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group`

			`auto sizeCrossThreadData = kernel.getCrossThreadDataSize();`
			`auto IndirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),`
			`WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);`
			`pWalkerCmd->setIndirectDataLength(IndirectDataLength);`

			`dispatchWorkarounds(commandStream, commandQueue, kernel, false);`
			`currentDispatchIndex++;`
			`}`
			`dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);`
			`}`

			`template <typename GfxFamily>`
			`inline void BaseInterfaceVersion<GfxFamily>::getDefaultDshSpace(`
			`const size_t &offsetInterfaceDescriptorTable,`
			`CommandQueue &commandQueue,`
			`const MultiDispatchInfo &multiDispatchInfo,`
			`size_t &totalInterfaceDescriptorTableSize,`
			`OCLRT::Kernel *parentKernel,`
			`OCLRT::IndirectHeap *dsh,`
			`OCLRT::LinearStream *commandStream) {`

			`size_t numDispatches = multiDispatchInfo.size();`
			`totalInterfaceDescriptorTableSize *= numDispatches;`

			`if (!parentKernel) {`
			`dsh->getSpace(totalInterfaceDescriptorTableSize);`
			`} else {`
			`dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());`
			`}`
			`}`

			`template <typename GfxFamily>`
			`inline typename BaseInterfaceVersion<GfxFamily>::INTERFACE_DESCRIPTOR_DATA *`
			`BaseInterfaceVersion<GfxFamily>::obtainInterfaceDescriptorData(`
			`WALKER_HANDLE pCmdData) {`

			`return nullptr;`
			`}`

			`template <typename GfxFamily>`
			`inline void BaseInterfaceVersion<GfxFamily>::setOffsetCrossThreadData(`
			`WALKER_HANDLE pCmdData,`
			`size_t &offsetCrossThreadData,`
			`uint32_t &interfaceDescriptorIndex) {`

			`WALKER_TYPE<GfxFamily> pCmd = static_cast<WALKER_TYPE<GfxFamily> >(pCmdData);`
			`pCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));`
			`pCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);`
			`}`

			`template <typename GfxFamily>`
			`inline void BaseInterfaceVersion<GfxFamily>::dispatchWorkarounds(`
			`OCLRT::LinearStream *commandStream,`
			`CommandQueue &commandQueue,`
			`OCLRT::Kernel &kernel,`
			`const bool &enable) {`

			`if (enable) {`
			`PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());`
			`// Implement enabling special WA DisableLSQCROPERFforOCL if needed`
			`GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);`
			`} else {`
			`// Implement disabling special WA DisableLSQCROPERFforOCL if needed`
			`GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);`
			`PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());`
			`}`
			`}`

			`template <typename GfxFamily>`
			`inline void BaseInterfaceVersion<GfxFamily>::dispatchProfilingPerfStartCommands(`
			`const OCLRT::DispatchInfo &dispatchInfo,`
			`const MultiDispatchInfo &multiDispatchInfo,`
			`HwTimeStamps *hwTimeStamps,`
			`OCLRT::HwPerfCounter *hwPerfCounter,`
			`OCLRT::LinearStream *commandStream,`
			`CommandQueue &commandQueue) {`

			`if (&dispatchInfo == &*multiDispatchInfo.begin()) {`
			`// If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled`
			`if (hwTimeStamps != nullptr) {`
			`GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream);`
			`}`
			`if (hwPerfCounter != nullptr) {`
			`GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);`
			`}`
			`}`
			`}`

			`template <typename GfxFamily>`
			`inline void BaseInterfaceVersion<GfxFamily>::dispatchProfilingPerfEndCommands(`
			`HwTimeStamps *hwTimeStamps,`
			`OCLRT::HwPerfCounter *hwPerfCounter,`
			`OCLRT::LinearStream *commandStream,`
			`CommandQueue &commandQueue) {`

			`// If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled`
			`if (hwTimeStamps != nullptr) {`
			`GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream);`
			`}`
			`if (hwPerfCounter != nullptr) {`
			`GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);`
			`}`
			`}`

			`} // namespace OCLRT`