compute-runtime/runtime/command_queue/gpgpu_walker.h

/*
 * Copyright (C) 2018 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#pragma once

#include "runtime/built_ins/built_ins.h"
#include "runtime/context/context.h"
#include "runtime/command_queue/command_queue.h"
#include "runtime/command_stream/linear_stream.h"
#include "runtime/command_stream/preemption.h"
#include "runtime/device_queue/device_queue_hw.h"
#include "runtime/event/hw_timestamps.h"
#include "runtime/event/perf_counter.h"
#include "runtime/helpers/dispatch_info.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/task_information.h"
#include "runtime/helpers/timestamp_packet.h"
#include "runtime/indirect_heap/indirect_heap.h"
#include "runtime/kernel/kernel.h"
#include "runtime/program/kernel_info.h"
#include "runtime/utilities/vec.h"

namespace OCLRT {

template <typename GfxFamily>
using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;

constexpr int32_t NUM_ALU_INST_FOR_READ_MODIFY_WRITE = 4;

constexpr int32_t L3SQC_BIT_LQSC_RO_PERF_DIS = 0x08000000;
constexpr int32_t L3SQC_REG4 = 0xB118;

constexpr int32_t GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER = 0xFFFFFFFF;
constexpr int32_t GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER = 0x00000000;

constexpr int32_t CS_GPR_R0 = 0x2600;
constexpr int32_t CS_GPR_R1 = 0x2608;

constexpr int32_t ALU_OPCODE_LOAD = 0x080;
constexpr int32_t ALU_OPCODE_STORE = 0x180;
constexpr int32_t ALU_OPCODE_OR = 0x103;
constexpr int32_t ALU_OPCODE_AND = 0x102;

constexpr int32_t ALU_REGISTER_R_0 = 0x0;
constexpr int32_t ALU_REGISTER_R_1 = 0x1;
constexpr int32_t ALU_REGISTER_R_SRCA = 0x20;
constexpr int32_t ALU_REGISTER_R_SRCB = 0x21;
constexpr int32_t ALU_REGISTER_R_ACCU = 0x31;

constexpr uint32_t GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW = 0x23A8;

void computeWorkgroupSize1D(
    uint32_t maxWorkGroupSize,
    size_t workGroupSize[3],
    const size_t workItems[3],
    size_t simdSize);

void computeWorkgroupSizeND(
    WorkSizeInfo wsInfo,
    size_t workGroupSize[3],
    const size_t workItems[3],
    const uint32_t workDim);

void computeWorkgroupSize2D(
    uint32_t maxWorkGroupSize,
    size_t workGroupSize[3],
    const size_t workItems[3],
    size_t simdSize);

void computeWorkgroupSizeSquared(
    uint32_t maxWorkGroupSize,
    size_t workGroupSize[3],
    const size_t workItems[3],
    size_t simdSize,
    const uint32_t workDim);

Vec3<size_t> computeWorkgroupSize(
    const DispatchInfo &dispatchInfo);

Vec3<size_t> generateWorkgroupSize(
    const DispatchInfo &dispatchInfo);

Vec3<size_t> computeWorkgroupsNumber(
    const Vec3<size_t> gws,
    const Vec3<size_t> lws);

Vec3<size_t> generateWorkgroupsNumber(
    const Vec3<size_t> gws,
    const Vec3<size_t> lws);

Vec3<size_t> generateWorkgroupsNumber(
    const DispatchInfo &dispatchInfo);

inline uint32_t calculateDispatchDim(Vec3<size_t> dispatchSize, Vec3<size_t> dispatchOffset) {
    return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
}

Vec3<size_t> canonizeWorkgroup(
    Vec3<size_t> workgroup);

void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo);

inline cl_uint computeDimensions(const size_t workItems[3]) {
    return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
}

template <typename GfxFamily>
class GpgpuWalkerHelper {
  public:
    using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;

    static void addAluReadModifyWriteRegister(
        LinearStream *pCommandStream,
        uint32_t aluRegister,
        uint32_t operation,
        uint32_t mask);

    static void applyWADisableLSQCROPERFforOCL(LinearStream *pCommandStream,
                                               const Kernel &kernel,
                                               bool disablePerfMode);

    static size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);

    static size_t setGpgpuWalkerThreadData(
        WALKER_TYPE<GfxFamily> *walkerCmd,
        const size_t globalOffsets[3],
        const size_t startWorkGroups[3],
        const size_t numWorkGroups[3],
        const size_t localWorkSizesIn[3],
        uint32_t simd,
        uint32_t workDim,
        bool localIdsGenerationByRuntime,
        bool kernelUsesLocalIds,
        bool inlineDataProgrammingRequired);

    static void dispatchProfilingCommandsStart(
        HwTimeStamps &hwTimeStamps,
        OCLRT::LinearStream *commandStream);

    static void dispatchProfilingCommandsEnd(
        HwTimeStamps &hwTimeStamps,
        OCLRT::LinearStream *commandStream);

    static void dispatchPerfCountersNoopidRegisterCommands(
        CommandQueue &commandQueue,
        OCLRT::HwPerfCounter &hwPerfCounter,
        OCLRT::LinearStream *commandStream,
        bool start);

    static void dispatchPerfCountersReadFreqRegisterCommands(
        CommandQueue &commandQueue,
        OCLRT::HwPerfCounter &hwPerfCounter,
        OCLRT::LinearStream *commandStream,
        bool start);

    static void dispatchPerfCountersGeneralPurposeCounterCommands(
        CommandQueue &commandQueue,
        OCLRT::HwPerfCounter &hwPerfCounter,
        OCLRT::LinearStream *commandStream,
        bool start);

    static void dispatchPerfCountersUserCounterCommands(
        CommandQueue &commandQueue,
        OCLRT::HwPerfCounter &hwPerfCounter,
        OCLRT::LinearStream *commandStream,
        bool start);

    static void dispatchPerfCountersOABufferStateCommands(
        CommandQueue &commandQueue,
        OCLRT::HwPerfCounter &hwPerfCounter,
        OCLRT::LinearStream *commandStream);

    static void dispatchPerfCountersCommandsStart(
        CommandQueue &commandQueue,
        OCLRT::HwPerfCounter &hwPerfCounter,
        OCLRT::LinearStream *commandStream);

    static void dispatchPerfCountersCommandsEnd(
        CommandQueue &commandQueue,
        OCLRT::HwPerfCounter &hwPerfCounter,
        OCLRT::LinearStream *commandStream);

    static void setupTimestampPacket(
        LinearStream *cmdStream,
        WALKER_TYPE<GfxFamily> *walkerCmd,
        TimestampPacket *timestampPacket,
        TimestampPacket::WriteOperationType writeOperationType);

    static void dispatchScheduler(
        CommandQueue &commandQueue,
        DeviceQueueHw<GfxFamily> &devQueueHw,
        PreemptionMode preemptionMode,
        SchedulerKernel &scheduler,
        IndirectHeap *ssh,
        IndirectHeap *dsh);

    static void dispatchOnDeviceWaitlistSemaphores(LinearStream *commandStream, Device &currentDevice,
                                                   cl_uint numEventsInWaitList, const cl_event *eventWaitList);
};

template <typename GfxFamily>
struct EnqueueOperation {
    using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
    static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo);
    static size_t getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel);
    static size_t getSizeRequiredForTimestampPacketWrite();

  private:
    static size_t getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel);
    static size_t getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue);
};

template <typename GfxFamily, uint32_t eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) {
    auto expectedSizeCS = EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel);
    return commandQueue.getCS(expectedSizeCS);
}

template <typename GfxFamily, uint32_t eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, cl_uint numEventsInWaitList, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
    size_t expectedSizeCS = 0;
    Kernel *parentKernel = multiDispatchInfo.peekParentKernel();
    for (auto &dispatchInfo : multiDispatchInfo) {
        expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel());
    }
    if (parentKernel) {
        SchedulerKernel &scheduler = commandQueue.getDevice().getExecutionEnvironment()->getBuiltIns()->getSchedulerKernel(parentKernel->getContext());
        expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler);
    }
    if (commandQueue.getDevice().getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
        auto semaphoreSize = sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
        auto atomicSize = sizeof(typename GfxFamily::MI_ATOMIC);

        expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite();
        expectedSizeCS += numEventsInWaitList * (semaphoreSize + atomicSize);
        if (!commandQueue.isOOQEnabled()) {
            expectedSizeCS += semaphoreSize + atomicSize;
        }
    }
    return commandQueue.getCS(expectedSizeCS);
}

template <typename GfxFamily, IndirectHeap::Type heapType>
IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
    size_t expectedSize = 0;
    IndirectHeap *ih = nullptr;

    // clang-format off
    switch (heapType) {
    case IndirectHeap::DYNAMIC_STATE:   expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo); break;
    case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo); break;
    case IndirectHeap::SURFACE_STATE:   expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo); break;
    }
    // clang-format on

    if (Kernel *parentKernel = multiDispatchInfo.peekParentKernel()) {
        if (heapType == IndirectHeap::SURFACE_STATE) {
            expectedSize += KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<heapType>(const_cast<const Kernel &>(*parentKernel));
        } else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
        {
            DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
            DEBUG_BREAK_IF(pDevQueue == nullptr);
            ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
        }
    }

    if (ih == nullptr)
        ih = &commandQueue.getIndirectHeap(heapType, expectedSize);

    return *ih;
}

} // namespace OCLRT
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`/*`
Update copyright headers Updating files modified in 2018 only. Older files remain with old style copyright header Change-Id: Ic99f2e190ad74b4b7f2bd79dd7b9fa5fbe36ec92 Signed-off-by: Artur Harasimiuk <artur.harasimiuk@intel.com> 2018-09-18 15:11:08 +08:00			`* Copyright (C) 2018 Intel Corporation`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`*`
Update copyright headers Updating files modified in 2018 only. Older files remain with old style copyright header Change-Id: Ic99f2e190ad74b4b7f2bd79dd7b9fa5fbe36ec92 Signed-off-by: Artur Harasimiuk <artur.harasimiuk@intel.com> 2018-09-18 15:11:08 +08:00			`* SPDX-License-Identifier: MIT`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`*`
			`*/`

			`#pragma once`

			`#include "runtime/built_ins/built_ins.h"`
			`#include "runtime/context/context.h"`
			`#include "runtime/command_queue/command_queue.h"`
			`#include "runtime/command_stream/linear_stream.h"`
			`#include "runtime/command_stream/preemption.h"`
			`#include "runtime/device_queue/device_queue_hw.h"`
			`#include "runtime/event/hw_timestamps.h"`
			`#include "runtime/event/perf_counter.h"`
			`#include "runtime/helpers/dispatch_info.h"`
			`#include "runtime/helpers/kernel_commands.h"`
			`#include "runtime/helpers/task_information.h"`
Initial implementation of Timestamp Packet write Change-Id: Ic498bcf9795f54fbb5fb5a8d07ed17fa70dc4f1a Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-08-24 14:48:59 +08:00			`#include "runtime/helpers/timestamp_packet.h"`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`#include "runtime/indirect_heap/indirect_heap.h"`
			`#include "runtime/kernel/kernel.h"`
			`#include "runtime/program/kernel_info.h"`
			`#include "runtime/utilities/vec.h"`

			`namespace OCLRT {`

Refactored GPGPU walker for easier maintenance Change-Id: Ie10272f363d3e583893c46123bb7a56a5fd9ac45 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com> 2018-08-24 21:53:33 +08:00			`template <typename GfxFamily>`
			`using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;`

Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`constexpr int32_t NUM_ALU_INST_FOR_READ_MODIFY_WRITE = 4;`

			`constexpr int32_t L3SQC_BIT_LQSC_RO_PERF_DIS = 0x08000000;`
			`constexpr int32_t L3SQC_REG4 = 0xB118;`

			`constexpr int32_t GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER = 0xFFFFFFFF;`
			`constexpr int32_t GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER = 0x00000000;`

			`constexpr int32_t CS_GPR_R0 = 0x2600;`
			`constexpr int32_t CS_GPR_R1 = 0x2608;`

			`constexpr int32_t ALU_OPCODE_LOAD = 0x080;`
			`constexpr int32_t ALU_OPCODE_STORE = 0x180;`
			`constexpr int32_t ALU_OPCODE_OR = 0x103;`
			`constexpr int32_t ALU_OPCODE_AND = 0x102;`

			`constexpr int32_t ALU_REGISTER_R_0 = 0x0;`
			`constexpr int32_t ALU_REGISTER_R_1 = 0x1;`
			`constexpr int32_t ALU_REGISTER_R_SRCA = 0x20;`
			`constexpr int32_t ALU_REGISTER_R_SRCB = 0x21;`
			`constexpr int32_t ALU_REGISTER_R_ACCU = 0x31;`

			`constexpr uint32_t GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW = 0x23A8;`

			`void computeWorkgroupSize1D(`
			`uint32_t maxWorkGroupSize,`
			`size_t workGroupSize[3],`
			`const size_t workItems[3],`
			`size_t simdSize);`

			`void computeWorkgroupSizeND(`
			`WorkSizeInfo wsInfo,`
			`size_t workGroupSize[3],`
			`const size_t workItems[3],`
			`const uint32_t workDim);`

			`void computeWorkgroupSize2D(`
			`uint32_t maxWorkGroupSize,`
			`size_t workGroupSize[3],`
			`const size_t workItems[3],`
			`size_t simdSize);`

			`void computeWorkgroupSizeSquared(`
			`uint32_t maxWorkGroupSize,`
			`size_t workGroupSize[3],`
			`const size_t workItems[3],`
			`size_t simdSize,`
			`const uint32_t workDim);`

			`Vec3<size_t> computeWorkgroupSize(`
			`const DispatchInfo &dispatchInfo);`

			`Vec3<size_t> generateWorkgroupSize(`
			`const DispatchInfo &dispatchInfo);`

			`Vec3<size_t> computeWorkgroupsNumber(`
			`const Vec3<size_t> gws,`
			`const Vec3<size_t> lws);`

			`Vec3<size_t> generateWorkgroupsNumber(`
			`const Vec3<size_t> gws,`
			`const Vec3<size_t> lws);`

			`Vec3<size_t> generateWorkgroupsNumber(`
			`const DispatchInfo &dispatchInfo);`

			`inline uint32_t calculateDispatchDim(Vec3<size_t> dispatchSize, Vec3<size_t> dispatchOffset) {`
			`return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));`
			`}`

			`Vec3<size_t> canonizeWorkgroup(`
			`Vec3<size_t> workgroup);`

			`void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo);`

			`inline cl_uint computeDimensions(const size_t workItems[3]) {`
			`return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;`
			`}`

			`template <typename GfxFamily>`
			`class GpgpuWalkerHelper {`
			`public:`
Initial implementation of Timestamp Packet write Change-Id: Ic498bcf9795f54fbb5fb5a8d07ed17fa70dc4f1a Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-08-24 14:48:59 +08:00			`using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;`
Refactored GPGPU walker for easier maintenance Change-Id: Ie10272f363d3e583893c46123bb7a56a5fd9ac45 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com> 2018-08-24 21:53:33 +08:00			`using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;`
Initial implementation of Timestamp Packet write Change-Id: Ic498bcf9795f54fbb5fb5a8d07ed17fa70dc4f1a Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-08-24 14:48:59 +08:00
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`static void addAluReadModifyWriteRegister(`
			`LinearStream *pCommandStream,`
			`uint32_t aluRegister,`
			`uint32_t operation,`
			`uint32_t mask);`

			`static void applyWADisableLSQCROPERFforOCL(LinearStream *pCommandStream,`
			`const Kernel &kernel,`
			`bool disablePerfMode);`

			`static size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);`

			`static size_t setGpgpuWalkerThreadData(`
Organize dispatching of thread data for better reuse of code Change-Id: I8c156f8b5a50f6fa4dfb5218cdadb2840ff556eb 2018-09-28 22:16:18 +08:00			`WALKER_TYPE<GfxFamily> *walkerCmd,`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`const size_t globalOffsets[3],`
			`const size_t startWorkGroups[3],`
			`const size_t numWorkGroups[3],`
			`const size_t localWorkSizesIn[3],`
Add new arguments to GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData Change-Id: I19e42a75f5224f6e3588c2c7be4a3451714bb5ef 2018-09-26 20:56:01 +08:00			`uint32_t simd,`
			`uint32_t workDim,`
Add new parameter to thread data dispatching Change-Id: I86710b0cc764156f4c2db9d24ccd1c96b32d7660 2018-10-03 21:13:54 +08:00			`bool localIdsGenerationByRuntime,`
			`bool kernelUsesLocalIds,`
			`bool inlineDataProgrammingRequired);`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00
			`static void dispatchProfilingCommandsStart(`
			`HwTimeStamps &hwTimeStamps,`
			`OCLRT::LinearStream *commandStream);`

			`static void dispatchProfilingCommandsEnd(`
			`HwTimeStamps &hwTimeStamps,`
			`OCLRT::LinearStream *commandStream);`

			`static void dispatchPerfCountersNoopidRegisterCommands(`
			`CommandQueue &commandQueue,`
			`OCLRT::HwPerfCounter &hwPerfCounter,`
			`OCLRT::LinearStream *commandStream,`
			`bool start);`

			`static void dispatchPerfCountersReadFreqRegisterCommands(`
			`CommandQueue &commandQueue,`
			`OCLRT::HwPerfCounter &hwPerfCounter,`
			`OCLRT::LinearStream *commandStream,`
			`bool start);`

			`static void dispatchPerfCountersGeneralPurposeCounterCommands(`
			`CommandQueue &commandQueue,`
			`OCLRT::HwPerfCounter &hwPerfCounter,`
			`OCLRT::LinearStream *commandStream,`
			`bool start);`

			`static void dispatchPerfCountersUserCounterCommands(`
			`CommandQueue &commandQueue,`
			`OCLRT::HwPerfCounter &hwPerfCounter,`
			`OCLRT::LinearStream *commandStream,`
			`bool start);`

			`static void dispatchPerfCountersOABufferStateCommands(`
			`CommandQueue &commandQueue,`
			`OCLRT::HwPerfCounter &hwPerfCounter,`
			`OCLRT::LinearStream *commandStream);`

			`static void dispatchPerfCountersCommandsStart(`
			`CommandQueue &commandQueue,`
			`OCLRT::HwPerfCounter &hwPerfCounter,`
			`OCLRT::LinearStream *commandStream);`

			`static void dispatchPerfCountersCommandsEnd(`
			`CommandQueue &commandQueue,`
			`OCLRT::HwPerfCounter &hwPerfCounter,`
			`OCLRT::LinearStream *commandStream);`

Initial implementation of Timestamp Packet write Change-Id: Ic498bcf9795f54fbb5fb5a8d07ed17fa70dc4f1a Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-08-24 14:48:59 +08:00			`static void setupTimestampPacket(`
			`LinearStream *cmdStream,`
Organize dispatching of thread data for better reuse of code Change-Id: I8c156f8b5a50f6fa4dfb5218cdadb2840ff556eb 2018-09-28 22:16:18 +08:00			`WALKER_TYPE<GfxFamily> *walkerCmd,`
Initial implementation of Timestamp Packet write Change-Id: Ic498bcf9795f54fbb5fb5a8d07ed17fa70dc4f1a Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-08-24 14:48:59 +08:00			`TimestampPacket *timestampPacket,`
			`TimestampPacket::WriteOperationType writeOperationType);`

Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`static void dispatchScheduler(`
			`CommandQueue &commandQueue,`
			`DeviceQueueHw<GfxFamily> &devQueueHw,`
			`PreemptionMode preemptionMode,`
[26/n] Internal 4GB allocator. - change the way we handle blocked commands. - instead of allocating CPU pointer and populating it with commands, create real IndirectHeap that may be later submitted to the GPU - that removes a lot of copy operations that were happening on submit time - for device enqueue, this requires dsh & shh to be passed directly to the underlying commands, in that scenario device queue buffers are not used Change-Id: I1124a8edbb46777ea7f7d3a5946f302e7fdf9665 2018-04-05 21:12:28 +08:00			`SchedulerKernel &scheduler,`
			`IndirectHeap *ssh,`
			`IndirectHeap *dsh);`
Use Semaphore to wait for dependencies on the same device Change-Id: Ib04c960c50183c080d02753815ece80b58d1980e Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-09-07 15:09:24 +08:00
			`static void dispatchOnDeviceWaitlistSemaphores(LinearStream *commandStream, Device &currentDevice,`
			`cl_uint numEventsInWaitList, const cl_event *eventWaitList);`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`};`

Separate struct EnqueueOperation declaration and implementation Change-Id: I537660867a1c98f957280237c14b7a1554fce3db 2018-04-09 22:39:32 +08:00			`template <typename GfxFamily>`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`struct EnqueueOperation {`
Initial implementation of Timestamp Packet write Change-Id: Ic498bcf9795f54fbb5fb5a8d07ed17fa70dc4f1a Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-08-24 14:48:59 +08:00			`using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;`
Separate struct EnqueueOperation declaration and implementation Change-Id: I537660867a1c98f957280237c14b7a1554fce3db 2018-04-09 22:39:32 +08:00			`static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo);`
			`static size_t getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel);`
Improve TimestmapPacket write commands size estimation Change-Id: Idf80401f4360342a162d54aba7ffbe6dfcf714e8 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-09-06 16:01:51 +08:00			`static size_t getSizeRequiredForTimestampPacketWrite();`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00
Separate struct EnqueueOperation declaration and implementation Change-Id: I537660867a1c98f957280237c14b7a1554fce3db 2018-04-09 22:39:32 +08:00			`private:`
			`static size_t getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel);`
			`static size_t getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue);`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`};`

			`template <typename GfxFamily, uint32_t eventType>`
			`LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) {`
Separate struct EnqueueOperation declaration and implementation Change-Id: I537660867a1c98f957280237c14b7a1554fce3db 2018-04-09 22:39:32 +08:00			`auto expectedSizeCS = EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel);`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`return commandQueue.getCS(expectedSizeCS);`
			`}`

			`template <typename GfxFamily, uint32_t eventType>`
Use Semaphore to wait for dependencies on the same device Change-Id: Ib04c960c50183c080d02753815ece80b58d1980e Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-09-07 15:09:24 +08:00			`LinearStream &getCommandStream(CommandQueue &commandQueue, cl_uint numEventsInWaitList, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`size_t expectedSizeCS = 0;`
Refactor querying Main and Parent Kernel from MultiDispatchInfo Change-Id: I723d91f2f445bc7af1bcb0de46f8ac07837f3449 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-08-16 21:47:25 +08:00			`Kernel *parentKernel = multiDispatchInfo.peekParentKernel();`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`for (auto &dispatchInfo : multiDispatchInfo) {`
Separate struct EnqueueOperation declaration and implementation Change-Id: I537660867a1c98f957280237c14b7a1554fce3db 2018-04-09 22:39:32 +08:00			`expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel());`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`}`
Refactor querying Main and Parent Kernel from MultiDispatchInfo Change-Id: I723d91f2f445bc7af1bcb0de46f8ac07837f3449 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-08-16 21:47:25 +08:00			`if (parentKernel) {`
Delete Device::getBuiltIns() Change-Id: I9d1968dfb2ba4a56020fd17152119add726106e1 Signed-off-by: Maciej Dziuban <maciej.dziuban@intel.com> 2018-08-22 19:57:21 +08:00			`SchedulerKernel &scheduler = commandQueue.getDevice().getExecutionEnvironment()->getBuiltIns()->getSchedulerKernel(parentKernel->getContext());`
Separate struct EnqueueOperation declaration and implementation Change-Id: I537660867a1c98f957280237c14b7a1554fce3db 2018-04-09 22:39:32 +08:00			`expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler);`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`}`
Store command stream receiver in device. Change-Id: I133ce4435e189e5bb8473ad4736fe0b02047dffe 2018-09-11 16:43:50 +08:00			`if (commandQueue.getDevice().getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {`
Handle TimestamPacket with implicit dependencies ownership Change-Id: I22a4de4e9eb904c359583e235e0de54a7c743e07 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-09-26 06:44:43 +08:00			`auto semaphoreSize = sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);`
			`auto atomicSize = sizeof(typename GfxFamily::MI_ATOMIC);`

Improve TimestmapPacket write commands size estimation Change-Id: Idf80401f4360342a162d54aba7ffbe6dfcf714e8 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-09-06 16:01:51 +08:00			`expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite();`
Handle TimestamPacket with implicit dependencies ownership Change-Id: I22a4de4e9eb904c359583e235e0de54a7c743e07 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-09-26 06:44:43 +08:00			`expectedSizeCS += numEventsInWaitList * (semaphoreSize + atomicSize);`
			`if (!commandQueue.isOOQEnabled()) {`
			`expectedSizeCS += semaphoreSize + atomicSize;`
			`}`
Initial implementation of Timestamp Packet write Change-Id: Ic498bcf9795f54fbb5fb5a8d07ed17fa70dc4f1a Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-08-24 14:48:59 +08:00			`}`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`return commandQueue.getCS(expectedSizeCS);`
			`}`

			`template <typename GfxFamily, IndirectHeap::Type heapType>`
			`IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {`
			`size_t expectedSize = 0;`
			`IndirectHeap *ih = nullptr;`

			`// clang-format off`
			`switch (heapType) {`
			`case IndirectHeap::DYNAMIC_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo); break;`
			`case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo); break;`
			`case IndirectHeap::SURFACE_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo); break;`
			`}`
			`// clang-format on`

Refactor querying Main and Parent Kernel from MultiDispatchInfo Change-Id: I723d91f2f445bc7af1bcb0de46f8ac07837f3449 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-08-16 21:47:25 +08:00			`if (Kernel *parentKernel = multiDispatchInfo.peekParentKernel()) {`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`if (heapType == IndirectHeap::SURFACE_STATE) {`
Refactor querying Main and Parent Kernel from MultiDispatchInfo Change-Id: I723d91f2f445bc7af1bcb0de46f8ac07837f3449 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com> 2018-08-16 21:47:25 +08:00			`expectedSize += KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<heapType>(const_cast<const Kernel &>(*parentKernel));`
Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b 2018-03-30 23:57:51 +08:00			`} else //if (heapType == IndirectHeap::DYNAMIC_STATE \|\| heapType == IndirectHeap::INDIRECT_OBJECT)`
			`{`
			`DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());`
			`DEBUG_BREAK_IF(pDevQueue == nullptr);`
			`ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);`
			`}`
			`}`

			`if (ih == nullptr)`
			`ih = &commandQueue.getIndirectHeap(heapType, expectedSize);`

			`return *ih;`
			`}`

			`} // namespace OCLRT`