compute-runtime/opencl/source/execution_model/device_enqueue.h

/*
 * Copyright (C) 2017-2020 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#pragma once

// Uncomment this macro to build "empty" schedulers
//#define WA_DISABLE_SCHEDULERS 1

#if !defined(__OPENCL_VERSION__)
#include <cstdint>

typedef uint32_t uint;
typedef uint64_t ulong;
#endif

#define OCLRT_SIZEOF_MEDIA_INTERFACE_DESCRIPTOR_LOAD_DEVICE_CMD (4 * sizeof(uint))
#define OCLRT_SIZEOF_MEDIA_CURBE_LOAD_DEVICE_CMD (4 * sizeof(uint))
#define OCLRT_SIZEOF_MEDIA_STATE_FLUSH (2 * sizeof(uint))
#define OCLRT_SIZEOF_MI_ATOMIC_CMD (11 * sizeof(uint))
#define OCLRT_SIZEOF_MEDIA_VFE_STATE_CMD (9 * sizeof(uint))
#define OCLRT_SIZEOF_MI_ARB_CHECK (1 * sizeof(uint))

#define OCLRT_SIZEOF_MEDIA_INTERFACE_DESCRIPTOR_LOAD_DEVICE_CMD_DWORD_OFFSET (4)
#define OCLRT_SIZEOF_MI_ATOMIC_CMD_DWORD_OFFSET (11)
#define OCLRT_SIZEOF_MEDIA_CURBE_LOAD_DEVICE_CMD_DWORD_OFFSET (4)
#define OCLRT_IMM_LOAD_REGISTER_CMD_DEVICE_CMD_DWORD_OFFSET (3)

#define OCLRT_SIZEOF_MSFLUSH_DWORD (2)
#define OCLRT_SIZEOF_MI_ARB_CHECK_DWORD (1)
#define OCLRT_SIZEOF_MEDIA_VFE_STATE_DWORD (9)

#define OCLRT_BATCH_BUFFER_END_CMD (83886080)

//Constant buffer stuff
#define COMPILER_DATA_PARAMETER_GLOBAL_SURFACE (49)

#define SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT (50)

#define SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER_SHIFT (63)
#define SCHEDULER_DATA_PARAMETER_SAMPLER_SHIFT (51)
#define SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE (2 * SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT)

#define CS_PREFETCH_SIZE (8 * 64)

#define ALL_BITS_SET_DWORD_MASK (0xffffffff)
#define DWORD_SIZE_IN_BITS (32)

#define CL_sRGB 0x10BF
#define CL_sRGBX 0x10C0
#define CL_sRGBA 0x10C1
#define CL_sBGRA 0x10C2

//scheduler currently can spawn up to 8 GPGPU_WALKERS between scheduler runs, so it needs 8 * 3 HW threads for scheduling blocks + 1 HW thread to scheduler next scheduler
//each HW group consist of 3 HW threads that are capable of scheduling 1 block

//!!! Make sure value of this define equals MAX_NUMBER_OF_PARALLEL_GPGPU_WALKERS in DeviceEnqueueInternalTypes.h
#define PARALLEL_SCHEDULER_HW_GROUPS (8)
#define PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP (3)
#define PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 (3)
#define PARALLEL_SCHEDULER_HW_GROUPS_IN_THREADS (PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP * PARALLEL_SCHEDULER_HW_GROUPS)

#define PARALLEL_SCHEDULER_NUMBER_HW_THREADS (PARALLEL_SCHEDULER_HW_GROUPS_IN_THREADS + 1)

//parallel scheduler 2.0 is compiled in simd8
#define PARALLEL_SCHEDULER_COMPILATION_SIZE_20 (8)

#define HW_GROUP_ID_SHIFT(COMPILATION_SIZE) ((COMPILATION_SIZE & 0x10) ? 4 : 3)

#define GRF_SIZE (32)
#define SIZEOF_3GRFS (3 * GRF_SIZE)

//estimation for dynamic payload size
#define SCHEDULER_DYNAMIC_PAYLOAD_SIZE (PARALLEL_SCHEDULER_NUMBER_HW_THREADS * SIZEOF_3GRFS)

//assume that max DSH per walker is 9472B ( assuming registers can take up to 4KB, and max dynamic payload is around 96B * 56(HW threads) it should be fine.
#define MAX_DSH_SIZE_PER_ENQUEUE 9472

#define MAX_BINDING_TABLE_INDEX (253)
#define MAX_SSH_PER_KERNEL_SIZE (MAX_BINDING_TABLE_INDEX * 64) //max SSH that can be one kernel. It is 253 binding table entries multiplied by the Surface State size.

#define OCLRT_ARG_OFFSET_TO_SAMPLER_OBJECT_ID(ArgOffset) (ArgOffset + MAX_SSH_PER_KERNEL_SIZE)
#define OCLRT_IMAGE_MAX_OBJECT_ID (MAX_SSH_PER_KERNEL_SIZE - 1)
#define OCLRT_SAMPLER_MIN_OBJECT_ID (MAX_SSH_PER_KERNEL_SIZE)

typedef enum tagDebugDataTypes {
    DBG_DEFAULT = 0,
    DBG_COMMAND_QUEUE = 1,
    DBG_EVENTS_UPDATE = 2,
    DBG_EVENTS_NUMBER = 3,
    DBG_STACK_UPDATE = 4,
    DBG_BEFORE_PATCH = 5,
    DBG_KERNELID = 6,
    DBG_DSHOFFSET = 7,
    DBG_IDOFFSET = 8,
    DBG_AFTER_PATCH = 9,
    DBG_UNSPECIFIED = 10,
    DBG_ENQUEUES_NUMBER = 11,
    DBG_LOCAL_ID,
    DBG_WKG_ID,
    DBG_SCHEDULER_END,
    // Add here new debug enums
    DBG_MAX
} DebugDataTypes;
// Struct for debugging kernels
typedef struct
{
    DebugDataTypes m_dataType;
    uint m_dataSize;
} DebugDataInfo;
typedef struct
{
    enum DDBFlags { DDB_HAS_DATA_INFO = 1,
                    DDB_SCHEDULER_PROFILING = 2,
                    DDB_COMMAND_QUEUE_RAW = 4 } ddbFlags;
    uint m_size;
    uint m_stackTop;    //index of data stack
    uint m_dataInfoTop; //index of the top of DataInfo stack, this stacks grows with decrementing address
    uint m_stackBottom;
    uint m_dataInfoBottom; //index of the bottom of DataInfo
    uint m_dataInfoSize;
    uint m_flags;

    uint m_offset;    //current offset indicates free place
    uint m_data[100]; //buffer
} DebugDataBuffer;

#pragma pack(push)
#pragma pack(4)
#include "DeviceEnqueueInternalTypes.h"
#pragma pack(pop)
Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`/*`
Changing include paths Change-Id: I3b878463289083c956382e68da3473788cf5c15f 2020-02-22 16:28:27 +08:00			`* Copyright (C) 2017-2020 Intel Corporation`
Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`*`
clang-format: enable sorting includes Include files are now grouped and sorted in following order: 1. Header file of the class the current file implements 2. Project files 3. Third party files 4. Standard library Change-Id: If31af05652184169f7fee1d7ad08f1b2ed602cf0 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com> 2019-02-27 18:39:32 +08:00			`* SPDX-License-Identifier: MIT`
Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`*`
clang-format: enable sorting includes Include files are now grouped and sorted in following order: 1. Header file of the class the current file implements 2. Project files 3. Third party files 4. Standard library Change-Id: If31af05652184169f7fee1d7ad08f1b2ed602cf0 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com> 2019-02-27 18:39:32 +08:00			`*/`
Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00
			`#pragma once`

			`// Uncomment this macro to build "empty" schedulers`
			`//#define WA_DISABLE_SCHEDULERS 1`

			`#if !defined(__OPENCL_VERSION__)`
clang-format: enable sorting includes Include files are now grouped and sorted in following order: 1. Header file of the class the current file implements 2. Project files 3. Third party files 4. Standard library Change-Id: If31af05652184169f7fee1d7ad08f1b2ed602cf0 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com> 2019-02-27 18:39:32 +08:00			`#include <cstdint>`

Initial commit Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd 2017-12-21 07:45:38 +08:00			`typedef uint32_t uint;`
			`typedef uint64_t ulong;`
			`#endif`

			`#define OCLRT_SIZEOF_MEDIA_INTERFACE_DESCRIPTOR_LOAD_DEVICE_CMD (4 * sizeof(uint))`
			`#define OCLRT_SIZEOF_MEDIA_CURBE_LOAD_DEVICE_CMD (4 * sizeof(uint))`
			`#define OCLRT_SIZEOF_MEDIA_STATE_FLUSH (2 * sizeof(uint))`
			`#define OCLRT_SIZEOF_MI_ATOMIC_CMD (11 * sizeof(uint))`
			`#define OCLRT_SIZEOF_MEDIA_VFE_STATE_CMD (9 * sizeof(uint))`
			`#define OCLRT_SIZEOF_MI_ARB_CHECK (1 * sizeof(uint))`

			`#define OCLRT_SIZEOF_MEDIA_INTERFACE_DESCRIPTOR_LOAD_DEVICE_CMD_DWORD_OFFSET (4)`
			`#define OCLRT_SIZEOF_MI_ATOMIC_CMD_DWORD_OFFSET (11)`
			`#define OCLRT_SIZEOF_MEDIA_CURBE_LOAD_DEVICE_CMD_DWORD_OFFSET (4)`
			`#define OCLRT_IMM_LOAD_REGISTER_CMD_DEVICE_CMD_DWORD_OFFSET (3)`

			`#define OCLRT_SIZEOF_MSFLUSH_DWORD (2)`
			`#define OCLRT_SIZEOF_MI_ARB_CHECK_DWORD (1)`
			`#define OCLRT_SIZEOF_MEDIA_VFE_STATE_DWORD (9)`

			`#define OCLRT_BATCH_BUFFER_END_CMD (83886080)`

			`//Constant buffer stuff`
			`#define COMPILER_DATA_PARAMETER_GLOBAL_SURFACE (49)`

			`#define SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT (50)`

			`#define SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER_SHIFT (63)`
			`#define SCHEDULER_DATA_PARAMETER_SAMPLER_SHIFT (51)`
			`#define SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE (2 * SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT)`

			`#define CS_PREFETCH_SIZE (8 * 64)`

			`#define ALL_BITS_SET_DWORD_MASK (0xffffffff)`
			`#define DWORD_SIZE_IN_BITS (32)`

			`#define CL_sRGB 0x10BF`
			`#define CL_sRGBX 0x10C0`
			`#define CL_sRGBA 0x10C1`
			`#define CL_sBGRA 0x10C2`

			`//scheduler currently can spawn up to 8 GPGPU_WALKERS between scheduler runs, so it needs 8 * 3 HW threads for scheduling blocks + 1 HW thread to scheduler next scheduler`
			`//each HW group consist of 3 HW threads that are capable of scheduling 1 block`

			`//!!! Make sure value of this define equals MAX_NUMBER_OF_PARALLEL_GPGPU_WALKERS in DeviceEnqueueInternalTypes.h`
			`#define PARALLEL_SCHEDULER_HW_GROUPS (8)`
			`#define PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP (3)`
			`#define PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 (3)`
			`#define PARALLEL_SCHEDULER_HW_GROUPS_IN_THREADS (PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP * PARALLEL_SCHEDULER_HW_GROUPS)`

			`#define PARALLEL_SCHEDULER_NUMBER_HW_THREADS (PARALLEL_SCHEDULER_HW_GROUPS_IN_THREADS + 1)`

			`//parallel scheduler 2.0 is compiled in simd8`
			`#define PARALLEL_SCHEDULER_COMPILATION_SIZE_20 (8)`

			`#define HW_GROUP_ID_SHIFT(COMPILATION_SIZE) ((COMPILATION_SIZE & 0x10) ? 4 : 3)`

			`#define GRF_SIZE (32)`
			`#define SIZEOF_3GRFS (3 * GRF_SIZE)`

			`//estimation for dynamic payload size`
			`#define SCHEDULER_DYNAMIC_PAYLOAD_SIZE (PARALLEL_SCHEDULER_NUMBER_HW_THREADS * SIZEOF_3GRFS)`

			`//assume that max DSH per walker is 9472B ( assuming registers can take up to 4KB, and max dynamic payload is around 96B * 56(HW threads) it should be fine.`
			`#define MAX_DSH_SIZE_PER_ENQUEUE 9472`

			`#define MAX_BINDING_TABLE_INDEX (253)`
			`#define MAX_SSH_PER_KERNEL_SIZE (MAX_BINDING_TABLE_INDEX * 64) //max SSH that can be one kernel. It is 253 binding table entries multiplied by the Surface State size.`

			`#define OCLRT_ARG_OFFSET_TO_SAMPLER_OBJECT_ID(ArgOffset) (ArgOffset + MAX_SSH_PER_KERNEL_SIZE)`
			`#define OCLRT_IMAGE_MAX_OBJECT_ID (MAX_SSH_PER_KERNEL_SIZE - 1)`
			`#define OCLRT_SAMPLER_MIN_OBJECT_ID (MAX_SSH_PER_KERNEL_SIZE)`

			`typedef enum tagDebugDataTypes {`
			`DBG_DEFAULT = 0,`
			`DBG_COMMAND_QUEUE = 1,`
			`DBG_EVENTS_UPDATE = 2,`
			`DBG_EVENTS_NUMBER = 3,`
			`DBG_STACK_UPDATE = 4,`
			`DBG_BEFORE_PATCH = 5,`
			`DBG_KERNELID = 6,`
			`DBG_DSHOFFSET = 7,`
			`DBG_IDOFFSET = 8,`
			`DBG_AFTER_PATCH = 9,`
			`DBG_UNSPECIFIED = 10,`
			`DBG_ENQUEUES_NUMBER = 11,`
			`DBG_LOCAL_ID,`
			`DBG_WKG_ID,`
			`DBG_SCHEDULER_END,`
			`// Add here new debug enums`
			`DBG_MAX`
			`} DebugDataTypes;`
			`// Struct for debugging kernels`
			`typedef struct`
			`{`
			`DebugDataTypes m_dataType;`
			`uint m_dataSize;`
			`} DebugDataInfo;`
			`typedef struct`
			`{`
			`enum DDBFlags { DDB_HAS_DATA_INFO = 1,`
			`DDB_SCHEDULER_PROFILING = 2,`
			`DDB_COMMAND_QUEUE_RAW = 4 } ddbFlags;`
			`uint m_size;`
			`uint m_stackTop; //index of data stack`
			`uint m_dataInfoTop; //index of the top of DataInfo stack, this stacks grows with decrementing address`
			`uint m_stackBottom;`
			`uint m_dataInfoBottom; //index of the bottom of DataInfo`
			`uint m_dataInfoSize;`
			`uint m_flags;`

			`uint m_offset; //current offset indicates free place`
			`uint m_data[100]; //buffer`
			`} DebugDataBuffer;`

			`#pragma pack(push)`
			`#pragma pack(4)`
			`#include "DeviceEnqueueInternalTypes.h"`
			`#pragma pack(pop)`