150 lines
5.6 KiB
C
150 lines
5.6 KiB
C
/*
|
|
* Copyright (c) 2017, Intel Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included
|
|
* in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
* File Name: Device_Enqueue.h
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
// Uncomment this macro to build "empty" schedulers
|
|
//#define WA_DISABLE_SCHEDULERS 1
|
|
|
|
#if !defined(__OPENCL_VERSION__)
|
|
typedef uint32_t uint;
|
|
typedef uint64_t ulong;
|
|
#endif
|
|
|
|
#define OCLRT_SIZEOF_MEDIA_INTERFACE_DESCRIPTOR_LOAD_DEVICE_CMD (4 * sizeof(uint))
|
|
#define OCLRT_SIZEOF_MEDIA_CURBE_LOAD_DEVICE_CMD (4 * sizeof(uint))
|
|
#define OCLRT_SIZEOF_MEDIA_STATE_FLUSH (2 * sizeof(uint))
|
|
#define OCLRT_SIZEOF_MI_ATOMIC_CMD (11 * sizeof(uint))
|
|
#define OCLRT_SIZEOF_MEDIA_VFE_STATE_CMD (9 * sizeof(uint))
|
|
#define OCLRT_SIZEOF_MI_ARB_CHECK (1 * sizeof(uint))
|
|
|
|
#define OCLRT_SIZEOF_MEDIA_INTERFACE_DESCRIPTOR_LOAD_DEVICE_CMD_DWORD_OFFSET (4)
|
|
#define OCLRT_SIZEOF_MI_ATOMIC_CMD_DWORD_OFFSET (11)
|
|
#define OCLRT_SIZEOF_MEDIA_CURBE_LOAD_DEVICE_CMD_DWORD_OFFSET (4)
|
|
#define OCLRT_IMM_LOAD_REGISTER_CMD_DEVICE_CMD_DWORD_OFFSET (3)
|
|
|
|
#define OCLRT_SIZEOF_MSFLUSH_DWORD (2)
|
|
#define OCLRT_SIZEOF_MI_ARB_CHECK_DWORD (1)
|
|
#define OCLRT_SIZEOF_MEDIA_VFE_STATE_DWORD (9)
|
|
|
|
#define OCLRT_BATCH_BUFFER_END_CMD (83886080)
|
|
|
|
//Constant buffer stuff
|
|
#define COMPILER_DATA_PARAMETER_GLOBAL_SURFACE (49)
|
|
|
|
#define SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT (50)
|
|
|
|
#define SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER_SHIFT (63)
|
|
#define SCHEDULER_DATA_PARAMETER_SAMPLER_SHIFT (51)
|
|
#define SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE (2 * SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT)
|
|
|
|
#define CS_PREFETCH_SIZE (8 * 64)
|
|
|
|
#define ALL_BITS_SET_DWORD_MASK (0xffffffff)
|
|
#define DWORD_SIZE_IN_BITS (32)
|
|
|
|
#define CL_sRGB 0x10BF
|
|
#define CL_sRGBX 0x10C0
|
|
#define CL_sRGBA 0x10C1
|
|
#define CL_sBGRA 0x10C2
|
|
|
|
//scheduler currently can spawn up to 8 GPGPU_WALKERS between scheduler runs, so it needs 8 * 3 HW threads for scheduling blocks + 1 HW thread to scheduler next scheduler
|
|
//each HW group consist of 3 HW threads that are capable of scheduling 1 block
|
|
|
|
//!!! Make sure value of this define equals MAX_NUMBER_OF_PARALLEL_GPGPU_WALKERS in DeviceEnqueueInternalTypes.h
|
|
#define PARALLEL_SCHEDULER_HW_GROUPS (8)
|
|
#define PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP (3)
|
|
#define PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 (3)
|
|
#define PARALLEL_SCHEDULER_HW_GROUPS_IN_THREADS (PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP * PARALLEL_SCHEDULER_HW_GROUPS)
|
|
|
|
#define PARALLEL_SCHEDULER_NUMBER_HW_THREADS (PARALLEL_SCHEDULER_HW_GROUPS_IN_THREADS + 1)
|
|
|
|
//parallel scheduler 2.0 is compiled in simd8
|
|
#define PARALLEL_SCHEDULER_COMPILATION_SIZE_20 (8)
|
|
|
|
#define HW_GROUP_ID_SHIFT(COMPILATION_SIZE) ((COMPILATION_SIZE & 0x10) ? 4 : 3)
|
|
|
|
#define GRF_SIZE (32)
|
|
#define SIZEOF_3GRFS (3 * GRF_SIZE)
|
|
|
|
//estimation for dynamic payload size
|
|
#define SCHEDULER_DYNAMIC_PAYLOAD_SIZE (PARALLEL_SCHEDULER_NUMBER_HW_THREADS * SIZEOF_3GRFS)
|
|
|
|
//assume that max DSH per walker is 9472B ( assuming registers can take up to 4KB, and max dynamic payload is around 96B * 56(HW threads) it should be fine.
|
|
#define MAX_DSH_SIZE_PER_ENQUEUE 9472
|
|
|
|
#define MAX_BINDING_TABLE_INDEX (253)
|
|
#define MAX_SSH_PER_KERNEL_SIZE (MAX_BINDING_TABLE_INDEX * 64) //max SSH that can be one kernel. It is 253 binding table entries multiplied by the Surface State size.
|
|
|
|
#define OCLRT_ARG_OFFSET_TO_SAMPLER_OBJECT_ID(ArgOffset) (ArgOffset + MAX_SSH_PER_KERNEL_SIZE)
|
|
#define OCLRT_IMAGE_MAX_OBJECT_ID (MAX_SSH_PER_KERNEL_SIZE - 1)
|
|
#define OCLRT_SAMPLER_MIN_OBJECT_ID (MAX_SSH_PER_KERNEL_SIZE)
|
|
|
|
typedef enum tagDebugDataTypes {
|
|
DBG_DEFAULT = 0,
|
|
DBG_COMMAND_QUEUE = 1,
|
|
DBG_EVENTS_UPDATE = 2,
|
|
DBG_EVENTS_NUMBER = 3,
|
|
DBG_STACK_UPDATE = 4,
|
|
DBG_BEFORE_PATCH = 5,
|
|
DBG_KERNELID = 6,
|
|
DBG_DSHOFFSET = 7,
|
|
DBG_IDOFFSET = 8,
|
|
DBG_AFTER_PATCH = 9,
|
|
DBG_UNSPECIFIED = 10,
|
|
DBG_ENQUEUES_NUMBER = 11,
|
|
DBG_LOCAL_ID,
|
|
DBG_WKG_ID,
|
|
DBG_SCHEDULER_END,
|
|
// Add here new debug enums
|
|
DBG_MAX
|
|
} DebugDataTypes;
|
|
// Struct for debugging kernels
|
|
typedef struct
|
|
{
|
|
DebugDataTypes m_dataType;
|
|
uint m_dataSize;
|
|
} DebugDataInfo;
|
|
typedef struct
|
|
{
|
|
enum DDBFlags { DDB_HAS_DATA_INFO = 1,
|
|
DDB_SCHEDULER_PROFILING = 2,
|
|
DDB_COMMAND_QUEUE_RAW = 4 } ddbFlags;
|
|
uint m_size;
|
|
uint m_stackTop; //index of data stack
|
|
uint m_dataInfoTop; //index of the top of DataInfo stack, this stacks grows with decrementing address
|
|
uint m_stackBottom;
|
|
uint m_dataInfoBottom; //index of the bottom of DataInfo
|
|
uint m_dataInfoSize;
|
|
uint m_flags;
|
|
|
|
uint m_offset; //current offset indicates free place
|
|
uint m_data[100]; //buffer
|
|
} DebugDataBuffer;
|
|
|
|
#pragma pack(push)
|
|
#pragma pack(4)
|
|
#include "DeviceEnqueueInternalTypes.h"
|
|
#pragma pack(pop)
|