3333 lines
148 KiB
Common Lisp
3333 lines
148 KiB
Common Lisp
/*
|
|
* Copyright (C) 2017-2019 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
*/
|
|
|
|
#ifndef SCHEDULER_EMULATION
|
|
#include "device_enqueue.h"
|
|
#endif
|
|
|
|
// float passed as int
|
|
extern float __intel__getProfilingTimerResolution();
|
|
|
|
#ifndef EMULATION_ENTER_FUNCTION
|
|
#define EMULATION_ENTER_FUNCTION( )
|
|
#endif
|
|
|
|
#ifndef NULL
|
|
#define NULL 0
|
|
#endif
|
|
|
|
#define SIMD8 0
|
|
#define SIMD16 1
|
|
#define SIMD32 2
|
|
|
|
#define SHORT_SIZE_IN_BYTES 2
|
|
#define DWORD_SIZE_IN_BYTES 4
|
|
#define QWORD_SIZE_IN_BYTES 8
|
|
|
|
#define MAX_GLOBAL_ARGS 255
|
|
|
|
#define MASK_LOW16_BITS 0xFFFF
|
|
//Currently setting to 8
|
|
#define MAX_WALKERS_IN_PARALELL PARALLEL_SCHEDULER_HW_GROUPS
|
|
//Need 4 uints per walker ( command packet offset, slb offest, dsh offset, idt offset, + 1 to store total
|
|
#define PARALLEL_SCHEDULER_OFFSETS_NUMBER 4
|
|
#define PARALLEL_SCHEDULER_LOCAL_MEM_SIZE ( MAX_WALKERS_IN_PARALELL * PARALLEL_SCHEDULER_OFFSETS_NUMBER + 1 )
|
|
//Last index
|
|
#define TOTAL_ENQUEUES_FOUND ( PARALLEL_SCHEDULER_LOCAL_MEM_SIZE - 1 )
|
|
|
|
//CURBE STUFF, only entries that really needs to be patched
|
|
#define SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT 1
|
|
#define SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE 2
|
|
#define SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE 3
|
|
#define SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS 4
|
|
#define SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS 5
|
|
#define SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES 8
|
|
#define SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET 16
|
|
#define SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS 17
|
|
#define SCHEDULER_DATA_PARAMETER_PARENT_EVENT 22
|
|
#define SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE 28
|
|
|
|
#define SCHEDULER_DATA_PARAMETER_IMAGE_WIDTH ( 9 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
|
|
#define SCHEDULER_DATA_PARAMETER_IMAGE_HEIGHT ( 10 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
|
|
#define SCHEDULER_DATA_PARAMETER_IMAGE_DEPTH ( 11 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
|
|
#define SCHEDULER_DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE ( 12 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
|
|
#define SCHEDULER_DATA_PARAMETER_IMAGE_CHANNEL_ORDER ( 13 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
|
|
#define SCHEDULER_DATA_PARAMETER_IMAGE_ARRAY_SIZE ( 18 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
|
|
#define SCHEDULER_DATA_PARAMETER_IMAGE_NUM_SAMPLES ( 20 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
|
|
#define SCHEDULER_DATA_PARAMETER_IMAGE_NUM_MIP_LEVELS ( 27 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
|
|
#define SCHEDULER_DATA_PARAMETER_IMAGE_OBJECT_ID ( 35 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
|
|
#define SCHEDULER_DATA_PARAMETER_IMAGE_SRGB_CHANNEL_ORDER ( 39 + SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
|
|
|
|
#define DATA_PARAMETER_SAMPLER_ADDRESS_MODE ( 14 + SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE )
|
|
#define DATA_PARAMETER_SAMPLER_NORMALIZED_COORDS ( 15 + SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE )
|
|
#define DATA_PARAMETER_SAMPLER_COORDINATE_SNAP_WA_REQUIRED ( 21 + SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE )
|
|
#define SCHEDULER_DATA_PARAMETER_SAMPLER_OBJECT_ID ( 35 + SCHEDULER_DATA_PARAMETER_SAMPLER_ADDED_VALUE )
|
|
|
|
//CURBE STUFF, only entries that really needs to be patched
|
|
#define SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT_MASK ( 1 << 1 )
|
|
#define SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE_MASK ( 1 << 2 )
|
|
#define SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE_MASK ( 1 << 3 )
|
|
#define SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS_MASK ( 1 << 4 )
|
|
#define SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS_MASK ( 1 << 5 )
|
|
#define SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES_MASK ( 1 << 8 )
|
|
#define SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET_MASK ( 1 << 16 )
|
|
#define SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS_MASK ( 1 << 17 )
|
|
#define SCHEDULER_DATA_PARAMETER_PARENT_EVENT_MASK ( 1 << 22 )
|
|
#define SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE_MASK ( 1 << 28 )
|
|
#define SCHEDULER_DATA_PARAMETER_IMAGE_CURBE_ENTRIES ( ( ulong ) 1 << SCHEDULER_DATA_PARAMETER_IMAGES_CURBE_SHIFT )
|
|
#define SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER ( ( ( ulong ) 1 ) << SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER_SHIFT )
|
|
|
|
|
|
#define SCHEDULER_DATA_PARAMETER_SAMPLER_MASK ( ( ( ulong ) 1 ) << SCHEDULER_DATA_PARAMETER_SAMPLER_SHIFT )
|
|
//Error codes
|
|
#define SCHEDULER_CURBE_TOKEN_MISSED 10
|
|
#define SCHEDULER_CURBE_ARGUMENTS_SIZE_MISMATCH 11
|
|
|
|
#define CAN_BE_RECLAIMED 123456
|
|
|
|
#define SCHEDULER_MSF_INITIAL 1
|
|
#define SCHEDULER_MSF_SECOND 2
|
|
|
|
//Uncomment to enable logging debug data
|
|
//#define ENABLE_DEBUG_BUFFER 1
|
|
|
|
#ifdef ENABLE_DEBUG_BUFFER
|
|
//Update DebugDataInfo types in device_enqueue.h and PrintDebugDataBuffer() in cmd_queue_device.cpp
|
|
|
|
//Flags
|
|
#define DDB_HAS_DATA_INFO ( 1 << 0 )
|
|
#define DDB_SCHEDULER_PROFILING ( 1 << 1 )
|
|
|
|
#define DDB_ALL ( 0xffffffff )
|
|
|
|
#endif
|
|
|
|
//Turn this to 1 to turn on debug calls, notice that it will cause up to 10 x longer time to build scheduler
|
|
//#define SCHEDULER_DEBUG_MODE 1
|
|
|
|
//#define DISABLE_RESOURCE_RECLAMATION 1
|
|
/*
|
|
Resource reclamation procedure
|
|
1. Move all new command packets from queue_t to qstorage
|
|
2. In case there is place in storage for whole queue, reclaim space on queue
|
|
3. Construct stack basing on new commands added in the qstorage
|
|
4. Browse stack to find next item for execution
|
|
5. When you take item from the stack and schedule it , reclaim place on qstorage buffer
|
|
*/
|
|
|
|
typedef struct
|
|
{
|
|
uint3 ActualLocalSize;
|
|
uint3 WalkerDimSize;
|
|
uint3 WalkerStartPoint;
|
|
} IGIL_WalkerData;
|
|
|
|
typedef struct
|
|
{
|
|
uint3 LocalWorkSize;
|
|
uint3 TotalDimSize;
|
|
IGIL_WalkerData WalkerArray[ 8 ];
|
|
} IGIL_WalkerEnumeration;
|
|
|
|
inline void patchDword( __global uint* patchedDword, uint startOffset, uint endOffset, uint value )
|
|
{
|
|
uint LeftMask = ALL_BITS_SET_DWORD_MASK >> ( DWORD_SIZE_IN_BITS - endOffset - 1 );
|
|
uint RightMask = ALL_BITS_SET_DWORD_MASK << ( startOffset );
|
|
uint CleanMask = ~( RightMask & LeftMask );
|
|
*patchedDword &= CleanMask;
|
|
*patchedDword |= ( value << startOffset );
|
|
}
|
|
|
|
inline __global IGIL_KernelAddressData* IGIL_GetKernelAddressData( __global IGIL_KernelDataHeader* pKernelReflection, uint blockId )
|
|
{
|
|
return ( __global IGIL_KernelAddressData* ) ( &pKernelReflection->m_data[ blockId ] );
|
|
}
|
|
__global IGIL_KernelData* IGIL_GetKernelData( __global IGIL_KernelDataHeader* pKernelReflection, uint blockId )
|
|
{
|
|
__global IGIL_KernelAddressData* pKernelAddressData = IGIL_GetKernelAddressData( pKernelReflection, blockId );
|
|
uint Offset = pKernelAddressData->m_KernelDataOffset;
|
|
__global char* pKernelReflectionRaw = ( __global char * ) pKernelReflection;
|
|
return ( __global IGIL_KernelData* ) ( &pKernelReflectionRaw[ Offset ] );
|
|
}
|
|
|
|
|
|
inline __global IGIL_CommandHeader* TEMP_IGIL_GetCommandHeader( __global IGIL_CommandQueue* q, uint offset )
|
|
{
|
|
__global uchar *pQueueRaw = (__global uchar *) q;
|
|
|
|
__global IGIL_CommandHeader* pCommand = ( __global IGIL_CommandHeader* )( pQueueRaw + offset );
|
|
|
|
return pCommand;
|
|
}
|
|
//Make sure enough command packets are in command queue before calling this function.
|
|
__global IGIL_CommandHeader* TEMP_IGIL_GetNthCommandHeader( __global IGIL_CommandQueue* q, uint initialOffset, uint number )
|
|
{
|
|
__global uchar *pQueueRaw = (__global uchar *) q;
|
|
|
|
__global IGIL_CommandHeader* pCommand = ( __global IGIL_CommandHeader* )( pQueueRaw + initialOffset );
|
|
uint Offset = initialOffset;
|
|
//Traverse queue_t unless nth command packet is found
|
|
while( number > 0 )
|
|
{
|
|
Offset += pCommand->m_commandSize;
|
|
pCommand = TEMP_IGIL_GetCommandHeader( q, Offset );
|
|
number--;
|
|
}
|
|
|
|
return pCommand;
|
|
}
|
|
|
|
//Make sure enough command packets are in command queue before calling this function.
|
|
uint TEMP_IGIL_GetNthCommandHeaderOffset( __global IGIL_CommandQueue* q, uint initialOffset, uint number )
|
|
{
|
|
__global uchar *pQueueRaw = (__global uchar *) q;
|
|
|
|
__global IGIL_CommandHeader* pCommand = ( __global IGIL_CommandHeader* )( pQueueRaw + initialOffset );
|
|
uint Offset = initialOffset;
|
|
//Traverse queue_t unless nth command packet is found
|
|
while( number > 0 )
|
|
{
|
|
Offset += pCommand->m_commandSize;
|
|
pCommand = TEMP_IGIL_GetCommandHeader( q, Offset );
|
|
number--;
|
|
}
|
|
|
|
return Offset;
|
|
}
|
|
|
|
__global IGIL_CommandHeader* GetCommandHeaderFromStorage( __global uint* queueStorage, uint offset )
|
|
{
|
|
__global uchar *pQueueRaw = ( __global uchar * ) queueStorage;
|
|
|
|
__global IGIL_CommandHeader* pCommand = ( __global IGIL_CommandHeader* )( pQueueRaw + offset );
|
|
|
|
return pCommand;
|
|
}
|
|
|
|
inline queue_t TEMP_IGIL_GetQueueT( IGIL_CommandHeader * queuePtr )
|
|
{
|
|
return __builtin_astype( queuePtr, queue_t );
|
|
}
|
|
|
|
inline __global IGIL_DeviceEvent* TEMP_IGIL_GetDeviceEvents( __global IGIL_EventPool *pool )
|
|
{
|
|
return ( __global IGIL_DeviceEvent * )( pool + 1 );
|
|
}
|
|
|
|
inline __global IGIL_DeviceEvent* TEMP_IGIL_GetDeviceEvent( __global IGIL_EventPool *pool, uint eventId )
|
|
{
|
|
__global IGIL_DeviceEvent * pEvent = ( __global IGIL_DeviceEvent * )( pool + 1 );
|
|
|
|
return ( __global IGIL_DeviceEvent * )( pEvent + eventId );
|
|
}
|
|
bool SetEventState( __global IGIL_EventPool *pool, uint eventId, int state )
|
|
{
|
|
__global IGIL_DeviceEvent* pDeviceEvent = TEMP_IGIL_GetDeviceEvent( pool, eventId );
|
|
pDeviceEvent->m_state = state;
|
|
return true;
|
|
}
|
|
|
|
void TEMP_IGIL_FreeEvent( clk_event_t event, __global IGIL_EventPool *pool )
|
|
{
|
|
//Offset into the event data in the pool
|
|
__global IGIL_DeviceEvent *events = TEMP_IGIL_GetDeviceEvents( pool );
|
|
|
|
atomic_xchg( &events[ ( uint )(size_t)__builtin_astype( event, void* ) ].m_state, IGIL_EVENT_UNUSED );
|
|
}
|
|
|
|
void IGILLOCAL_MEMCPY_GTOG( __global void* pDst, __global void* pSrc, int numBytes )
|
|
{
|
|
numBytes = numBytes >> 2;
|
|
for( int i = 0; i < numBytes; i++ )
|
|
{
|
|
( ( __global uint* )pDst ) [ i ] = ( ( __global int* )pSrc )[ i ];
|
|
}
|
|
}
|
|
|
|
//Global memcpy running on all witems possible, make sure it's run from all hw threads.
|
|
void GLOBAL_MEMCPYUINT( __global void* pDst, __global void* pSrc, int numBytes )
|
|
{
|
|
uint total_local_size = get_local_size( 0 );
|
|
uint LoopCtr = numBytes / ( total_local_size * DWORD_SIZE_IN_BYTES );
|
|
uint LeftOver = numBytes % ( total_local_size * DWORD_SIZE_IN_BYTES );
|
|
uint Lid = get_local_id( 0 );
|
|
uint i = 0;
|
|
|
|
//Main copy
|
|
for( i = 0; i < LoopCtr; i++ )
|
|
{
|
|
( ( __global uint* )pDst ) [ Lid + total_local_size * i ] = ( ( __global uint* )pSrc )[ Lid + total_local_size * i ];
|
|
}
|
|
//Copy what's left
|
|
if( LeftOver != 0 )
|
|
{
|
|
if( Lid * DWORD_SIZE_IN_BYTES < LeftOver )
|
|
{
|
|
( ( __global uint* )pDst ) [ Lid + total_local_size * i ] = ( ( __global uint* )pSrc )[ Lid + total_local_size * i ];
|
|
}
|
|
}
|
|
}
|
|
|
|
//In SIMD8 to fully use cachelines copy in portions of uint2 , 8 bytes x 8 witems = 64 bytes cacheline size.
|
|
//Global memcpy running on all witems possible, make sure it's run from all hw threads.
|
|
void GLOBAL_MEMCPY( __global void* pDst, __global void* pSrc, int numBytes )
|
|
{
|
|
//In case I need dword copy use uint version of this function.
|
|
if( ( numBytes % ( DWORD_SIZE_IN_BYTES * 2 ) ) != 0 )
|
|
{
|
|
GLOBAL_MEMCPYUINT( pDst, pSrc, numBytes );
|
|
}
|
|
else
|
|
{
|
|
uint total_local_size = get_local_size( 0 );
|
|
uint LoopCtr = numBytes / ( total_local_size * DWORD_SIZE_IN_BYTES * 2 );
|
|
uint LeftOver = numBytes % ( total_local_size * DWORD_SIZE_IN_BYTES * 2 );
|
|
uint Lid = get_local_id( 0 );
|
|
uint i = 0;
|
|
|
|
//Main copy
|
|
for( i = 0; i < LoopCtr; i++ )
|
|
{
|
|
( ( __global uint2* )pDst ) [ Lid + total_local_size * i ] = ( ( __global uint2* )pSrc )[ Lid + total_local_size * i ];
|
|
}
|
|
//Copy what's left
|
|
if( LeftOver != 0 )
|
|
{
|
|
if( Lid * DWORD_SIZE_IN_BYTES * 2 < LeftOver )
|
|
{
|
|
( ( __global uint2* )pDst ) [ Lid + total_local_size * i ] = ( ( __global uint2* )pSrc )[ Lid + total_local_size * i ];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//This works only for 32 bit types
|
|
uint GetNextPowerof2( uint number )
|
|
{
|
|
number --;
|
|
number |= number >> 1;
|
|
number |= number >> 2;
|
|
number |= number >> 4;
|
|
number |= number >> 8;
|
|
number |= number >> 16;
|
|
number ++;
|
|
return number;
|
|
}
|
|
|
|
#define OCLRT_ALIGN( a, b ) ( ( ( ( a ) % ( b ) ) != 0 ) ? ( ( a ) - ( ( a ) % ( b ) ) + ( b ) ) : ( a ) )
|
|
#define OCLRT_MAX( a, b ) ( ( ( a ) > ( b ) ) ? ( a ) : ( b ) )
|
|
|
|
#ifndef SCHEDULER_EMULATION
|
|
#include "scheduler_definitions.h"
|
|
#endif
|
|
|
|
#ifdef ENABLE_DEBUG_BUFFER
|
|
//Adds private uint* data to debug buffer
|
|
// ddb - global buffer to keep data
|
|
// src - source data
|
|
// numberOfElements
|
|
// localID - WI ID , when 0xffffffff all WI copy data
|
|
// returns 0 when data was copied, else -1
|
|
int AddToDebugBufferParallel( __global DebugDataBuffer* ddb, uint* pSrc, uint numberOfElements, uint localID )
|
|
{
|
|
if( ddb->m_flags == 0 )
|
|
{
|
|
//All work items in local group copies data
|
|
if( localID == 0xffffffff )
|
|
{
|
|
//Check if there is enough place for new data in ddb for every workitem in local group
|
|
if( ( ( ddb->m_size / sizeof( uint ) - ddb->m_offset ) * get_local_size(0) ) >= numberOfElements )
|
|
{
|
|
uint startIndex = atomic_add( &ddb->m_offset, numberOfElements );
|
|
uint i;
|
|
int srcPos;
|
|
for( i = startIndex, srcPos = 0; i < startIndex + numberOfElements; i++, srcPos++ )
|
|
{
|
|
ddb->m_data[ i ] = pSrc[ srcPos ];
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//Check if there is enough place for new data in ddb for one workitem in local group
|
|
if( ( ddb->m_size / sizeof( uint ) - ddb->m_offset ) >= numberOfElements )
|
|
{
|
|
if( get_local_id(0) == localID )
|
|
{
|
|
uint startIndex = atomic_add( &ddb->m_offset, numberOfElements );
|
|
uint i;
|
|
int srcPos;
|
|
for( i = startIndex, srcPos = 0; i < startIndex + numberOfElements; i++, srcPos++ )
|
|
{
|
|
ddb->m_data[ i ] = pSrc[ srcPos ];
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
//Adds private uint to debug buffer
|
|
int AddToDBParallel(__global DebugDataBuffer* ddb, uint pSrc, uint localID)
|
|
{
|
|
return AddToDebugBufferParallel(ddb, &pSrc, 1, localID);
|
|
}
|
|
|
|
//Adds global uint* data to debug buffer
|
|
// ddb - global buffer to keep data
|
|
// src - source data
|
|
// numberOfElements
|
|
// localID - WI ID , when 0xffffffff all WI copy data
|
|
// returns 0 when data was copied, else -1
|
|
int AddGlobalToDebugBufferParallel( __global DebugDataBuffer* ddb, __global uint* pSrc, uint numberOfElements, uint localID )
|
|
{
|
|
if( ddb->m_flags == 0 )
|
|
{
|
|
//All work items in local group copies data
|
|
if( localID == 0xffffffff )
|
|
{
|
|
//Check if there is enough place for new data in ddb for every workitem in local group
|
|
if( ( ( ddb->m_size / sizeof( uint ) - ddb->m_offset ) * get_local_size(0) ) >= numberOfElements )
|
|
{
|
|
uint startIndex = atomic_add( &ddb->m_offset, numberOfElements );
|
|
uint i;
|
|
int srcPos;
|
|
for( i = startIndex, srcPos = 0; i < startIndex + numberOfElements; i++, srcPos++ )
|
|
{
|
|
ddb->m_data[ i ] = pSrc[ srcPos ];
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//Check if there is enough place for new data in ddb for one workitem in local group
|
|
if( ( ddb->m_size / sizeof( uint ) - ddb->m_offset ) >= numberOfElements )
|
|
{
|
|
if( get_local_id(0) == localID )
|
|
{
|
|
uint startIndex = atomic_add( &ddb->m_offset, numberOfElements );
|
|
uint i;
|
|
int srcPos;
|
|
for( i = startIndex, srcPos = 0; i < startIndex + numberOfElements; i++, srcPos++ )
|
|
{
|
|
ddb->m_data[ i ] = pSrc[ srcPos ];
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
|
|
|
|
//Adds private uint data to debug buffer
|
|
// ddb - global buffer to keep data
|
|
// src - source data
|
|
// dataType - enum defining added data type
|
|
// returns 0 when data was copied, else -1
|
|
int AddToDebugBuffer( __global DebugDataBuffer* ddb, __private ulong src, uint dataType, uint localID )
|
|
{
|
|
if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO )
|
|
{
|
|
//All work items in local group copies data
|
|
if( localID == 0xffffffff )
|
|
{
|
|
//Check if there is enough place for new data in ddb
|
|
if( ( ( ddb->m_stackTop - ddb->m_dataInfoTop ) >= 4 * get_local_size( 0 ) ) && ( ddb->m_dataInfoTop > ddb->m_stackTop ) )
|
|
{
|
|
//Check flags
|
|
if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO )
|
|
{
|
|
uint startIndex = atomic_add( &ddb->m_offset, 1 );
|
|
uint dataIndex = atomic_sub( &ddb->m_dataInfoTop, ddb->m_dataInfoSize );
|
|
uint stackTop = atomic_add( &ddb->m_stackTop, 8 );
|
|
|
|
__global uchar* pCharDebugQueue = ( __global uchar * )ddb;
|
|
__global uint* dest = ( __global uint* )&pCharDebugQueue[ stackTop ];
|
|
__global DebugDataInfo* debugInfo;
|
|
|
|
dest[ 0 ] = ( uint )src & 0xffffffff;
|
|
dest[ 1 ] = ( src & 0xffffffff00000000 ) >> 32;
|
|
debugInfo = ( __global DebugDataInfo* )( &pCharDebugQueue[ dataIndex ] );
|
|
debugInfo->m_dataType = ( DebugDataTypes )dataType;
|
|
debugInfo->m_dataSize = 8;
|
|
ddb->m_flags |= DDB_HAS_DATA_INFO;
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//Check if there is enough place for new data in ddb
|
|
if( ( ( ddb->m_stackTop - ddb->m_dataInfoTop ) >= 8 ) && ( ddb->m_dataInfoTop > ddb->m_stackTop ) )
|
|
{
|
|
if( get_local_id( 0 ) == localID )
|
|
{
|
|
uint startIndex = atomic_add( &ddb->m_offset, 1 );
|
|
uint dataIndex = atomic_sub( &ddb->m_dataInfoTop, ddb->m_dataInfoSize );
|
|
uint stackTop = atomic_add( &ddb->m_stackTop, 8 );
|
|
|
|
__global uchar* pCharDebugQueue = ( __global uchar * )ddb;
|
|
__global uint* dest = ( __global uint* )&pCharDebugQueue[ stackTop ];
|
|
__global DebugDataInfo* debugInfo;
|
|
|
|
dest[ 0 ] = ( uint )src & 0xffffffff;
|
|
dest[ 1 ] = ( src & 0xffffffff00000000 ) >> 32;
|
|
debugInfo = ( __global DebugDataInfo* )( &pCharDebugQueue[ dataIndex ] );
|
|
debugInfo->m_dataType = ( DebugDataTypes )dataType;
|
|
debugInfo->m_dataSize = 8;
|
|
ddb->m_flags |= DDB_HAS_DATA_INFO;
|
|
return 0;
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
//Adds data to debug buffer
|
|
// ddb - global buffer to keep data
|
|
// src - source
|
|
// bytes - number of bytes from src to put into ddb
|
|
// dataType - enum defining added data type
|
|
// returns 0 when data was copied, else -1
|
|
int AddGlobalToDebugBuffer( __global DebugDataBuffer* ddb, __global uchar* src, uint bytes, uint dataType )
|
|
{
|
|
if( get_local_id( 0 ) )
|
|
{
|
|
//Check if there is enough place for new data in ddb
|
|
if( ( ( ddb->m_stackTop - ddb->m_dataInfoTop ) >= bytes ) && ( ddb->m_dataInfoTop > ddb->m_stackTop ) )
|
|
{
|
|
//Check flags
|
|
if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO )
|
|
{
|
|
__global uchar* pCharDebugQueue = ( __global uchar * )ddb;
|
|
__global DebugDataInfo* debugInfo;
|
|
IGILLOCAL_MEMCPY_GTOG( ( &pCharDebugQueue[ ddb->m_stackTop ] ), ( src ), ( int )bytes );
|
|
debugInfo = ( __global DebugDataInfo* )( &pCharDebugQueue[ ddb->m_dataInfoTop ] );
|
|
debugInfo->m_dataType = ( DebugDataTypes )dataType;
|
|
debugInfo->m_dataSize = bytes;
|
|
|
|
ddb->m_dataInfoTop = ddb->m_dataInfoTop - ddb->m_dataInfoSize;
|
|
ddb->m_stackTop = ddb->m_stackTop + bytes;
|
|
ddb->m_offset = ddb->m_offset + ( bytes / 4 );
|
|
ddb->m_flags |= DDB_HAS_DATA_INFO;
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
int AddGlobalToDebugBufferAllIds( __global DebugDataBuffer* ddb, __global uchar* src, uint bytes, uint dataType, uint localID )
|
|
{
|
|
if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO )
|
|
{
|
|
//Check if there is enough place for new data in ddb
|
|
if( ( ( ddb->m_stackTop - ddb->m_dataInfoTop ) >= bytes ) && ( ddb->m_dataInfoTop > ddb->m_stackTop ) )
|
|
{
|
|
//Check flags
|
|
if( ddb->m_flags == 0 || ddb->m_flags == DDB_HAS_DATA_INFO )
|
|
{
|
|
__global uchar* pCharDebugQueue = ( __global uchar * )ddb;
|
|
__global DebugDataInfo* debugInfo;
|
|
IGILLOCAL_MEMCPY_GTOG( ( &pCharDebugQueue[ ddb->m_stackTop ] ), ( src ), ( int )bytes );
|
|
debugInfo = ( __global DebugDataInfo* )( &pCharDebugQueue[ ddb->m_dataInfoTop ] );
|
|
debugInfo->m_dataType = ( DebugDataTypes )dataType;
|
|
debugInfo->m_dataSize = bytes;
|
|
|
|
ddb->m_dataInfoTop = ddb->m_dataInfoTop - ddb->m_dataInfoSize;
|
|
ddb->m_stackTop = ddb->m_stackTop + bytes;
|
|
ddb->m_offset = ddb->m_offset + ( bytes / 4 );
|
|
ddb->m_flags |= DDB_HAS_DATA_INFO;
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
#endif
|
|
|
|
#define MAX_SLB_OFFSET ( SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE * SECOND_LEVEL_BUFFER_NUMBER_OF_ENQUEUES )
|
|
|
|
#ifndef SCHEDULER_EMULATION
|
|
#include "scheduler_igdrcl_built_in.inl"
|
|
#endif
|
|
|
|
//SOME COMMON CODE FUNCTIONS
|
|
//COMMON CODE STARTS HERE
|
|
//Not thread safe - make sure it's called in thread safe fashion.
|
|
|
|
void patchIDData( __global char* dsh,
|
|
uint blockId,
|
|
uint numberOfHwThreads,
|
|
uint slmSize )
|
|
{
|
|
__global char* DSHIData = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( blockId + 1 ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) );
|
|
__global uint* DSHIntData = ( __global uint* )( DSHIData );
|
|
//Barrier enable is pre-patched on the host.
|
|
patchDword( ( &DSHIntData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 0, INTERFACE_DESCRIPTOR_HWTHREADS_UPPER_BIT, numberOfHwThreads );
|
|
|
|
//Patch SLM.
|
|
uint SLMPatchValue = GetPatchValueForSLMSize( slmSize );
|
|
patchDword( ( &DSHIntData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 16, 20, SLMPatchValue );
|
|
}
|
|
/*
|
|
this is how it works :
|
|
When constructing primary batch, first IDT table is also constructed, for all blocks, it is constructed as follows:
|
|
[0] - parent id
|
|
[1 .. x ] block id
|
|
[last aligned ] scheduler
|
|
|
|
now when we enter SLB, we forgot about first IDT, and we point all interface descriptor loads to point at scheduler which was last in the first IDT, to be first in the new IDT.
|
|
|
|
This way we can copy Interface Descriptors for blocks from the first IDT and assign Interface Descriptors dynamically in scheduler.
|
|
*/
|
|
|
|
void CopyAndPatchIDData( __global char* dsh,
|
|
uint blockId,
|
|
uint numberOfHwThreads,
|
|
uint slmSize,
|
|
uint interfaceDescriptorOffset,
|
|
uint blockStartId )
|
|
{
|
|
__global char* DSHIData = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( blockId + blockStartId ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) );
|
|
__global uint* DSHIntData = ( __global uint* )( DSHIData );
|
|
|
|
//Copy to ID InterfaceDescriptorOffset
|
|
__global char* DSHDestIData = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( IDT_BREAKDOWN + interfaceDescriptorOffset ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) );
|
|
__global uint* DSHDestIntData = ( __global uint* )( DSHDestIData );
|
|
__global uint* DSHDestIntStartData = DSHDestIntData;
|
|
|
|
for( int i = 0; i < ( SIZEOF_INTERFACE_DESCRIPTOR_DATA / 4 ); i++ )
|
|
{
|
|
DSHDestIntData[ i ] = DSHIntData[ i ];
|
|
}
|
|
|
|
//Barrier enable is pre-patched on the host.
|
|
patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 0, INTERFACE_DESCRIPTOR_HWTHREADS_UPPER_BIT, numberOfHwThreads );
|
|
|
|
//Patch SLM.
|
|
uint SLMPatchValue = GetPatchValueForSLMSize( slmSize );
|
|
patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 16, 20, SLMPatchValue );
|
|
}
|
|
|
|
void CopyAndPatchIDData20( __global char* dsh,
|
|
uint blockId,
|
|
uint numberOfHwThreads,
|
|
uint slmSize,
|
|
uint interfaceDescriptorOffset,
|
|
uint blockStartId,
|
|
uint bToffset,
|
|
uint dshOffset,
|
|
uint numOfSamplers
|
|
#ifdef ENABLE_DEBUG_BUFFER
|
|
, __global DebugDataBuffer* DebugQueue
|
|
#endif
|
|
)
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
__global char* DSHIData = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( blockId + blockStartId ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) );
|
|
__global uint* DSHIntData = ( __global uint* )( DSHIData );
|
|
|
|
//Copy to ID InterfaceDescriptorOffset
|
|
__global char* DSHDestIData = ( __global char* )( dsh + SIZEOF_COLOR_CALCULATOR_STATE + ( ( IDT_BREAKDOWN + interfaceDescriptorOffset ) * SIZEOF_INTERFACE_DESCRIPTOR_DATA ) );
|
|
__global uint* DSHDestIntData = ( __global uint* )( DSHDestIData );
|
|
__global uint* DSHDestIntStartData = DSHDestIntData;
|
|
|
|
for( int i = 0; i < ( SIZEOF_INTERFACE_DESCRIPTOR_DATA / 4 ); i++ )
|
|
{
|
|
DSHDestIntData[ i ] = DSHIntData[ i ];
|
|
}
|
|
|
|
//Barrier enable is pre-patched on the host.
|
|
patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 0, INTERFACE_DESCRIPTOR_HWTHREADS_UPPER_BIT, numberOfHwThreads );
|
|
|
|
//Patch BT offset
|
|
patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_BINDING_TABLE_POINTER_DWORD ] ), 5, 15, ( bToffset >> 5 ) );
|
|
|
|
//Patch SLM.
|
|
uint PatchValue = GetPatchValueForSLMSize( slmSize );
|
|
patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_HWTHREADS_NUMBER_DWORD ] ), 16, 20, PatchValue );
|
|
|
|
PatchValue = ( DSHDestIntStartData[ INTERFACE_DESCRIPTOR_SAMPLER_STATE_TABLE_DWORD ] & 0xffffffe0 ) + ( dshOffset );
|
|
patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_SAMPLER_STATE_TABLE_DWORD ] ), 5, 31, ( ( PatchValue ) >> 5 ) );
|
|
|
|
//Samplers in multiple of 4
|
|
numOfSamplers = ( numOfSamplers + 3 ) / 4;
|
|
patchDword( ( &DSHDestIntStartData[ INTERFACE_DESCRIPTOR_SAMPLER_STATE_TABLE_DWORD ] ), 2, 4, numOfSamplers );
|
|
}
|
|
|
|
|
|
void patchGpGpuWalker(
|
|
uint secondLevelBatchOffset,
|
|
__global uint* secondaryBatchBuffer,
|
|
uint interfaceDescriptorOffset,
|
|
uint simdSize,
|
|
uint totalLocalWorkSize,
|
|
uint3 dimSize,
|
|
uint3 startPoint,
|
|
uint numberOfHwThreadsPerWg,
|
|
uint indirectPayloadSize,
|
|
uint ioHoffset )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
//SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
|
|
uint CmdPacketStart = secondLevelBatchOffset / DWORD_SIZE_IN_BYTES;
|
|
//INTERFACE_DESCRIPTOR for GPGPU_WALKER
|
|
//INTERFACE DESCRIPTOR is one plus the block id
|
|
uint PatchOffset = CmdPacketStart + GPGPU_WALKER_INTERFACE_DESCRIPTOR_ID_OFFSET;
|
|
//Patch id data
|
|
patchDword( &( secondaryBatchBuffer[ PatchOffset ] ),
|
|
0, 5, ( interfaceDescriptorOffset ) );
|
|
PatchOffset = CmdPacketStart + GPGPU_WALKER_THREAD_WIDTH_DWORD;
|
|
//THREAD_WIDTH for GPGPU_WALKER
|
|
patchDword( &( secondaryBatchBuffer[ PatchOffset ] ),
|
|
0, 5, ( numberOfHwThreadsPerWg - 1 ) );
|
|
|
|
PatchOffset = CmdPacketStart + GPGPU_WALKER_SIMDSIZE_DWORD;
|
|
|
|
//SIMD SIZE for GPGPU_WALKER
|
|
//Double Check the bits for SIMDSize
|
|
if( simdSize == 8 )
|
|
{
|
|
patchDword( &( secondaryBatchBuffer[ PatchOffset ] ),
|
|
30, 31, SIMD8 );
|
|
}
|
|
else if ( simdSize == 16 )
|
|
{
|
|
patchDword( &( secondaryBatchBuffer[ PatchOffset ] ),
|
|
30, 31, SIMD16 );
|
|
}
|
|
else
|
|
{
|
|
patchDword( &( secondaryBatchBuffer[ PatchOffset ] ),
|
|
30, 31, SIMD32 );
|
|
}
|
|
|
|
//XDIM for GPGPU_WALKER
|
|
secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_XDIM_DWORD ] = dimSize.x;
|
|
secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_GROUP_ID_START_X ] = startPoint.x;
|
|
//YDIM
|
|
secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_YDIM_DWORD ] = dimSize.y;
|
|
secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_GROUP_ID_START_Y ] = startPoint.y;
|
|
//ZDIM for GPGPU_WALKER
|
|
secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_ZDIM_DWORD ] = dimSize.z;
|
|
secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_GROUP_ID_START_Z ] = startPoint.z;
|
|
|
|
//XMASK for GPGPU_WALKER
|
|
uint mask = ( 1 << ( totalLocalWorkSize % simdSize ) ) - 1;
|
|
if( mask == 0 )
|
|
mask = ~0;
|
|
|
|
secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_XMASK_DWORD ] = mask;
|
|
|
|
//YMASK for GPGPU_WALKER
|
|
uint YMask = ~0;
|
|
|
|
secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_YMASK_DWORD ] = YMask;
|
|
|
|
|
|
patchDword( &( secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_INDIRECT_DATA_LENGTH_OFFSET ] ),
|
|
0, 16, indirectPayloadSize );
|
|
|
|
patchDword( &( secondaryBatchBuffer[ CmdPacketStart + GPGPU_WALKER_INDIRECT_START_ADDRESS_OFFSET ] ),
|
|
0, 31, ioHoffset );
|
|
}
|
|
|
|
int PatchMediaStateFlush(
|
|
uint secondLevelBatchOffset,
|
|
__global uint* secondaryBatchBuffer,
|
|
uint interfaceDescriptorOffset,
|
|
uint msfNumber )
|
|
{
|
|
//SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
|
|
uint CmdPacketStart = secondLevelBatchOffset / DWORD_SIZE_IN_BYTES;
|
|
uint MsfOffset;
|
|
|
|
if( msfNumber == SCHEDULER_MSF_INITIAL )
|
|
{
|
|
MsfOffset = MEDIA_STATE_FLUSH_INITIAL_INTERFACE_DESCRIPTOR_OFFSET;
|
|
}
|
|
else if ( msfNumber == SCHEDULER_MSF_SECOND )
|
|
{
|
|
MsfOffset = MEDIA_STATE_FLUSH_INTERFACE_DESCRIPTOR_OFFSET;
|
|
}
|
|
else
|
|
{
|
|
return -1;
|
|
}
|
|
patchDword( &( secondaryBatchBuffer[ CmdPacketStart + MsfOffset ] ), 0, 5, interfaceDescriptorOffset );
|
|
|
|
return 0;
|
|
}
|
|
|
|
#if defined WA_LRI_COMMANDS_EXIST
|
|
void PatchMiLoadRegisterImm(
|
|
uint secondLevelBatchOffset,
|
|
__global uint* secondaryBatchBuffer,
|
|
uint enqueueOffset,
|
|
uint registerAddress,
|
|
uint value )
|
|
{
|
|
//SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
|
|
uint CmdPacketStart = secondLevelBatchOffset / DWORD_SIZE_IN_BYTES;
|
|
|
|
secondaryBatchBuffer[ CmdPacketStart + enqueueOffset ] = OCLRT_LOAD_REGISTER_IMM_CMD;
|
|
patchDword( &( secondaryBatchBuffer[ CmdPacketStart + enqueueOffset + IMM_LOAD_REGISTER_ADDRESS_DWORD_OFFSET ] ), 2, 22, registerAddress >> 2 );
|
|
secondaryBatchBuffer[ CmdPacketStart + enqueueOffset + IMM_LOAD_REGISTER_VALUE_DWORD_OFFSET ] = value;
|
|
}
|
|
|
|
void AddMiLoadRegisterImm(
|
|
__global uint* secondaryBatchBuffer,
|
|
__private uint* dwordOffset,
|
|
uint value )
|
|
{
|
|
secondaryBatchBuffer[ *dwordOffset ] = OCLRT_LOAD_REGISTER_IMM_CMD;
|
|
( *dwordOffset )++;
|
|
secondaryBatchBuffer[ *dwordOffset ] = 0;
|
|
patchDword( &( secondaryBatchBuffer[ *dwordOffset ] ), 2, 22, CTXT_PREMP_DBG_ADDRESS_VALUE >> 2 );
|
|
( *dwordOffset )++;
|
|
secondaryBatchBuffer[ *dwordOffset ] = value; //CTXT_PREMP_ON_MI_ARB_CHECK_ONLY or CTXT_PREMP_DEFAULT_VALUE
|
|
( *dwordOffset )++;
|
|
}
|
|
|
|
void SetDisablePreemptionRegister(
|
|
uint secondLevelBatchOffset,
|
|
__global uint* secondaryBatchBuffer )
|
|
{
|
|
PatchMiLoadRegisterImm( secondLevelBatchOffset,
|
|
secondaryBatchBuffer,
|
|
IMM_LOAD_REGISTER_FOR_DISABLE_PREEMPTION_OFFSET,
|
|
CTXT_PREMP_DBG_ADDRESS_VALUE,
|
|
CTXT_PREMP_ON_MI_ARB_CHECK_ONLY );
|
|
}
|
|
|
|
void SetEnablePreemptionRegister(
|
|
uint secondLevelBatchOffset,
|
|
__global uint* secondaryBatchBuffer )
|
|
{
|
|
PatchMiLoadRegisterImm( secondLevelBatchOffset,
|
|
secondaryBatchBuffer,
|
|
IMM_LOAD_REGISTER_FOR_ENABLE_PREEMPTION_OFFSET,
|
|
CTXT_PREMP_DBG_ADDRESS_VALUE,
|
|
CTXT_PREMP_DEFAULT_VALUE );
|
|
}
|
|
|
|
void NoopPreemptionCommand(
|
|
uint secondLevelBatchOffset,
|
|
uint cmdOffset,
|
|
__global uint* secondaryBatchBuffer )
|
|
{
|
|
uint CmdPacketStart = cmdOffset + secondLevelBatchOffset / DWORD_SIZE_IN_BYTES;
|
|
for( int i = 0; i < OCLRT_IMM_LOAD_REGISTER_CMD_DEVICE_CMD_DWORD_OFFSET; i++ )
|
|
{
|
|
secondaryBatchBuffer[ CmdPacketStart + i ] = 0;
|
|
}
|
|
}
|
|
#endif //WA_LRI_COMMANDS_EXIST
|
|
|
|
|
|
//PQueue is needed for SLBOffset
|
|
void AddCmdsInSLBforScheduler20Parallel( uint slbOffset,
|
|
__global IGIL_CommandQueue* pQueue,
|
|
__global uint * secondaryBatchBuffer,
|
|
__global char * dsh )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
#ifdef SCHEDULER_EMULATION
|
|
uint3 StartPoint = { 0, 0, 0 };
|
|
uint3 DimSize = { get_num_groups( 0 ), 1, 1 };
|
|
#else
|
|
uint3 StartPoint = ( uint3 )( 0 );
|
|
uint3 DimSize = ( uint3 )( get_num_groups( 0 ), 1, 1 );
|
|
#endif
|
|
patchGpGpuWalker( slbOffset,
|
|
secondaryBatchBuffer,
|
|
0,
|
|
PARALLEL_SCHEDULER_COMPILATION_SIZE_20,
|
|
get_local_size(0),
|
|
DimSize,
|
|
StartPoint,
|
|
PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20,
|
|
SIZEOF_3GRFS * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 + pQueue->m_controls.m_SchedulerConstantBufferSize,
|
|
pQueue->m_controls.m_SchedulerDSHOffset );
|
|
|
|
PatchMediaStateFlush( slbOffset, secondaryBatchBuffer, 0, SCHEDULER_MSF_INITIAL );
|
|
PatchMediaStateFlush( slbOffset, secondaryBatchBuffer, 0, SCHEDULER_MSF_SECOND );
|
|
|
|
//When commands exists and scheduler does not require preemption off, noop the commands space
|
|
#if defined WA_LRI_COMMANDS_EXIST
|
|
#if defined WA_SCHEDULER_PREEMPTION
|
|
if( pQueue->m_controls.m_EventTimestampAddress == 0u )
|
|
{
|
|
SetEnablePreemptionRegister( slbOffset, secondaryBatchBuffer );
|
|
SetDisablePreemptionRegister( slbOffset, secondaryBatchBuffer );
|
|
}
|
|
else
|
|
{
|
|
NoopPreemptionCommand( slbOffset, IMM_LOAD_REGISTER_FOR_ENABLE_PREEMPTION_OFFSET, secondaryBatchBuffer );
|
|
NoopPreemptionCommand( slbOffset, IMM_LOAD_REGISTER_FOR_DISABLE_PREEMPTION_OFFSET, secondaryBatchBuffer );
|
|
}
|
|
#else
|
|
//This is case, where LRI preemption is not required around scheduler WALKERs, but space for LRI commands exists, make sure they are nooped then
|
|
NoopPreemptionCommand( SLBOffset, IMM_LOAD_REGISTER_FOR_ENABLE_PREEMPTION_OFFSET, secondaryBatchBuffer );
|
|
NoopPreemptionCommand( SLBOffset, IMM_LOAD_REGISTER_FOR_DISABLE_PREEMPTION_OFFSET, secondaryBatchBuffer );
|
|
#endif //WA_SCHEDULER_PREEMPTION
|
|
#endif //WA_LRI_COMMANDS_EXIST
|
|
}
|
|
|
|
int generateLocalIDSParallel20(
|
|
__global char* dsh,
|
|
uint3 localSize,
|
|
uint hwThreads,
|
|
uint simdSize )
|
|
{
|
|
uint it, currX, currY, currZ, FlattendID;
|
|
|
|
uint Max = 1;
|
|
if( simdSize == 32 )
|
|
{
|
|
Max = 2;
|
|
}
|
|
|
|
//Update full GRFs, each WI generate ID for one work item in x,y and z
|
|
//in case we generate SIMD8 payload using 16 wi , idle half of them
|
|
if( get_local_id( 0 ) < simdSize )
|
|
{
|
|
for( it = 0; it < hwThreads; it++ )
|
|
{
|
|
for( uint multip = 0; multip < Max; multip++ )
|
|
{
|
|
//We are in simd 8, each wi process generation for 1 wi
|
|
FlattendID = get_local_id( 0 ) + it * simdSize + 16 * ( multip );
|
|
|
|
currX = FlattendID % localSize.x;
|
|
currY = ( FlattendID / localSize.x ) % localSize.y;
|
|
currZ = ( FlattendID / ( localSize.x * localSize.y ) );//not needed % localSize.z;
|
|
|
|
*( __global ushort * )( dsh + get_local_id( 0 ) * 2 + it * GRF_SIZE * 3 * Max + multip * GRF_SIZE ) = ( ushort )currX;
|
|
*( __global ushort * )( dsh + get_local_id( 0 ) * 2 + it * GRF_SIZE * 3 * Max + GRF_SIZE * Max + multip * GRF_SIZE ) = ( ushort )currY;
|
|
*( __global ushort * )( dsh + get_local_id( 0 ) * 2 + it * GRF_SIZE * 3 * Max + GRF_SIZE * Max + GRF_SIZE * Max + multip * GRF_SIZE ) = ( ushort )currZ;
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
//Function generate local ids.
|
|
//SIMD16 version
|
|
int generateLocalIDSsimd16(
|
|
__global char* dsh,
|
|
uint3 localSize,
|
|
uint hwThreads)
|
|
{
|
|
typedef union
|
|
{
|
|
ushort16 vectors;
|
|
ushort varray[ 16 ];
|
|
}vectorUnion;
|
|
|
|
__private vectorUnion LidX;
|
|
__private vectorUnion LidY;
|
|
__private vectorUnion LidZ;
|
|
|
|
__private ushort currX = 0;
|
|
__private ushort currY = 0;
|
|
__private ushort currZ = 0;
|
|
|
|
//Assuming full load of hw thread , remainder done separately
|
|
for(uint it = 0; it < hwThreads; it++ )
|
|
{
|
|
//This will be unrolled by compiler
|
|
for(uint x = 0; x < 16; x++ )
|
|
{
|
|
LidX.varray[ x ] = currX++;
|
|
LidY.varray[ x ] = currY;
|
|
LidZ.varray[ x ] = currZ;
|
|
|
|
if( currX == localSize.x )
|
|
{
|
|
currX = 0;
|
|
currY++;
|
|
}
|
|
|
|
if( currY == localSize.y )
|
|
{
|
|
currY = 0;
|
|
currZ++;
|
|
}
|
|
}
|
|
|
|
*( __global ushort16 * )( dsh + it * GRF_SIZE * 3 ) = LidX.vectors;
|
|
*( __global ushort16 * )( dsh + it * GRF_SIZE * 3 + GRF_SIZE ) = LidY.vectors;
|
|
*( __global ushort16 * )( dsh + it * GRF_SIZE * 3 + GRF_SIZE + GRF_SIZE ) = LidZ.vectors;
|
|
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
//Function generate local ids.
|
|
//SIMD8 version
|
|
int generateLocalIDSsimd8(
|
|
__global char* dsh,
|
|
uint3 localSize,
|
|
uint hwThreads)
|
|
{
|
|
typedef union
|
|
{
|
|
ushort8 vectors;
|
|
ushort varray[ 8 ];
|
|
}vectorUnion;
|
|
|
|
__private vectorUnion LidX;
|
|
__private vectorUnion LidY;
|
|
__private vectorUnion LidZ;
|
|
|
|
__private ushort currX = 0;
|
|
__private ushort currY = 0;
|
|
__private ushort currZ = 0;
|
|
|
|
//Assuming full load of hw thread , remainder done separately
|
|
for(uint it = 0; it < hwThreads; it++ )
|
|
{
|
|
//This will be unrolled by compiler
|
|
for(uint x = 0; x < 8; x++ )
|
|
{
|
|
LidX.varray[ x ] = currX++;
|
|
LidY.varray[ x ] = currY;
|
|
LidZ.varray[ x ] = currZ;
|
|
|
|
if( currX == localSize.x )
|
|
{
|
|
currX = 0;
|
|
currY++;
|
|
}
|
|
|
|
if( currY == localSize.y )
|
|
{
|
|
currY = 0;
|
|
currZ++;
|
|
}
|
|
}
|
|
|
|
*( __global ushort8 * )( dsh + it * GRF_SIZE * 3 ) = LidX.vectors;
|
|
*( __global ushort8 * )( dsh + it * GRF_SIZE * 3 + GRF_SIZE ) = LidY.vectors;
|
|
*( __global ushort8 * )( dsh + it * GRF_SIZE * 3 + GRF_SIZE + GRF_SIZE ) = LidZ.vectors;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
//Function patches a curbe parametr , this version of function supports only these curbe tokens that may appear only once
|
|
int PatchDSH1Token( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
|
|
uint value )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
uint PatchOffset;
|
|
#if SCHEDULER_DEBUG_MODE
|
|
//If we are here it means that mask is ok and there are at least 3 curbe tokens that needs to be patched, do it right away
|
|
if( pKernelCurbeParams[ CurrentIndex ].m_parameterType != TokenType )
|
|
{
|
|
return -1;
|
|
}
|
|
#endif
|
|
PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset;
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = value;
|
|
|
|
currentIndex++;
|
|
return currentIndex;
|
|
}
|
|
|
|
int PatchLocalMemEntities( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
|
|
__global IGIL_CommandHeader* pCommand )
|
|
{
|
|
uint PatchOffset;
|
|
#if SCHEDULER_DEBUG_MODE
|
|
//If we are here it means that mask is ok and there are at least 3 curbe tokens that needs to be patched, do it right away
|
|
if( pKernelCurbeParams[ CurrentIndex ].m_parameterType != TokenType )
|
|
{
|
|
return -1;
|
|
}
|
|
#endif
|
|
//First patch is with 0
|
|
PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset;
|
|
|
|
//SUM_OF_LOCAL_MEMORY_KERNEL_ARGS can be a 4 or 8 byte patch
|
|
if( pKernelCurbeParams[currentIndex].m_parameterSize == sizeof( ulong ) )
|
|
{
|
|
*( __global ulong * )( &pDsh[PatchOffset] ) = 0;
|
|
}
|
|
else
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = 0;
|
|
}
|
|
|
|
|
|
currentIndex++;
|
|
uint Alignement;
|
|
uint iter = 0;
|
|
uint CurrentSum = 0;
|
|
uint CurrentValue;
|
|
//For each global captured there will be uint with index and ulong with address.
|
|
uint GlobalPointersSize = ( pCommand->m_numGlobalCapturedBuffer * ( sizeof( ulong ) + sizeof( uint ) ) ) / sizeof( uint );
|
|
|
|
__global uint* pLocalMemSizes = &pCommand->m_data[ pCommand->m_numDependencies + pCommand->m_numScalarArguments + GlobalPointersSize ];
|
|
//Check if there is second surface
|
|
while( pKernelCurbeParams[ currentIndex ].m_parameterType == tokenType )
|
|
{
|
|
PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset;
|
|
|
|
//Value needs to be aligned to the value stored in sourceoffset
|
|
Alignement = OCLRT_MAX( DWORD_SIZE_IN_BYTES, pKernelCurbeParams[ currentIndex ].m_sourceOffset );
|
|
|
|
CurrentValue = pLocalMemSizes[ iter ];
|
|
CurrentValue = OCLRT_ALIGN( CurrentValue, Alignement );
|
|
CurrentSum += CurrentValue;
|
|
|
|
//SUM_OF_LOCAL_MEMORY_KERNEL_ARGS can be a 4 or 8 byte patch
|
|
if( pKernelCurbeParams[currentIndex].m_parameterSize == sizeof( ulong ) )
|
|
{
|
|
*( __global ulong * )( &pDsh[PatchOffset] ) = ( ulong )CurrentSum;
|
|
}
|
|
else
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = CurrentSum;
|
|
}
|
|
|
|
currentIndex++;
|
|
iter++;
|
|
}
|
|
return currentIndex;
|
|
}
|
|
|
|
//Function patches a curbe parametr , this version of function supports only these curbe tokens that may appear only once
|
|
int PatchDSH1TokenParallel20( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
|
|
uint value )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
uint PatchOffset;
|
|
if( get_local_id( 0 ) == PARALLEL_SCHEDULER_COMPILATION_SIZE_20 )
|
|
{
|
|
PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset;
|
|
*( __global uint * ) ( &pDsh[ PatchOffset ] ) = value;
|
|
}
|
|
currentIndex++;
|
|
return currentIndex;
|
|
}
|
|
|
|
//Function patches a curbe parametr, this version of function works on 3d curbe tokens
|
|
//It assumes that at least 3 tokens exists, then checks if 3 additional patches are needed
|
|
int PatchDSH6TokensParallel20( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
|
|
uint value1, uint value2, uint value3 )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
uint PatchOffset, SourceOffset;
|
|
uint WorkingOffset;
|
|
uint ShiftSize;
|
|
|
|
//Check if we patch 3 or 6 curbe tokens
|
|
if( pKernelCurbeParams[ currentIndex + 3 ].m_parameterType == tokenType )
|
|
{
|
|
ShiftSize = 6;
|
|
}
|
|
else
|
|
{
|
|
ShiftSize = 3;
|
|
}
|
|
|
|
if( get_local_id( 0 ) < PARALLEL_SCHEDULER_COMPILATION_SIZE_20 + ShiftSize )
|
|
{
|
|
WorkingOffset = currentIndex + get_local_id( 0 ) - PARALLEL_SCHEDULER_COMPILATION_SIZE_20;
|
|
PatchOffset = pKernelCurbeParams[ WorkingOffset ].m_patchOffset;
|
|
SourceOffset = pKernelCurbeParams[ WorkingOffset ].m_sourceOffset;
|
|
|
|
if( SourceOffset == 0 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = value1;
|
|
}
|
|
else if( SourceOffset == 4 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = value2;
|
|
}
|
|
else if( SourceOffset == 8 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = value3;
|
|
}
|
|
}
|
|
|
|
currentIndex += ShiftSize;
|
|
return currentIndex;
|
|
}
|
|
|
|
int PatchLocalWorkSizes( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
|
|
uint enqLocalX, uint enqLocalY, uint enqLocalZ, uint cutLocalX, uint cutLocalY, uint cutLocalZ )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
uint PatchOffset, SourceOffset;
|
|
|
|
//Tokens are sorted by m_sourceOffset, it means that first 3 keys are always used to compute global_id and are always present
|
|
for( uint it = 0; it < 3; it++ )
|
|
{
|
|
PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset;
|
|
SourceOffset = pKernelCurbeParams[ currentIndex ].m_sourceOffset;
|
|
|
|
if( SourceOffset == 0 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = enqLocalX;
|
|
}
|
|
else if( SourceOffset == 4 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = enqLocalY;
|
|
}
|
|
else if( SourceOffset == 8 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = enqLocalZ;
|
|
}
|
|
currentIndex++;
|
|
}
|
|
//If there are 3 more tokens, it means that get_local_size is used within a kernel, to deal with it patch with the second set of variables
|
|
if( pKernelCurbeParams[ currentIndex ].m_parameterType == tokenType )
|
|
{
|
|
for( uint it = 0; it < 3; it++ )
|
|
{
|
|
PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset;
|
|
SourceOffset = pKernelCurbeParams[ currentIndex ].m_sourceOffset;
|
|
|
|
if( SourceOffset == 0 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = cutLocalX;
|
|
}
|
|
else if( SourceOffset == 4 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = cutLocalY;
|
|
}
|
|
else if( SourceOffset == 8 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = cutLocalZ;
|
|
}
|
|
currentIndex++;
|
|
}
|
|
}
|
|
return currentIndex;
|
|
}
|
|
|
|
//Function patches a curbe parametr, this version of function works on 3d curbe tokens
|
|
//It assumes that at least 3 tokens exists, then checks if 3 additional patches are needed
|
|
int PatchLocalWorkSizesParallel( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
|
|
uint enqLocalX, uint enqLocalY, uint enqLocalZ, uint cutLocalX, uint cutLocalY, uint cutLocalZ )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
uint ShiftSize;
|
|
|
|
//Check if we patch 3 or 6 curbe tokens
|
|
if( pKernelCurbeParams[ currentIndex + 3 ].m_parameterType == tokenType )
|
|
{
|
|
ShiftSize = 6;
|
|
}
|
|
else
|
|
{
|
|
ShiftSize = 3;
|
|
}
|
|
|
|
//Use single threaded version
|
|
if( get_local_id( 0 ) == PARALLEL_SCHEDULER_COMPILATION_SIZE_20 )
|
|
{
|
|
PatchLocalWorkSizes( currentIndex, SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE, pKernelCurbeParams, pDsh, enqLocalX, enqLocalY, enqLocalZ, cutLocalX, cutLocalY, cutLocalZ );
|
|
}
|
|
|
|
currentIndex += ShiftSize;
|
|
return currentIndex;
|
|
}
|
|
|
|
//Function patches a curbe parametr, this version of function works on 3d curbe tokens
|
|
//It assumes that at least 3 tokens exists, then checks if 3 additional patches are needed
|
|
int PatchDSH6Tokens( int currentIndex, uint tokenType, __global IGIL_KernelCurbeParams* pKernelCurbeParams, __global char* pDsh,
|
|
uint value1, uint value2, uint value3 )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
uint PatchOffset, SourceOffset;
|
|
#if SCHEDULER_DEBUG_MODE
|
|
//If we are here it means that mask is ok and there are at least 3 curbe tokens that needs to be patched, do it right away
|
|
if( pKernelCurbeParams[ CurrentIndex ].m_parameterType != TokenType )
|
|
{
|
|
return -1;
|
|
}
|
|
#endif
|
|
for( uint it = 0; it < 3; it++ )
|
|
{
|
|
PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset;
|
|
SourceOffset = pKernelCurbeParams[ currentIndex ].m_sourceOffset;
|
|
|
|
if( SourceOffset == 0 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = value1;
|
|
}
|
|
else if( SourceOffset == 4 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = value2;
|
|
}
|
|
else if( SourceOffset == 8 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = value3;
|
|
}
|
|
currentIndex++;
|
|
}
|
|
//Check if there are 3 more.
|
|
if( pKernelCurbeParams[ currentIndex ].m_parameterType == tokenType )
|
|
{
|
|
for( uint it = 0; it < 3; it++ )
|
|
{
|
|
PatchOffset = pKernelCurbeParams[ currentIndex ].m_patchOffset;
|
|
SourceOffset = pKernelCurbeParams[ currentIndex ].m_sourceOffset;
|
|
|
|
if( SourceOffset == 0 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = value1;
|
|
}
|
|
else if( SourceOffset == 4 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = value2;
|
|
}
|
|
else if( SourceOffset == 8 )
|
|
{
|
|
*( __global uint * )( &pDsh[ PatchOffset ] ) = value3;
|
|
}
|
|
currentIndex++;
|
|
}
|
|
}
|
|
|
|
return currentIndex;
|
|
}
|
|
//Common code
|
|
|
|
inline __global char* GetPtrToCurbeData( uint offset, __global IGIL_KernelDataHeader * pKernelReflection )
|
|
{
|
|
__global char * pRawKernelReflection = ( __global char * )pKernelReflection;
|
|
return ( pRawKernelReflection + offset );
|
|
}
|
|
|
|
__global char* GetPtrToKernelReflectionOffset( uint offset, __global IGIL_KernelDataHeader * pKernelReflection )
|
|
{
|
|
__global char * pRawKernelReflection = ( __global char * )pKernelReflection;
|
|
return ( pRawKernelReflection + offset );
|
|
}
|
|
|
|
void InitWalkerDataParallel( __local IGIL_WalkerEnumeration* pWalkerEnumData,
|
|
uint workDim,
|
|
uint* pWalkerCount,
|
|
uint3 edgeArray,
|
|
uint3 globalDim,
|
|
uint3 globalSizes,
|
|
uint3 localSizes )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
pWalkerEnumData->TotalDimSize.x = globalDim.x;
|
|
pWalkerEnumData->TotalDimSize.y = globalDim.y;
|
|
pWalkerEnumData->TotalDimSize.z = globalDim.z;
|
|
|
|
pWalkerEnumData->WalkerArray[ 0 ].ActualLocalSize.x = localSizes.x;
|
|
pWalkerEnumData->WalkerArray[ 0 ].WalkerStartPoint.x = 0;
|
|
pWalkerEnumData->WalkerArray[ 0 ].WalkerDimSize.x = globalDim.x;
|
|
|
|
pWalkerEnumData->WalkerArray[ 0 ].ActualLocalSize.y = localSizes.y;
|
|
pWalkerEnumData->WalkerArray[ 0 ].WalkerStartPoint.y = 0;
|
|
pWalkerEnumData->WalkerArray[ 0 ].WalkerDimSize.y = globalDim.y;
|
|
|
|
pWalkerEnumData->WalkerArray[ 0 ].ActualLocalSize.z = localSizes.z;
|
|
pWalkerEnumData->WalkerArray[ 0 ].WalkerStartPoint.z = 0;
|
|
pWalkerEnumData->WalkerArray[ 0 ].WalkerDimSize.z = globalDim.z;
|
|
|
|
uint WalkerCount = 1;
|
|
|
|
if( edgeArray.x != 0 )
|
|
{
|
|
pWalkerEnumData->TotalDimSize.x++;
|
|
|
|
pWalkerEnumData->WalkerArray[ 1 ].ActualLocalSize.x = edgeArray.x;
|
|
pWalkerEnumData->WalkerArray[ 1 ].WalkerStartPoint.x = globalDim.x;
|
|
pWalkerEnumData->WalkerArray[ 1 ].WalkerDimSize.x = pWalkerEnumData->TotalDimSize.x;
|
|
|
|
pWalkerEnumData->WalkerArray[ 1 ].ActualLocalSize.y = localSizes.y;
|
|
pWalkerEnumData->WalkerArray[ 1 ].WalkerStartPoint.y = 0;
|
|
pWalkerEnumData->WalkerArray[ 1 ].WalkerDimSize.y = globalDim.y;
|
|
|
|
pWalkerEnumData->WalkerArray[ 1 ].ActualLocalSize.z = localSizes.z;
|
|
pWalkerEnumData->WalkerArray[ 1 ].WalkerStartPoint.z = 0;
|
|
pWalkerEnumData->WalkerArray[ 1 ].WalkerDimSize.z = globalDim.z;
|
|
|
|
WalkerCount++;
|
|
}
|
|
|
|
if( workDim > 1 )
|
|
{
|
|
if( edgeArray.y != 0 )
|
|
{
|
|
pWalkerEnumData->TotalDimSize.y++;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = localSizes.x;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = 0;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = globalDim.x;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = edgeArray.y;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = globalDim.y;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = pWalkerEnumData->TotalDimSize.y;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = localSizes.z;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = 0;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = globalDim.z;
|
|
|
|
WalkerCount++;
|
|
}
|
|
|
|
if( ( edgeArray.x != 0 ) & ( edgeArray.y != 0 ) )
|
|
{
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = edgeArray.x;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = globalDim.x;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = pWalkerEnumData->TotalDimSize.x;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = edgeArray.y;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = globalDim.y;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = pWalkerEnumData->TotalDimSize.y;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = localSizes.z;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = 0;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = globalDim.z;
|
|
|
|
WalkerCount++;
|
|
}
|
|
if( workDim > 2 )
|
|
{
|
|
if( edgeArray.z != 0 )
|
|
{
|
|
pWalkerEnumData->TotalDimSize.z++;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = localSizes.x;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = 0;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = globalDim.x;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = localSizes.y;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = 0;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = globalDim.y;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = edgeArray.z;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = globalDim.z;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = pWalkerEnumData->TotalDimSize.z;
|
|
|
|
WalkerCount++;
|
|
}
|
|
if( ( edgeArray.x != 0 ) & ( edgeArray.z != 0 ) )
|
|
{
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = edgeArray.x;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = globalDim.x;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = pWalkerEnumData->TotalDimSize.x;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = localSizes.y;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = 0;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = globalDim.y;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = edgeArray.z;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = globalDim.z;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = pWalkerEnumData->TotalDimSize.z;
|
|
|
|
WalkerCount++;
|
|
}
|
|
|
|
if( ( edgeArray.y != 0 ) & ( edgeArray.z != 0 ) )
|
|
{
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = localSizes.x;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = 0;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = globalDim.x;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = edgeArray.y;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = globalDim.y;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = pWalkerEnumData->TotalDimSize.y;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = edgeArray.z;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = globalDim.z;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = pWalkerEnumData->TotalDimSize.z;
|
|
|
|
WalkerCount++;
|
|
}
|
|
if( ( edgeArray.x != 0 ) & ( edgeArray.y != 0 ) & ( edgeArray.z != 0 ) )
|
|
{
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.x = edgeArray.x;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.x = globalDim.x;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.x = pWalkerEnumData->TotalDimSize.x;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.y = edgeArray.y;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.y = globalDim.y;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.y = pWalkerEnumData->TotalDimSize.y;
|
|
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].ActualLocalSize.z = edgeArray.z;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerStartPoint.z = globalDim.z;
|
|
pWalkerEnumData->WalkerArray[ WalkerCount ].WalkerDimSize.z = pWalkerEnumData->TotalDimSize.z;
|
|
|
|
WalkerCount++;
|
|
}
|
|
}
|
|
}
|
|
|
|
*pWalkerCount = WalkerCount;
|
|
}
|
|
|
|
//Compute number of Walkers needed for this command packet, this function assumes that command packet is initialized
|
|
inline int GetWalkerCount( __global IGIL_CommandHeader* pCommand )
|
|
{
|
|
int WalkerCount = 1;
|
|
for( uint dim = 0; ( dim < pCommand->m_range.m_dispatchDimensions ); dim++ )
|
|
{
|
|
if( ( pCommand->m_range.m_globalWorkSize[ dim ] % pCommand->m_range.m_localWorkSize[ dim ] ) != 0 )
|
|
{
|
|
WalkerCount *= 2;
|
|
}
|
|
}
|
|
return WalkerCount;
|
|
}
|
|
|
|
//This function intializes command packet, checks for null case, sets proper LWS sizes and return WalkerCount needed for this packet
|
|
inline void InitializeCommandPacket( __global IGIL_CommandHeader* pCommand )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
//Check for NULL case
|
|
if( pCommand->m_range.m_localWorkSize[ 0 ] == 0 )
|
|
{
|
|
//If null case detected use 16 x 1 x 1 lws
|
|
if( pCommand->m_range.m_globalWorkSize[ 0 ] >= 16 )
|
|
{
|
|
pCommand->m_range.m_localWorkSize[ 0 ] = 16;
|
|
}
|
|
else
|
|
{
|
|
pCommand->m_range.m_localWorkSize[ 0 ] = pCommand->m_range.m_globalWorkSize[ 0 ];
|
|
}
|
|
|
|
pCommand->m_range.m_localWorkSize[ 1 ] = 1;
|
|
pCommand->m_range.m_localWorkSize[ 2 ] = 1;
|
|
}
|
|
|
|
}
|
|
|
|
//Patches the address for pipe control
|
|
void PatchPipeControlProfilingAddres( __global uint* secondaryBatchBuffer, uint slBoffset, ulong address, uint pipeControlOffset )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
//SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
|
|
uint PostSyncDwordOffset = ( slBoffset / DWORD_SIZE_IN_BYTES ) + pipeControlOffset + PIPE_CONTROL_POST_SYNC_DWORD;
|
|
uint DwordOffset = ( slBoffset / DWORD_SIZE_IN_BYTES ) + pipeControlOffset + PIPE_CONTROL_ADDRESS_FIELD_DWORD;
|
|
//Patch P_C event timestamp address in SLB in 3rd and 4th dword
|
|
secondaryBatchBuffer[ DwordOffset ] = 0;
|
|
patchDword( &secondaryBatchBuffer[ DwordOffset ], PIPE_CONTROL_GRAPHICS_ADDRESS_START_BIT, PIPE_CONTROL_GRAPHICS_ADDRESS_END_BIT, ( uint )( address >> PIPE_CONTROL_GRAPHICS_ADDRESS_START_BIT ) );
|
|
DwordOffset++;
|
|
secondaryBatchBuffer[ DwordOffset ] = 0;
|
|
patchDword( &secondaryBatchBuffer[ DwordOffset ], PIPE_CONTROL_GRAPHICS_ADDRESS_HIGH_START_BIT, PIPE_CONTROL_GRAPHICS_ADDRESS_HIGH_END_BIT, ( address >> 32 ) );
|
|
|
|
//Patch Timestamp bit
|
|
patchDword( &secondaryBatchBuffer[ PostSyncDwordOffset ], PIPE_CONTROL_POST_SYNC_START_BIT, PIPE_CONTROL_POST_SYNC_END_BIT, PIPE_CONTROL_GENERATE_TIME_STAMP );
|
|
}
|
|
|
|
void DisablePostSyncBitInPipeControl( __global uint* secondaryBatchBuffer, uint slBoffset, uint pipeControlOffset )
|
|
{
|
|
//SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
|
|
uint PostSyncDwordOffset = ( slBoffset / DWORD_SIZE_IN_BYTES ) + pipeControlOffset + PIPE_CONTROL_POST_SYNC_DWORD;
|
|
//Patch P_C event timestamp address in SLB in 3rd and 4th dword
|
|
patchDword( &secondaryBatchBuffer[ PostSyncDwordOffset ], PIPE_CONTROL_POST_SYNC_START_BIT, PIPE_CONTROL_POST_SYNC_END_BIT, PIPE_CONTROL_NO_POSTSYNC_OPERATION );
|
|
}
|
|
|
|
|
|
int PatchDSH( __global IGIL_CommandQueue* pQueue,
|
|
__global IGIL_KernelDataHeader * pKernelReflection,
|
|
__global char* dsh,
|
|
uint blockId,
|
|
__global IGIL_CommandHeader* pCommandHeader,
|
|
__global uint* secondaryBatchBuffer,
|
|
uint dshOffset,
|
|
uint intefaceDescriptorOffset,
|
|
__local IGIL_WalkerEnumeration* pWalkerMain,
|
|
uint walkerPos )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
__global IGIL_KernelAddressData* pKernelAddressData = IGIL_GetKernelAddressData( pKernelReflection, blockId );
|
|
__global IGIL_KernelData* pBlockData = IGIL_GetKernelData( pKernelReflection, blockId );
|
|
ulong PatchMask = pBlockData->m_PatchTokensMask;
|
|
uint CurrentIndex = 0;
|
|
|
|
__global char* pDsh = ( __global char* )&dsh[ dshOffset ];
|
|
__global IGIL_KernelCurbeParams* pKernelCurbeParams = ( __global IGIL_KernelCurbeParams* )&pBlockData->m_data;
|
|
|
|
uint NumberOfDepencies = pCommandHeader->m_numDependencies;
|
|
uint PatchOffset;
|
|
__global char* pScalarData = ( __global char* )( &pCommandHeader->m_data[ NumberOfDepencies ] );
|
|
__global char *pDshOnKRS = GetPtrToKernelReflectionOffset( pKernelAddressData->m_SamplerHeapOffset, pKernelReflection );
|
|
|
|
uint SizeOfScalarsFromCurbe = 0;
|
|
uint CurbeSize;
|
|
|
|
uint TotalLocalSize;
|
|
uint ThreadPayloadSize;
|
|
uint NumberOfHWThreads;
|
|
uint WorkDim;
|
|
uint3 GlobalOffset;
|
|
uint3 GlobalSizes;
|
|
uint3 ActualLocalSize;
|
|
|
|
GlobalOffset.x = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 0 ];
|
|
GlobalOffset.y = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 1 ];
|
|
GlobalOffset.z = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 2 ];
|
|
|
|
GlobalSizes.x = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 0 ];
|
|
GlobalSizes.y = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 1 ];
|
|
GlobalSizes.z = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 2 ];
|
|
|
|
ActualLocalSize.x = pWalkerMain->WalkerArray[ walkerPos ].ActualLocalSize.x;
|
|
ActualLocalSize.y = pWalkerMain->WalkerArray[ walkerPos ].ActualLocalSize.y;
|
|
ActualLocalSize.z = pWalkerMain->WalkerArray[ walkerPos ].ActualLocalSize.z;
|
|
|
|
WorkDim = pCommandHeader->m_range.m_dispatchDimensions;
|
|
TotalLocalSize = ActualLocalSize.x * ActualLocalSize.y * ActualLocalSize.z;
|
|
|
|
NumberOfHWThreads = TotalLocalSize / pBlockData->m_SIMDSize;
|
|
if( TotalLocalSize % pBlockData->m_SIMDSize )
|
|
{
|
|
NumberOfHWThreads++;
|
|
}
|
|
|
|
ThreadPayloadSize = NumberOfHWThreads * 3 * GRF_SIZE;
|
|
//Copy constant buffer to designated area on DSH.
|
|
|
|
//pDshOnKRS seems to be in the global address space, not the private address space
|
|
//Copy SamplerState and Constant Buffer at once
|
|
IGILLOCAL_MEMCPY_GTOG( pDsh, pDshOnKRS, pBlockData->m_sizeOfConstantBuffer + pBlockData->m_SizeOfSamplerHeap );
|
|
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT_MASK )
|
|
{
|
|
while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT )
|
|
{
|
|
CurbeSize = pKernelCurbeParams[ CurrentIndex ].m_parameterSize;
|
|
SizeOfScalarsFromCurbe += CurbeSize;
|
|
PatchOffset = pKernelCurbeParams[ CurrentIndex ].m_patchOffset;
|
|
|
|
//pScalarData is in the global address space, not the private address space
|
|
IGILLOCAL_MEMCPY_GTOG( &pDsh[ PatchOffset ], pScalarData, CurbeSize );
|
|
pScalarData += CurbeSize;
|
|
CurrentIndex++;
|
|
}
|
|
#if SCHEDULER_DEBUG_MODE
|
|
if( pCommandHeader->m_sizeOfScalarArguments != SizeOfScalarsFromCurbe )
|
|
{
|
|
pCommandHeader->m_commandState = SCHEDULER_CURBE_ARGUMENTS_SIZE_MISMATCH;
|
|
return -1;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE_MASK )
|
|
{
|
|
CurrentIndex = PatchLocalWorkSizes( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
pWalkerMain->LocalWorkSize.x,
|
|
pWalkerMain->LocalWorkSize.y,
|
|
pWalkerMain->LocalWorkSize.z,
|
|
ActualLocalSize.x,
|
|
ActualLocalSize.y,
|
|
ActualLocalSize.z );
|
|
#if SCHEDULER_DEBUG_MODE
|
|
if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
|
|
{
|
|
pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
|
|
return -1;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH6Tokens( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
GlobalSizes.x,
|
|
GlobalSizes.y,
|
|
GlobalSizes.z );
|
|
#if SCHEDULER_DEBUG_MODE
|
|
if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
|
|
{
|
|
pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
|
|
return -1;
|
|
}
|
|
#endif
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH6Tokens( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
pWalkerMain->TotalDimSize.x,
|
|
pWalkerMain->TotalDimSize.y,
|
|
pWalkerMain->TotalDimSize.z );
|
|
#if SCHEDULER_DEBUG_MODE
|
|
if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
|
|
{
|
|
pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
|
|
return -1;
|
|
}
|
|
#endif
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH1Token( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
WorkDim );
|
|
#if SCHEDULER_DEBUG_MODE
|
|
if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
|
|
{
|
|
pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
|
|
return -1;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES_MASK )
|
|
{
|
|
CurrentIndex = PatchLocalMemEntities( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
pCommandHeader );
|
|
#if SCHEDULER_DEBUG_MODE
|
|
if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
|
|
{
|
|
pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
|
|
return -1;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH6Tokens( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
GlobalOffset.x,
|
|
GlobalOffset.y,
|
|
GlobalOffset.z );
|
|
#if SCHEDULER_DEBUG_MODE
|
|
if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
|
|
{
|
|
pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
|
|
return -1;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH1Token( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
NumberOfHWThreads );
|
|
#if SCHEDULER_DEBUG_MODE
|
|
if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
|
|
{
|
|
pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
|
|
return -1;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_PARENT_EVENT_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH1Token( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_PARENT_EVENT,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
pCommandHeader->m_event );
|
|
#if SCHEDULER_DEBUG_MODE
|
|
if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
|
|
{
|
|
pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
|
|
return -1;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH6Tokens( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
pWalkerMain->LocalWorkSize.x,
|
|
pWalkerMain->LocalWorkSize.y,
|
|
pWalkerMain->LocalWorkSize.z );
|
|
#if SCHEDULER_DEBUG_MODE
|
|
if( ( CurrentIndex == -1 ) || ( CurrentIndex >= pBlockData->m_numberOfCurbeParams ) )
|
|
{
|
|
pCommandHeader->m_commandState = SCHEDULER_CURBE_TOKEN_MISSED;
|
|
return -1;
|
|
}
|
|
#endif
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER )
|
|
{
|
|
if( pCommandHeader->m_numGlobalCapturedBuffer > 0 )
|
|
{
|
|
//Handle global pointers patching in stateless mode, info about layout in declaration of IGIL_CommandHeader
|
|
__global uint* pGlobalIndexes = ( __global uint* ) ( &pCommandHeader->m_data[ NumberOfDepencies + pCommandHeader->m_numScalarArguments ] );
|
|
__global uint* pGlobalPtrs = ( __global uint* ) ( &pCommandHeader->m_data[ NumberOfDepencies + pCommandHeader->m_numScalarArguments + pCommandHeader->m_numGlobalCapturedBuffer ] );
|
|
uint StartIndex = CurrentIndex;
|
|
|
|
//Argument in command header are not in correct sequence, that's why proper key needs to be located
|
|
for( uint glIdx = 0 ; glIdx < pCommandHeader->m_numGlobalCapturedBuffer; glIdx++)
|
|
{
|
|
//Reset CurrentIndex as we need to start from the beginning.
|
|
CurrentIndex = StartIndex;
|
|
while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == COMPILER_DATA_PARAMETER_GLOBAL_SURFACE )
|
|
{
|
|
//Patch only if exact match occurs
|
|
if( pKernelCurbeParams[ CurrentIndex ].m_sourceOffset == *pGlobalIndexes )
|
|
{
|
|
PatchOffset = pKernelCurbeParams[ CurrentIndex ].m_patchOffset;
|
|
//64 bit patching
|
|
if( pKernelCurbeParams[ CurrentIndex ].m_parameterSize == 8 )
|
|
{
|
|
__global uint* pDst = (__global uint *) &pDsh[PatchOffset];
|
|
pDst[ 0 ] = pGlobalPtrs[ 0 ];
|
|
pDst[ 1 ] = pGlobalPtrs[ 1 ];
|
|
}
|
|
else
|
|
{
|
|
__global uint* pDst = ( __global uint* ) &pDsh[ PatchOffset ];
|
|
*pDst = ( uint ) *pGlobalPtrs;
|
|
}
|
|
}
|
|
CurrentIndex++;
|
|
}
|
|
pGlobalPtrs += 2;
|
|
pGlobalIndexes++;
|
|
}
|
|
}
|
|
}
|
|
|
|
//Now generate local IDS
|
|
if( pBlockData->m_SIMDSize == 8 )
|
|
{
|
|
generateLocalIDSsimd8( &pDsh[ pBlockData->m_sizeOfConstantBuffer ], ActualLocalSize, NumberOfHWThreads );
|
|
}
|
|
else
|
|
{
|
|
generateLocalIDSsimd16( &pDsh[ pBlockData->m_sizeOfConstantBuffer ], ActualLocalSize, NumberOfHWThreads );
|
|
}
|
|
|
|
uint TotalSLMSize = pCommandHeader->m_totalLocalSize + pBlockData->m_InilineSLMSize;
|
|
|
|
//Update Interface Descriptor Data with SLM size / number of HW threads.
|
|
CopyAndPatchIDData( dsh, blockId, NumberOfHWThreads, TotalSLMSize, intefaceDescriptorOffset, pQueue->m_controls.m_StartBlockID );
|
|
|
|
//Add WalkerStartSize
|
|
patchGpGpuWalker( pQueue->m_controls.m_SecondLevelBatchOffset, secondaryBatchBuffer, intefaceDescriptorOffset, pBlockData->m_SIMDSize,
|
|
TotalLocalSize, pWalkerMain->WalkerArray[ walkerPos ].WalkerDimSize, pWalkerMain->WalkerArray[ walkerPos ].WalkerStartPoint,
|
|
NumberOfHWThreads, pBlockData->m_sizeOfConstantBuffer + ThreadPayloadSize, dshOffset );
|
|
|
|
PatchMediaStateFlush( pQueue->m_controls.m_SecondLevelBatchOffset, secondaryBatchBuffer, intefaceDescriptorOffset, SCHEDULER_MSF_INITIAL );
|
|
PatchMediaStateFlush( pQueue->m_controls.m_SecondLevelBatchOffset, secondaryBatchBuffer, intefaceDescriptorOffset, SCHEDULER_MSF_SECOND );
|
|
|
|
return 0;
|
|
}
|
|
|
|
//Returns: isSRGB(ChannelOrder) ? ChannelOrder : 0;
|
|
inline uint GetSRGBChannelOrder( uint channelOrder )
|
|
{
|
|
const uint AsSrgb = channelOrder - CL_sRGB;
|
|
const uint NumSrgbFormats = CL_sBGRA - CL_sRGB;
|
|
if( AsSrgb < NumSrgbFormats )
|
|
return channelOrder;
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
void PatchDSHParallelWithDynamicDSH20( uint slbOffsetBase,
|
|
uint dshOffsetBase,
|
|
uint intefaceDescriptorOffsetBase,
|
|
__global IGIL_KernelDataHeader * pKernelReflection,
|
|
__global char* dsh,
|
|
uint blockId,
|
|
__global IGIL_CommandHeader* pCommandHeader,
|
|
__global uint* secondaryBatchBuffer,
|
|
__global IGIL_CommandQueue* pQueue,
|
|
__global IGIL_EventPool* eventsPool,
|
|
__global char* ssh,
|
|
uint btOffset,
|
|
__local IGIL_WalkerEnumeration* pWalkerEnum,
|
|
__local uint* objectIds
|
|
#ifdef ENABLE_DEBUG_BUFFER
|
|
, __global DebugDataBuffer* DebugQueue
|
|
#endif
|
|
)
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
|
|
__global IGIL_KernelAddressData* pKernelAddressData = IGIL_GetKernelAddressData( pKernelReflection, blockId );
|
|
__global IGIL_KernelData* pBlockData = IGIL_GetKernelData( pKernelReflection, blockId );
|
|
ulong PatchMask = pBlockData->m_PatchTokensMask;
|
|
uint CurrentIndex = 0;
|
|
|
|
__global IGIL_KernelCurbeParams* pKernelCurbeParams = ( __global IGIL_KernelCurbeParams* )&pBlockData->m_data;
|
|
|
|
uint NumberOfDepencies = pCommandHeader->m_numDependencies;
|
|
uint PatchOffset;
|
|
|
|
uint CurbeSize;
|
|
|
|
//Get pointer to the Sampler State
|
|
__global char *pDshOnKRS = GetPtrToKernelReflectionOffset( pKernelAddressData->m_SamplerHeapOffset, pKernelReflection );
|
|
|
|
uint WalkerCount = GetWalkerCount( pCommandHeader );
|
|
__global char *pKernelReflectionChar = ( __global char * ) pKernelReflection;
|
|
__global IGIL_KernelCurbeParams* pSSHdata = ( __global IGIL_KernelCurbeParams* )&pKernelReflectionChar[ pKernelAddressData->m_SSHTokensOffset ];
|
|
|
|
//WALKER variables that will be propagated to SLB
|
|
uint3 LocalSizes;
|
|
uint3 GlobalSizes;
|
|
uint3 GlobalOffset;
|
|
uint3 EdgeArray;
|
|
uint3 XYZDim;
|
|
|
|
//X is always there
|
|
GlobalOffset.x = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 0 ];
|
|
GlobalSizes.x = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 0 ];
|
|
LocalSizes.x = ( uint )pCommandHeader->m_range.m_localWorkSize[ 0 ];
|
|
EdgeArray.x = GlobalSizes.x % LocalSizes.x;
|
|
uint WorkDim = pCommandHeader->m_range.m_dispatchDimensions;
|
|
XYZDim.x = GlobalSizes.x / LocalSizes.x;
|
|
|
|
if( WorkDim > 1 )
|
|
{
|
|
GlobalOffset.y = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 1 ];
|
|
GlobalSizes.y = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 1 ];
|
|
LocalSizes.y = ( uint )pCommandHeader->m_range.m_localWorkSize[ 1 ];
|
|
EdgeArray.y = GlobalSizes.y % LocalSizes.y;
|
|
XYZDim.y = GlobalSizes.y / LocalSizes.y;
|
|
|
|
if( WorkDim > 2 )
|
|
{
|
|
GlobalOffset.z = ( uint )pCommandHeader->m_range.m_globalWorkOffset[ 2 ];
|
|
GlobalSizes.z = ( uint )pCommandHeader->m_range.m_globalWorkSize[ 2 ];
|
|
LocalSizes.z = ( uint )pCommandHeader->m_range.m_localWorkSize[ 2 ];
|
|
XYZDim.z = GlobalSizes.z / LocalSizes.z;
|
|
EdgeArray.z = GlobalSizes.z % LocalSizes.z;
|
|
}
|
|
else
|
|
{
|
|
GlobalOffset.z = 0;
|
|
GlobalSizes.z = 1;
|
|
LocalSizes.z = 1;
|
|
EdgeArray.z = 0;
|
|
XYZDim.z = 1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
GlobalOffset.y = 0;
|
|
GlobalOffset.z = 0;
|
|
GlobalSizes.y = 1;
|
|
GlobalSizes.z = 1;
|
|
LocalSizes.y = 1;
|
|
LocalSizes.z = 1;
|
|
EdgeArray.z = 0;
|
|
EdgeArray.y = 0;
|
|
XYZDim.z = 1;
|
|
XYZDim.y = 1;
|
|
}
|
|
|
|
if( get_local_id( 0 ) == 0 )
|
|
{
|
|
InitWalkerDataParallel( pWalkerEnum, WorkDim, &WalkerCount, EdgeArray, XYZDim, GlobalSizes, LocalSizes );
|
|
}
|
|
|
|
uint SLBOffset = slbOffsetBase;
|
|
uint DshOffset = dshOffsetBase;
|
|
uint IntefaceDescriptorOffset = intefaceDescriptorOffsetBase;
|
|
__global uint* pArgumentIds = NULL;
|
|
__global uint* pObjectIds = NULL;
|
|
__global char* pLocalIdsOnDSH = NULL;
|
|
|
|
uint SamplerHeapSize = pBlockData->m_SizeOfSamplerHeap;
|
|
|
|
//Object ID is in fact surface state offset for parent in case of surfaces using SSH, copy SSH from parent to child.
|
|
//Copy binding table state of this kernel to allocated place on ssh
|
|
GLOBAL_MEMCPY( &ssh[ btOffset ], &ssh[ pKernelAddressData->m_BTSoffset ] , pKernelAddressData->m_BTSize );
|
|
|
|
for( uint WalkerID = 0; WalkerID < WalkerCount; WalkerID++ )
|
|
{
|
|
//Update the offsets
|
|
if( WalkerID > 0 )
|
|
{
|
|
SLBOffset += SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE;
|
|
SLBOffset %= MAX_SLB_OFFSET;
|
|
IntefaceDescriptorOffset++;
|
|
DshOffset += MAX_DSH_SIZE_PER_ENQUEUE;
|
|
}
|
|
|
|
__global char* pDsh = ( __global char* )&dsh[ DshOffset ];
|
|
pLocalIdsOnDSH = &pDsh[ pBlockData->m_sizeOfConstantBuffer + SamplerHeapSize ];
|
|
|
|
//Copy Sampler State and constant buffer on all threads
|
|
GLOBAL_MEMCPY( pDsh, pDshOnKRS, pBlockData->m_sizeOfConstantBuffer + SamplerHeapSize );
|
|
|
|
barrier( CLK_GLOBAL_MEM_FENCE );
|
|
|
|
//Update BorderColorPointer on all threads
|
|
if( pBlockData->m_numberOfSamplerStates )
|
|
{
|
|
uint SamplerId = get_local_id( 0 );
|
|
__global uint* pSamplerState;
|
|
while( SamplerId < pBlockData->m_numberOfSamplerStates )
|
|
{
|
|
pSamplerState = ( __global uint* )&dsh[ DshOffset + pBlockData->m_SamplerStateArrayOffsetOnDSH + SamplerId * OCLRT_SIZEOF_SAMPLER_STATE ];
|
|
uint PatchValue = DshOffset >> 5;
|
|
patchDword( &pSamplerState[ SAMPLER_STATE_DESCRIPTOR_BORDER_COLOR_POINTER_DWORD ], 5, 31, PatchValue );
|
|
SamplerId += PARALLEL_SCHEDULER_COMPILATION_SIZE_20 * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20;
|
|
}
|
|
}
|
|
|
|
//Setup SSH if needed, do it only for first Walker as all Walkers will re-use the same binding table layout.
|
|
if( ( pCommandHeader->m_numGlobalArguments > 0 ) & ( WalkerID == 0 ) )
|
|
{
|
|
//Global arguments are after scalars, global pointers slm sizes and events
|
|
uint offset = pCommandHeader->m_numDependencies + pCommandHeader->m_numScalarArguments + pCommandHeader->m_numOfLocalPtrSizes + ( pCommandHeader->m_numGlobalCapturedBuffer * ( sizeof( ulong ) + sizeof( uint ) ) / sizeof( uint ) );
|
|
|
|
pArgumentIds = &pCommandHeader->m_data[ offset ];
|
|
//Object IDS are located after Argument IDs
|
|
pObjectIds = &pCommandHeader->m_data[ offset + pCommandHeader->m_numGlobalArguments ];
|
|
|
|
//Setup local memory for fast access for Curbe patching
|
|
uint ArgId = get_local_id( 0 );
|
|
|
|
//Only third group Updates ObjectIDS, this will be synchronized with condition below
|
|
if( ( ArgId >> HW_GROUP_ID_SHIFT( PARALLEL_SCHEDULER_COMPILATION_SIZE_20 ) ) == 2 )
|
|
{
|
|
ArgId = ArgId - ( PARALLEL_SCHEDULER_COMPILATION_SIZE_20 << 1 );
|
|
while( ArgId < pCommandHeader->m_numGlobalArguments )
|
|
{
|
|
objectIds[ pArgumentIds[ ArgId ] ] = pObjectIds[ ArgId ];
|
|
ArgId += PARALLEL_SCHEDULER_COMPILATION_SIZE_20;
|
|
}
|
|
}
|
|
#ifdef SCHEDULER_EMULATION
|
|
//Synchronization needed for Emulation, ObjectIDS needs to be set by whole HW group, on GPU there is implicit synchronization in HW group
|
|
barrier( CLK_GLOBAL_MEM_FENCE );
|
|
#endif
|
|
if( get_local_id( 0 ) == PARALLEL_SCHEDULER_COMPILATION_SIZE_20 * 2 )
|
|
{
|
|
__global uint* pBindingTable = ( __global uint* ) &ssh[ btOffset ];
|
|
|
|
|
|
//To properly set up binding table point to parents surface state heap
|
|
for( uint ArgumentID = 0 ; ArgumentID < pCommandHeader->m_numGlobalArguments; ArgumentID++ )
|
|
{
|
|
uint ArgId = pArgumentIds[ ArgumentID ];
|
|
|
|
//Locate proper Arg ID
|
|
//Get ssh offset, lookup table already provided
|
|
if( objectIds[ ArgId ] < MAX_SSH_PER_KERNEL_SIZE )
|
|
{
|
|
if( pSSHdata[ ArgId ].m_sourceOffset == ArgId )
|
|
{
|
|
pBindingTable[ pSSHdata[ ArgId ].m_patchOffset ] = objectIds[ ArgId ];
|
|
}
|
|
else
|
|
{
|
|
pQueue->m_controls.m_ErrorCode += 10;
|
|
uint CurrentArg = 0;
|
|
while( CurrentArg < pKernelAddressData->m_BTSize / 4 )
|
|
{
|
|
if( pSSHdata[ CurrentArg ].m_sourceOffset == ArgId )
|
|
{
|
|
pBindingTable[ pSSHdata[ CurrentArg ].m_patchOffset ] = objectIds[ ArgId ];
|
|
break;
|
|
}
|
|
CurrentArg++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if( ( PatchMask & SCHEDULER_DATA_PARAMETER_SAMPLER_MASK ) )
|
|
{
|
|
if( get_local_id( 0 ) == 2 * PARALLEL_SCHEDULER_COMPILATION_SIZE_20 )
|
|
{
|
|
for( uint ArgumentID = 0; ArgumentID < pCommandHeader->m_numGlobalArguments; ArgumentID++ )
|
|
{
|
|
uint ArgId = pArgumentIds[ ArgumentID ];
|
|
if( ( objectIds[ ArgId ] >= MAX_SSH_PER_KERNEL_SIZE ) )
|
|
{
|
|
uint SamplerCount = 0;
|
|
//Get pointer to Parent's samplers ( arguments ) data stored on KRS
|
|
__global IGIL_SamplerParams* pSamplerParamsOnKRS = ( __global IGIL_SamplerParams* )GetPtrToKernelReflectionOffset( pKernelAddressData->m_SamplerParamsOffset, pKernelReflection );
|
|
|
|
//Iterate through all samplers passed from parent and copy state to proper SSA offset
|
|
while( pKernelReflection->m_ParentSamplerCount > SamplerCount )
|
|
{
|
|
//Get offset in parent's SSA from ObjectID, offset to beginning of SSA is included ( before SSA is BorderColorPointer ) so this is relative to parent's DSH heap
|
|
PatchOffset = objectIds[ ArgId ] - MAX_SSH_PER_KERNEL_SIZE;
|
|
|
|
if( pSamplerParamsOnKRS->m_ArgID == ArgId )
|
|
{
|
|
IGILLOCAL_MEMCPY_GTOG( &pDsh[ pSamplerParamsOnKRS->m_SamplerStateOffset ], &dsh[ pQueue->m_controls.m_ParentDSHOffset + PatchOffset ], OCLRT_SIZEOF_SAMPLER_STATE );
|
|
break;
|
|
}
|
|
pSamplerParamsOnKRS = pSamplerParamsOnKRS + 1;
|
|
SamplerCount = SamplerCount + 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
__global char* pScalarData = ( __global char* ) ( &pCommandHeader->m_data[ NumberOfDepencies ] );
|
|
|
|
CurrentIndex = 0;
|
|
uint TotalLocalSize = pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.x *
|
|
pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.y *
|
|
pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.z;
|
|
|
|
uint NumberOfHWThreads = TotalLocalSize / pBlockData->m_SIMDSize;
|
|
|
|
if( TotalLocalSize % pBlockData->m_SIMDSize != 0 )
|
|
{
|
|
NumberOfHWThreads++;
|
|
}
|
|
|
|
uint ThreadPayloadSize = NumberOfHWThreads * pBlockData->m_PayloadSize;
|
|
|
|
//Move pointer to Constant Buffer Offset
|
|
pDsh = ( __global char* )&dsh[ DshOffset + SamplerHeapSize ];
|
|
|
|
if( ( get_local_id( 0 ) >= PARALLEL_SCHEDULER_COMPILATION_SIZE_20 ) & ( get_local_id( 0 ) < PARALLEL_SCHEDULER_COMPILATION_SIZE_20 + 6 ) )
|
|
{
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT_MASK )
|
|
{
|
|
while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == SCHEDULER_DATA_PARAMETER_KERNEL_ARGUMENT )
|
|
{
|
|
CurbeSize = pKernelCurbeParams[ CurrentIndex ].m_parameterSize;
|
|
PatchOffset = pKernelCurbeParams[ CurrentIndex ].m_patchOffset;
|
|
IGILLOCAL_MEMCPY_GTOG( &pDsh[ PatchOffset ], pScalarData, CurbeSize );
|
|
pScalarData += CurbeSize;
|
|
CurrentIndex++;
|
|
}
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE_MASK )
|
|
{
|
|
CurrentIndex = PatchLocalWorkSizesParallel( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_LOCAL_WORK_SIZE,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
LocalSizes.x,
|
|
LocalSizes.y,
|
|
LocalSizes.z,
|
|
pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.x,
|
|
pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.y,
|
|
pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize.z );
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH6TokensParallel20( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_SIZE,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
GlobalSizes.x,
|
|
GlobalSizes.y,
|
|
GlobalSizes.z );
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH6TokensParallel20( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_NUM_WORK_GROUPS,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
pWalkerEnum->TotalDimSize.x,
|
|
pWalkerEnum->TotalDimSize.y,
|
|
pWalkerEnum->TotalDimSize.z );
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH1TokenParallel20( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_WORK_DIMENSIONS,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
WorkDim );
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES_MASK )
|
|
{
|
|
CurrentIndex = PatchLocalMemEntities( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
pCommandHeader );
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH6TokensParallel20( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_GLOBAL_WORK_OFFSET,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
GlobalOffset.x,
|
|
GlobalOffset.y,
|
|
GlobalOffset.z );
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH1TokenParallel20( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_NUM_HARDWARE_THREADS,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
NumberOfHWThreads );
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_PARENT_EVENT_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH1TokenParallel20( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_PARENT_EVENT,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
pCommandHeader->m_event );
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE_MASK )
|
|
{
|
|
CurrentIndex = PatchDSH6TokensParallel20( CurrentIndex,
|
|
SCHEDULER_DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE,
|
|
pKernelCurbeParams,
|
|
pDsh,
|
|
LocalSizes.x,
|
|
LocalSizes.y,
|
|
LocalSizes.z );
|
|
}
|
|
if( PatchMask & SCHEDULER_DATA_PARAMETER_GLOBAL_POINTER )
|
|
{
|
|
if( pCommandHeader->m_numGlobalCapturedBuffer > 0 )
|
|
{
|
|
//Handle global pointers patching in stateless mode, info about layout in declaration of IGIL_CommandHeader
|
|
__global uint* pGlobalIndexes = ( __global uint* ) ( &pCommandHeader->m_data[ NumberOfDepencies + pCommandHeader->m_numScalarArguments ] );
|
|
__global uint* pGlobalPtrs = ( __global uint* ) ( &pCommandHeader->m_data[ NumberOfDepencies + pCommandHeader->m_numScalarArguments + pCommandHeader->m_numGlobalCapturedBuffer ] );
|
|
uint StartIndex = CurrentIndex;
|
|
|
|
//Argument in command header are not in correct sequence, that's why proper key needs to be located
|
|
for( uint glIdx = 0 ; glIdx < pCommandHeader->m_numGlobalCapturedBuffer; glIdx++)
|
|
{
|
|
//Reset CurrentIndex as we need to start from the beginning.
|
|
CurrentIndex = StartIndex;
|
|
while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == COMPILER_DATA_PARAMETER_GLOBAL_SURFACE )
|
|
{
|
|
//Patch only if exact match occurs
|
|
if( pKernelCurbeParams[ CurrentIndex ].m_sourceOffset == *pGlobalIndexes )
|
|
{
|
|
PatchOffset = pKernelCurbeParams[ CurrentIndex ].m_patchOffset;
|
|
//64 bit patching
|
|
if( pKernelCurbeParams[ CurrentIndex ].m_parameterSize == 8 )
|
|
{
|
|
__global uint* pDst = (__global uint *) &pDsh[PatchOffset];
|
|
pDst[0] = pGlobalPtrs[0];
|
|
pDst[1] = pGlobalPtrs[1];
|
|
}
|
|
else
|
|
{
|
|
__global uint* pDst = ( __global uint* ) &pDsh[ PatchOffset ];
|
|
*pDst = ( uint ) *pGlobalPtrs;
|
|
}
|
|
}
|
|
CurrentIndex++;
|
|
}
|
|
pGlobalPtrs += 2;
|
|
pGlobalIndexes++;
|
|
}
|
|
}
|
|
while( pKernelCurbeParams[ CurrentIndex ].m_parameterType == COMPILER_DATA_PARAMETER_GLOBAL_SURFACE )
|
|
{
|
|
CurrentIndex++;
|
|
}
|
|
}
|
|
|
|
//Patch images curbe entries
|
|
if( ( PatchMask & SCHEDULER_DATA_PARAMETER_IMAGE_CURBE_ENTRIES ) | ( PatchMask & SCHEDULER_DATA_PARAMETER_SAMPLER_MASK ) )
|
|
{
|
|
if( ( pArgumentIds != NULL ) & ( pObjectIds != NULL ) )
|
|
{
|
|
//pKernelReflectionChar is a global address pointer
|
|
__global IGIL_ImageParamters *pImageParams = ( __global IGIL_ImageParamters * ) &pKernelReflectionChar[ pKernelReflection->m_ParentImageDataOffset ];
|
|
__global IGIL_ParentSamplerParams *pParentSamplerParams = ( __global IGIL_ParentSamplerParams* ) &pKernelReflectionChar[ pKernelReflection->m_ParentSamplerParamsOffset ];
|
|
//First obtain argument ID
|
|
uint WorkID = get_local_id( 0 ) - PARALLEL_SCHEDULER_COMPILATION_SIZE_20;
|
|
while( WorkID + CurrentIndex < pBlockData->m_numberOfCurbeTokens )
|
|
{
|
|
uint ArgId = pKernelCurbeParams[ CurrentIndex + WorkID ].m_sourceOffset;
|
|
uint ObjectID = objectIds[ ArgId ];
|
|
uint CurrentImage = 0;
|
|
uint CurrentSampler = 0;
|
|
|
|
uint PatchValue = 0;
|
|
uint TokenType = pKernelCurbeParams[ CurrentIndex + WorkID ].m_parameterType;
|
|
uint PatchOffset = pKernelCurbeParams[ CurrentIndex + WorkID ].m_patchOffset;
|
|
uint PatchValueInvalid = 0;
|
|
|
|
//If Images
|
|
if( ObjectID < OCLRT_IMAGE_MAX_OBJECT_ID )
|
|
{
|
|
//Locate proper parent Image
|
|
while( ( pImageParams[ CurrentImage ].m_ObjectID != ObjectID ) & ( CurrentImage < pKernelReflection->m_ParentKernelImageCount ) )
|
|
{
|
|
CurrentImage++;
|
|
}
|
|
//Proper image is located under CurrentImage patch the token
|
|
|
|
if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_WIDTH )
|
|
{
|
|
PatchValue = pImageParams[ CurrentImage ].m_Width;
|
|
}
|
|
else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_HEIGHT )
|
|
{
|
|
PatchValue = pImageParams[ CurrentImage ].m_Height;
|
|
}
|
|
else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_DEPTH )
|
|
{
|
|
PatchValue = pImageParams[ CurrentImage ].m_Depth;
|
|
}
|
|
else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE )
|
|
{
|
|
PatchValue = pImageParams[ CurrentImage ].m_ChannelDataType;
|
|
}
|
|
else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_CHANNEL_ORDER )
|
|
{
|
|
PatchValue = pImageParams[ CurrentImage ].m_ChannelOrder;
|
|
}
|
|
else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_SRGB_CHANNEL_ORDER )
|
|
{
|
|
PatchValue = GetSRGBChannelOrder( pImageParams[ CurrentImage ].m_ChannelOrder );
|
|
}
|
|
else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_ARRAY_SIZE )
|
|
{
|
|
PatchValue = pImageParams[ CurrentImage ].m_ArraySize;
|
|
}
|
|
else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_NUM_SAMPLES )
|
|
{
|
|
PatchValue = pImageParams[ CurrentImage ].m_NumSamples;
|
|
}
|
|
else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_NUM_MIP_LEVELS )
|
|
{
|
|
PatchValue = pImageParams[ CurrentImage ].m_NumMipLevels;
|
|
}
|
|
else if( TokenType == SCHEDULER_DATA_PARAMETER_IMAGE_OBJECT_ID )
|
|
{
|
|
PatchValue = ObjectID;
|
|
}
|
|
else
|
|
{
|
|
PatchValueInvalid = 1;
|
|
}
|
|
}
|
|
//If Sampler
|
|
else if( ObjectID >= OCLRT_SAMPLER_MIN_OBJECT_ID )
|
|
{
|
|
//Mark PatchValue invalid if SamplerParams will not be found
|
|
PatchValueInvalid = 1;
|
|
//Locate proper parent Image
|
|
while( CurrentSampler < pKernelReflection->m_ParentSamplerCount )
|
|
{
|
|
if( pParentSamplerParams[ CurrentSampler ].m_ObjectID == ObjectID )
|
|
{
|
|
PatchValueInvalid = 0;
|
|
if( TokenType == DATA_PARAMETER_SAMPLER_ADDRESS_MODE )
|
|
{
|
|
PatchValue = pParentSamplerParams[ CurrentSampler ].m_AddressingMode;
|
|
}
|
|
else if( TokenType == DATA_PARAMETER_SAMPLER_NORMALIZED_COORDS )
|
|
{
|
|
PatchValue = pParentSamplerParams[ CurrentSampler ].NormalizedCoords;
|
|
}
|
|
else if( TokenType == DATA_PARAMETER_SAMPLER_COORDINATE_SNAP_WA_REQUIRED )
|
|
{
|
|
PatchValue = pParentSamplerParams[ CurrentSampler ].CoordinateSnapRequired;
|
|
}
|
|
else if( TokenType == SCHEDULER_DATA_PARAMETER_SAMPLER_OBJECT_ID )
|
|
{
|
|
PatchValue = ObjectID;
|
|
}
|
|
else
|
|
{
|
|
PatchValueInvalid = 1;
|
|
}
|
|
CurrentSampler = pKernelReflection->m_ParentSamplerCount;
|
|
}
|
|
CurrentSampler++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
PatchValueInvalid = 1;
|
|
}
|
|
|
|
if( PatchValueInvalid == 0 )
|
|
{
|
|
*( __global uint * ) ( &pDsh[ PatchOffset ] ) = PatchValue;
|
|
}
|
|
CurrentIndex += 6;
|
|
}
|
|
|
|
}
|
|
else
|
|
{
|
|
pQueue->m_controls.m_ErrorCode += 7;
|
|
}
|
|
}
|
|
}
|
|
#ifdef SCHEDULER_EMULATION
|
|
barrier( CLK_GLOBAL_MEM_FENCE );
|
|
#endif
|
|
|
|
if( get_local_id( 0 ) == 0 )
|
|
{
|
|
#if defined WA_LRI_COMMANDS_EXIST
|
|
bool ShouldDisablePreemption = false;
|
|
#endif
|
|
//Profiling support
|
|
if( pQueue->m_controls.m_IsProfilingEnabled != 0 )
|
|
{
|
|
bool DisableTimeStampStart = true;
|
|
bool DisableTimeStampEnd = true;
|
|
if( ( ( uint )pCommandHeader->m_event != IGIL_EVENT_INVALID_HANDLE ) & ( ( WalkerID == 0 ) | ( WalkerID == WalkerCount - 1 ) ) )
|
|
{
|
|
//Event is propagated to childs as "parent event", to avoid overwriting the same start value, only generate timestamp write
|
|
//For the first command for this event, this means we look for event with no children ( so compare to 1 ).
|
|
clk_event_t EventID = __builtin_astype( ( void* ) ( ( ulong ) pCommandHeader->m_event ), clk_event_t );
|
|
__global IGIL_DeviceEvent *events = TEMP_IGIL_GetDeviceEvents( eventsPool );
|
|
|
|
if( events[ ( uint )(size_t)__builtin_astype( EventID, void* ) ].m_numChildren == 1 )
|
|
{
|
|
#if defined WA_LRI_COMMANDS_EXIST && defined WA_PROFILING_PREEMPTION
|
|
//This is a case, where profiling of block kernels occurs - presence of event in EM workload
|
|
//In such case, disable preemption around all WALKERs for that block kernel and event
|
|
ShouldDisablePreemption = true;
|
|
#endif
|
|
if( WalkerID == 0 )
|
|
{
|
|
//Emit pipecontrol with timestamp write
|
|
ulong Address = ( ulong )&( events[ ( uint )(size_t)__builtin_astype( EventID, void* ) ].m_profilingCmdStart );
|
|
//Timestamp start
|
|
PatchPipeControlProfilingAddres( secondaryBatchBuffer,
|
|
SLBOffset,
|
|
Address,
|
|
PIPE_CONTROL_FOR_TIMESTAMP_START_OFFSET );
|
|
DisableTimeStampStart = false;
|
|
}
|
|
if( WalkerID == WalkerCount - 1 )
|
|
{
|
|
ulong Address = ( ulong )&( events[ ( uint )(size_t)__builtin_astype( EventID, void* ) ].m_profilingCmdEnd );
|
|
//Timestamp end
|
|
PatchPipeControlProfilingAddres( secondaryBatchBuffer,
|
|
SLBOffset,
|
|
Address,
|
|
PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
|
|
DisableTimeStampEnd = false;
|
|
}
|
|
}
|
|
}
|
|
if( DisableTimeStampStart )
|
|
{
|
|
DisablePostSyncBitInPipeControl( secondaryBatchBuffer,
|
|
SLBOffset,
|
|
PIPE_CONTROL_FOR_TIMESTAMP_START_OFFSET );
|
|
}
|
|
if( DisableTimeStampEnd )
|
|
{
|
|
DisablePostSyncBitInPipeControl( secondaryBatchBuffer,
|
|
SLBOffset,
|
|
PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//Optimized path, in case block can be run concurently noop pipe control after such block.
|
|
uint DwordOffset = SLBOffset / DWORD_SIZE_IN_BYTES;
|
|
|
|
if( pBlockData->m_CanRunConcurently != 0 )
|
|
{
|
|
NOOPCSStallPipeControl( secondaryBatchBuffer, DwordOffset, PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
|
|
}
|
|
else
|
|
{
|
|
PutCSStallPipeControl( secondaryBatchBuffer, DwordOffset, PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
|
|
}
|
|
}
|
|
|
|
#if defined WA_LRI_COMMANDS_EXIST
|
|
bool NoopPreemptionDisabling = true;
|
|
bool NoopPreemptionEnabling = true;
|
|
|
|
#if defined WA_KERNEL_PREEMPTION
|
|
//This is case, where block kernel should have disabled preemption because of its sampler usage around all WALKERs of that block kernel
|
|
//Preemption should be disabled when EM event profiling is used OR kernel data indicate such behavior
|
|
ShouldDisablePreemption |= ( pBlockData->m_DisablePreemption != 0 );
|
|
#endif
|
|
|
|
#if defined WA_PROFILING_PREEMPTION
|
|
//m_EventTimestampAddress != NULL means profiling of the whole workload is enabled (preemption around whole chained BB is disabled)
|
|
//So disabling preemption should be permitted only when workload profiling is off, in other cases noop all LRI commands
|
|
//For m_EventTimestampAddress != NULL preemption is enabled before BB_END
|
|
ShouldDisablePreemption &= ( pQueue->m_controls.m_EventTimestampAddress == 0 );
|
|
#endif
|
|
|
|
if( ShouldDisablePreemption != false )
|
|
{
|
|
if( WalkerID == 0 )
|
|
{
|
|
SetDisablePreemptionRegister( SLBOffset, secondaryBatchBuffer );
|
|
NoopPreemptionDisabling = false;
|
|
}
|
|
|
|
if( WalkerID == WalkerCount - 1 )
|
|
{
|
|
SetEnablePreemptionRegister( SLBOffset, secondaryBatchBuffer );
|
|
NoopPreemptionEnabling = false;
|
|
}
|
|
}
|
|
|
|
if( NoopPreemptionDisabling )
|
|
{
|
|
NoopPreemptionCommand( SLBOffset, IMM_LOAD_REGISTER_FOR_DISABLE_PREEMPTION_OFFSET, secondaryBatchBuffer );
|
|
}
|
|
|
|
if( NoopPreemptionEnabling )
|
|
{
|
|
NoopPreemptionCommand( SLBOffset, IMM_LOAD_REGISTER_FOR_ENABLE_PREEMPTION_OFFSET, secondaryBatchBuffer );
|
|
}
|
|
#endif //WA_LRI_COMMANDS_EXIST
|
|
}
|
|
|
|
//Witems from 0 to 16 are responsible for local ids generation.
|
|
if( ( get_local_id( 0 ) < 16 ) & ( pBlockData->m_NeedLocalIDS != 0 ) )
|
|
{
|
|
//Now generate local IDS
|
|
generateLocalIDSParallel20( pLocalIdsOnDSH, pWalkerEnum->WalkerArray[ WalkerID ].ActualLocalSize, NumberOfHWThreads, pBlockData->m_SIMDSize );
|
|
}
|
|
//3rd HW thread will take care of patching media curbe load and GPPGU_WALKER command
|
|
if( get_local_id( 0 ) == PARALLEL_SCHEDULER_COMPILATION_SIZE_20 * 2 )
|
|
{
|
|
|
|
uint TotalSLMSize = pCommandHeader->m_totalLocalSize + pBlockData->m_InilineSLMSize;
|
|
//Update Interface Descriptor Data with SLM size / number of HW threads.
|
|
CopyAndPatchIDData20(dsh, blockId, NumberOfHWThreads, TotalSLMSize, IntefaceDescriptorOffset,
|
|
pQueue->m_controls.m_StartBlockID, btOffset, DshOffset,
|
|
pBlockData->m_numberOfSamplerStates
|
|
#ifdef ENABLE_DEBUG_BUFFER
|
|
,
|
|
DebugQueue
|
|
#endif
|
|
);
|
|
|
|
patchGpGpuWalker( SLBOffset, secondaryBatchBuffer, IntefaceDescriptorOffset, pBlockData->m_SIMDSize,
|
|
TotalLocalSize, pWalkerEnum->WalkerArray[ WalkerID ].WalkerDimSize, pWalkerEnum->WalkerArray[ WalkerID ].WalkerStartPoint,
|
|
NumberOfHWThreads, pBlockData->m_sizeOfConstantBuffer + ThreadPayloadSize, SamplerHeapSize + DshOffset );
|
|
|
|
PatchMediaStateFlush( SLBOffset, secondaryBatchBuffer, IntefaceDescriptorOffset, SCHEDULER_MSF_INITIAL );
|
|
PatchMediaStateFlush( SLBOffset, secondaryBatchBuffer, IntefaceDescriptorOffset, SCHEDULER_MSF_SECOND );
|
|
}
|
|
}
|
|
}
|
|
|
|
uint CheckEventStatus( __global IGIL_CommandHeader* pCommand,
|
|
__global IGIL_EventPool* eventsPool )
|
|
{
|
|
if( pCommand->m_numDependencies == 0 )
|
|
{
|
|
return 0;
|
|
}
|
|
else
|
|
{
|
|
__global IGIL_DeviceEvent* pDeviceEvent;
|
|
//Events are stored at the begining of command packet dynamic payload
|
|
for( uint i = 0; i < pCommand->m_numDependencies; i++ )
|
|
{
|
|
pDeviceEvent = TEMP_IGIL_GetDeviceEvent( eventsPool, pCommand->m_data[ i ] );
|
|
if( pDeviceEvent->m_state != CL_COMPLETE )
|
|
{
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void DecreaseEventDependenciesParallel( __global IGIL_CommandHeader* pCommand,
|
|
__global IGIL_EventPool* eventsPool )
|
|
{
|
|
__global IGIL_DeviceEvent* pDeviceEvent;
|
|
|
|
//Events are stored at the begining of command packet dynamic payload
|
|
for( uint i = 0; i < pCommand->m_numDependencies; i++ )
|
|
{
|
|
pDeviceEvent = TEMP_IGIL_GetDeviceEvent( eventsPool, pCommand->m_data[ i ] );
|
|
int OldDependants = atomic_dec( &pDeviceEvent->m_numDependents );
|
|
|
|
if( ( pDeviceEvent->m_refCount <= 0 ) &
|
|
( ( OldDependants - 1 ) <= 0 ) &
|
|
( pDeviceEvent->m_numChildren <= 0 ) )
|
|
{
|
|
TEMP_IGIL_FreeEvent( __builtin_astype( ( void* )( ( ulong )pCommand->m_data[ i ] ), clk_event_t ), eventsPool );
|
|
}
|
|
}
|
|
}
|
|
|
|
//Update status of the event and all events that are depending on this event
|
|
void UpdateEventsTreeStatusParallel( clk_event_t eventId, __global IGIL_EventPool* eventsPool, bool isProfilingEnabled )
|
|
{
|
|
__global IGIL_DeviceEvent *events = TEMP_IGIL_GetDeviceEvents( eventsPool );
|
|
__global IGIL_DeviceEvent *pEvent;
|
|
do
|
|
{
|
|
pEvent = &events[ (uint) (size_t)__builtin_astype( eventId, void* ) ];
|
|
|
|
int OldNumChild = atomic_dec( &pEvent->m_numChildren );
|
|
if( ( OldNumChild - 1 ) <= 0 )
|
|
{
|
|
pEvent->m_state = CL_COMPLETE;
|
|
|
|
if( ( pEvent->m_refCount <= 0 ) &
|
|
( pEvent->m_numDependents <= 0 ) &
|
|
( pEvent->m_numChildren <= 0 ) )
|
|
{
|
|
TEMP_IGIL_FreeEvent( eventId, eventsPool );
|
|
}
|
|
//This event transitions to CL_COMPLETE state, update it profiling informations.
|
|
if( isProfilingEnabled != 0 )
|
|
{
|
|
//CL COMPLETE time is before this scheduler starts
|
|
pEvent->m_profilingCmdComplete = eventsPool->m_CLcompleteTimestamp;
|
|
|
|
//Check if this event has profiling pointer, if so update profiling data, all times should be there atm
|
|
if( pEvent->m_pProfiling != 0 )
|
|
{
|
|
__global ulong* retValues = ( __global ulong * )pEvent->m_pProfiling;
|
|
|
|
ulong StartTime = pEvent->m_profilingCmdStart;
|
|
ulong EndTime = pEvent->m_profilingCmdEnd;
|
|
ulong CompleteTime = pEvent->m_profilingCmdComplete;
|
|
ulong CLEndTransitionTime = 0;
|
|
ulong CLCompleteTransitionTime = 0;
|
|
//Check if timer didn't reset by hitting max value
|
|
if( CompleteTime > StartTime )
|
|
{
|
|
CLEndTransitionTime = EndTime - StartTime;
|
|
CLCompleteTransitionTime = CompleteTime - StartTime;
|
|
}
|
|
//If we hit this else it means that GPU timer reset to 0, compute proper delta
|
|
else
|
|
{
|
|
if( EndTime < StartTime )
|
|
{
|
|
CLEndTransitionTime = PROFILING_MAX_TIMER_VALUE - StartTime + EndTime;
|
|
}
|
|
else
|
|
{
|
|
CLEndTransitionTime = EndTime - StartTime;
|
|
}
|
|
CLCompleteTransitionTime = PROFILING_MAX_TIMER_VALUE - StartTime + CompleteTime;
|
|
}
|
|
//First value is END - START timestamp
|
|
retValues[ 0 ] = ( ulong )( ( float )CLEndTransitionTime * __intel__getProfilingTimerResolution() );
|
|
//Second value is COMPLETE - START timestamp
|
|
retValues[ 1 ] = ( ulong )( ( float )CLCompleteTransitionTime * __intel__getProfilingTimerResolution() );
|
|
}
|
|
}
|
|
//Signal parent because we completed
|
|
eventId = __builtin_astype( ( void* )( ( ulong )pEvent->m_parentEvent ), clk_event_t );
|
|
}
|
|
}
|
|
while ( ( ( uint )(size_t)__builtin_astype( eventId, void* ) != IGIL_EVENT_INVALID_HANDLE ) & ( pEvent->m_numChildren <= 0 ) );
|
|
}
|
|
|
|
void GlobalBarrier( __global volatile uint* syncSurface )
|
|
{
|
|
//Make sure each WKG item hit the barrier.
|
|
barrier( CLK_GLOBAL_MEM_FENCE );
|
|
|
|
//Now first thread of each wkg writes to designated place on SyncSurface
|
|
if ( get_local_id( 0 ) == 0 )
|
|
{
|
|
syncSurface[ get_group_id( 0 ) ] = 1;
|
|
}
|
|
//Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner.
|
|
if( get_group_id( 0 ) == ( get_num_groups( 0 ) - 1 ) )
|
|
{
|
|
//24 -48 case
|
|
uint Value;
|
|
do
|
|
{
|
|
Value = 1;
|
|
for( uint i = get_local_id( 0 ); i < get_num_groups( 0 ); i += get_local_size( 0 ) )
|
|
{
|
|
Value = Value & syncSurface[ i ];
|
|
}
|
|
|
|
}
|
|
while( Value == 0 );
|
|
barrier( CLK_GLOBAL_MEM_FENCE );
|
|
|
|
for( uint i = get_local_id( 0 ); i < get_num_groups( 0 ); i += get_local_size( 0 ) )
|
|
{
|
|
syncSurface[ i ] = 0;
|
|
}
|
|
}
|
|
|
|
if( get_local_id( 0 ) == 0 )
|
|
{
|
|
while( syncSurface[ get_group_id( 0 ) ] != 0 );
|
|
}
|
|
barrier( CLK_GLOBAL_MEM_FENCE );
|
|
}
|
|
|
|
void GlobalBarrierUpdateQueue( __global volatile uint* syncSurface, __global IGIL_CommandQueue* pQueue )
|
|
{
|
|
//Make sure each WKG item hit the barrier.
|
|
barrier( CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
|
|
//Now first thread of each wkg writes to designated place on SyncSurface
|
|
if ( get_local_id(0) == 0 )
|
|
{
|
|
syncSurface[get_group_id(0)] = 1;
|
|
}
|
|
//Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner.
|
|
if( get_group_id(0) == ( get_num_groups( 0 ) - 1 ) )
|
|
{
|
|
uint Value;
|
|
do
|
|
{
|
|
Value = 1;
|
|
for( uint i = get_local_id( 0 ); i < get_num_groups( 0 ); i += get_local_size( 0 ) )
|
|
{
|
|
Value = Value & syncSurface[ i ];
|
|
}
|
|
}
|
|
while( Value == 0 );
|
|
barrier( CLK_GLOBAL_MEM_FENCE );
|
|
pQueue->m_controls.m_IDTAfterFirstPhase = pQueue->m_controls.m_CurrentIDToffset;
|
|
|
|
barrier( CLK_GLOBAL_MEM_FENCE );
|
|
|
|
for( uint i = get_local_id( 0 ); i < get_num_groups( 0 ); i += get_local_size( 0 ) )
|
|
{
|
|
syncSurface[ i ] = 0;
|
|
}
|
|
}
|
|
|
|
if( get_local_id(0) == 0 )
|
|
{
|
|
while( syncSurface[ get_group_id(0) ] != 0 );
|
|
}
|
|
barrier( CLK_GLOBAL_MEM_FENCE );
|
|
}
|
|
|
|
|
|
#ifdef SCHEDULER_EMULATION
|
|
__local int IDTOffset;
|
|
__local int DSHOffset;
|
|
__local int SLBOffset;
|
|
__local int StackOffset;
|
|
__local int QStorageOffset;
|
|
__local int MarkerOffset;
|
|
__local int BTSoffset;
|
|
__local IGIL_WalkerEnumeration WalkerEnum;
|
|
__local uint ObjectIDS[ MAX_GLOBAL_ARGS ];
|
|
#endif
|
|
|
|
#define WA_INT_DESC_MAX 62
|
|
__kernel __attribute__((intel_reqd_sub_group_size(PARALLEL_SCHEDULER_COMPILATION_SIZE_20)))
|
|
void SchedulerParallel20(
|
|
__global IGIL_CommandQueue* pQueue,
|
|
__global uint* commandsStack,
|
|
__global IGIL_EventPool* eventsPool,
|
|
__global uint* secondaryBatchBuffer, //SLB that will be used to put commands in.
|
|
__global char* dsh, //Pointer to the start of Dynamic State Heap
|
|
__global IGIL_KernelDataHeader* kernelData, //This is kernel reflection surface
|
|
__global volatile uint* queueStorageBuffer,
|
|
__global char* ssh, //Pointer to Surface state heap with BT and SS
|
|
__global DebugDataBuffer* debugQueue )
|
|
{
|
|
EMULATION_ENTER_FUNCTION( );
|
|
#ifdef WA_DISABLE_SCHEDULERS
|
|
return;
|
|
#endif
|
|
|
|
#ifdef DEBUG
|
|
//Early return enabled when m_SchedulerEarlyReturn is > 0,
|
|
if( pQueue->m_controls.m_SchedulerEarlyReturn > 0 )
|
|
{
|
|
if( pQueue->m_controls.m_SchedulerEarlyReturn == 1 )
|
|
{
|
|
return;
|
|
}
|
|
|
|
if( get_global_id( 0 ) == 0 )
|
|
{
|
|
pQueue->m_controls.m_SchedulerEarlyReturnCounter++;
|
|
}
|
|
GlobalBarrier( queueStorageBuffer );
|
|
|
|
if( pQueue->m_controls.m_SchedulerEarlyReturnCounter == pQueue->m_controls.m_SchedulerEarlyReturn )
|
|
{
|
|
if( ( ( get_group_id( 0 ) == 1) == ( get_num_groups( 0 ) > 1 ) ) & ( get_local_id( 0 ) == 0 ) )
|
|
{
|
|
#ifdef ENABLE_DEBUG_BUFFER
|
|
//Set START time of current (last) scheduler
|
|
if( ( pQueue->m_controls.m_IsProfilingEnabled != 0 ) & ( DebugQueue != 0 ) & ( DebugQueue->m_flags == DDB_SCHEDULER_PROFILING ) )
|
|
{
|
|
*( ( __global ulong * ) ( &DebugQueue->m_data[ atomic_add( &DebugQueue->m_offset, 2 ) ] ) ) = EventsPool->m_CLcompleteTimestamp;
|
|
}
|
|
#endif
|
|
pQueue->m_controls.Temporary[ 2 ]++;
|
|
|
|
//SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
|
|
__private uint DwordOffset = ( pQueue->m_controls.m_SecondLevelBatchOffset % MAX_SLB_OFFSET ) / DWORD_SIZE_IN_BYTES;
|
|
//BB_START 1st DWORD
|
|
secondaryBatchBuffer[ DwordOffset ] = OCLRT_BATCH_BUFFER_BEGIN_CMD_DWORD0;
|
|
DwordOffset++;
|
|
//BB_START 2nd DWORD - Address, 3rd DWORD Address high
|
|
secondaryBatchBuffer[ DwordOffset++ ] = (uint)(pQueue->m_controls.m_CleanupSectionAddress & 0xFFFFFFFF);
|
|
secondaryBatchBuffer[ DwordOffset ] = (uint)((pQueue->m_controls.m_CleanupSectionAddress >> 32) & 0xFFFFFFFF);
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
//First check if there are any new command packets on queue_t
|
|
__global IGIL_CommandHeader* pCommand = 0;
|
|
uint GroupID = get_group_id( 0 );
|
|
|
|
#ifndef SCHEDULER_EMULATION
|
|
__local int IDTOffset;
|
|
__local int DSHOffset;
|
|
__local int SLBOffset;
|
|
__local int StackOffset;
|
|
__local int QStorageOffset;
|
|
__local int MarkerOffset;
|
|
__local int BTSoffset;
|
|
__local IGIL_WalkerEnumeration WalkerEnum;
|
|
__local uint ObjectIDS[ MAX_GLOBAL_ARGS ];
|
|
|
|
#endif
|
|
|
|
if( pQueue->m_controls.m_LastScheduleEventNumber > 0 )
|
|
{
|
|
//Check if there are any events that needs updating, each wkg uses all hw threads in wkg to update events
|
|
if( GroupID * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 < pQueue->m_controls.m_LastScheduleEventNumber )
|
|
{
|
|
clk_event_t EventID;
|
|
if( get_local_id( 0 ) % PARALLEL_SCHEDULER_COMPILATION_SIZE_20 == 0 )
|
|
{
|
|
uint ID = ( GroupID * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20 ) + ( get_local_id( 0 ) / PARALLEL_SCHEDULER_COMPILATION_SIZE_20 );
|
|
while( ID < pQueue->m_controls.m_LastScheduleEventNumber )
|
|
{
|
|
EventID = __builtin_astype( ( void* )( ( ulong )pQueue->m_controls.m_EventDependencies[ ID ] ), clk_event_t );
|
|
UpdateEventsTreeStatusParallel( EventID, eventsPool, ( pQueue->m_controls.m_IsProfilingEnabled != 0 ) );
|
|
ID += get_num_groups( 0 ) * PARALLEL_SCHEDULER_HWTHREADS_IN_HW_GROUP20;
|
|
}
|
|
}
|
|
}
|
|
GlobalBarrier( queueStorageBuffer );
|
|
}
|
|
//Queue parsing section
|
|
uint NumberOfEnqueues = pQueue->m_controls.m_TotalNumberOfQueues - pQueue->m_controls.m_PreviousNumberOfQueues;
|
|
if( NumberOfEnqueues > 0 )
|
|
{
|
|
uint InitialOffset = pQueue->m_controls.m_PreviousHead;
|
|
bool PacketScheduled = true;
|
|
uint offset = 0;
|
|
|
|
for( uint CurrentPacket = GroupID; CurrentPacket < NumberOfEnqueues; CurrentPacket += get_num_groups( 0 ) )
|
|
{
|
|
if( CurrentPacket == GroupID )
|
|
{
|
|
offset = TEMP_IGIL_GetNthCommandHeaderOffset( pQueue, InitialOffset, CurrentPacket );
|
|
}
|
|
else
|
|
{
|
|
offset = TEMP_IGIL_GetNthCommandHeaderOffset( pQueue, offset, get_num_groups( 0 ) );
|
|
}
|
|
pCommand = TEMP_IGIL_GetCommandHeader( pQueue, offset );
|
|
|
|
//Initialize command packet with proper lws
|
|
if( get_local_id( 0 ) == 0 )
|
|
{
|
|
InitializeCommandPacket( pCommand );
|
|
}
|
|
|
|
//Can I run this command ?
|
|
if( CheckEventStatus( pCommand, eventsPool ) == 0 )
|
|
{
|
|
//Is it marker command ?
|
|
if( pCommand->m_kernelId != IGIL_KERNEL_ID_ENQUEUE_MARKER )
|
|
{
|
|
//Is there enough IDT space for me ?
|
|
if( get_local_id( 0 ) == 0 )
|
|
{
|
|
int WalkerNeeded = GetWalkerCount( pCommand );
|
|
//Optimization - check if IDT has free space for me
|
|
if( pQueue->m_controls.m_CurrentIDToffset + WalkerNeeded <= WA_INT_DESC_MAX )
|
|
{
|
|
uint Temp = atomic_add( &pQueue->m_controls.m_CurrentIDToffset, WalkerNeeded );
|
|
if( Temp + WalkerNeeded <= WA_INT_DESC_MAX )
|
|
{
|
|
IDTOffset = Temp;
|
|
DSHOffset = atomic_add( &pQueue->m_controls.m_CurrentDSHoffset, ( MAX_DSH_SIZE_PER_ENQUEUE * WalkerNeeded ) );
|
|
SLBOffset = ( ( atomic_add( &pQueue->m_controls.m_SecondLevelBatchOffset, ( SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE * WalkerNeeded ) ) ) % MAX_SLB_OFFSET );
|
|
BTSoffset = atomic_add( &pQueue->m_controls.m_CurrentSSHoffset, pQueue->m_controls.m_BTmaxSize );
|
|
}
|
|
else
|
|
{
|
|
IDTOffset = -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
IDTOffset = -1;
|
|
}
|
|
}
|
|
//Now barrier and check if we can go with scheduling
|
|
barrier( CLK_LOCAL_MEM_FENCE );
|
|
|
|
if( IDTOffset != -1 )
|
|
{
|
|
//This packet is all set, schedule it and we are done with it.
|
|
//Patch DSH has media curbe load and patch gpgpu walker inside
|
|
PatchDSHParallelWithDynamicDSH20( SLBOffset,
|
|
DSHOffset,
|
|
IDTOffset,
|
|
kernelData,
|
|
dsh,
|
|
pCommand->m_kernelId,
|
|
pCommand,
|
|
secondaryBatchBuffer,
|
|
pQueue,
|
|
eventsPool,
|
|
ssh,
|
|
BTSoffset,
|
|
&WalkerEnum,
|
|
ObjectIDS
|
|
#ifdef ENABLE_DEBUG_BUFFER
|
|
, DebugQueue
|
|
#endif
|
|
);
|
|
PacketScheduled = true;
|
|
}
|
|
else
|
|
{
|
|
PacketScheduled = false;
|
|
}
|
|
}
|
|
else //For marker we need to update returned event status
|
|
{
|
|
//Check if there is space to track event
|
|
if( get_local_id( 0 ) == 0 )
|
|
{
|
|
uint Temp = atomic_inc( &pQueue->m_controls.m_EnqueueMarkerScheduled );
|
|
if( Temp < MAX_NUMBER_OF_ENQUEUE_MARKER )
|
|
{
|
|
MarkerOffset = Temp;
|
|
}
|
|
else
|
|
{
|
|
MarkerOffset = -1;
|
|
}
|
|
}
|
|
barrier( CLK_LOCAL_MEM_FENCE );
|
|
if( MarkerOffset != -1 )
|
|
{
|
|
PacketScheduled = true;
|
|
}
|
|
else
|
|
{
|
|
PacketScheduled = false;
|
|
}
|
|
}
|
|
//Update event dependencies if any, if there are event waiting for status change, put them on the list.
|
|
if( PacketScheduled == true )
|
|
{
|
|
if( get_local_id( 0 ) == 0 )
|
|
{
|
|
if( ( uint )pCommand->m_event != IGIL_EVENT_INVALID_HANDLE )
|
|
{
|
|
pQueue->m_controls.m_EventDependencies[ atomic_inc( &pQueue->m_controls.m_CurrentScheduleEventNumber ) ] = pCommand->m_event;
|
|
}
|
|
//Remove event dependencies setting.
|
|
if( pCommand->m_numDependencies > 0 )
|
|
{
|
|
DecreaseEventDependenciesParallel( pCommand, eventsPool );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
//Can't schedule it right now, move to storage.
|
|
else
|
|
{
|
|
if( pQueue->m_controls.m_IsSimulation )
|
|
{
|
|
barrier( CLK_LOCAL_MEM_FENCE );
|
|
}
|
|
PacketScheduled = false;
|
|
}
|
|
|
|
//Allocation failure, move command to stack storage and update stack pointers
|
|
if( PacketScheduled == false )
|
|
{
|
|
if( get_local_id( 0 ) == 0 )
|
|
{
|
|
StackOffset = atomic_dec( &pQueue->m_controls.m_StackTop ) - 1;
|
|
QStorageOffset = atomic_sub( &pQueue->m_controls.m_QstorageTop, pCommand->m_commandSize ) - pCommand->m_commandSize;
|
|
commandsStack[ StackOffset ] = QStorageOffset;
|
|
}
|
|
barrier( CLK_LOCAL_MEM_FENCE );
|
|
__global char* ptrQueue = ( __global char* )pQueue;
|
|
GLOBAL_MEMCPY( ( __global void* )&queueStorageBuffer[ QStorageOffset / 4 ], (__global void * )&ptrQueue[ offset ] , pCommand->m_commandSize );
|
|
}
|
|
else if( pQueue->m_controls.m_IsSimulation )
|
|
{
|
|
barrier( CLK_LOCAL_MEM_FENCE );
|
|
}
|
|
}
|
|
|
|
//In case there were new enqueues on queue_t, all work items must hit the global barrier before they can start taking items from the stack.
|
|
GlobalBarrierUpdateQueue( queueStorageBuffer, pQueue );
|
|
}
|
|
|
|
//Check stack only when there are free IDTS
|
|
if( ( pQueue->m_controls.m_IDTAfterFirstPhase < WA_INT_DESC_MAX ) &
|
|
( pQueue->m_controls.m_PreviousStackTop != pQueue->m_controls.m_StackSize ) )
|
|
{
|
|
//Start stack browsing
|
|
uint MyID = get_group_id( 0 );
|
|
//Start browsing from the begining of the previous stack top
|
|
uint CurrentOffset = pQueue->m_controls.m_PreviousStackTop + MyID;
|
|
uint CommandOffset = 0;
|
|
|
|
while( CurrentOffset < pQueue->m_controls.m_StackSize )
|
|
{
|
|
CommandOffset = commandsStack[ CurrentOffset ];
|
|
|
|
if( CommandOffset != 0 )
|
|
{
|
|
pCommand = GetCommandHeaderFromStorage( ( __global uint* )queueStorageBuffer, CommandOffset );
|
|
|
|
//Can I run this command ?
|
|
if( CheckEventStatus( pCommand, eventsPool ) == 0 )
|
|
{
|
|
//Is it marker command ?
|
|
if( pCommand->m_kernelId != IGIL_KERNEL_ID_ENQUEUE_MARKER )
|
|
{
|
|
//Is there enough IDT space for me ?
|
|
if( get_local_id( 0 ) == 0 )
|
|
{
|
|
int WalkerNeeded = GetWalkerCount( pCommand );
|
|
//Optimization - check if IDT has free space for me
|
|
if( pQueue->m_controls.m_CurrentIDToffset + WalkerNeeded <= WA_INT_DESC_MAX )
|
|
{
|
|
uint Temp = atomic_add( &pQueue->m_controls.m_CurrentIDToffset, WalkerNeeded );
|
|
if( Temp + WalkerNeeded <= WA_INT_DESC_MAX )
|
|
{
|
|
IDTOffset = Temp;
|
|
DSHOffset = atomic_add( &pQueue->m_controls.m_CurrentDSHoffset, ( MAX_DSH_SIZE_PER_ENQUEUE * WalkerNeeded ) );
|
|
SLBOffset = ( ( atomic_add( &pQueue->m_controls.m_SecondLevelBatchOffset, ( SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE * WalkerNeeded ) ) ) % MAX_SLB_OFFSET );
|
|
BTSoffset = atomic_add( &pQueue->m_controls.m_CurrentSSHoffset, pQueue->m_controls.m_BTmaxSize );
|
|
}
|
|
else
|
|
{
|
|
IDTOffset = -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
IDTOffset = -1;
|
|
}
|
|
}
|
|
//Now barrier and check if we can go with scheduling
|
|
barrier( CLK_LOCAL_MEM_FENCE );
|
|
if( IDTOffset != -1 )
|
|
{
|
|
//This packet is all set, schedule it and we are done with it.
|
|
//Patch DSH has media curbe load and patch gpgpu walker inside
|
|
PatchDSHParallelWithDynamicDSH20( SLBOffset,
|
|
DSHOffset,
|
|
IDTOffset,
|
|
kernelData,
|
|
dsh,
|
|
pCommand->m_kernelId,
|
|
pCommand,
|
|
secondaryBatchBuffer,
|
|
pQueue,
|
|
eventsPool,
|
|
ssh,
|
|
BTSoffset,
|
|
&WalkerEnum,
|
|
ObjectIDS
|
|
#ifdef ENABLE_DEBUG_BUFFER
|
|
, DebugQueue
|
|
#endif
|
|
);
|
|
pCommand->m_commandState = CAN_BE_RECLAIMED;
|
|
//Reset stack offset
|
|
commandsStack[ CurrentOffset ] = 0;
|
|
|
|
//Update event status
|
|
if( get_local_id( 0 ) == 0 )
|
|
{
|
|
//Add events dependant on this command to list of events neeeded to be updated.
|
|
if( ( uint )pCommand->m_event != IGIL_EVENT_INVALID_HANDLE )
|
|
{
|
|
pQueue->m_controls.m_EventDependencies[ atomic_inc( &pQueue->m_controls.m_CurrentScheduleEventNumber ) ] = pCommand->m_event;
|
|
}
|
|
|
|
//Remove event dependencies setting.
|
|
if( pCommand->m_numDependencies > 0 )
|
|
{
|
|
DecreaseEventDependenciesParallel( pCommand, eventsPool );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else // For marker we need to update returned event status
|
|
{
|
|
barrier( CLK_GLOBAL_MEM_FENCE );
|
|
//Check if there is space to track event
|
|
if( get_local_id( 0 ) == 0 )
|
|
{
|
|
uint Temp = atomic_inc( &pQueue->m_controls.m_EnqueueMarkerScheduled );
|
|
if( Temp < MAX_NUMBER_OF_ENQUEUE_MARKER )
|
|
{
|
|
pCommand->m_commandState = CAN_BE_RECLAIMED;
|
|
commandsStack[ CurrentOffset ] = 0;
|
|
//Add events dependant on this command to list of events neeeded to be updated.
|
|
if( ( uint )pCommand->m_event != IGIL_EVENT_INVALID_HANDLE )
|
|
{
|
|
pQueue->m_controls.m_EventDependencies[ atomic_inc( &pQueue->m_controls.m_CurrentScheduleEventNumber ) ] = pCommand->m_event;
|
|
}
|
|
|
|
//Remove event dependencies setting.
|
|
if( pCommand->m_numDependencies > 0 )
|
|
{
|
|
DecreaseEventDependenciesParallel( pCommand, eventsPool );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
CurrentOffset += get_num_groups( 0 );
|
|
if( pQueue->m_controls.m_IsSimulation )
|
|
{
|
|
barrier( CLK_LOCAL_MEM_FENCE );
|
|
}
|
|
}
|
|
}
|
|
|
|
//Finish execution and check end conditons
|
|
//Execute this global barrier only when needed, i.e. stack browsing was executed or new item were added on the stack
|
|
if( ( pQueue->m_controls.m_PreviousStackTop != pQueue->m_controls.m_StackSize ) |
|
|
( pQueue->m_controls.m_StackTop != pQueue->m_controls.m_PreviousStackTop ) )
|
|
{
|
|
GlobalBarrier( queueStorageBuffer );
|
|
}
|
|
|
|
//Cleanup & resource reclamation section
|
|
//We are after global sync section, we can do anything to globals right now.
|
|
if( ( get_local_id( 0 ) == 0 ) & ( get_group_id( 0 ) == 0 ) )
|
|
{
|
|
{
|
|
pQueue->m_controls.m_CurrentDSHoffset = pQueue->m_controls.m_DynamicHeapStart;
|
|
pQueue->m_controls.m_IDTAfterFirstPhase = 1;
|
|
pQueue->m_controls.m_PreviousNumberOfQueues = pQueue->m_controls.m_TotalNumberOfQueues;
|
|
pQueue->m_controls.m_LastScheduleEventNumber = pQueue->m_controls.m_CurrentScheduleEventNumber;
|
|
pQueue->m_controls.m_CurrentScheduleEventNumber = 0;
|
|
}
|
|
}
|
|
|
|
//Schedule scheduler
|
|
if( ( (get_group_id( 0 ) == 1) == (get_num_groups( 0 ) > 1) ) & ( get_local_id(0) == 0 ) )
|
|
{
|
|
pQueue->m_controls.m_SecondLevelBatchOffset = ( pQueue->m_controls.m_SecondLevelBatchOffset % MAX_SLB_OFFSET );
|
|
|
|
//If we scheduled any blocks, put scheduler right after
|
|
if( ( pQueue->m_controls.m_CurrentIDToffset > 1 ) | ( pQueue->m_controls.m_EnqueueMarkerScheduled > 0 ) )
|
|
{
|
|
AddCmdsInSLBforScheduler20Parallel( pQueue->m_controls.m_SecondLevelBatchOffset,
|
|
pQueue,
|
|
secondaryBatchBuffer,
|
|
dsh );
|
|
|
|
//If we have profiling enabled, we need CL_COMPLETE time, which is before next scheduler starts
|
|
if( pQueue->m_controls.m_IsProfilingEnabled != 0 )
|
|
{
|
|
ulong Address = ( ulong ) &( eventsPool->m_CLcompleteTimestamp );
|
|
//Emit pipecontrol with timestamp write
|
|
PatchPipeControlProfilingAddres( secondaryBatchBuffer,
|
|
pQueue->m_controls.m_SecondLevelBatchOffset,
|
|
Address,
|
|
PIPE_CONTROL_FOR_TIMESTAMP_START_OFFSET );
|
|
//Bit after scheduler may be set by some other command, reset it to 0
|
|
DisablePostSyncBitInPipeControl( secondaryBatchBuffer,
|
|
pQueue->m_controls.m_SecondLevelBatchOffset,
|
|
PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET_TO_PATCH );
|
|
#ifdef ENABLE_DEBUG_BUFFER
|
|
if( ( DebugQueue != 0 ) & ( DebugQueue->m_flags == DDB_SCHEDULER_PROFILING ) )
|
|
{
|
|
//Store Current scheduler START time
|
|
*((__global ulong * ) (&DebugQueue->m_data[ atomic_add( &DebugQueue->m_offset, 2 ) ] )) = EventsPool->m_CLcompleteTimestamp;
|
|
|
|
//Set address to store next scheduler's END time
|
|
PatchPipeControlProfilingAddres( secondaryBatchBuffer,
|
|
pQueue->m_controls.m_SecondLevelBatchOffset,
|
|
( ulong )(&DebugQueue->m_data[ atomic_add( &DebugQueue->m_offset, 2 ) ]),
|
|
PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
|
|
}
|
|
#endif
|
|
}
|
|
|
|
//Program pipe controls around scheduler to make sure it is not executed concurently to blocks
|
|
else
|
|
{
|
|
//Locate previous pipe control
|
|
int PreviousOffset = pQueue->m_controls.m_SecondLevelBatchOffset - SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE;
|
|
//If offset is negative it means we are first command after chaining
|
|
if( PreviousOffset < 0 )
|
|
{
|
|
PreviousOffset += MAX_SLB_OFFSET;
|
|
}
|
|
//Tighten previous pipecontrol
|
|
uint DwordOffset = PreviousOffset / DWORD_SIZE_IN_BYTES;
|
|
|
|
PutCSStallPipeControl( secondaryBatchBuffer, DwordOffset, PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
|
|
|
|
//Now put pipe control after scheduler
|
|
DwordOffset = pQueue->m_controls.m_SecondLevelBatchOffset / DWORD_SIZE_IN_BYTES;
|
|
|
|
PutCSStallPipeControl( secondaryBatchBuffer, DwordOffset, PIPE_CONTROL_FOR_TIMESTAMP_END_OFFSET );
|
|
}
|
|
|
|
pQueue->m_controls.m_SecondLevelBatchOffset += SECOND_LEVEL_BUFFER_SPACE_FOR_EACH_ENQUEUE;
|
|
pQueue->m_controls.m_CurrentIDToffset = 1;
|
|
pQueue->m_controls.m_EnqueueMarkerScheduled = 0;
|
|
pQueue->m_controls.Temporary[1]++;
|
|
pQueue->m_controls.m_CurrentSSHoffset = pQueue->m_controls.m_BTbaseOffset;
|
|
}
|
|
//Nothing to schedule, return to the host
|
|
else
|
|
{
|
|
#ifdef ENABLE_DEBUG_BUFFER
|
|
//Set START time of current (last) scheduler
|
|
if( ( pQueue->m_controls.m_IsProfilingEnabled != 0 ) & ( DebugQueue != 0 ) & ( DebugQueue->m_flags == DDB_SCHEDULER_PROFILING ) )
|
|
{
|
|
*((__global ulong * ) (&DebugQueue->m_data[ atomic_add( &DebugQueue->m_offset, 2 ) ] )) = EventsPool->m_CLcompleteTimestamp;
|
|
}
|
|
#endif
|
|
pQueue->m_controls.Temporary[2]++;
|
|
pQueue->m_controls.m_SLBENDoffsetInBytes = ( int ) pQueue->m_controls.m_SecondLevelBatchOffset;
|
|
//SlbOffset is expressed in bytes and for cmd it is needed to convert it to dwords
|
|
__private uint DwordOffset = pQueue->m_controls.m_SecondLevelBatchOffset / DWORD_SIZE_IN_BYTES;
|
|
//BB_START 1st DWORD
|
|
secondaryBatchBuffer[ DwordOffset ] = OCLRT_BATCH_BUFFER_BEGIN_CMD_DWORD0;
|
|
DwordOffset++;
|
|
//BB_START 2nd DWORD - Address, 3rd DWORD Address high
|
|
secondaryBatchBuffer[ DwordOffset++ ] = (uint)( pQueue->m_controls.m_CleanupSectionAddress & 0xFFFFFFFF );
|
|
secondaryBatchBuffer[ DwordOffset++ ] = (uint)( ( pQueue->m_controls.m_CleanupSectionAddress >> 32 ) & 0xFFFFFFFF );
|
|
}
|
|
}
|
|
|
|
//Parallel stack compaction
|
|
if( ( ( get_group_id( 0 ) == 2 ) == ( get_num_groups( 0 ) > 2 ) ) & ( get_local_id( 0 ) == 0 ) )
|
|
{
|
|
uint Current = pQueue->m_controls.m_StackTop + get_local_id( 0 );
|
|
uint StackSize = pQueue->m_controls.m_StackSize;
|
|
uint Found = 0;
|
|
|
|
while( ( Current < StackSize ) && ( Found == 0 ) )
|
|
{
|
|
__global uint * pCmdStackBlock = (__global uint *)( commandsStack + Current );
|
|
//We have found an element
|
|
if( *pCmdStackBlock != 0 )
|
|
{
|
|
Found = 1;
|
|
}
|
|
else
|
|
{
|
|
Current += get_local_size( 0 );
|
|
}
|
|
}
|
|
|
|
if ( Found == 1 )
|
|
{
|
|
atomic_min( &pQueue->m_controls.m_StackTop, Current );
|
|
atomic_min( &pQueue->m_controls.m_PreviousStackTop, Current );
|
|
}
|
|
}
|
|
|
|
//Qstorage compaction
|
|
if( ( ( get_group_id( 0 ) == 3 ) == ( get_num_groups( 0 ) > 3 ) ) & ( get_local_id( 0 ) == 0 ) )
|
|
{
|
|
uint ReclaimFurhter = 1;
|
|
while( ( pQueue->m_controls.m_QstorageTop < pQueue->m_controls.m_QstorageSize ) & ( ReclaimFurhter == 1 ) )
|
|
{
|
|
pCommand = GetCommandHeaderFromStorage( ( __global uint* ) queueStorageBuffer, pQueue->m_controls.m_QstorageTop );
|
|
if( pCommand->m_commandState == CAN_BE_RECLAIMED )
|
|
{
|
|
pQueue->m_controls.m_QstorageTop += pCommand->m_commandSize;
|
|
}
|
|
else
|
|
{
|
|
ReclaimFurhter = 0;
|
|
}
|
|
}
|
|
|
|
pQueue->m_controls.m_PreviousStorageTop = pQueue->m_controls.m_QstorageTop;
|
|
|
|
#ifndef DISABLE_RESOURCE_RECLAMATION
|
|
//Reclaim space on queue_t, do this only if there is enough space
|
|
//1 KB is used for global barrier, make sure this space will never be used.
|
|
if( pQueue->m_controls.m_QstorageTop - 1024 > pQueue->m_size )
|
|
{
|
|
//In this case we can take full queue_t next time we enter scheduler, so reclaim full space on queue_t
|
|
pQueue->m_head = IGIL_DEVICE_QUEUE_HEAD_INIT;
|
|
}
|
|
#endif
|
|
pQueue->m_controls.m_PreviousHead = pQueue->m_head;
|
|
}
|
|
}
|