Enable ARM build

Some IGC code uses SSE4.1 intrinsics. The code is changed to avoid the
intrinsics usage for host platforms which do not support them.
Allows ARM builds and cross-builds
This commit is contained in:
Kirill Zhavoronkov
2022-01-18 23:56:10 +00:00
committed by igcbot
parent d790a8f857
commit eda4042be9
11 changed files with 212 additions and 232 deletions

View File

@ -10,8 +10,11 @@ SPDX-License-Identifier: MIT
#include "types.h"
#include "utility.h"
#include "UFO/portable_cpuid.h"
#if defined(_M_IX86) || defined(_M_AMD64) || \
defined(__i386__) || defined(__x86_64_)
#include "UFO/portable_cpuid.h"
#endif
namespace iSTD
{
@ -50,7 +53,8 @@ inline CPU_INSTRUCTION_LEVEL GetCpuInstructionLevel( void )
{
#if defined(ANDROID) && defined(__SSE4_1__)
return CPU_INSTRUCTION_LEVEL_SSE4_1;
#else
#elif defined(_M_IX86) || defined(_M_AMD64) || \
defined(__i386__) || defined(__x86_64_)
int CPUInfo[4] = { 0, 0, 0, 0 };
__cpuid(CPUInfo, 1);
@ -78,6 +82,8 @@ inline CPU_INSTRUCTION_LEVEL GetCpuInstructionLevel( void )
}
return CpuInstructionLevel;
#else
return CPU_INSTRUCTION_LEVEL_UNKNOWN;
#endif
}

View File

@ -214,7 +214,7 @@ Input:
ValueType touch_value - the associated value stored with the key.
Output:
bool - true if the item was evicted from the set.
bool - true if the item was evicted from the set.
\*****************************************************************************/
template<LruHashTableTemplateList>
bool LruHashTableType::TouchItem(
@ -247,7 +247,7 @@ Input:
value which was evicted to add the touch_value to the set.
Output:
bool - true if the item was evicted from the set.
bool - true if the item was evicted from the set.
\*****************************************************************************/
template<LruHashTableTemplateList>
bool LruHashTableType::TouchItem(
@ -277,10 +277,11 @@ Output:
template<LruHashTableTemplateList>
DWORD LruHashTableType::MakeHashValue(const KeyType& in_key)
{
const __m128i* keyPointer = (__m128i*)&in_key;
DWORD hashCode = 0;
#if defined(_WIN32) && defined(_MSC_VER)
const __m128i* keyPointer = (__m128i*)&in_key;
ASSERT( HashingFunctions::HashingFunctionCount == 1 );
switch(m_hashingFunction)

View File

@ -28,13 +28,24 @@ SPDX-License-Identifier: MIT
// warning C4985: 'ceil': attributes not present on previous declaration.
#include <math.h>
#endif
#include <intrin.h>
#define USE_X86
#define USE_SSE4_1
#else
#elif defined(__i386__) || defined(__x86_64__)
#include <x86intrin.h>
#define USE_X86
#if defined(__SSE4_1__)
#define USE_SSE4_1
#endif // defined(__SSE4_1__)
#endif
#if defined(USE_SSE4_1)
typedef __m128 DQWORD; // 128-bits, 16-bytes
#endif
typedef DWORD PREFETCH[8]; // 32-bytes
typedef DWORD CACHELINE[8]; // 32-bytes
typedef WORD DHWORD[32]; // 512-bits, 64-bytes
@ -81,16 +92,16 @@ inline void MemCopy( void*, const void* );
inline void MemCopy( void*, const void*, const size_t );
inline void MemCopyWC( void*, const void*, const size_t );
inline void MemCopySwapBytes( void*, const void*, const size_t, const unsigned int);
inline void ScalarSwapBytes( __m128i**, const __m128i**, const size_t, const unsigned int);
inline void ScalarSwapBytes( void**, const void**, const size_t, const unsigned int);
inline void SafeMemSet( void*, const int, const size_t );
inline int SafeMemCompare( const void*, const void*, const size_t );
inline void SafeMemMove( void*, const void*, const size_t );
#ifndef _WIN64
#if defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1
inline void __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa(void* dst, const void* src );
inline void __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu(void* dst, const void* src );
#endif
#endif // defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1
inline void FastMemCopyFromWC( void* dst, const void* src, const size_t bytes, CPU_INSTRUCTION_LEVEL cpuInstructionLevel);
inline void FastCpuBlt( BYTE*, const DWORD, BYTE*, const DWORD, const DWORD, DWORD );
@ -114,7 +125,11 @@ Description:
\*****************************************************************************/
inline void Prefetch( const void* ptr )
{
#if defined(USE_X86)
_mm_prefetch( (const char*)ptr, _MM_HINT_NTA );
#elif defined(__GNUC__) || defined(__clang__)
__builtin_prefetch(ptr, 0, 0);
#endif
}
/*****************************************************************************\
@ -130,8 +145,12 @@ inline void PrefetchBuffer( const void* pBuffer, const size_t bytes )
for( size_t i = 0; i <= cachelines; i++ )
{
#if defined(USE_X86)
_mm_prefetch( (const char*)pBuffer + i * sizeof(PREFETCH),
_MM_HINT_NTA );
#elif defined(__GNUC__) || defined(__clang__)
__builtin_prefetch((const char*)pBuffer + i * sizeof(PREFETCH), 0, 0);
#endif
}
}
@ -144,7 +163,9 @@ Description:
\*****************************************************************************/
inline void CachelineFlush( const void* ptr )
{
#if defined(USE_X86)
_mm_clflush( (char*)ptr );
#endif
}
/*****************************************************************************\
@ -192,6 +213,7 @@ inline void MemCopy<8>( void* dst, const void* src )
*pDst = *pSrc;
}
#if defined(USE_SSE4_1)
template <>
inline void MemCopy<16>( void* dst, const void* src )
{
@ -223,6 +245,7 @@ inline void MemCopy<28>( void* dst, const void* src )
UINT32* pDst32 = reinterpret_cast<UINT32*>( pDst64 );
*pDst32 = *pSrc32;
}
#endif // defined(USE_SSE4_1)
/*****************************************************************************\
Inline Function:
@ -233,7 +256,7 @@ Description:
\*****************************************************************************/
inline void MemCopy( void* dst, const void* src, const size_t bytes )
{
#if defined ( _MSC_VER )
#if defined ( USE_SSE4_1 )
UINT8* pDst8 = reinterpret_cast<UINT8*>( dst );
const UINT8* pSrc8 = reinterpret_cast<const UINT8*>( src );
size_t bytesRemaining = bytes;
@ -619,7 +642,7 @@ inline void MemCopy( void* dst, const void* src, const size_t bytes )
}
}
}
#else // #if defined ( _MSC_VER )
#else // !defined ( USE_SSE4_1 )
// Linux projects do not support standard types or memcpy_s
::memcpy_s(dst, bytes, src, bytes);
#endif
@ -639,7 +662,7 @@ Input:
\*****************************************************************************/
inline void MemCopyWC( void* dst, const void* src, const size_t bytes )
{
#if defined ( _MSC_VER )
#if defined ( USE_SSE4_1 )
const __m128i s_SSE2CmpMask = _mm_setr_epi8( 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 );
const __m128i* pMMSrc = reinterpret_cast<const __m128i*>(src);
__m128i* pMMDest = reinterpret_cast<__m128i*>(dst);
@ -797,8 +820,8 @@ Description:
Helper function for MemCopySwapBytes
\*****************************************************************************/
inline void ScalarSwapBytes(
__m128i** dst,
const __m128i** src,
void** dst,
const void** src,
const size_t byteCount,
const unsigned int swapbytes)
{
@ -817,8 +840,8 @@ inline void ScalarSwapBytes(
wSrc += 1;
}
*src = reinterpret_cast<const __m128i*>(wSrc);
*dst = reinterpret_cast<__m128i*>(wDst);
*src = reinterpret_cast<const void*>(wSrc);
*dst = reinterpret_cast<void*>(wDst);
}
break;
case 4:
@ -836,8 +859,8 @@ inline void ScalarSwapBytes(
dwSrc += 1;
}
*src = reinterpret_cast<const __m128i*>(dwSrc);
*dst = reinterpret_cast<__m128i*>(dwDst);
*src = reinterpret_cast<const void*>(dwSrc);
*dst = reinterpret_cast<void*>(dwDst);
}
break;
default:
@ -847,8 +870,8 @@ inline void ScalarSwapBytes(
::memcpy_s(bDst, byteCount, bSrc, byteCount);
*src = reinterpret_cast<const __m128i*>(bSrc + byteCount);
*dst = reinterpret_cast<__m128i*>(bDst + byteCount);
*src = reinterpret_cast<const void*>(bSrc + byteCount);
*dst = reinterpret_cast<void*>(bDst + byteCount);
}
}
@ -871,6 +894,14 @@ inline void MemCopySwapBytes(
const size_t bytes,
const unsigned int swapbytes)
{
// only handle 2 and 4 bytes swapping
if (swapbytes != 2 && swapbytes != 4)
{
MemCopy(dst, src, bytes);
return;
}
#if defined(USE_SSE4_1)
const __m128i* pMMSrc = reinterpret_cast<const __m128i*>(src);
__m128i* pMMDest = reinterpret_cast<__m128i*>(dst);
size_t count = bytes;
@ -887,27 +918,8 @@ inline void MemCopySwapBytes(
0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04,
0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c);
// SSE3 support required
CPU_INSTRUCTION_LEVEL cpuInstructionLevel = GetCpuInstructionLevel();
if (cpuInstructionLevel < CPU_INSTRUCTION_LEVEL_SSE3)
{
ScalarSwapBytes(&pMMDest, &pMMSrc, count, swapbytes);
return;
}
// only handle 2 and 4 bytes swapping
if (swapbytes != 2 && swapbytes != 4)
{
MemCopy(pMMDest, pMMSrc, count);
return;
}
// when size is < 16 rely, must use scalar swap
if (count < INSTR_WIDTH_128)
{
ScalarSwapBytes(&pMMDest, &pMMSrc, count, swapbytes);
}
else
if (count >= INSTR_WIDTH_128)
{
const __m128i shuffleMask = (swapbytes == 2) ? wordMask : dwordMask;
@ -933,7 +945,7 @@ inline void MemCopySwapBytes(
{
align = INSTR_WIDTH_128 - align;
cnt = align >> DWORD_SHIFT;
ScalarSwapBytes(&pMMDest, &pMMSrc, cnt * sizeof(DWORD), swapbytes);
ScalarSwapBytes((void**)&pMMDest, (const void**)&pMMSrc, cnt * sizeof(DWORD), swapbytes);
cnt = align & BYTE_TAIL;
// only words should remain, not bytes
@ -941,7 +953,7 @@ inline void MemCopySwapBytes(
{
ASSERT(cnt % 2 == 0);
ASSERT(swapbytes == 2);
ScalarSwapBytes(&pMMDest, &pMMSrc, cnt, swapbytes);
ScalarSwapBytes((void**)&pMMDest, (const void**)&pMMSrc, cnt, swapbytes);
}
}
@ -1055,7 +1067,7 @@ inline void MemCopySwapBytes(
if (count != 0)
{
cnt = count >> DWORD_SHIFT;
ScalarSwapBytes(&pMMDest, &pMMSrc, cnt * sizeof(DWORD), swapbytes);
ScalarSwapBytes((void**)&pMMDest, (const void**)&pMMSrc, cnt * sizeof(DWORD), swapbytes);
cnt = count & BYTE_TAIL;
// only words should remain, not bytes
@ -1063,10 +1075,13 @@ inline void MemCopySwapBytes(
{
ASSERT(cnt % 2 == 0);
ASSERT(swapbytes == 2);
ScalarSwapBytes(&pMMDest, &pMMSrc, cnt, swapbytes);
ScalarSwapBytes((void**)&pMMDest, (const void**)&pMMSrc, cnt, swapbytes);
}
}
return;
}
#endif // defined(USE_SSE4_1)
ScalarSwapBytes(&dst, &src, bytes, swapbytes);
}
/*****************************************************************************\
@ -1156,6 +1171,7 @@ inline void SafeMemMove( void *dst, const void *src, const size_t bytes )
}
}
#if defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1
/*****************************************************************************\
MACROS:
EMIT_R_MR
@ -1240,7 +1256,6 @@ Input:
dst - 16-byte aligned pointer to (cacheable) destination buffer
src - 16-byte(req)/64-byte(optimal) aligned pointer to (USWC) source buffer
\*****************************************************************************/
#if defined( _MSC_VER ) && !defined (_WIN64)
__forceinline void __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa( void* dst, const void* src )
{
@ -1279,7 +1294,6 @@ __forceinline void __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa( void
}
} // FastMemCopy_SSE4_1_movntdqa_movdqa()
#endif //#if defined( _MSC_VER ) && !defined (_WIN64)
/*****************************************************************************\
Inline Function:
@ -1291,7 +1305,6 @@ Input:
dst - 16-byte (unaligned) pointer to (cacheable) destination buffer
src - 16-byte(req)/64-byte(optimal) aligned pointer to (USWC) source buffer
\*****************************************************************************/
#if defined ( _MSC_VER ) && !defined(_WIN64)
__forceinline void __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu(void* dst, const void* src )
{
__asm
@ -1328,19 +1341,20 @@ __forceinline void __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu(void
movdqu xmmword ptr [ecx+48], xmm3
}
} // FastMemCopy_SSE4_1_movntdqa_movdqu()
#endif // #if defined( _MSC_VER ) && !defined (_WIN64)
#endif // defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1
inline void FastMemCopyFromWC( void* dst, const void* src, const size_t bytes, CPU_INSTRUCTION_LEVEL cpuInstructionLevel )
{
#if defined( _MSC_VER ) && (!defined (_WIN64) || defined ( _In_ ) ) || defined (__GNUC__)
// Cache pointers to memory
BYTE* p_dst = (BYTE*)dst;
BYTE* p_src = (BYTE*)src;
size_t count = bytes;
#if defined(USE_SSE4_1)
if( cpuInstructionLevel >= CPU_INSTRUCTION_LEVEL_SSE4_1 )
{
// Cache pointers to memory
BYTE* p_dst = (BYTE*)dst;
BYTE* p_src = (BYTE*)src;
size_t count = bytes;
if( count >= sizeof(DHWORD) )
{
@ -1370,11 +1384,11 @@ inline void FastMemCopyFromWC( void* dst, const void* src, const size_t bytes, C
const bool isDstDoubleQuadWordAligned =
IsAligned( p_dst, sizeof(DQWORD) );
#if defined(_WIN64) || defined(__GNUC__)
#if !defined(USE_INLINE_ASM) || USE_INLINE_ASM == 0
__m128i* pMMSrc = (__m128i*)(p_src);
__m128i* pMMDest = reinterpret_cast<__m128i*>(p_dst);
__m128i xmm0, xmm1, xmm2, xmm3;
#endif
#endif // !defined(USE_INLINE_ASM) || USE_INLINE_ASM == 0
if( isDstDoubleQuadWordAligned )
{
@ -1385,9 +1399,9 @@ inline void FastMemCopyFromWC( void* dst, const void* src, const size_t bytes, C
for( size_t i=0; i<DoubleHexWordsToCopy; i++ )
{
#if !defined(_WIN64) && !defined(__GNUC__)
#if defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1
FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa( p_dst, p_src );
#else
#else // !(defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1)
xmm0 = _mm_stream_load_si128(pMMSrc);
xmm1 = _mm_stream_load_si128(pMMSrc + 1);
xmm2 = _mm_stream_load_si128(pMMSrc + 2);
@ -1399,7 +1413,7 @@ inline void FastMemCopyFromWC( void* dst, const void* src, const size_t bytes, C
_mm_store_si128(pMMDest + 2, xmm2);
_mm_store_si128(pMMDest + 3, xmm3);
pMMDest += 4;
#endif
#endif // defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1
p_dst += sizeof(DHWORD);
p_src += sizeof(DHWORD);
@ -1415,9 +1429,9 @@ inline void FastMemCopyFromWC( void* dst, const void* src, const size_t bytes, C
for( size_t i=0; i<DoubleHexWordsToCopy; i++ )
{
#if !defined(_WIN64) && !defined(__GNUC__)
#if defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1
FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu( p_dst, p_src );
#else
#else // !(defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1)
xmm0 = _mm_stream_load_si128(pMMSrc);
xmm1 = _mm_stream_load_si128(pMMSrc + 1);
xmm2 = _mm_stream_load_si128(pMMSrc + 2);
@ -1429,7 +1443,7 @@ inline void FastMemCopyFromWC( void* dst, const void* src, const size_t bytes, C
_mm_storeu_si128(pMMDest + 2, xmm2);
_mm_storeu_si128(pMMDest + 3, xmm3);
pMMDest += 4;
#endif
#endif // defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1
p_dst += sizeof(DHWORD);
p_src += sizeof(DHWORD);
@ -1438,17 +1452,12 @@ inline void FastMemCopyFromWC( void* dst, const void* src, const size_t bytes, C
}
}
}
// Copy remaining BYTE(s)
if( count )
{
MemCopy( p_dst, p_src, count );
}
}
else
#endif //!defined ( _WIN64 ) || defined ( _In_ )
#endif // defined(USE_SSE4_1)
// Copy remaining BYTE(s)
if( count )
{
MemCopy( dst, src, bytes );
MemCopy( p_dst, p_src, count );
}
}
@ -1547,8 +1556,7 @@ inline void FastCpuBltFromUSWC(
DWORD count,
CPU_INSTRUCTION_LEVEL level)
{
#ifndef _WIN64
#if defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1
//back up the XMM registers just in case
__declspec( align(16) ) BYTE backUpRegisters[16*4];
@ -1559,8 +1567,8 @@ inline void FastCpuBltFromUSWC(
__asm movdqa xmmword ptr [ecx + 16*1], xmm1
__asm movdqa xmmword ptr [ecx + 16*2], xmm2
__asm movdqa xmmword ptr [ecx + 16*3], xmm3
#endif // defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1
#endif //_WIN64
do
{
iSTD::FastMemCopyFromWC( dst, src, stride, level );
@ -1569,15 +1577,13 @@ inline void FastCpuBltFromUSWC(
src += srcPitch;
}
while( --count > 0 );
#ifndef _WIN64
#if defined ( _MSC_VER )
#if defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1
__asm mov ecx, tempPtr
__asm movdqa xmm0, xmmword ptr [ecx + 16*0]
__asm movdqa xmm1, xmmword ptr [ecx + 16*1]
__asm movdqa xmm2, xmmword ptr [ecx + 16*2]
__asm movdqa xmm3, xmmword ptr [ecx + 16*3]
#endif
#endif //_WIN64
#endif // defined(USE_INLINE_ASM) && USE_INLINE_ASM == 1
}
#endif
@ -1611,6 +1617,8 @@ inline void FindWordBufferMinMax(
WORD wMaxValue = 0x0000;
size_t count = bytes / sizeof(WORD);
#if defined(USE_SSE4_1)
size_t i = 0;
if( IsAligned( pBuffer, sizeof(WORD) ) )
@ -1854,7 +1862,8 @@ inline void FindWordBufferMinMax(
} // if( count >= WordsPerQuadWord )
}
#endif
#endif // _WIN64
#endif // defined(USE_SSE4_1)
// Find min/max per value
while( count > 0 )
@ -1949,26 +1958,12 @@ inline void FindWordBufferMinMaxRestart(
if( count >= WordsPerDoubleQuadWord )
{
__m128i mInput, mRestarts, mMask;
__m128i mAll_ones;
__m128i mMinValue128i, mMaxValue128i;
// This is just used for andnot mInput
mAll_ones.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
mAll_ones.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
// start with really high min and really low max
// What should happen if all values are restart?
mMinValue128i.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
mMinValue128i.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
mMaxValue128i.m128i_u64[0] = 0x0000000000000000;
mMaxValue128i.m128i_u64[1] = 0x0000000000000000;
__m128i mAll_ones = _mm_setr_epi32(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu);
__m128i mMinValue128i = _mm_setr_epi32(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu);
__m128i mMaxValue128i = _mm_setzero_si128();
// Initialize register used for testing for restart index.
mRestarts.m128i_u64[0] = mRestarts.m128i_u64[1] =
(((UINT64) restart) << 48) |
(((UINT64) restart) << 32) |
(((UINT64) restart) << 16) |
((UINT64) restart);
mRestarts = _mm_setr_epi16(restart, restart, restart, restart, restart, restart, restart, restart);
while( count >= WordsPerPrefetch )
{
@ -2106,6 +2101,8 @@ inline void FindDWordBufferMinMax(
DWORD wMaxValue = 0x00000000;
DWORD count = bytes / sizeof(DWORD);
#if defined(USE_SSE4_1)
DWORD i = 0;
if( IsAligned( pBuffer, sizeof(DWORD) ) )
@ -2240,6 +2237,7 @@ inline void FindDWordBufferMinMax(
} // if( count >= DWordsPerDoubleQuadWord )
} // if( count >= DWordsPerDoubleQuadWord )
}
#endif // defined(USE_SSE4_1)
// Find min/max per value
while( count > 0 )
@ -2334,22 +2332,12 @@ inline void FindDWordBufferMinMaxRestart(
if( count >= DWordsPerPrefetch )
{
__m128i mInput, mRestarts, mMask;
__m128i mAll_ones;
__m128i mMinValue128i, mMaxValue128i;
// This is just used for andnot mInput
mAll_ones.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
mAll_ones.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
// start with really high min and really low max
// What should happen if all values are restart?
mMinValue128i.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
mMinValue128i.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
mMaxValue128i.m128i_u64[0] = 0x0000000000000000;
mMaxValue128i.m128i_u64[1] = 0x0000000000000000;
__m128i mAll_ones = _mm_setr_epi32(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu);
__m128i mMinValue128i = _mm_setr_epi32(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu);
__m128i mMaxValue128i = _mm_setzero_si128();
// Initialize register used for testing for restart index.
mRestarts.m128i_u64[0] = mRestarts.m128i_u64[1] = (((UINT64) restart) << 32) | ((UINT64) restart);
mRestarts = _mm_setr_epi32(restart, restart, restart, restart);
while( count >= DWordsPerPrefetch )
{
@ -2484,6 +2472,8 @@ inline void FindWordBufferMinMaxCopy(
WORD wMaxValue = 0x0000;
size_t count = bytes / sizeof(WORD);
#if defined(USE_SSE4_1)
size_t i = 0;
if( IsAligned( pBuffer, sizeof(WORD) ) )
@ -2735,7 +2725,8 @@ inline void FindWordBufferMinMaxCopy(
} // if( count >= WordsPerQuadWord )
}
#endif
#endif // _WIN64
#endif // defined(USE_SSE4_1)
// Find min/max per value
while( count > 0 )
@ -2784,6 +2775,8 @@ inline void FindDWordBufferMinMaxCopy(
DWORD wMaxValue = 0x00000000;
DWORD count = bytes / sizeof(DWORD);
#if defined(USE_SSE4_1)
DWORD i = 0;
if( IsAligned( pBuffer, sizeof(DWORD) ) )
@ -2922,6 +2915,7 @@ inline void FindDWordBufferMinMaxCopy(
} // if( count >= DWordsPerDoubleQuadWord )
} // if( count >= DWordsPerDoubleQuadWord )
}
#endif // defined(USE_SSE4_1)
// Find min/max per value
while( count > 0 )
@ -3019,26 +3013,12 @@ inline void FindWordBufferMinMaxRestartCopy(
if( count >= WordsPerDoubleQuadWord )
{
__m128i mInput, mRestarts, mMask;
__m128i mAll_ones;
__m128i mMinValue128i, mMaxValue128i;
// This is just used for andnot mInput
mAll_ones.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
mAll_ones.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
// start with really high min and really low max
// What should happen if all values are restart?
mMinValue128i.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
mMinValue128i.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
mMaxValue128i.m128i_u64[0] = 0x0000000000000000;
mMaxValue128i.m128i_u64[1] = 0x0000000000000000;
__m128i mAll_ones = _mm_setr_epi32(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu);
__m128i mMinValue128i = _mm_setr_epi32(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu);
__m128i mMaxValue128i = _mm_setzero_si128();
// Initialize register used for testing for restart index.
mRestarts.m128i_u64[0] = mRestarts.m128i_u64[1] =
(((UINT64) restart) << 48) |
(((UINT64) restart) << 32) |
(((UINT64) restart) << 16) |
((UINT64) restart);
mRestarts = _mm_setr_epi16(restart, restart, restart, restart, restart, restart, restart, restart);
while( count >= WordsPerPrefetch )
{
@ -3231,22 +3211,12 @@ inline void FindDWordBufferMinMaxRestartCopy(
if( count >= DWordsPerPrefetch )
{
__m128i mInput, mRestarts, mMask;
__m128i mAll_ones;
__m128i mMinValue128i, mMaxValue128i;
// This is just used for andnot mInput
mAll_ones.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
mAll_ones.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
// start with really high min and really low max
// What should happen if all values are restart?
mMinValue128i.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
mMinValue128i.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
mMaxValue128i.m128i_u64[0] = 0x0000000000000000;
mMaxValue128i.m128i_u64[1] = 0x0000000000000000;
__m128i mAll_ones = _mm_setr_epi32(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu);
__m128i mMinValue128i = _mm_setr_epi32(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu);
__m128i mMaxValue128i = _mm_setzero_si128();
// Initialize register used for testing for restart index.
mRestarts.m128i_u64[0] = mRestarts.m128i_u64[1] = (((UINT64) restart) << 32) | ((UINT64) restart);
mRestarts = _mm_setr_epi32(restart, restart, restart, restart);
while( count >= DWordsPerPrefetch )
{
@ -3356,3 +3326,11 @@ inline void FindDWordBufferMinMaxRestartCopy(
} // iSTD
#if defined(USE_X86)
# undef USE_X86
#endif // defined(USE_X86)
#if defined(USE_SSE4_1)
# undef USE_SSE4_1
#endif // defined(USE_SSE4_1)

View File

@ -16,7 +16,6 @@ SPDX-License-Identifier: MIT
# include <intrin.h>
#else
# include <time.h>
# include <x86intrin.h>
# ifndef NSEC_PER_SEC
# define NSEC_PER_SEC 1000000000L
# endif
@ -33,7 +32,10 @@ Description:
\*****************************************************************************/
__forceinline void Pause( void )
{
#if defined(_M_IX86) || defined (_M_AMD64) || \
defined(__i386__) || defined(__x86_64__)
_mm_pause();
#endif
}
#if defined _WIN32

View File

@ -14,7 +14,6 @@ SPDX-License-Identifier: MIT
#include "Print.h"
#include "Debug.h"
#include "utility.h"
#include "utilitySSE.h"
#include "CpuUtil.h"
#include "FloatUtil.h"
#include "FloatSafe.h"
@ -47,4 +46,3 @@ SPDX-License-Identifier: MIT
#include "LRUSet.h"
#include "LruHashTable.h"
#endif

View File

@ -1,71 +0,0 @@
/*========================== begin_copyright_notice ============================
Copyright (C) 2019-2021 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#pragma once
#if defined(_WIN32)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include "types.h"
namespace iSTD
{
/*****************************************************************************\
Inline Function:
FastClamp
Description:
Fast clamping implementation(s) for 4xfloats
\*****************************************************************************/
__forceinline void FastClampF( const __m128 &inMins,
const __m128 &inMaxs,
float* oDest)
{
// load data to be clamped into 128 register
__m128 vals = _mm_loadu_ps(oDest);
// clamp
vals = _mm_min_ps(inMaxs, _mm_max_ps(vals, inMins));
// load into output
_mm_storeu_ps(oDest, vals);
}
__forceinline void FastClampF( const __m128 &inMins,
const __m128 &inMaxs,
float* oDest,
const float* inSrc )
{
// load data to be clamped into 128 register
__m128 vals = _mm_loadu_ps(inSrc);
// clamp
vals = _mm_min_ps(inMaxs, _mm_max_ps(vals, inMins));
// load into output
_mm_storeu_ps(oDest, vals);
}
__forceinline void FastClampF( const __m128 &inMins,
const __m128 &inMaxs,
float* oDest,
const __m128 &inSrc )
{
// clamp
__m128 vals = _mm_min_ps(inMaxs, _mm_max_ps(inSrc, inMins));
// load into output
_mm_storeu_ps(oDest, vals);
}
} // iSTD

View File

@ -19,7 +19,11 @@ SPDX-License-Identifier: MIT
namespace iOpenCL
{
#if __ARM_ARCH
void DebugMessageStr(std::string& output, unsigned int ulDebugLevel, const char* fmt, ...)
#else
void __cdecl DebugMessageStr(std::string& output, unsigned int ulDebugLevel, const char* fmt, ...)
#endif
{
if(IGC_IS_FLAG_ENABLED(ShaderDumpEnable) && IGC_IS_FLAG_ENABLED(EnableCosDump))
//if( str && ( ( g_DebugControl.MsgLevel & ulDebugLevel ) != GFXDBG_OFF ) )

View File

@ -871,6 +871,40 @@ foreach(_compilerFlagName IN ITEMS "CMAKE_CXX_FLAGS" "CMAKE_C_FLAGS")
endif()
elseif(_igc_compiler_is_gnu_or_clang)
if("${IGC_OPTION__ARCHITECTURE_TARGET}" STREQUAL "LinuxARM")
igc_config_flag_apply_settings(
CompilerOptions
"${_compilerFlagName}"
ALL_PATTERN ""
SET_RAW
-pipe
-fmessage-length=0
#-march=armv8.2-a+fp+simd+sve2
-march=armv8-a+fp+simd
-fms-extensions
-Werror
-Wno-unused-parameter
-Wno-missing-field-initializers
-Wwrite-strings
-Wno-long-long
-Wswitch
-Wno-sign-compare
-Wno-unused-result
-Wno-enum-compare
-Wno-type-limits
-Wno-ignored-qualifiers
-Wno-shadow
-Wformat
-Wformat-security
-Wno-extra
-Wno-write-strings
-finline
-fno-strict-aliasing
-Wno-unknown-pragmas
-fPIC
-Bsymbolic
)
else()
igc_config_flag_apply_settings(
CompilerOptions
"${_compilerFlagName}"
@ -910,6 +944,7 @@ foreach(_compilerFlagName IN ITEMS "CMAKE_CXX_FLAGS" "CMAKE_C_FLAGS")
-fPIC
-Bsymbolic
)
endif()
igc_config_flag_apply_settings(
CompilerOptions
"${_compilerFlagName}"
@ -1195,6 +1230,9 @@ add_compile_definitions(
$<$<STREQUAL:${IGC_OPTION__ARCHITECTURE_TARGET},Linux64>:USE_SSE2>
$<$<STREQUAL:${IGC_OPTION__ARCHITECTURE_TARGET},Linux64>:USE_SSE3>
$<$<STREQUAL:${IGC_OPTION__ARCHITECTURE_TARGET},Linux64>:USE_SSSE3>
$<$<STREQUAL:${IGC_OPTION__ARCHITECTURE_TARGET},LinuxARM>:USE_NEON>
#$<$<STREQUAL:${IGC_OPTION__ARCHITECTURE_TARGET},LinuxARM>:USE_SVE>
#$<$<STREQUAL:${IGC_OPTION__ARCHITECTURE_TARGET},LinuxARM>:USE_SVE2>
$<$<STREQUAL:${CMAKE_SYSTEM_NAME},Windows>:ICBE_LHDM>
$<$<STREQUAL:${CMAKE_SYSTEM_NAME},Windows>:_WIN32>
$<$<STREQUAL:${CMAKE_SYSTEM_NAME},Windows>:"WIN32">
@ -1975,8 +2013,18 @@ if(UNIX)
endif()
endif()
# Architecture-related options
if("${IGC_OPTION__ARCHITECTURE_TARGET}" STREQUAL "LinuxARM")
set(CPACK_PACKAGE_ARCHITECTURE "arm64")
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "arm64")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "arm64")
else()
set(CPACK_PACKAGE_ARCHITECTURE "x86_64")
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "amd64")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
endif()
set(CPACK_DEBIAN_PACKAGE_MAINTAINER "intel")
set(CPACK_PACKAGE_ARCHITECTURE "x86_64")
set(CPACK_PACKAGE_NAME "intel")
set(CPACK_PACKAGE_VERSION_MAJOR ${IGC_API_MAJOR_VERSION})
set(CPACK_PACKAGE_VERSION_MINOR ${IGC_API_MINOR_VERSION})
@ -1986,9 +2034,7 @@ if(UNIX)
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "postinst;postrm")
set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "Intel(R) Graphics Compiler for OpenCL(TM)")
set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "amd64")
set(CPACK_RPM_PACKAGE_RELOCATABLE TRUE)
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
set(CPACK_DEBIAN_IGC-CORE_FILE_NAME "intel-igc-core_${CPACK_PACKAGE_VERSION}_${CPACK_DEBIAN_PACKAGE_ARCHITECTURE}.deb")
set(CPACK_DEBIAN_IGC-MEDIA_FILE_NAME "intel-igc-media_${CPACK_PACKAGE_VERSION}_${CPACK_DEBIAN_PACKAGE_ARCHITECTURE}.deb")

View File

@ -103,17 +103,26 @@ elseif(${CCLANG_BUILD_PREBUILDS})
set(CLANG_TOOL_PATH "${CCLANG_BUILD_PREBUILDS_DIR}/clang${CMAKE_EXECUTABLE_SUFFIX}")
set(LLVM_PACKAGE_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX}")
# Get clang-tool version
execute_process(
COMMAND ${CLANG_TOOL_PATH} -v
ERROR_VARIABLE CLANG_TOOL_V_CALL)
string(REGEX MATCH "clang version ([0-9]*\\.[0-9]*\\.[0-9]*[a-zA-Z0-9]*)" CLANG_TOOL_VERSION "${CLANG_TOOL_V_CALL}")
set(CLANG_TOOL_VERSION "${CMAKE_MATCH_1}")
if(CMAKE_CROSSCOMPILING)
# In case of cross compilation we could not execute prebuilt opencl library,
# so we trust version number provided externally or assume it's the same as
# LLVM version
if(NOT DEFINED CLANG_TOOL_VERSION)
set(CLANG_TOOL_VERSION "${LLVM_PACKAGE_VERSION}")
endif()
else(CMAKE_CROSSCOMPILING)
# Get clang-tool version
execute_process(
COMMAND ${CLANG_TOOL_PATH} -v
ERROR_VARIABLE CLANG_TOOL_V_CALL)
string(REGEX MATCH "clang version ([0-9]*\\.[0-9]*\\.[0-9]*[a-zA-Z0-9]*)" CLANG_TOOL_VERSION "${CLANG_TOOL_V_CALL}")
set(CLANG_TOOL_VERSION "${CMAKE_MATCH_1}")
# Check if we parse clang tool version correctly
if(NOT CLANG_TOOL_VERSION)
message(FATAL_ERROR "[IGC] : Cannot read version of clang tool, please check the output of execution `clang -v` : ${CLANG_TOOL_V_CALL}")
endif()
# Check if we parse clang tool version correctly
if(NOT CLANG_TOOL_VERSION)
message(FATAL_ERROR "[IGC] : Cannot read version of clang tool, please check the output of execution `clang -v` : ${CLANG_TOOL_V_CALL}")
endif()
endif(CMAKE_CROSSCOMPILING)
# Check if llvm version for IGC is newer or equal with the clang-tool version
if(${LLVM_PACKAGE_VERSION} VERSION_GREATER ${CLANG_TOOL_VERSION} OR

View File

@ -25,6 +25,13 @@ set(CMAKE_MODULE_PATH
)
set(LLVM_TABLEGEN_EXE "llvm-tblgen")
if(CMAKE_CROSSCOMPILING)
if(DEFINED LLVM_TABLEGEN)
set(LLVM_TABLEGEN_EXE ${LLVM_TABLEGEN})
else()
find_program(LLVM_TABLEGEN_EXE "llvm-tblgen" ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
endif()
endif()
set(LLVM_LINK_EXE "llvm-link" CACHE STRING "")

View File

@ -50,7 +50,7 @@ static void * GetProcAddress(
#if defined(WIN32)
#define CDECLATTRIBUTE __cdecl
#elif __GNUC__
#ifdef __x86_64__
#if defined(__x86_64__) || defined(__ARM_ARCH)
#define CDECLATTRIBUTE
#else
#define CDECLATTRIBUTE __attribute__((__cdecl__))