Add neon intrinsics for aarch64

Related-To: NEO-6452

Signed-off-by: Sebastian Luzynski <sebastian.jozef.luzynski@intel.com>
This commit is contained in:
Sebastian Luzynski
2022-03-28 16:30:45 +00:00
committed by Compute-Runtime-Automation
parent c7d8915dd4
commit cf906030ac
16 changed files with 445 additions and 27 deletions

View File

@@ -1,5 +1,5 @@
#
# Copyright (C) 2019-2021 Intel Corporation
# Copyright (C) 2019-2022 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
@@ -10,5 +10,12 @@ if(${NEO_TARGET_PROCESSOR} STREQUAL "aarch64")
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
)
if(COMPILER_SUPPORTS_NEON)
list(APPEND NEO_CORE_HELPERS
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_neon.cpp
${CMAKE_CURRENT_SOURCE_DIR}/uint16_neon.h
)
endif()
set_property(GLOBAL PROPERTY NEO_CORE_HELPERS ${NEO_CORE_HELPERS})
endif()

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -9,10 +9,12 @@
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/local_id_gen_special.inl"
#include "shared/source/utilities/cpu_info.h"
namespace NEO {
struct uint16x8_t;
struct uint16x16_t;
// This is the initial value of SIMD for local ID
// computation. It correlates to the SIMD lane.
@@ -27,6 +29,18 @@ void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3>
void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;
// Initialize the lookup table based on CPU capabilities
LocalIDHelper::LocalIDHelper() {
bool supportsNEON = CpuInfo::getInstance().isFeatureSupported(CpuInfo::featureNeon);
if (supportsNEON) {
LocalIDHelper::generateSimd8 = generateLocalIDsSimd<uint16x8_t, 8>;
LocalIDHelper::generateSimd16 = generateLocalIDsSimd<uint16x16_t, 16>;
LocalIDHelper::generateSimd32 = generateLocalIDsSimd<uint16x16_t, 32>;
}
}
LocalIDHelper LocalIDHelper::initializer;
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);

View File

@@ -0,0 +1,17 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/aarch64/uint16_neon.h"
#include "shared/source/helpers/local_id_gen.inl"
#include <array>
namespace NEO {
template void generateLocalIDsSimd<uint16x16_t, 8>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
template void generateLocalIDsSimd<uint16x16_t, 16>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
template void generateLocalIDsSimd<uint16x16_t, 32>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
} // namespace NEO

View File

@@ -0,0 +1,173 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/debug_helpers.h"
#include <arm_neon.h>
#include <cstdint>
namespace NEO {
struct uint16x16_t {
enum { numChannels = 16 };
uint16x8x2_t value;
uint16x16_t() {
value.val[0] = vdupq_n_u16(0);
value.val[1] = vdupq_n_u16(0);
}
uint16x16_t(uint16x8_t lo, uint16x8_t hi) {
value.val[0] = lo;
value.val[1] = hi;
}
uint16x16_t(uint16_t a) {
value.val[0] = vdupq_n_u16(a);
value.val[1] = vdupq_n_u16(a);
}
explicit uint16x16_t(const void *alignedPtr) {
load(alignedPtr);
}
inline uint16_t get(unsigned int element) {
DEBUG_BREAK_IF(element >= numChannels);
uint16_t result;
// vgetq_lane requires constant immediate
switch (element) {
case 0:
result = vgetq_lane_u16(value.val[0], 0);
break;
case 1:
result = vgetq_lane_u16(value.val[0], 1);
break;
case 2:
result = vgetq_lane_u16(value.val[0], 2);
break;
case 3:
result = vgetq_lane_u16(value.val[0], 3);
break;
case 4:
result = vgetq_lane_u16(value.val[0], 4);
break;
case 5:
result = vgetq_lane_u16(value.val[0], 5);
break;
case 6:
result = vgetq_lane_u16(value.val[0], 6);
break;
case 7:
result = vgetq_lane_u16(value.val[0], 7);
break;
case 8:
result = vgetq_lane_u16(value.val[1], 0);
break;
case 9:
result = vgetq_lane_u16(value.val[1], 1);
break;
case 10:
result = vgetq_lane_u16(value.val[1], 2);
break;
case 11:
result = vgetq_lane_u16(value.val[1], 3);
break;
case 12:
result = vgetq_lane_u16(value.val[1], 4);
break;
case 13:
result = vgetq_lane_u16(value.val[1], 5);
break;
case 14:
result = vgetq_lane_u16(value.val[1], 6);
break;
case 15:
result = vgetq_lane_u16(value.val[1], 7);
break;
}
return result;
}
static inline uint16x16_t zero() {
return uint16x16_t(static_cast<uint16_t>(0u));
}
static inline uint16x16_t one() {
return uint16x16_t(static_cast<uint16_t>(1u));
}
static inline uint16x16_t mask() {
return uint16x16_t(static_cast<uint16_t>(0xffffu));
}
inline void load(const void *alignedPtr) {
DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
value = vld1q_u16_x2(reinterpret_cast<const uint16_t *>(alignedPtr));
}
inline void store(void *alignedPtr) {
DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
vst1q_u16_x2(reinterpret_cast<uint16_t *>(alignedPtr), value);
}
inline operator bool() const {
uint64x2_t hi = vreinterpretq_u64_u16(value.val[0]);
uint64x2_t lo = vreinterpretq_u64_u16(value.val[1]);
uint64x2_t tmp = vorrq_u64(hi, lo);
uint64_t result = vget_lane_u64(vorr_u64(vget_high_u64(tmp), vget_low_u64(tmp)), 0);
return result;
}
inline uint16x16_t &operator-=(const uint16x16_t &a) {
value.val[0] = vsubq_u16(value.val[0], a.value.val[0]);
value.val[1] = vsubq_u16(value.val[1], a.value.val[1]);
return *this;
}
inline uint16x16_t &operator+=(const uint16x16_t &a) {
value.val[0] = vaddq_u16(value.val[0], a.value.val[0]);
value.val[1] = vaddq_u16(value.val[1], a.value.val[1]);
return *this;
}
inline friend uint16x16_t operator>=(const uint16x16_t &a, const uint16x16_t &b) {
uint16x16_t result;
result.value.val[0] = veorq_u16(mask().value.val[0],
vcgtq_u16(b.value.val[0], a.value.val[0]));
result.value.val[1] = veorq_u16(mask().value.val[1],
vcgtq_u16(b.value.val[1], a.value.val[1]));
return result;
}
inline friend uint16x16_t operator&&(const uint16x16_t &a, const uint16x16_t &b) {
uint16x16_t result;
result.value.val[0] = vandq_u16(a.value.val[0], b.value.val[0]);
result.value.val[1] = vandq_u16(a.value.val[1], b.value.val[1]);
return result;
}
// NOTE: uint16x16_t::blend behaves like mask ? a : b
inline friend uint16x16_t blend(const uint16x16_t &a, const uint16x16_t &b, const uint16x16_t &mask) {
uint16x16_t result;
result.value.val[0] = vbslq_u16(mask.value.val[0], a.value.val[0], b.value.val[0]);
result.value.val[1] = vbslq_u16(mask.value.val[1], a.value.val[1], b.value.val[1]);
return result;
}
};
} // namespace NEO