mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-10 15:12:56 +08:00
Add neon intrinsics for aarch64
Related-To: NEO-6452 Signed-off-by: Sebastian Luzynski <sebastian.jozef.luzynski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
c7d8915dd4
commit
cf906030ac
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (C) 2019-2021 Intel Corporation
|
||||
# Copyright (C) 2019-2022 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
#
|
||||
@@ -10,5 +10,12 @@ if(${NEO_TARGET_PROCESSOR} STREQUAL "aarch64")
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
|
||||
)
|
||||
|
||||
if(COMPILER_SUPPORTS_NEON)
|
||||
list(APPEND NEO_CORE_HELPERS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_neon.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/uint16_neon.h
|
||||
)
|
||||
endif()
|
||||
|
||||
set_property(GLOBAL PROPERTY NEO_CORE_HELPERS ${NEO_CORE_HELPERS})
|
||||
endif()
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -9,10 +9,12 @@
|
||||
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/helpers/local_id_gen_special.inl"
|
||||
#include "shared/source/utilities/cpu_info.h"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
struct uint16x8_t;
|
||||
struct uint16x16_t;
|
||||
|
||||
// This is the initial value of SIMD for local ID
|
||||
// computation. It correlates to the SIMD lane.
|
||||
@@ -27,6 +29,18 @@ void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3>
|
||||
void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
|
||||
void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;
|
||||
|
||||
// Initialize the lookup table based on CPU capabilities
|
||||
LocalIDHelper::LocalIDHelper() {
|
||||
bool supportsNEON = CpuInfo::getInstance().isFeatureSupported(CpuInfo::featureNeon);
|
||||
if (supportsNEON) {
|
||||
LocalIDHelper::generateSimd8 = generateLocalIDsSimd<uint16x8_t, 8>;
|
||||
LocalIDHelper::generateSimd16 = generateLocalIDsSimd<uint16x16_t, 16>;
|
||||
LocalIDHelper::generateSimd32 = generateLocalIDsSimd<uint16x16_t, 32>;
|
||||
}
|
||||
}
|
||||
|
||||
LocalIDHelper LocalIDHelper::initializer;
|
||||
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
|
||||
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
||||
|
||||
17
shared/source/helpers/aarch64/local_id_gen_neon.cpp
Normal file
17
shared/source/helpers/aarch64/local_id_gen_neon.cpp
Normal file
@@ -0,0 +1,17 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/helpers/aarch64/uint16_neon.h"
|
||||
#include "shared/source/helpers/local_id_gen.inl"
|
||||
|
||||
#include <array>
|
||||
|
||||
namespace NEO {
|
||||
template void generateLocalIDsSimd<uint16x16_t, 8>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
|
||||
template void generateLocalIDsSimd<uint16x16_t, 16>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
|
||||
template void generateLocalIDsSimd<uint16x16_t, 32>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
|
||||
} // namespace NEO
|
||||
173
shared/source/helpers/aarch64/uint16_neon.h
Normal file
173
shared/source/helpers/aarch64/uint16_neon.h
Normal file
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/helpers/debug_helpers.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <cstdint>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
struct uint16x16_t {
|
||||
enum { numChannels = 16 };
|
||||
|
||||
uint16x8x2_t value;
|
||||
|
||||
uint16x16_t() {
|
||||
value.val[0] = vdupq_n_u16(0);
|
||||
value.val[1] = vdupq_n_u16(0);
|
||||
}
|
||||
|
||||
uint16x16_t(uint16x8_t lo, uint16x8_t hi) {
|
||||
value.val[0] = lo;
|
||||
value.val[1] = hi;
|
||||
}
|
||||
|
||||
uint16x16_t(uint16_t a) {
|
||||
value.val[0] = vdupq_n_u16(a);
|
||||
value.val[1] = vdupq_n_u16(a);
|
||||
}
|
||||
|
||||
explicit uint16x16_t(const void *alignedPtr) {
|
||||
load(alignedPtr);
|
||||
}
|
||||
|
||||
inline uint16_t get(unsigned int element) {
|
||||
DEBUG_BREAK_IF(element >= numChannels);
|
||||
uint16_t result;
|
||||
// vgetq_lane requires constant immediate
|
||||
switch (element) {
|
||||
case 0:
|
||||
result = vgetq_lane_u16(value.val[0], 0);
|
||||
break;
|
||||
case 1:
|
||||
result = vgetq_lane_u16(value.val[0], 1);
|
||||
break;
|
||||
case 2:
|
||||
result = vgetq_lane_u16(value.val[0], 2);
|
||||
break;
|
||||
case 3:
|
||||
result = vgetq_lane_u16(value.val[0], 3);
|
||||
break;
|
||||
case 4:
|
||||
result = vgetq_lane_u16(value.val[0], 4);
|
||||
break;
|
||||
case 5:
|
||||
result = vgetq_lane_u16(value.val[0], 5);
|
||||
break;
|
||||
case 6:
|
||||
result = vgetq_lane_u16(value.val[0], 6);
|
||||
break;
|
||||
case 7:
|
||||
result = vgetq_lane_u16(value.val[0], 7);
|
||||
break;
|
||||
case 8:
|
||||
result = vgetq_lane_u16(value.val[1], 0);
|
||||
break;
|
||||
case 9:
|
||||
result = vgetq_lane_u16(value.val[1], 1);
|
||||
break;
|
||||
case 10:
|
||||
result = vgetq_lane_u16(value.val[1], 2);
|
||||
break;
|
||||
case 11:
|
||||
result = vgetq_lane_u16(value.val[1], 3);
|
||||
break;
|
||||
case 12:
|
||||
result = vgetq_lane_u16(value.val[1], 4);
|
||||
break;
|
||||
case 13:
|
||||
result = vgetq_lane_u16(value.val[1], 5);
|
||||
break;
|
||||
case 14:
|
||||
result = vgetq_lane_u16(value.val[1], 6);
|
||||
break;
|
||||
case 15:
|
||||
result = vgetq_lane_u16(value.val[1], 7);
|
||||
break;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint16x16_t zero() {
|
||||
return uint16x16_t(static_cast<uint16_t>(0u));
|
||||
}
|
||||
|
||||
static inline uint16x16_t one() {
|
||||
return uint16x16_t(static_cast<uint16_t>(1u));
|
||||
}
|
||||
|
||||
static inline uint16x16_t mask() {
|
||||
return uint16x16_t(static_cast<uint16_t>(0xffffu));
|
||||
}
|
||||
|
||||
inline void load(const void *alignedPtr) {
|
||||
DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
|
||||
value = vld1q_u16_x2(reinterpret_cast<const uint16_t *>(alignedPtr));
|
||||
}
|
||||
|
||||
inline void store(void *alignedPtr) {
|
||||
DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
|
||||
vst1q_u16_x2(reinterpret_cast<uint16_t *>(alignedPtr), value);
|
||||
}
|
||||
|
||||
inline operator bool() const {
|
||||
uint64x2_t hi = vreinterpretq_u64_u16(value.val[0]);
|
||||
uint64x2_t lo = vreinterpretq_u64_u16(value.val[1]);
|
||||
uint64x2_t tmp = vorrq_u64(hi, lo);
|
||||
uint64_t result = vget_lane_u64(vorr_u64(vget_high_u64(tmp), vget_low_u64(tmp)), 0);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
inline uint16x16_t &operator-=(const uint16x16_t &a) {
|
||||
value.val[0] = vsubq_u16(value.val[0], a.value.val[0]);
|
||||
value.val[1] = vsubq_u16(value.val[1], a.value.val[1]);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline uint16x16_t &operator+=(const uint16x16_t &a) {
|
||||
value.val[0] = vaddq_u16(value.val[0], a.value.val[0]);
|
||||
value.val[1] = vaddq_u16(value.val[1], a.value.val[1]);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline friend uint16x16_t operator>=(const uint16x16_t &a, const uint16x16_t &b) {
|
||||
uint16x16_t result;
|
||||
|
||||
result.value.val[0] = veorq_u16(mask().value.val[0],
|
||||
vcgtq_u16(b.value.val[0], a.value.val[0]));
|
||||
result.value.val[1] = veorq_u16(mask().value.val[1],
|
||||
vcgtq_u16(b.value.val[1], a.value.val[1]));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline friend uint16x16_t operator&&(const uint16x16_t &a, const uint16x16_t &b) {
|
||||
uint16x16_t result;
|
||||
|
||||
result.value.val[0] = vandq_u16(a.value.val[0], b.value.val[0]);
|
||||
result.value.val[1] = vandq_u16(a.value.val[1], b.value.val[1]);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// NOTE: uint16x16_t::blend behaves like mask ? a : b
|
||||
inline friend uint16x16_t blend(const uint16x16_t &a, const uint16x16_t &b, const uint16x16_t &mask) {
|
||||
uint16x16_t result;
|
||||
|
||||
result.value.val[0] = vbslq_u16(mask.value.val[0], a.value.val[0], b.value.val[0]);
|
||||
result.value.val[1] = vbslq_u16(mask.value.val[1], a.value.val[1], b.value.val[1]);
|
||||
|
||||
return result;
|
||||
}
|
||||
};
|
||||
} // namespace NEO
|
||||
Reference in New Issue
Block a user