Add neon intrinsics for aarch64

Related-To: NEO-6452

Signed-off-by: Sebastian Luzynski <sebastian.jozef.luzynski@intel.com>
This commit is contained in:
Sebastian Luzynski
2022-03-28 16:30:45 +00:00
committed by Compute-Runtime-Automation
parent c7d8915dd4
commit cf906030ac
16 changed files with 445 additions and 27 deletions

View File

@@ -618,6 +618,7 @@ else()
endif()
check_cxx_compiler_flag(-msse4.2 COMPILER_SUPPORTS_SSE42)
check_cxx_compiler_flag(-mavx2 COMPILER_SUPPORTS_AVX2)
check_cxx_compiler_flag(-march=armv8-a+simd COMPILER_SUPPORTS_NEON)
endif()
if(NOT MSVC)

View File

@@ -1,5 +1,5 @@
#
# Copyright (C) 2019-2021 Intel Corporation
# Copyright (C) 2019-2022 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
@@ -10,5 +10,12 @@ if(${NEO_TARGET_PROCESSOR} STREQUAL "aarch64")
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
)
if(COMPILER_SUPPORTS_NEON)
list(APPEND NEO_CORE_HELPERS
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_neon.cpp
${CMAKE_CURRENT_SOURCE_DIR}/uint16_neon.h
)
endif()
set_property(GLOBAL PROPERTY NEO_CORE_HELPERS ${NEO_CORE_HELPERS})
endif()

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -9,10 +9,12 @@
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/local_id_gen_special.inl"
#include "shared/source/utilities/cpu_info.h"
namespace NEO {
struct uint16x8_t;
struct uint16x16_t;
// This is the initial value of SIMD for local ID
// computation. It correlates to the SIMD lane.
@@ -27,6 +29,18 @@ void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3>
void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;
// Initialize the lookup table based on CPU capabilities
LocalIDHelper::LocalIDHelper() {
bool supportsNEON = CpuInfo::getInstance().isFeatureSupported(CpuInfo::featureNeon);
if (supportsNEON) {
LocalIDHelper::generateSimd8 = generateLocalIDsSimd<uint16x8_t, 8>;
LocalIDHelper::generateSimd16 = generateLocalIDsSimd<uint16x16_t, 16>;
LocalIDHelper::generateSimd32 = generateLocalIDsSimd<uint16x16_t, 32>;
}
}
LocalIDHelper LocalIDHelper::initializer;
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);

View File

@@ -0,0 +1,17 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/aarch64/uint16_neon.h"
#include "shared/source/helpers/local_id_gen.inl"
#include <array>
namespace NEO {
template void generateLocalIDsSimd<uint16x16_t, 8>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
template void generateLocalIDsSimd<uint16x16_t, 16>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
template void generateLocalIDsSimd<uint16x16_t, 32>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
} // namespace NEO

View File

@@ -0,0 +1,173 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/debug_helpers.h"
#include <arm_neon.h>
#include <cstdint>
namespace NEO {
struct uint16x16_t {
enum { numChannels = 16 };
uint16x8x2_t value;
uint16x16_t() {
value.val[0] = vdupq_n_u16(0);
value.val[1] = vdupq_n_u16(0);
}
uint16x16_t(uint16x8_t lo, uint16x8_t hi) {
value.val[0] = lo;
value.val[1] = hi;
}
uint16x16_t(uint16_t a) {
value.val[0] = vdupq_n_u16(a);
value.val[1] = vdupq_n_u16(a);
}
explicit uint16x16_t(const void *alignedPtr) {
load(alignedPtr);
}
inline uint16_t get(unsigned int element) {
DEBUG_BREAK_IF(element >= numChannels);
uint16_t result;
// vgetq_lane requires constant immediate
switch (element) {
case 0:
result = vgetq_lane_u16(value.val[0], 0);
break;
case 1:
result = vgetq_lane_u16(value.val[0], 1);
break;
case 2:
result = vgetq_lane_u16(value.val[0], 2);
break;
case 3:
result = vgetq_lane_u16(value.val[0], 3);
break;
case 4:
result = vgetq_lane_u16(value.val[0], 4);
break;
case 5:
result = vgetq_lane_u16(value.val[0], 5);
break;
case 6:
result = vgetq_lane_u16(value.val[0], 6);
break;
case 7:
result = vgetq_lane_u16(value.val[0], 7);
break;
case 8:
result = vgetq_lane_u16(value.val[1], 0);
break;
case 9:
result = vgetq_lane_u16(value.val[1], 1);
break;
case 10:
result = vgetq_lane_u16(value.val[1], 2);
break;
case 11:
result = vgetq_lane_u16(value.val[1], 3);
break;
case 12:
result = vgetq_lane_u16(value.val[1], 4);
break;
case 13:
result = vgetq_lane_u16(value.val[1], 5);
break;
case 14:
result = vgetq_lane_u16(value.val[1], 6);
break;
case 15:
result = vgetq_lane_u16(value.val[1], 7);
break;
}
return result;
}
static inline uint16x16_t zero() {
return uint16x16_t(static_cast<uint16_t>(0u));
}
static inline uint16x16_t one() {
return uint16x16_t(static_cast<uint16_t>(1u));
}
static inline uint16x16_t mask() {
return uint16x16_t(static_cast<uint16_t>(0xffffu));
}
inline void load(const void *alignedPtr) {
DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
value = vld1q_u16_x2(reinterpret_cast<const uint16_t *>(alignedPtr));
}
inline void store(void *alignedPtr) {
DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
vst1q_u16_x2(reinterpret_cast<uint16_t *>(alignedPtr), value);
}
inline operator bool() const {
uint64x2_t hi = vreinterpretq_u64_u16(value.val[0]);
uint64x2_t lo = vreinterpretq_u64_u16(value.val[1]);
uint64x2_t tmp = vorrq_u64(hi, lo);
uint64_t result = vget_lane_u64(vorr_u64(vget_high_u64(tmp), vget_low_u64(tmp)), 0);
return result;
}
inline uint16x16_t &operator-=(const uint16x16_t &a) {
value.val[0] = vsubq_u16(value.val[0], a.value.val[0]);
value.val[1] = vsubq_u16(value.val[1], a.value.val[1]);
return *this;
}
inline uint16x16_t &operator+=(const uint16x16_t &a) {
value.val[0] = vaddq_u16(value.val[0], a.value.val[0]);
value.val[1] = vaddq_u16(value.val[1], a.value.val[1]);
return *this;
}
inline friend uint16x16_t operator>=(const uint16x16_t &a, const uint16x16_t &b) {
uint16x16_t result;
result.value.val[0] = veorq_u16(mask().value.val[0],
vcgtq_u16(b.value.val[0], a.value.val[0]));
result.value.val[1] = veorq_u16(mask().value.val[1],
vcgtq_u16(b.value.val[1], a.value.val[1]));
return result;
}
inline friend uint16x16_t operator&&(const uint16x16_t &a, const uint16x16_t &b) {
uint16x16_t result;
result.value.val[0] = vandq_u16(a.value.val[0], b.value.val[0]);
result.value.val[1] = vandq_u16(a.value.val[1], b.value.val[1]);
return result;
}
// NOTE: uint16x16_t::blend behaves like mask ? a : b
inline friend uint16x16_t blend(const uint16x16_t &a, const uint16x16_t &b, const uint16x16_t &mask) {
uint16x16_t result;
result.value.val[0] = vbslq_u16(mask.value.val[0], a.value.val[0], b.value.val[0]);
result.value.val[1] = vbslq_u16(mask.value.val[1], a.value.val[1], b.value.val[1]);
return result;
}
};
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021 Intel Corporation
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -7,7 +7,12 @@
#include "shared/source/utilities/cpu_info.h"
#include <asm/hwcap.h>
namespace NEO {
void CpuInfo::detect() const {
uint32_t cpuInfo[4] = {};
cpuid(cpuInfo, 0u);
features |= cpuInfo[0] & HWCAP_ASIMD ? featureNeon : featureNone;
}
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -42,6 +42,7 @@ struct CpuInfo {
static const uint64_t featureHle = 0x000200000ULL;
static const uint64_t featureRtm = 0x000400000ULL;
static const uint64_t featureAvX2 = 0x000800000ULL;
static const uint64_t featureNeon = 0x001000000ULL;
static const uint64_t featureKncni = 0x004000000ULL;
static const uint64_t featureAvX512F = 0x008000000ULL;
static const uint64_t featureAdx = 0x010000000ULL;

View File

@@ -11,10 +11,12 @@
#include <cstdint>
#include <fstream>
#include <sys/auxv.h>
namespace NEO {
void cpuid_linux_wrapper(int cpuInfo[4], int functionId) {
cpuInfo[0] = getauxval(AT_HWCAP);
}
void cpuidex_linux_wrapper(int *cpuInfo, int functionId, int subfunctionId) {
@@ -24,7 +26,7 @@ void get_cpu_flags_linux(std::string &cpuFlags) {
std::ifstream cpuinfo(std::string(Os::sysFsProcPathPrefix) + "/cpuinfo");
std::string line;
while (std::getline(cpuinfo, line)) {
if (line.substr(0, 5) == "flags") {
if (line.substr(0, 8) == "Features") {
cpuFlags = line;
break;
}

View File

@@ -25,5 +25,11 @@ set(IGDRCL_SRCS_tests_helpers
${CMAKE_CURRENT_SOURCE_DIR}/test_hw_info_config.cpp
)
if(COMPILER_SUPPORTS_NEON)
list(APPEND IGDRCL_SRCS_tests_helpers
${CMAKE_CURRENT_SOURCE_DIR}/uint16_neon_tests.cpp
)
endif()
target_sources(${TARGET_NAME} PRIVATE ${IGDRCL_SRCS_tests_helpers})
add_subdirectories()

View File

@@ -0,0 +1,111 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/aarch64/uint16_neon.h"
#include "shared/source/helpers/aligned_memory.h"
#include "gtest/gtest.h"
using namespace NEO;
TEST(Uint16Neon, GivenNeonAndMaskWhenCastingToBoolThenTrueIsReturned) {
EXPECT_TRUE(static_cast<bool>(NEO::uint16x16_t::mask()));
}
TEST(Uint16Neon, GivenNeonAndZeroWhenCastingToBoolThenFalseIsReturned) {
EXPECT_FALSE(static_cast<bool>(NEO::uint16x16_t::zero()));
}
TEST(Uint16Neon, GivenNeonWhenConjoiningMaskAndZeroThenBooleanResultIsCorrect) {
EXPECT_TRUE(NEO::uint16x16_t::mask() && NEO::uint16x16_t::mask());
EXPECT_FALSE(NEO::uint16x16_t::mask() && NEO::uint16x16_t::zero());
EXPECT_FALSE(NEO::uint16x16_t::zero() && NEO::uint16x16_t::mask());
EXPECT_FALSE(NEO::uint16x16_t::zero() && NEO::uint16x16_t::zero());
}
TEST(Uint16Neon, GivenNeonAndOneWhenCreatingThenInstancesAreSame) {
auto one = NEO::uint16x16_t::one();
NEO::uint16x16_t alsoOne(one);
EXPECT_EQ(0, memcmp(&alsoOne, &one, sizeof(NEO::uint16x16_t)));
}
TEST(Uint16Neon, GivenNeonAndValueWhenCreatingThenConstructorIsReplicated) {
NEO::uint16x16_t allSevens(7u);
for (int i = 0; i < NEO::uint16x16_t::numChannels; ++i) {
EXPECT_EQ(7u, allSevens.get(i));
}
}
ALIGNAS(32)
static const uint16_t laneValues[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
TEST(Uint16Neon, GivenNeonAndArrayWhenCreatingThenConstructorIsReplicated) {
NEO::uint16x16_t lanes(laneValues);
for (int i = 0; i < NEO::uint16x16_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i), lanes.get(i));
}
}
TEST(Uint16Neon, GivenNeonWhenLoadingThenValuesAreSetCorrectly) {
NEO::uint16x16_t lanes;
lanes.load(laneValues);
for (int i = 0; i < NEO::uint16x16_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i), lanes.get(i));
}
}
TEST(Uint16Neon, GivenNeonWhenStoringThenValuesAreSetCorrectly) {
uint16_t *alignedMemory = reinterpret_cast<uint16_t *>(alignedMalloc(1024, 32));
NEO::uint16x16_t lanes(laneValues);
lanes.store(alignedMemory);
for (int i = 0; i < NEO::uint16x16_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i), alignedMemory[i]);
}
alignedFree(alignedMemory);
}
TEST(Uint16Neon, GivenNeonWhenDecrementingThenValuesAreSetCorrectly) {
NEO::uint16x16_t result(laneValues);
result -= NEO::uint16x16_t::one();
for (int i = 0; i < NEO::uint16x16_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i - 1), result.get(i));
}
}
TEST(Uint16Neon, GivenNeonWhenIncrementingThenValuesAreSetCorrectly) {
NEO::uint16x16_t result(laneValues);
result += NEO::uint16x16_t::one();
for (int i = 0; i < NEO::uint16x16_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i + 1), result.get(i));
}
}
TEST(Uint16Sse4, GivenNeonWhenBlendingThenValuesAreSetCorrectly) {
NEO::uint16x16_t a(NEO::uint16x16_t::one());
NEO::uint16x16_t b(NEO::uint16x16_t::zero());
NEO::uint16x16_t c;
// c = mask ? a : b
c = blend(a, b, NEO::uint16x16_t::mask());
for (int i = 0; i < NEO::uint16x16_t::numChannels; ++i) {
EXPECT_EQ(a.get(i), c.get(i));
}
// c = mask ? a : b
c = blend(a, b, NEO::uint16x16_t::zero());
for (int i = 0; i < NEO::uint16x16_t::numChannels; ++i) {
EXPECT_EQ(b.get(i), c.get(i));
}
}

View File

@@ -0,0 +1,12 @@
#
# Copyright (C) 2022 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
if(${NEO_TARGET_PROCESSOR} STREQUAL "aarch64")
target_sources(${TARGET_NAME} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/cpuinfo_tests_aarch64.cpp
)
endif()

View File

@@ -0,0 +1,38 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/file_io.h"
#include "shared/source/os_interface/linux/os_inc.h"
#include "shared/source/utilities/cpu_info.h"
#include "shared/test/common/helpers/variable_backup.h"
#include "gtest/gtest.h"
#include <cstdio>
#include <fstream>
using namespace NEO;
TEST(CpuInfoAarch64, givenProcCpuinfoFileExistsWhenIsCpuFlagPresentIsCalledThenValidValueIsReturned) {
VariableBackup<const char *> pathPrefixBackup(&Os::sysFsProcPathPrefix, "./test_files");
std::string cpuinfoFile = "./test_files/cpuinfo";
EXPECT_FALSE(fileExists(cpuinfoFile));
{
std::ofstream cpuinfo(cpuinfoFile);
cpuinfo << "processor\t\t: 0\nFeatures\t\t: flag1 flag2 flag3\n";
}
EXPECT_TRUE(fileExists(cpuinfoFile));
CpuInfo testCpuInfo;
EXPECT_TRUE(testCpuInfo.isCpuFlagPresent("flag1"));
EXPECT_TRUE(testCpuInfo.isCpuFlagPresent("flag2"));
EXPECT_FALSE(testCpuInfo.isCpuFlagPresent("nonExistingCpuFlag"));
std::remove(cpuinfoFile.c_str());
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021 Intel Corporation
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -17,26 +17,6 @@
using namespace NEO;
TEST(CpuInfo, givenProcCpuinfoFileExistsWhenIsCpuFlagPresentIsCalledThenValidValueIsReturned) {
VariableBackup<const char *> pathPrefixBackup(&Os::sysFsProcPathPrefix, "./test_files");
std::string cpuinfoFile = "./test_files/cpuinfo";
EXPECT_FALSE(fileExists(cpuinfoFile));
{
std::ofstream cpuinfo(cpuinfoFile);
cpuinfo << "processor\t\t: 0\nflags\t\t: flag1 flag2 flag3\n";
}
EXPECT_TRUE(fileExists(cpuinfoFile));
CpuInfo testCpuInfo;
EXPECT_TRUE(testCpuInfo.isCpuFlagPresent("flag1"));
EXPECT_TRUE(testCpuInfo.isCpuFlagPresent("flag2"));
EXPECT_FALSE(testCpuInfo.isCpuFlagPresent("nonExistingCpuFlag"));
std::remove(cpuinfoFile.c_str());
}
TEST(CpuInfo, givenProcCpuinfoFileIsNotExistsWhenIsCpuFlagPresentIsCalledThenValidValueIsReturned) {
std::string cpuinfoFile = "test_files/linux/proc/cpuinfo";
EXPECT_FALSE(fileExists(cpuinfoFile));

View File

@@ -1,5 +1,5 @@
#
# Copyright (C) 2021 Intel Corporation
# Copyright (C) 2021-2022 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
@@ -9,4 +9,6 @@ if(${NEO_TARGET_PROCESSOR} STREQUAL "x86_64")
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/cpuinfo_tests_x86_64.cpp
)
add_subdirectories()
endif()

View File

@@ -0,0 +1,11 @@
#
# Copyright (C) 2022 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
if(UNIX)
target_sources(${TARGET_NAME} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/cpuinfo_tests_x86_64_linux.cpp
)
endif()

View File

@@ -0,0 +1,38 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/file_io.h"
#include "shared/source/os_interface/linux/os_inc.h"
#include "shared/source/utilities/cpu_info.h"
#include "shared/test/common/helpers/variable_backup.h"
#include "gtest/gtest.h"
#include <cstdio>
#include <fstream>
using namespace NEO;
TEST(CpuInfo, givenProcCpuinfoFileExistsWhenIsCpuFlagPresentIsCalledThenValidValueIsReturned) {
VariableBackup<const char *> pathPrefixBackup(&Os::sysFsProcPathPrefix, "./test_files");
std::string cpuinfoFile = "./test_files/cpuinfo";
EXPECT_FALSE(fileExists(cpuinfoFile));
{
std::ofstream cpuinfo(cpuinfoFile);
cpuinfo << "processor\t\t: 0\nflags\t\t: flag1 flag2 flag3\n";
}
EXPECT_TRUE(fileExists(cpuinfoFile));
CpuInfo testCpuInfo;
EXPECT_TRUE(testCpuInfo.isCpuFlagPresent("flag1"));
EXPECT_TRUE(testCpuInfo.isCpuFlagPresent("flag2"));
EXPECT_FALSE(testCpuInfo.isCpuFlagPresent("nonExistingCpuFlag"));
std::remove(cpuinfoFile.c_str());
}