initial Neo enabling on architectures other than x86

Related-To: NEO-6011
Signed-off-by: Artur Harasimiuk <artur.harasimiuk@intel.com>
This commit is contained in:
Artur Harasimiuk
2021-09-02 21:25:03 +00:00
committed by Compute-Runtime-Automation
parent f958b053ab
commit 895e9e5116
28 changed files with 8801 additions and 100 deletions

View File

@ -149,6 +149,20 @@ else()
set(NEO_ARCH "x86")
endif()
if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
set(NEO_TARGET_PROCESSOR "x86_64")
elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64")
set(NEO_TARGET_PROCESSOR "x86_64")
elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
set(NEO_TARGET_PROCESSOR "aarch64")
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/sse2neon)
endif()
message(STATUS "Target processor: ${NEO_TARGET_PROCESSOR}")
if(NOT DEFINED NEO_TARGET_PROCESSOR)
message(FATAL_ERROR "Unsupported target processor: ${CMAKE_SYSTEM_PROCESSOR}")
endif()
if(NOT DEFINED BUILD_WITH_L0)
if("${NEO_BITS}" STREQUAL "64")
set(BUILD_WITH_L0 TRUE)
@ -847,6 +861,8 @@ else()
else()
message(WARNING "Spectre mitigation DISABLED")
endif()
check_cxx_compiler_flag(-msse4.2 COMPILER_SUPPORTS_SSE42)
check_cxx_compiler_flag(-mavx2 COMPILER_SUPPORTS_AVX2)
endif()
if(NOT MSVC)

View File

@ -485,7 +485,9 @@ if(BUILD_WITH_L0)
)
if(UNIX)
target_link_libraries(${TARGET_NAME_L0} ${GMM_LINK_NAME})
if(${NEO_TARGET_PROCESSOR} STREQUAL "x86_64")
target_link_libraries(${TARGET_NAME_L0} ${GMM_LINK_NAME})
endif()
set_property(TARGET ${TARGET_NAME_L0}
APPEND_STRING PROPERTY LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/core/source/dll/linux/ze.exports"

View File

@ -20,7 +20,7 @@ components:
infra:
branch: master
dest_dir: infra
revision: a6b4272e6e2ebd1965b656a0d247038a1111cc58
revision: 6f8216baa8dbd1c185c7dcd5349a8aa7ae0e5591
type: git
internal:
branch: master

View File

@ -189,7 +189,9 @@ if(${GENERATE_EXECUTABLE})
${NEO_SHARED_DIRECTORY}/os_interface/windows/gmm_interface_win.cpp
)
else()
target_link_libraries(${NEO_DYNAMIC_LIB_NAME} ${GMM_LINK_NAME})
if(${NEO_TARGET_PROCESSOR} STREQUAL "x86_64")
target_link_libraries(${NEO_DYNAMIC_LIB_NAME} ${GMM_LINK_NAME})
endif()
target_include_directories(${NEO_DYNAMIC_LIB_NAME} PRIVATE
${NEO_SHARED_DIRECTORY}/dll/devices${BRANCH_DIR_SUFFIX}
)

View File

@ -111,7 +111,7 @@ if(USE_ASAN)
set(GTEST_ENV "LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_SOURCE_DIR}/lsan_suppressions.txt")
endif()
if(NOT MSVC)
if(COMPILER_SUPPORTS_SSE42)
set_source_files_properties(helpers/uint16_sse4_tests.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
endif()

View File

@ -37,7 +37,6 @@ set(IGDRCL_SRCS_tests_helpers
${CMAKE_CURRENT_SOURCE_DIR}/timestamp_packet_tests.h
${CMAKE_CURRENT_SOURCE_DIR}/transfer_properties_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ult_limits.h
${CMAKE_CURRENT_SOURCE_DIR}/uint16_sse4_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/validator_tests.cpp
${NEO_SHARED_TEST_DIRECTORY}/common/helpers/aligned_memory_tests.cpp
${NEO_SHARED_TEST_DIRECTORY}/common/helpers/debug_manager_state_restore.h
@ -45,6 +44,12 @@ set(IGDRCL_SRCS_tests_helpers
${NEO_SHARED_TEST_DIRECTORY}/common/helpers/unit_test_helper.inl
)
if(MSVC OR COMPILER_SUPPORTS_SSE42)
list(APPEND IGDRCL_SRCS_tests_helpers
${CMAKE_CURRENT_SOURCE_DIR}/uint16_sse4_tests.cpp
)
endif()
if(TESTS_XEHP_AND_LATER)
list(APPEND IGDRCL_SRCS_tests_helpers
${CMAKE_CURRENT_SOURCE_DIR}/aub_helper_hw_tests_xehp_and_later.cpp

View File

@ -13,29 +13,29 @@
using namespace NEO;
TEST(Uint16Sse4, GivenMaskWhenCastingToBoolThenTrueIsReturned) {
EXPECT_TRUE(static_cast<bool>(uint16x8_t::mask()));
EXPECT_TRUE(static_cast<bool>(NEO::uint16x8_t::mask()));
}
TEST(Uint16Sse4, GivenZeroWhenCastingToBoolThenFalseIsReturned) {
EXPECT_FALSE(static_cast<bool>(uint16x8_t::zero()));
EXPECT_FALSE(static_cast<bool>(NEO::uint16x8_t::zero()));
}
TEST(Uint16Sse4, WhenConjoiningMaskAndZeroThenBooleanResultIsCorrect) {
EXPECT_TRUE(uint16x8_t::mask() && uint16x8_t::mask());
EXPECT_FALSE(uint16x8_t::mask() && uint16x8_t::zero());
EXPECT_FALSE(uint16x8_t::zero() && uint16x8_t::mask());
EXPECT_FALSE(uint16x8_t::zero() && uint16x8_t::zero());
EXPECT_TRUE(NEO::uint16x8_t::mask() && NEO::uint16x8_t::mask());
EXPECT_FALSE(NEO::uint16x8_t::mask() && NEO::uint16x8_t::zero());
EXPECT_FALSE(NEO::uint16x8_t::zero() && NEO::uint16x8_t::mask());
EXPECT_FALSE(NEO::uint16x8_t::zero() && NEO::uint16x8_t::zero());
}
TEST(Uint16Sse4, GivenOneWhenCreatingThenInstancesAreSame) {
auto one = uint16x8_t::one();
uint16x8_t alsoOne(one.value);
EXPECT_EQ(0, memcmp(&alsoOne, &one, sizeof(uint16x8_t)));
auto one = NEO::uint16x8_t::one();
NEO::uint16x8_t alsoOne(one.value);
EXPECT_EQ(0, memcmp(&alsoOne, &one, sizeof(NEO::uint16x8_t)));
}
TEST(Uint16Sse4, GivenValueWhenCreatingThenConstructorIsReplicated) {
uint16x8_t allSevens(7u);
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
NEO::uint16x8_t allSevens(7u);
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(7u, allSevens.get(i));
}
}
@ -46,24 +46,24 @@ static const uint16_t laneValues[] = {
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
TEST(Uint16Sse4, GivenArrayWhenCreatingThenConstructorIsReplicated) {
uint16x8_t lanes(laneValues);
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
NEO::uint16x8_t lanes(laneValues);
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i), lanes.get(i));
}
}
TEST(Uint16Sse4, WhenLoadingThenValuesAreSetCorrectly) {
uint16x8_t lanes;
NEO::uint16x8_t lanes;
lanes.load(laneValues);
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i), lanes.get(i));
}
}
TEST(Uint16Sse4, WhenLoadingUnalignedThenValuesAreSetCorrectly) {
uint16x8_t lanes;
NEO::uint16x8_t lanes;
lanes.loadUnaligned(laneValues + 1);
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i + 1), lanes.get(i));
}
}
@ -71,9 +71,9 @@ TEST(Uint16Sse4, WhenLoadingUnalignedThenValuesAreSetCorrectly) {
TEST(Uint16Sse4, WhenStoringThenValuesAreSetCorrectly) {
uint16_t *alignedMemory = reinterpret_cast<uint16_t *>(alignedMalloc(1024, 32));
uint16x8_t lanes(laneValues);
NEO::uint16x8_t lanes(laneValues);
lanes.store(alignedMemory);
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i), alignedMemory[i]);
}
@ -83,9 +83,9 @@ TEST(Uint16Sse4, WhenStoringThenValuesAreSetCorrectly) {
TEST(Uint16Sse4, WhenStoringUnalignedThenValuesAreSetCorrectly) {
uint16_t *alignedMemory = reinterpret_cast<uint16_t *>(alignedMalloc(1024, 32));
uint16x8_t lanes(laneValues);
NEO::uint16x8_t lanes(laneValues);
lanes.storeUnaligned(alignedMemory + 1);
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i), (alignedMemory + 1)[i]);
}
@ -93,39 +93,39 @@ TEST(Uint16Sse4, WhenStoringUnalignedThenValuesAreSetCorrectly) {
}
TEST(Uint16Sse4, WhenDecrementingThenValuesAreSetCorrectly) {
uint16x8_t result(laneValues);
result -= uint16x8_t::one();
NEO::uint16x8_t result(laneValues);
result -= NEO::uint16x8_t::one();
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i - 1), result.get(i));
}
}
TEST(Uint16Sse4, WhenIncrementingThenValuesAreSetCorrectly) {
uint16x8_t result(laneValues);
result += uint16x8_t::one();
NEO::uint16x8_t result(laneValues);
result += NEO::uint16x8_t::one();
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i + 1), result.get(i));
}
}
TEST(Uint16Sse4, WhenBlendingThenValuesAreSetCorrectly) {
uint16x8_t a(uint16x8_t::one());
uint16x8_t b(uint16x8_t::zero());
uint16x8_t c;
NEO::uint16x8_t a(NEO::uint16x8_t::one());
NEO::uint16x8_t b(NEO::uint16x8_t::zero());
NEO::uint16x8_t c;
// c = mask ? a : b
c = blend(a, b, uint16x8_t::mask());
c = blend(a, b, NEO::uint16x8_t::mask());
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(a.get(i), c.get(i));
}
// c = mask ? a : b
c = blend(a, b, uint16x8_t::zero());
c = blend(a, b, NEO::uint16x8_t::zero());
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(b.get(i), c.get(i));
}
}

View File

@ -65,6 +65,7 @@ exposing hardware capabilities to applications.
-DRELEASE_WITH_REGKEYS=TRUE \
-DL0_INSTALL_UDEV_RULES=1 \
-DUDEV_RULES_DIR=/etc/udev/rules.d/ \
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF \
-Wno-dev
%make_build

View File

@ -56,6 +56,7 @@ Summary: ocloc package for opencl
-DCMAKE_INSTALL_PREFIX=/usr \
-DSKIP_UNIT_TESTS=1 \
-DRELEASE_WITH_REGKEYS=1 \
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF \
-Wno-dev
%make_build

View File

@ -61,10 +61,14 @@ function(generate_shared_lib LIB_NAME MOCKABLE)
# Enable SSE4/AVX2 options for files that need them
if(MSVC)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS /arch:AVX2)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/${NEO_TARGET_PROCESSOR}/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS /arch:AVX2)
else()
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/local_id_gen_sse4.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
if(COMPILER_SUPPORTS_AVX2)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/${NEO_TARGET_PROCESSOR}/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
endif()
if(COMPILER_SUPPORTS_SSE42)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/local_id_gen_sse4.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
endif()
endif()
endfunction()

View File

@ -71,10 +71,9 @@ set(NEO_CORE_HELPERS
${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.cpp
${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.h
${CMAKE_CURRENT_SOURCE_DIR}/l3_range.h
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.h
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_avx2.cpp
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_special.inl
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_sse4.cpp
${CMAKE_CURRENT_SOURCE_DIR}/non_copyable_or_moveable.h
${CMAKE_CURRENT_SOURCE_DIR}/options.h

View File

@ -0,0 +1,14 @@
#
# Copyright (C) 2019-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
if(${NEO_TARGET_PROCESSOR} STREQUAL "aarch64")
list(APPEND NEO_CORE_HELPERS
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
)
set_property(GLOBAL PROPERTY NEO_CORE_HELPERS ${NEO_CORE_HELPERS})
endif()

View File

@ -0,0 +1,46 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/local_id_gen.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/local_id_gen_special.inl"
namespace NEO {
struct uint16x8_t;
// This is the initial value of SIMD for local ID
// computation. It correlates to the SIMD lane.
// Must be 32byte aligned for AVX2 usage
ALIGNAS(32)
const uint16_t initialLocalID[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
// Lookup table for generating LocalIDs based on the SIMD of the kernel
void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 8>;
void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
if (useLayoutForImages) {
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
} else if (simd == 32) {
LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
} else if (simd == 16) {
LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
} else if (simd == 8) {
LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
} else {
generateLocalIDsForSimdOne(buffer, localWorkgroupSize, dimensionsOrder, grfSize);
}
}
} // namespace NEO

View File

@ -5,60 +5,12 @@
*
*/
#include "shared/source/helpers/local_id_gen.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/utilities/cpu_info.h"
#include "shared/source/helpers/ptr_math.h"
#include <array>
namespace NEO {
struct uint16x8_t;
struct uint16x16_t;
// This is the initial value of SIMD for local ID
// computation. It correlates to the SIMD lane.
// Must be 32byte aligned for AVX2 usage
ALIGNAS(32)
const uint16_t initialLocalID[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
// Lookup table for generating LocalIDs based on the SIMD of the kernel
void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 8>;
void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;
// Initialize the lookup table based on CPU capabilities
LocalIDHelper::LocalIDHelper() {
bool supportsAVX2 = CpuInfo::getInstance().isFeatureSupported(CpuInfo::featureAvX2);
if (supportsAVX2) {
LocalIDHelper::generateSimd8 = generateLocalIDsSimd<uint16x8_t, 8>;
LocalIDHelper::generateSimd16 = generateLocalIDsSimd<uint16x16_t, 16>;
LocalIDHelper::generateSimd32 = generateLocalIDsSimd<uint16x16_t, 32>;
}
}
LocalIDHelper LocalIDHelper::initializer;
//traditional function to generate local IDs
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
if (useLayoutForImages) {
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
} else if (simd == 32) {
LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
} else if (simd == 16) {
LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
} else if (simd == 8) {
LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
} else {
generateLocalIDsForSimdOne(buffer, localWorkgroupSize, dimensionsOrder, grfSize);
}
}
bool isCompatibleWithLayoutForImages(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd) {
uint8_t xMask = simd == 8u ? 0b1 : 0b11;
uint8_t yMask = 0b11;

View File

@ -10,7 +10,11 @@
#include "shared/source/helpers/debug_helpers.h"
#include <cstdint>
#if defined(__ARM_ARCH)
#include <sse2neon.h>
#else
#include <immintrin.h>
#endif
namespace NEO {

View File

@ -0,0 +1,15 @@
#
# Copyright (C) 2019-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
if(${NEO_TARGET_PROCESSOR} STREQUAL "x86_64")
list(APPEND NEO_CORE_HELPERS
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_avx2.cpp
)
set_property(GLOBAL PROPERTY NEO_CORE_HELPERS ${NEO_CORE_HELPERS})
endif()

View File

@ -0,0 +1,63 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/local_id_gen.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/local_id_gen_special.inl"
#include "shared/source/utilities/cpu_info.h"
#include <array>
namespace NEO {
struct uint16x8_t;
struct uint16x16_t;
// This is the initial value of SIMD for local ID
// computation. It correlates to the SIMD lane.
// Must be 32byte aligned for AVX2 usage
ALIGNAS(32)
const uint16_t initialLocalID[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
// Lookup table for generating LocalIDs based on the SIMD of the kernel
void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 8>;
void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;
// Initialize the lookup table based on CPU capabilities
LocalIDHelper::LocalIDHelper() {
bool supportsAVX2 = CpuInfo::getInstance().isFeatureSupported(CpuInfo::featureAvX2);
if (supportsAVX2) {
LocalIDHelper::generateSimd8 = generateLocalIDsSimd<uint16x8_t, 8>;
LocalIDHelper::generateSimd16 = generateLocalIDsSimd<uint16x16_t, 16>;
LocalIDHelper::generateSimd32 = generateLocalIDsSimd<uint16x16_t, 32>;
}
}
LocalIDHelper LocalIDHelper::initializer;
//traditional function to generate local IDs
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
if (useLayoutForImages) {
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
} else if (simd == 32) {
LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
} else if (simd == 16) {
LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
} else if (simd == 8) {
LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
} else {
generateLocalIDsForSimdOne(buffer, localWorkgroupSize, dimensionsOrder, grfSize);
}
}
} // namespace NEO

View File

@ -49,12 +49,7 @@ set(NEO_CORE_UTILITIES_WINDOWS
${CMAKE_CURRENT_SOURCE_DIR}/windows/timer_util.cpp
)
set(NEO_CORE_UTILITIES_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/linux/cpu_info.cpp
${CMAKE_CURRENT_SOURCE_DIR}/linux/directory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/linux/timer_util.cpp
)
set_property(GLOBAL PROPERTY NEO_CORE_UTILITIES ${NEO_CORE_UTILITIES})
set_property(GLOBAL PROPERTY NEO_CORE_UTILITIES_WINDOWS ${NEO_CORE_UTILITIES_WINDOWS})
set_property(GLOBAL PROPERTY NEO_CORE_UTILITIES_LINUX ${NEO_CORE_UTILITIES_LINUX})
add_subdirectories()

View File

@ -7,7 +7,11 @@
#include "shared/source/utilities/cpuintrinsics.h"
#if defined(__ARM_ARCH)
#include <sse2neon.h>
#else
#include <emmintrin.h>
#endif
namespace NEO {
namespace CpuIntrinsics {

View File

@ -0,0 +1,14 @@
#
# Copyright (C) 2019-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
set(NEO_CORE_UTILITIES_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/directory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/timer_util.cpp
)
set_property(GLOBAL PROPERTY NEO_CORE_UTILITIES_LINUX ${NEO_CORE_UTILITIES_LINUX})
add_subdirectories()

View File

@ -0,0 +1,13 @@
#
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
if(${NEO_TARGET_PROCESSOR} STREQUAL "aarch64")
list(APPEND NEO_CORE_UTILITIES_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/cpu_info.cpp
)
set_property(GLOBAL PROPERTY NEO_CORE_UTILITIES_LINUX ${NEO_CORE_UTILITIES_LINUX})
endif()

View File

@ -0,0 +1,55 @@
/*
* Copyright (C) 2019-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/utilities/cpu_info.h"
#include "shared/source/os_interface/linux/os_inc.h"
#include <cstdint>
#include <fstream>
namespace NEO {
void cpuid_linux_wrapper(int cpuInfo[4], int functionId) {
// TODO: need aarch64 implementation
}
void cpuidex_linux_wrapper(int *cpuInfo, int functionId, int subfunctionId) {
// TODO: need aarch64 implementation
}
void get_cpu_flags_linux(std::string &cpuFlags) {
std::ifstream cpuinfo(std::string(Os::sysFsProcPathPrefix) + "/cpuinfo");
std::string line;
while (std::getline(cpuinfo, line)) {
if (line.substr(0, 5) == "flags") {
cpuFlags = line;
break;
}
}
}
void (*CpuInfo::cpuidexFunc)(int *, int, int) = cpuidex_linux_wrapper;
void (*CpuInfo::cpuidFunc)(int[4], int) = cpuid_linux_wrapper;
void (*CpuInfo::getCpuFlagsFunc)(std::string &) = get_cpu_flags_linux;
const CpuInfo CpuInfo::instance;
void CpuInfo::cpuid(
uint32_t cpuInfo[4],
uint32_t functionId) const {
cpuidFunc(reinterpret_cast<int *>(cpuInfo), functionId);
}
void CpuInfo::cpuidex(
uint32_t cpuInfo[4],
uint32_t functionId,
uint32_t subfunctionId) const {
cpuidexFunc(reinterpret_cast<int *>(cpuInfo), functionId, subfunctionId);
}
} // namespace NEO

View File

@ -0,0 +1,14 @@
#
# Copyright (C) 2019-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
if(${NEO_TARGET_PROCESSOR} STREQUAL "x86_64")
list(APPEND NEO_CORE_UTILITIES_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/cpu_info.cpp
)
set_property(GLOBAL PROPERTY NEO_CORE_UTILITIES_LINUX ${NEO_CORE_UTILITIES_LINUX})
endif()

19
third_party/sse2neon/LICENSE vendored Normal file
View File

@ -0,0 +1,19 @@
MIT License
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

154
third_party/sse2neon/README.md vendored Normal file
View File

@ -0,0 +1,154 @@
# sse2neon
![Github Actions](https://github.com/DLTcollab/sse2neon/workflows/Github%20Actions/badge.svg?branch=master)
A C/C++ header file that converts Intel SSE intrinsics to Arm/Aarch64 NEON intrinsics.
## Introduction
`sse2neon` is a translator of Intel SSE (Streaming SIMD Extensions) intrinsics
to [Arm NEON](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon),
shortening the time needed to get an Arm working program that then can be used to
extract profiles and to identify hot paths in the code.
The header file `sse2neon.h` contains several of the functions provided by Intel
intrinsic headers such as `<xmmintrin.h>`, only implemented with NEON-based counterparts
to produce the exact semantics of the intrinsics.
## Mapping and Coverage
Header file | Extension |
---|---|
`<mmintrin.h>` | MMX |
`<xmmintrin.h>` | SSE |
`<emmintrin.h>` | SSE2 |
`<pmmintrin.h>` | SSE3 |
`<tmmintrin.h>` | SSSE3 |
`<smmintrin.h>` | SSE4.1 |
`<nmmintrin.h>` | SSE4.2 |
`<wmmintrin.h>` | AES |
`sse2neon` aims to support SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and AES extension.
In order to deliver NEON-equivalent intrinsics for all SSE intrinsics used widely,
please be aware that some SSE intrinsics exist a direct mapping with a concrete
NEON-equivalent intrinsic. However, others lack of 1-to-1 mapping, that means the
equivalents are implemented using several NEON intrinsics.
For example, SSE intrinsic `_mm_loadu_si128` has a direct NEON mapping (`vld1q_s32`),
but SSE intrinsic `_mm_maddubs_epi16` has to be implemented with 13+ NEON instructions.
## Usage
- Put the file `sse2neon.h` in to your source code directory.
- Locate the following SSE header files included in the code:
```C
#include <xmmintrin.h>
#include <emmintrin.h>
```
{p,t,s,n,w}mmintrin.h should be replaceable, but the coverage of these extensions might be limited though.
- Replace them with:
```C
#include "sse2neon.h"
```
- Explicitly specify platform-specific options to gcc/clang compilers.
* On ARMv8-A targets, you should specify the following compiler option: (Remove `crypto` and/or `crc` if your architecture does not support cryptographic and/or CRC32 extensions)
```shell
-march=armv8-a+fp+simd+crypto+crc
```
* On ARMv7-A targets, you need to append the following compiler option:
```shell
-mfpu=neon
```
## Compile-time Configurations
Considering the balance between correctness and performance, `sse2neon` recognizes the following compile-time configurations:
* `SSE2NEON_PRECISE_MINMAX`: Enable precise implementation of `_mm_min_ps` and `_mm_max_ps`. If you need consistent results such as NaN special cases, enable it.
* `SSE2NEON_PRECISE_DIV`: Enable precise implementation of `_mm_rcp_ps` and `_mm_div_ps` by additional Netwon-Raphson iteration for accuracy.
* `SSE2NEON_PRECISE_SQRT`: Enable precise implementation of `_mm_sqrt_ps` and `_mm_rsqrt_ps` by additional Netwon-Raphson iteration for accuracy.
The above are turned off by default, and you should define the corresponding macro(s) as `1` before including `sse2neon.h` if you need the precise implementations.
## Run Built-in Test Suite
`sse2neon` provides a unified interface for developing test cases. These test
cases are located in `tests` directory, and the input data is specified at
runtime. Use the following commands to perform test cases:
```shell
$ make check
```
You can specify GNU toolchain for cross compilation as well.
[QEMU](https://www.qemu.org/) should be installed in advance.
```shell
$ make CROSS_COMPILE=aarch64-linux-gnu- check # ARMv8-A
```
or
```shell
$ make CROSS_COMPILE=arm-linux-gnueabihf- check # ARMv7-A
```
Check the details via [Test Suite for SSE2NEON](tests/README.md).
## Adoptions
Here is a partial list of open source projects that have adopted `sse2neon` for Arm/Aarch64 support.
* [aether-game-utils](https://github.com/johnhues/aether-game-utils) is a collection of cross platform utilities for quickly creating small game prototypes in C++.
* [Apache Impala](https://impala.apache.org/) is a lightning-fast, distributed SQL queries for petabytes of data stored in Apache Hadoop clusters.
* [Apache Kudu](https://kudu.apache.org/) completes Hadoop's storage layer to enable fast analytics on fast data.
* [ART](https://github.com/dinosaure/art) is an implementation in OCaml of [Adaptive Radix Tree](https://db.in.tum.de/~leis/papers/ART.pdf) (ART).
* [Async](https://github.com/romange/async) is a set of c++ primitives that allows efficient and rapid development in C++17 on GNU/Linux systems.
* [Blender](https://www.blender.org/) is the free and open source 3D creation suite, supporting the entirety of the 3D pipeline.
* [Boo](https://github.com/AxioDL/boo) is a cross-platform windowing and event manager similar to SDL or SFML, with additional 3D rendering functionality.
* [CARTA](https://github.com/CARTAvis/carta-backend) is a new visualization tool designed for viewing radio astronomy images in CASA, FITS, MIRIAD, and HDF5 formats (using the IDIA custom schema for HDF5).
* [Catcoon](https://github.com/i-evi/catcoon) is a [feedforward neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network) implementation in C.
* [dab-cmdline](https://github.com/JvanKatwijk/dab-cmdline) provides entries for the functionality to handle Digital audio broadcasting (DAB)/DAB+ through some simple calls.
* [EDGE](https://github.com/3dfxdev/EDGE) is an advanced OpenGL source port spawned from the DOOM engine, with focus on easy development and expansion for modders and end-users.
* [Embree](https://github.com/embree/embree) a collection of high-performance ray tracing kernels. Its target users are graphics application engineers who want to improve the performance of their photo-realistic rendering application by leveraging Embree's performance-optimized ray tracing kernels.
* [emp-tool](https://github.com/emp-toolkit/emp-tool) aims to provide a benchmark for secure computation and allowing other researchers to experiment and extend.
* [FoundationDB](https://www.foundationdb.org) is a distributed database designed to handle large volumes of structured data across clusters of commodity servers.
* [iqtree_arm_neon](https://github.com/joshlvmh/iqtree_arm_neon) is the Arm NEON port of [IQ-TREE](http://www.iqtree.org/), fast and effective stochastic algorithm to infer phylogenetic trees by maximum likelihood.
* [kram](https://github.com/alecazam/kram) is a wrapper to several popular encoders to and from PNG/[KTX](https://www.khronos.org/opengles/sdk/tools/KTX/file_format_spec/) files with [LDR/HDR and BC/ASTC/ETC2](https://developer.arm.com/solutions/graphics-and-gaming/developer-guides/learn-the-basics/adaptive-scalable-texture-compression/single-page).
* [libscapi](https://github.com/cryptobiu/libscapi) stands for the "Secure Computation API", providing reliable, efficient, and highly flexible cryptographic infrastructure.
* [libmatoya](https://github.com/matoya/libmatoya) is a cross-platform application development library, providing various features such as common cryptography tasks.
* [Madronalib](https://github.com/madronalabs/madronalib) enables efficient audio DSP on SIMD processors with readable and brief C++ code.
* [minimap2](https://github.com/lh3/minimap2) is a versatile sequence alignment program that aligns DNA or mRNA sequences against a large reference database.
* [MMseqs2](https://github.com/soedinglab/MMseqs2) (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets.
* [MRIcroGL](https://github.com/rordenlab/MRIcroGL) is a cross-platform tool for viewing NIfTI, DICOM, MGH, MHD, NRRD, AFNI format medical images.
* [N2](https://github.com/oddconcepts/n2o) is an approximate nearest neighborhoods algorithm library written in C++, providing a much faster search speed than other implementations when modeling large dataset.
* [niimath](https://github.com/rordenlab/niimath) is a general image calculator with superior performance.
* [OBS Studio](https://github.com/obsproject/obs-studio) is software designed for capturing, compositing, encoding, recording, and streaming video content, efficiently.
* [OGRE](https://github.com/OGRECave/ogre) is a scene-oriented, flexible 3D engine written in C++ designed to make it easier and more intuitive for developers to produce games and demos utilising 3D hardware.
* [OpenXRay](https://github.com/OpenXRay/xray-16) is an improved version of the X-Ray engine, used in world famous S.T.A.L.K.E.R. game series by GSC Game World.
* [parallel-n64](https://github.com/libretro/parallel-n64) is an optimized/rewritten Nintendo 64 emulator made specifically for [Libretro](https://www.libretro.com/).
* [PFFFT](https://github.com/marton78/pffft) does 1D Fast Fourier Transforms, of single precision real and complex vectors.
* [PlutoSDR Firmware](https://github.com/seanstone/plutosdr-fw) is the customized firmware for the [PlutoSDR](https://wiki.analog.com/university/tools/pluto) that can be used to introduce fundamentals of Software Defined Radio (SDR) or Radio Frequency (RF) or Communications as advanced topics in electrical engineering in a self or instructor lead setting.
* [Pygame](https://www.pygame.org) is cross-platform and designed to make it easy to write multimedia software, such as games, in Python.
* [simd_utils](https://github.com/JishinMaster/simd_utils) is a header-only library implementing common mathematical functions using SIMD intrinsics.
* [SMhasher](https://github.com/rurban/smhasher) provides comprehensive Hash function quality and speed tests.
* [Spack](https://github.com/spack/spack) is a multi-platform package manager that builds and installs multiple versions and configurations of software.
* [srsLTE](https://github.com/srsLTE/srsLTE) is an open source SDR LTE software suite.
* [Surge](https://github.com/surge-synthesizer/surge) is an open source digital synthesizer.
* [XMRig](https://github.com/xmrig/xmrig) is an open source CPU miner for [Monero](https://web.getmonero.org/) cryptocurrency.
## Related Projects
* [SIMDe](https://github.com/simd-everywhere/simde): fast and portable implementations of SIMD
intrinsics on hardware which doesn't natively support them, such as calling SSE functions on ARM.
* [CatBoost's sse2neon](https://github.com/catboost/catboost/blob/master/library/cpp/sse/sse2neon.h)
* [ARM\_NEON\_2\_x86\_SSE](https://github.com/intel/ARM_NEON_2_x86_SSE)
* [AvxToNeon](https://github.com/kunpengcompute/AvxToNeon)
* [POWER/PowerPC support for GCC](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000) contains a series of headers simplifying porting x86_64 code that
makes explicit use of Intel intrinsics to powerpc64le (pure little-endian mode that has been introduced with the [POWER8](https://en.wikipedia.org/wiki/POWER8)).
- implementation: [xmmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/xmmintrin.h), [emmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/emmintrin.h), [pmmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/pmmintrin.h), [tmmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/tmmintrin.h), [smmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/smmintrin.h)
## Reference
* [Intel Intrinsics Guide](https://software.intel.com/sites/landingpage/IntrinsicsGuide/)
* [Arm Neon Intrinsics Reference](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
* [Neon Programmer's Guide for Armv8-A](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/neon-programmers-guide-for-armv8-a)
* [NEON Programmer's Guide](https://static.docs.arm.com/den0018/a/DEN0018A_neon_programmers_guide_en.pdf)
* [qemu/target/i386/ops_sse.h](https://github.com/qemu/qemu/blob/master/target/i386/ops_sse.h): Comprehensive SSE instruction emulation in C. Ideal for semantic checks.
## Licensing
`sse2neon` is freely redistributable under the MIT License.

8309
third_party/sse2neon/sse2neon.h vendored Normal file

File diff suppressed because it is too large Load Diff