initial Neo enabling on architectures other than x86

Related-To: NEO-6011 Signed-off-by: Artur Harasimiuk <artur.harasimiuk@intel.com>
2025-09-10 12:53:42 +08:00 · 2021-09-02 21:25:03 +00:00
parent f958b053ab
commit 895e9e5116
28 changed files with 8801 additions and 100 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -149,6 +149,20 @@ else()
  set(NEO_ARCH "x86")
 endif()

+if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
+  set(NEO_TARGET_PROCESSOR "x86_64")
+elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64")
+  set(NEO_TARGET_PROCESSOR "x86_64")
+elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+  set(NEO_TARGET_PROCESSOR "aarch64")
+  include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/sse2neon)
+endif()
+message(STATUS "Target processor: ${NEO_TARGET_PROCESSOR}")
+
+if(NOT DEFINED NEO_TARGET_PROCESSOR)
+  message(FATAL_ERROR "Unsupported target processor: ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
 if(NOT DEFINED BUILD_WITH_L0)
  if("${NEO_BITS}" STREQUAL "64")
    set(BUILD_WITH_L0 TRUE)
@ -847,6 +861,8 @@ else()
  else()
    message(WARNING "Spectre mitigation DISABLED")
  endif()
+  check_cxx_compiler_flag(-msse4.2 COMPILER_SUPPORTS_SSE42)
+  check_cxx_compiler_flag(-mavx2 COMPILER_SUPPORTS_AVX2)
 endif()

 if(NOT MSVC)
--- a/level_zero/CMakeLists.txt
+++ b/level_zero/CMakeLists.txt
@ -485,7 +485,9 @@ if(BUILD_WITH_L0)
  )

  if(UNIX)
-    target_link_libraries(${TARGET_NAME_L0} ${GMM_LINK_NAME})
+    if(${NEO_TARGET_PROCESSOR} STREQUAL "x86_64")
+      target_link_libraries(${TARGET_NAME_L0} ${GMM_LINK_NAME})
+    endif()

    set_property(TARGET ${TARGET_NAME_L0}
                 APPEND_STRING PROPERTY LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/core/source/dll/linux/ze.exports"
--- a/manifests/manifest.yml
+++ b/manifests/manifest.yml
@ -20,7 +20,7 @@ components:
  infra:
    branch: master
    dest_dir: infra
-    revision: a6b4272e6e2ebd1965b656a0d247038a1111cc58
+    revision: 6f8216baa8dbd1c185c7dcd5349a8aa7ae0e5591
    type: git
  internal:
    branch: master
--- a/opencl/source/CMakeLists.txt
+++ b/opencl/source/CMakeLists.txt
@ -189,7 +189,9 @@ if(${GENERATE_EXECUTABLE})
                   ${NEO_SHARED_DIRECTORY}/os_interface/windows/gmm_interface_win.cpp
    )
  else()
-    target_link_libraries(${NEO_DYNAMIC_LIB_NAME} ${GMM_LINK_NAME})
+    if(${NEO_TARGET_PROCESSOR} STREQUAL "x86_64")
+      target_link_libraries(${NEO_DYNAMIC_LIB_NAME} ${GMM_LINK_NAME})
+    endif()
    target_include_directories(${NEO_DYNAMIC_LIB_NAME} PRIVATE
                               ${NEO_SHARED_DIRECTORY}/dll/devices${BRANCH_DIR_SUFFIX}
    )
--- a/opencl/test/unit_test/CMakeLists.txt
+++ b/opencl/test/unit_test/CMakeLists.txt
@ -111,7 +111,7 @@ if(USE_ASAN)
  set(GTEST_ENV "LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_SOURCE_DIR}/lsan_suppressions.txt")
 endif()

-if(NOT MSVC)
+if(COMPILER_SUPPORTS_SSE42)
  set_source_files_properties(helpers/uint16_sse4_tests.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
 endif()

--- a/opencl/test/unit_test/helpers/CMakeLists.txt
+++ b/opencl/test/unit_test/helpers/CMakeLists.txt
@ -37,7 +37,6 @@ set(IGDRCL_SRCS_tests_helpers
    ${CMAKE_CURRENT_SOURCE_DIR}/timestamp_packet_tests.h
    ${CMAKE_CURRENT_SOURCE_DIR}/transfer_properties_tests.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/ult_limits.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/uint16_sse4_tests.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/validator_tests.cpp
    ${NEO_SHARED_TEST_DIRECTORY}/common/helpers/aligned_memory_tests.cpp
    ${NEO_SHARED_TEST_DIRECTORY}/common/helpers/debug_manager_state_restore.h
@ -45,6 +44,12 @@ set(IGDRCL_SRCS_tests_helpers
    ${NEO_SHARED_TEST_DIRECTORY}/common/helpers/unit_test_helper.inl
 )

+if(MSVC OR COMPILER_SUPPORTS_SSE42)
+  list(APPEND IGDRCL_SRCS_tests_helpers
+       ${CMAKE_CURRENT_SOURCE_DIR}/uint16_sse4_tests.cpp
+  )
+endif()
+
 if(TESTS_XEHP_AND_LATER)
  list(APPEND IGDRCL_SRCS_tests_helpers
       ${CMAKE_CURRENT_SOURCE_DIR}/aub_helper_hw_tests_xehp_and_later.cpp
--- a/opencl/test/unit_test/helpers/uint16_sse4_tests.cpp
+++ b/opencl/test/unit_test/helpers/uint16_sse4_tests.cpp
@ -13,29 +13,29 @@
 using namespace NEO;

 TEST(Uint16Sse4, GivenMaskWhenCastingToBoolThenTrueIsReturned) {
-    EXPECT_TRUE(static_cast<bool>(uint16x8_t::mask()));
+    EXPECT_TRUE(static_cast<bool>(NEO::uint16x8_t::mask()));
 }

 TEST(Uint16Sse4, GivenZeroWhenCastingToBoolThenFalseIsReturned) {
-    EXPECT_FALSE(static_cast<bool>(uint16x8_t::zero()));
+    EXPECT_FALSE(static_cast<bool>(NEO::uint16x8_t::zero()));
 }

 TEST(Uint16Sse4, WhenConjoiningMaskAndZeroThenBooleanResultIsCorrect) {
-    EXPECT_TRUE(uint16x8_t::mask() && uint16x8_t::mask());
-    EXPECT_FALSE(uint16x8_t::mask() && uint16x8_t::zero());
-    EXPECT_FALSE(uint16x8_t::zero() && uint16x8_t::mask());
-    EXPECT_FALSE(uint16x8_t::zero() && uint16x8_t::zero());
+    EXPECT_TRUE(NEO::uint16x8_t::mask() && NEO::uint16x8_t::mask());
+    EXPECT_FALSE(NEO::uint16x8_t::mask() && NEO::uint16x8_t::zero());
+    EXPECT_FALSE(NEO::uint16x8_t::zero() && NEO::uint16x8_t::mask());
+    EXPECT_FALSE(NEO::uint16x8_t::zero() && NEO::uint16x8_t::zero());
 }

 TEST(Uint16Sse4, GivenOneWhenCreatingThenInstancesAreSame) {
-    auto one = uint16x8_t::one();
-    uint16x8_t alsoOne(one.value);
-    EXPECT_EQ(0, memcmp(&alsoOne, &one, sizeof(uint16x8_t)));
+    auto one = NEO::uint16x8_t::one();
+    NEO::uint16x8_t alsoOne(one.value);
+    EXPECT_EQ(0, memcmp(&alsoOne, &one, sizeof(NEO::uint16x8_t)));
 }

 TEST(Uint16Sse4, GivenValueWhenCreatingThenConstructorIsReplicated) {
-    uint16x8_t allSevens(7u);
-    for (int i = 0; i < uint16x8_t::numChannels; ++i) {
+    NEO::uint16x8_t allSevens(7u);
+    for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
        EXPECT_EQ(7u, allSevens.get(i));
    }
 }
@ -46,24 +46,24 @@ static const uint16_t laneValues[] = {
    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};

 TEST(Uint16Sse4, GivenArrayWhenCreatingThenConstructorIsReplicated) {
-    uint16x8_t lanes(laneValues);
-    for (int i = 0; i < uint16x8_t::numChannels; ++i) {
+    NEO::uint16x8_t lanes(laneValues);
+    for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
        EXPECT_EQ(static_cast<uint16_t>(i), lanes.get(i));
    }
 }

 TEST(Uint16Sse4, WhenLoadingThenValuesAreSetCorrectly) {
-    uint16x8_t lanes;
+    NEO::uint16x8_t lanes;
    lanes.load(laneValues);
-    for (int i = 0; i < uint16x8_t::numChannels; ++i) {
+    for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
        EXPECT_EQ(static_cast<uint16_t>(i), lanes.get(i));
    }
 }

 TEST(Uint16Sse4, WhenLoadingUnalignedThenValuesAreSetCorrectly) {
-    uint16x8_t lanes;
+    NEO::uint16x8_t lanes;
    lanes.loadUnaligned(laneValues + 1);
-    for (int i = 0; i < uint16x8_t::numChannels; ++i) {
+    for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
        EXPECT_EQ(static_cast<uint16_t>(i + 1), lanes.get(i));
    }
 }
@ -71,9 +71,9 @@ TEST(Uint16Sse4, WhenLoadingUnalignedThenValuesAreSetCorrectly) {
 TEST(Uint16Sse4, WhenStoringThenValuesAreSetCorrectly) {
    uint16_t *alignedMemory = reinterpret_cast<uint16_t *>(alignedMalloc(1024, 32));

-    uint16x8_t lanes(laneValues);
+    NEO::uint16x8_t lanes(laneValues);
    lanes.store(alignedMemory);
-    for (int i = 0; i < uint16x8_t::numChannels; ++i) {
+    for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
        EXPECT_EQ(static_cast<uint16_t>(i), alignedMemory[i]);
    }

@ -83,9 +83,9 @@ TEST(Uint16Sse4, WhenStoringThenValuesAreSetCorrectly) {
 TEST(Uint16Sse4, WhenStoringUnalignedThenValuesAreSetCorrectly) {
    uint16_t *alignedMemory = reinterpret_cast<uint16_t *>(alignedMalloc(1024, 32));

-    uint16x8_t lanes(laneValues);
+    NEO::uint16x8_t lanes(laneValues);
    lanes.storeUnaligned(alignedMemory + 1);
-    for (int i = 0; i < uint16x8_t::numChannels; ++i) {
+    for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
        EXPECT_EQ(static_cast<uint16_t>(i), (alignedMemory + 1)[i]);
    }

@ -93,39 +93,39 @@ TEST(Uint16Sse4, WhenStoringUnalignedThenValuesAreSetCorrectly) {
 }

 TEST(Uint16Sse4, WhenDecrementingThenValuesAreSetCorrectly) {
-    uint16x8_t result(laneValues);
-    result -= uint16x8_t::one();
+    NEO::uint16x8_t result(laneValues);
+    result -= NEO::uint16x8_t::one();

-    for (int i = 0; i < uint16x8_t::numChannels; ++i) {
+    for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
        EXPECT_EQ(static_cast<uint16_t>(i - 1), result.get(i));
    }
 }

 TEST(Uint16Sse4, WhenIncrementingThenValuesAreSetCorrectly) {
-    uint16x8_t result(laneValues);
-    result += uint16x8_t::one();
+    NEO::uint16x8_t result(laneValues);
+    result += NEO::uint16x8_t::one();

-    for (int i = 0; i < uint16x8_t::numChannels; ++i) {
+    for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
        EXPECT_EQ(static_cast<uint16_t>(i + 1), result.get(i));
    }
 }

 TEST(Uint16Sse4, WhenBlendingThenValuesAreSetCorrectly) {
-    uint16x8_t a(uint16x8_t::one());
-    uint16x8_t b(uint16x8_t::zero());
-    uint16x8_t c;
+    NEO::uint16x8_t a(NEO::uint16x8_t::one());
+    NEO::uint16x8_t b(NEO::uint16x8_t::zero());
+    NEO::uint16x8_t c;

    // c = mask ? a : b
-    c = blend(a, b, uint16x8_t::mask());
+    c = blend(a, b, NEO::uint16x8_t::mask());

-    for (int i = 0; i < uint16x8_t::numChannels; ++i) {
+    for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
        EXPECT_EQ(a.get(i), c.get(i));
    }

    // c = mask ? a : b
-    c = blend(a, b, uint16x8_t::zero());
+    c = blend(a, b, NEO::uint16x8_t::zero());

-    for (int i = 0; i < uint16x8_t::numChannels; ++i) {
+    for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
        EXPECT_EQ(b.get(i), c.get(i));
    }
 }
--- a/scripts/packaging/l0_gpu_driver/sles_15.2/SPECS/l0_gpu_driver.spec
+++ b/scripts/packaging/l0_gpu_driver/sles_15.2/SPECS/l0_gpu_driver.spec
@ -65,6 +65,7 @@ exposing hardware capabilities to applications.
   -DRELEASE_WITH_REGKEYS=TRUE \
   -DL0_INSTALL_UDEV_RULES=1 \
   -DUDEV_RULES_DIR=/etc/udev/rules.d/ \
+   -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF \
   -Wno-dev
 %make_build

--- a/scripts/packaging/opencl/sles_15.2/SPECS/opencl.spec
+++ b/scripts/packaging/opencl/sles_15.2/SPECS/opencl.spec
@ -56,6 +56,7 @@ Summary:        ocloc package for opencl
   -DCMAKE_INSTALL_PREFIX=/usr \
   -DSKIP_UNIT_TESTS=1 \
   -DRELEASE_WITH_REGKEYS=1 \
+   -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF \
   -Wno-dev
 %make_build

--- a/shared/source/CMakeLists.txt
+++ b/shared/source/CMakeLists.txt
@ -61,10 +61,14 @@ function(generate_shared_lib LIB_NAME MOCKABLE)

  # Enable SSE4/AVX2 options for files that need them
  if(MSVC)
-    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS /arch:AVX2)
+    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/${NEO_TARGET_PROCESSOR}/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS /arch:AVX2)
  else()
-    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
-    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/local_id_gen_sse4.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
+    if(COMPILER_SUPPORTS_AVX2)
+      set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/${NEO_TARGET_PROCESSOR}/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+    endif()
+    if(COMPILER_SUPPORTS_SSE42)
+      set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/local_id_gen_sse4.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
+    endif()
  endif()

 endfunction()
--- a/shared/source/helpers/CMakeLists.txt
+++ b/shared/source/helpers/CMakeLists.txt
@ -71,10 +71,9 @@ set(NEO_CORE_HELPERS
    ${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.h
    ${CMAKE_CURRENT_SOURCE_DIR}/l3_range.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.h
    ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl
-    ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_avx2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_special.inl
    ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_sse4.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/non_copyable_or_moveable.h
    ${CMAKE_CURRENT_SOURCE_DIR}/options.h
--- a/shared/source/helpers/aarch64/CMakeLists.txt
+++ b/shared/source/helpers/aarch64/CMakeLists.txt
@ -0,0 +1,14 @@
+#
+# Copyright (C) 2019-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+#
+
+if(${NEO_TARGET_PROCESSOR} STREQUAL "aarch64")
+  list(APPEND NEO_CORE_HELPERS
+       ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
+       ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
+  )
+
+  set_property(GLOBAL PROPERTY NEO_CORE_HELPERS ${NEO_CORE_HELPERS})
+endif()
--- a/shared/source/helpers/aarch64/local_id_gen.cpp
+++ b/shared/source/helpers/aarch64/local_id_gen.cpp
@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2018-2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/helpers/local_id_gen.h"
+
+#include "shared/source/helpers/aligned_memory.h"
+#include "shared/source/helpers/local_id_gen_special.inl"
+
+namespace NEO {
+
+struct uint16x8_t;
+
+// This is the initial value of SIMD for local ID
+// computation.  It correlates to the SIMD lane.
+// Must be 32byte aligned for AVX2 usage
+ALIGNAS(32)
+const uint16_t initialLocalID[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+// Lookup table for generating LocalIDs based on the SIMD of the kernel
+void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 8>;
+void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
+void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;
+
+void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
+    auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
+    bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
+    if (useLayoutForImages) {
+        generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
+    } else if (simd == 32) {
+        LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
+    } else if (simd == 16) {
+        LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
+    } else if (simd == 8) {
+        LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
+    } else {
+        generateLocalIDsForSimdOne(buffer, localWorkgroupSize, dimensionsOrder, grfSize);
+    }
+}
+
+} // namespace NEO
--- a/shared/source/helpers/local_id_gen_special.inl
+++ b/shared/source/helpers/local_id_gen_special.inl
@ -5,60 +5,12 @@
 *
 */

-#include "shared/source/helpers/local_id_gen.h"
-
-#include "shared/source/helpers/aligned_memory.h"
-#include "shared/source/utilities/cpu_info.h"
+#include "shared/source/helpers/ptr_math.h"

 #include <array>

 namespace NEO {

-struct uint16x8_t;
-struct uint16x16_t;
-
-// This is the initial value of SIMD for local ID
-// computation.  It correlates to the SIMD lane.
-// Must be 32byte aligned for AVX2 usage
-ALIGNAS(32)
-const uint16_t initialLocalID[] = {
-    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
-
-// Lookup table for generating LocalIDs based on the SIMD of the kernel
-void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 8>;
-void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
-void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;
-
-// Initialize the lookup table based on CPU capabilities
-LocalIDHelper::LocalIDHelper() {
-    bool supportsAVX2 = CpuInfo::getInstance().isFeatureSupported(CpuInfo::featureAvX2);
-    if (supportsAVX2) {
-        LocalIDHelper::generateSimd8 = generateLocalIDsSimd<uint16x8_t, 8>;
-        LocalIDHelper::generateSimd16 = generateLocalIDsSimd<uint16x16_t, 16>;
-        LocalIDHelper::generateSimd32 = generateLocalIDsSimd<uint16x16_t, 32>;
-    }
-}
-
-LocalIDHelper LocalIDHelper::initializer;
-
-//traditional function to generate local IDs
-void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
-    auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
-    bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
-    if (useLayoutForImages) {
-        generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
-    } else if (simd == 32) {
-        LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
-    } else if (simd == 16) {
-        LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
-    } else if (simd == 8) {
-        LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
-    } else {
-        generateLocalIDsForSimdOne(buffer, localWorkgroupSize, dimensionsOrder, grfSize);
-    }
-}
-
 bool isCompatibleWithLayoutForImages(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd) {
    uint8_t xMask = simd == 8u ? 0b1 : 0b11;
    uint8_t yMask = 0b11;
--- a/shared/source/helpers/uint16_sse4.h
+++ b/shared/source/helpers/uint16_sse4.h
@ -10,7 +10,11 @@
 #include "shared/source/helpers/debug_helpers.h"

 #include <cstdint>
+#if defined(__ARM_ARCH)
+#include <sse2neon.h>
+#else
 #include <immintrin.h>
+#endif

 namespace NEO {

--- a/shared/source/helpers/x86_64/CMakeLists.txt
+++ b/shared/source/helpers/x86_64/CMakeLists.txt
@ -0,0 +1,15 @@
+#
+# Copyright (C) 2019-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+#
+
+if(${NEO_TARGET_PROCESSOR} STREQUAL "x86_64")
+  list(APPEND NEO_CORE_HELPERS
+       ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
+       ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
+       ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_avx2.cpp
+  )
+
+  set_property(GLOBAL PROPERTY NEO_CORE_HELPERS ${NEO_CORE_HELPERS})
+endif()
--- a/shared/source/helpers/x86_64/local_id_gen.cpp
+++ b/shared/source/helpers/x86_64/local_id_gen.cpp
@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2018-2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/helpers/local_id_gen.h"
+
+#include "shared/source/helpers/aligned_memory.h"
+#include "shared/source/helpers/local_id_gen_special.inl"
+#include "shared/source/utilities/cpu_info.h"
+
+#include <array>
+
+namespace NEO {
+
+struct uint16x8_t;
+struct uint16x16_t;
+
+// This is the initial value of SIMD for local ID
+// computation.  It correlates to the SIMD lane.
+// Must be 32byte aligned for AVX2 usage
+ALIGNAS(32)
+const uint16_t initialLocalID[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+// Lookup table for generating LocalIDs based on the SIMD of the kernel
+void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 8>;
+void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
+void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;
+
+// Initialize the lookup table based on CPU capabilities
+LocalIDHelper::LocalIDHelper() {
+    bool supportsAVX2 = CpuInfo::getInstance().isFeatureSupported(CpuInfo::featureAvX2);
+    if (supportsAVX2) {
+        LocalIDHelper::generateSimd8 = generateLocalIDsSimd<uint16x8_t, 8>;
+        LocalIDHelper::generateSimd16 = generateLocalIDsSimd<uint16x16_t, 16>;
+        LocalIDHelper::generateSimd32 = generateLocalIDsSimd<uint16x16_t, 32>;
+    }
+}
+
+LocalIDHelper LocalIDHelper::initializer;
+
+//traditional function to generate local IDs
+void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
+    auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
+    bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
+    if (useLayoutForImages) {
+        generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
+    } else if (simd == 32) {
+        LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
+    } else if (simd == 16) {
+        LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
+    } else if (simd == 8) {
+        LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
+    } else {
+        generateLocalIDsForSimdOne(buffer, localWorkgroupSize, dimensionsOrder, grfSize);
+    }
+}
+
+} // namespace NEO
--- a/shared/source/helpers/x86_64/local_id_gen_avx2.cpp
+++ b/shared/source/helpers/x86_64/local_id_gen_avx2.cpp
--- a/shared/source/utilities/CMakeLists.txt
+++ b/shared/source/utilities/CMakeLists.txt
@ -49,12 +49,7 @@ set(NEO_CORE_UTILITIES_WINDOWS
    ${CMAKE_CURRENT_SOURCE_DIR}/windows/timer_util.cpp
 )

-set(NEO_CORE_UTILITIES_LINUX
-    ${CMAKE_CURRENT_SOURCE_DIR}/linux/cpu_info.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/linux/directory.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/linux/timer_util.cpp
-)
-
 set_property(GLOBAL PROPERTY NEO_CORE_UTILITIES ${NEO_CORE_UTILITIES})
 set_property(GLOBAL PROPERTY NEO_CORE_UTILITIES_WINDOWS ${NEO_CORE_UTILITIES_WINDOWS})
-set_property(GLOBAL PROPERTY NEO_CORE_UTILITIES_LINUX ${NEO_CORE_UTILITIES_LINUX})
+
+add_subdirectories()
--- a/shared/source/utilities/cpuintrinsics.cpp
+++ b/shared/source/utilities/cpuintrinsics.cpp
@ -7,7 +7,11 @@

 #include "shared/source/utilities/cpuintrinsics.h"

+#if defined(__ARM_ARCH)
+#include <sse2neon.h>
+#else
 #include <emmintrin.h>
+#endif

 namespace NEO {
 namespace CpuIntrinsics {
--- a/shared/source/utilities/linux/CMakeLists.txt
+++ b/shared/source/utilities/linux/CMakeLists.txt
@ -0,0 +1,14 @@
+#
+# Copyright (C) 2019-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+#
+
+set(NEO_CORE_UTILITIES_LINUX
+    ${CMAKE_CURRENT_SOURCE_DIR}/directory.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/timer_util.cpp
+)
+
+set_property(GLOBAL PROPERTY NEO_CORE_UTILITIES_LINUX ${NEO_CORE_UTILITIES_LINUX})
+
+add_subdirectories()
--- a/shared/source/utilities/linux/aarch64/CMakeLists.txt
+++ b/shared/source/utilities/linux/aarch64/CMakeLists.txt
@ -0,0 +1,13 @@
+#
+# Copyright (C) 2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+#
+
+if(${NEO_TARGET_PROCESSOR} STREQUAL "aarch64")
+  list(APPEND NEO_CORE_UTILITIES_LINUX
+       ${CMAKE_CURRENT_SOURCE_DIR}/cpu_info.cpp
+  )
+
+  set_property(GLOBAL PROPERTY NEO_CORE_UTILITIES_LINUX ${NEO_CORE_UTILITIES_LINUX})
+endif()
--- a/shared/source/utilities/linux/aarch64/cpu_info.cpp
+++ b/shared/source/utilities/linux/aarch64/cpu_info.cpp
@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2019-2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/utilities/cpu_info.h"
+
+#include "shared/source/os_interface/linux/os_inc.h"
+
+#include <cstdint>
+#include <fstream>
+
+namespace NEO {
+
+void cpuid_linux_wrapper(int cpuInfo[4], int functionId) {
+    // TODO: need aarch64 implementation
+}
+
+void cpuidex_linux_wrapper(int *cpuInfo, int functionId, int subfunctionId) {
+    // TODO: need aarch64 implementation
+}
+
+void get_cpu_flags_linux(std::string &cpuFlags) {
+    std::ifstream cpuinfo(std::string(Os::sysFsProcPathPrefix) + "/cpuinfo");
+    std::string line;
+    while (std::getline(cpuinfo, line)) {
+        if (line.substr(0, 5) == "flags") {
+            cpuFlags = line;
+            break;
+        }
+    }
+}
+
+void (*CpuInfo::cpuidexFunc)(int *, int, int) = cpuidex_linux_wrapper;
+void (*CpuInfo::cpuidFunc)(int[4], int) = cpuid_linux_wrapper;
+void (*CpuInfo::getCpuFlagsFunc)(std::string &) = get_cpu_flags_linux;
+
+const CpuInfo CpuInfo::instance;
+
+void CpuInfo::cpuid(
+    uint32_t cpuInfo[4],
+    uint32_t functionId) const {
+    cpuidFunc(reinterpret_cast<int *>(cpuInfo), functionId);
+}
+
+void CpuInfo::cpuidex(
+    uint32_t cpuInfo[4],
+    uint32_t functionId,
+    uint32_t subfunctionId) const {
+    cpuidexFunc(reinterpret_cast<int *>(cpuInfo), functionId, subfunctionId);
+}
+
+} // namespace NEO
--- a/shared/source/utilities/linux/x86_64/CMakeLists.txt
+++ b/shared/source/utilities/linux/x86_64/CMakeLists.txt
@ -0,0 +1,14 @@
+#
+# Copyright (C) 2019-2021 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+#
+
+if(${NEO_TARGET_PROCESSOR} STREQUAL "x86_64")
+  list(APPEND NEO_CORE_UTILITIES_LINUX
+       ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
+       ${CMAKE_CURRENT_SOURCE_DIR}/cpu_info.cpp
+  )
+
+  set_property(GLOBAL PROPERTY NEO_CORE_UTILITIES_LINUX ${NEO_CORE_UTILITIES_LINUX})
+endif()
--- a/shared/source/utilities/linux/x86_64/cpu_info.cpp
+++ b/shared/source/utilities/linux/x86_64/cpu_info.cpp
--- a/third_party/sse2neon/LICENSE
+++ b/third_party/sse2neon/LICENSE
@ -0,0 +1,19 @@
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/third_party/sse2neon/README.md
+++ b/third_party/sse2neon/README.md
@ -0,0 +1,154 @@
+# sse2neon
+![Github Actions](https://github.com/DLTcollab/sse2neon/workflows/Github%20Actions/badge.svg?branch=master)
+
+A C/C++ header file that converts Intel SSE intrinsics to Arm/Aarch64 NEON intrinsics.
+
+## Introduction
+
+`sse2neon` is a translator of Intel SSE (Streaming SIMD Extensions) intrinsics
+to [Arm NEON](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon),
+shortening the time needed to get an Arm working program that then can be used to
+extract profiles and to identify hot paths in the code.
+The header file `sse2neon.h` contains several of the functions provided by Intel
+intrinsic headers such as `<xmmintrin.h>`, only implemented with NEON-based counterparts
+to produce the exact semantics of the intrinsics.
+
+## Mapping and Coverage
+
+Header file | Extension |
+---|---|
+`<mmintrin.h>` | MMX |
+`<xmmintrin.h>` | SSE |
+`<emmintrin.h>` | SSE2 |
+`<pmmintrin.h>` | SSE3 |
+`<tmmintrin.h>` | SSSE3 |
+`<smmintrin.h>` | SSE4.1 |
+`<nmmintrin.h>` | SSE4.2 |
+`<wmmintrin.h>` | AES  |
+
+`sse2neon` aims to support SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and AES extension.
+
+In order to deliver NEON-equivalent intrinsics for all SSE intrinsics used widely,
+please be aware that some SSE intrinsics exist a direct mapping with a concrete
+NEON-equivalent intrinsic. However, others lack of 1-to-1 mapping, that means the
+equivalents are implemented using several NEON intrinsics.
+
+For example, SSE intrinsic `_mm_loadu_si128` has a direct NEON mapping (`vld1q_s32`),
+but SSE intrinsic `_mm_maddubs_epi16` has to be implemented with 13+ NEON instructions.
+
+## Usage
+
+- Put the file `sse2neon.h` in to your source code directory.
+
+- Locate the following SSE header files included in the code:
+```C
+#include <xmmintrin.h>
+#include <emmintrin.h>
+```
+  {p,t,s,n,w}mmintrin.h should be replaceable, but the coverage of these extensions might be limited though.
+
+- Replace them with:
+```C
+#include "sse2neon.h"
+```
+
+- Explicitly specify platform-specific options to gcc/clang compilers.
+  * On ARMv8-A targets, you should specify the following compiler option: (Remove `crypto` and/or `crc` if your architecture does not support cryptographic and/or CRC32 extensions)
+  ```shell
+  -march=armv8-a+fp+simd+crypto+crc
+  ```
+  * On ARMv7-A targets, you need to append the following compiler option:
+  ```shell
+  -mfpu=neon
+  ```
+
+## Compile-time Configurations
+
+Considering the balance between correctness and performance, `sse2neon` recognizes the following compile-time configurations:
+* `SSE2NEON_PRECISE_MINMAX`: Enable precise implementation of `_mm_min_ps` and `_mm_max_ps`. If you need consistent results such as NaN special cases, enable it.
+* `SSE2NEON_PRECISE_DIV`: Enable precise implementation of `_mm_rcp_ps` and `_mm_div_ps` by additional Netwon-Raphson iteration for accuracy.
+* `SSE2NEON_PRECISE_SQRT`: Enable precise implementation of `_mm_sqrt_ps` and `_mm_rsqrt_ps` by additional Netwon-Raphson iteration for accuracy.
+
+The above are turned off by default, and you should define the corresponding macro(s) as `1` before including `sse2neon.h` if you need the precise implementations.
+
+## Run Built-in Test Suite
+
+`sse2neon` provides a unified interface for developing test cases. These test
+cases are located in `tests` directory, and the input data is specified at
+runtime. Use the following commands to perform test cases:
+```shell
+$ make check
+```
+
+You can specify GNU toolchain for cross compilation as well.
+[QEMU](https://www.qemu.org/) should be installed in advance.
+```shell
+$ make CROSS_COMPILE=aarch64-linux-gnu- check # ARMv8-A
+```
+or
+```shell
+$ make CROSS_COMPILE=arm-linux-gnueabihf- check # ARMv7-A
+```
+
+Check the details via [Test Suite for SSE2NEON](tests/README.md).
+
+## Adoptions
+Here is a partial list of open source projects that have adopted `sse2neon` for Arm/Aarch64 support.
+* [aether-game-utils](https://github.com/johnhues/aether-game-utils) is a collection of cross platform utilities for quickly creating small game prototypes in C++.
+* [Apache Impala](https://impala.apache.org/) is a lightning-fast, distributed SQL queries for petabytes of data stored in Apache Hadoop clusters.
+* [Apache Kudu](https://kudu.apache.org/) completes Hadoop's storage layer to enable fast analytics on fast data.
+* [ART](https://github.com/dinosaure/art) is an implementation in OCaml of [Adaptive Radix Tree](https://db.in.tum.de/~leis/papers/ART.pdf) (ART).
+* [Async](https://github.com/romange/async) is a set of c++ primitives that allows efficient and rapid development in C++17 on GNU/Linux systems.
+* [Blender](https://www.blender.org/) is the free and open source 3D creation suite, supporting the entirety of the 3D pipeline.
+* [Boo](https://github.com/AxioDL/boo) is a cross-platform windowing and event manager similar to SDL or SFML, with additional 3D rendering functionality.
+* [CARTA](https://github.com/CARTAvis/carta-backend) is a new visualization tool designed for viewing radio astronomy images in CASA, FITS, MIRIAD, and HDF5 formats (using the IDIA custom schema for HDF5).
+* [Catcoon](https://github.com/i-evi/catcoon) is a [feedforward neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network) implementation in C.
+* [dab-cmdline](https://github.com/JvanKatwijk/dab-cmdline) provides entries for the functionality to handle Digital audio broadcasting (DAB)/DAB+ through some simple calls.
+* [EDGE](https://github.com/3dfxdev/EDGE) is an advanced OpenGL source port spawned from the DOOM engine, with focus on easy development and expansion for modders and end-users.
+* [Embree](https://github.com/embree/embree) a collection of high-performance ray tracing kernels. Its target users are graphics application engineers who want to improve the performance of their photo-realistic rendering application by leveraging Embree's performance-optimized ray tracing kernels.
+* [emp-tool](https://github.com/emp-toolkit/emp-tool) aims to provide a benchmark for secure computation and allowing other researchers to experiment and extend.
+* [FoundationDB](https://www.foundationdb.org) is a distributed database designed to handle large volumes of structured data across clusters of commodity servers.
+* [iqtree_arm_neon](https://github.com/joshlvmh/iqtree_arm_neon) is the Arm NEON port of [IQ-TREE](http://www.iqtree.org/), fast and effective stochastic algorithm to infer phylogenetic trees by maximum likelihood.
+* [kram](https://github.com/alecazam/kram) is a wrapper to several popular encoders to and from PNG/[KTX](https://www.khronos.org/opengles/sdk/tools/KTX/file_format_spec/) files with [LDR/HDR and BC/ASTC/ETC2](https://developer.arm.com/solutions/graphics-and-gaming/developer-guides/learn-the-basics/adaptive-scalable-texture-compression/single-page).
+* [libscapi](https://github.com/cryptobiu/libscapi) stands for the "Secure Computation API", providing  reliable, efficient, and highly flexible cryptographic infrastructure.
+* [libmatoya](https://github.com/matoya/libmatoya) is a cross-platform application development library, providing various features such as common cryptography tasks.
+* [Madronalib](https://github.com/madronalabs/madronalib) enables efficient audio DSP on SIMD processors with readable and brief C++ code.
+* [minimap2](https://github.com/lh3/minimap2) is a versatile sequence alignment program that aligns DNA or mRNA sequences against a large reference database.
+* [MMseqs2](https://github.com/soedinglab/MMseqs2) (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets.
+* [MRIcroGL](https://github.com/rordenlab/MRIcroGL) is a cross-platform tool for viewing NIfTI, DICOM, MGH, MHD, NRRD, AFNI format medical images.
+* [N2](https://github.com/oddconcepts/n2o) is an approximate nearest neighborhoods algorithm library written in C++, providing a much faster search speed than other implementations when modeling large dataset.
+* [niimath](https://github.com/rordenlab/niimath) is a general image calculator with superior performance.
+* [OBS Studio](https://github.com/obsproject/obs-studio) is software designed for capturing, compositing, encoding, recording, and streaming video content, efficiently.
+* [OGRE](https://github.com/OGRECave/ogre) is a scene-oriented, flexible 3D engine written in C++ designed to make it easier and more intuitive for developers to produce games and demos utilising 3D hardware.
+* [OpenXRay](https://github.com/OpenXRay/xray-16) is an improved version of the X-Ray engine, used in world famous S.T.A.L.K.E.R. game series by GSC Game World.
+* [parallel-n64](https://github.com/libretro/parallel-n64) is an optimized/rewritten Nintendo 64 emulator made specifically for [Libretro](https://www.libretro.com/).
+* [PFFFT](https://github.com/marton78/pffft) does 1D Fast Fourier Transforms, of single precision real and complex vectors.
+* [PlutoSDR Firmware](https://github.com/seanstone/plutosdr-fw) is the customized firmware for the [PlutoSDR](https://wiki.analog.com/university/tools/pluto) that can be used to introduce fundamentals of Software Defined Radio (SDR) or Radio Frequency (RF) or Communications as advanced topics in electrical engineering in a self or instructor lead setting.
+* [Pygame](https://www.pygame.org) is cross-platform and designed to make it easy to write multimedia software, such as games, in Python.
+* [simd_utils](https://github.com/JishinMaster/simd_utils) is a header-only library implementing common mathematical functions using SIMD intrinsics.
+* [SMhasher](https://github.com/rurban/smhasher) provides comprehensive Hash function quality and speed tests.
+* [Spack](https://github.com/spack/spack) is a multi-platform package manager that builds and installs multiple versions and configurations of software.
+* [srsLTE](https://github.com/srsLTE/srsLTE) is an open source SDR LTE software suite.
+* [Surge](https://github.com/surge-synthesizer/surge) is an open source digital synthesizer.
+* [XMRig](https://github.com/xmrig/xmrig) is an open source CPU miner for [Monero](https://web.getmonero.org/) cryptocurrency.
+
+## Related Projects
+* [SIMDe](https://github.com/simd-everywhere/simde): fast and portable implementations of SIMD
+  intrinsics on hardware which doesn't natively support them, such as calling SSE functions on ARM.
+* [CatBoost's sse2neon](https://github.com/catboost/catboost/blob/master/library/cpp/sse/sse2neon.h)
+* [ARM\_NEON\_2\_x86\_SSE](https://github.com/intel/ARM_NEON_2_x86_SSE)
+* [AvxToNeon](https://github.com/kunpengcompute/AvxToNeon)
+* [POWER/PowerPC support for GCC](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000) contains a series of headers simplifying porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le (pure little-endian mode that has been introduced with the [POWER8](https://en.wikipedia.org/wiki/POWER8)).
+    - implementation: [xmmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/xmmintrin.h), [emmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/emmintrin.h), [pmmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/pmmintrin.h), [tmmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/tmmintrin.h), [smmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/smmintrin.h)
+
+## Reference
+* [Intel Intrinsics Guide](https://software.intel.com/sites/landingpage/IntrinsicsGuide/)
+* [Arm Neon Intrinsics Reference](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
+* [Neon Programmer's Guide for Armv8-A](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/neon-programmers-guide-for-armv8-a)
+* [NEON Programmer's Guide](https://static.docs.arm.com/den0018/a/DEN0018A_neon_programmers_guide_en.pdf)
+* [qemu/target/i386/ops_sse.h](https://github.com/qemu/qemu/blob/master/target/i386/ops_sse.h): Comprehensive SSE instruction emulation in C. Ideal for semantic checks.
+
+## Licensing
+
+`sse2neon` is freely redistributable under the MIT License.
--- a/third_party/sse2neon/sse2neon.h
+++ b/third_party/sse2neon/sse2neon.h