/* * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #pragma once #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/debug_helpers.h" #include #include namespace OCLRT { #if __AVX2__ struct uint16x16_t { enum { numChannels = 16 }; __m256i value; uint16x16_t() { value = _mm256_setzero_si256(); } uint16x16_t(__m256i value) : value(value) { } uint16x16_t(uint16_t a) { value = _mm256_set1_epi16(a); //AVX } explicit uint16x16_t(const void *alignedPtr) { load(alignedPtr); } inline uint16_t get(unsigned int element) { DEBUG_BREAK_IF(element >= numChannels); return reinterpret_cast(&value)[element]; } static inline uint16x16_t zero() { return uint16x16_t(static_cast(0u)); } static inline uint16x16_t one() { return uint16x16_t(static_cast(1u)); } static inline uint16x16_t mask() { return uint16x16_t(static_cast(0xffffu)); } inline void load(const void *alignedPtr) { DEBUG_BREAK_IF(!isAligned<32>(alignedPtr)); value = _mm256_load_si256(reinterpret_cast(alignedPtr)); //AVX } inline void loadUnaligned(const void *ptr) { value = _mm256_loadu_si256(reinterpret_cast(ptr)); //AVX } inline void store(void *alignedPtr) { DEBUG_BREAK_IF(!isAligned<32>(alignedPtr)); _mm256_store_si256(reinterpret_cast<__m256i *>(alignedPtr), value); //AVX } inline void storeUnaligned(void *ptr) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), value); //AVX } inline operator bool() const { return _mm256_testz_si256(value, mask().value) ? false : true; //AVX } inline uint16x16_t &operator-=(const uint16x16_t &a) { value = _mm256_sub_epi16(value, a.value); //AVX2 return *this; } inline uint16x16_t &operator+=(const uint16x16_t &a) { value = _mm256_add_epi16(value, a.value); //AVX2 return *this; } inline friend uint16x16_t operator>=(const uint16x16_t &a, const uint16x16_t &b) { uint16x16_t result; result.value = _mm256_xor_si256(mask().value, _mm256_cmpgt_epi16(b.value, a.value)); //AVX2 return result; } inline friend uint16x16_t operator&&(const uint16x16_t &a, const uint16x16_t &b) { uint16x16_t result; result.value = _mm256_and_si256(a.value, b.value); //AVX2 return result; } // NOTE: uint16x16_t::blend behaves like mask ? a : b inline friend uint16x16_t blend(const uint16x16_t &a, const uint16x16_t &b, const uint16x16_t &mask) { uint16x16_t result; // Have to swap arguments to get intended calling semantics result.value = _mm256_blendv_epi8(b.value, a.value, mask.value); //AVX2 return result; } }; #endif // __AVX2__ } // namespace OCLRT