/* * Copyright (c) 2017, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #pragma once #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/debug_helpers.h" #include #include namespace OCLRT { struct uint16x8_t { enum { numChannels = 8 }; __m128i value; uint16x8_t() { value = _mm_setzero_si128(); } uint16x8_t(__m128i value) : value(value) { } uint16x8_t(uint16_t a) { value = _mm_set1_epi16(a); //SSE2 } explicit uint16x8_t(const void *alignedPtr) { load(alignedPtr); } inline uint16_t get(unsigned int element) { DEBUG_BREAK_IF(element >= numChannels); return reinterpret_cast(&value)[element]; } static inline uint16x8_t zero() { return uint16x8_t(static_cast(0u)); } static inline uint16x8_t one() { return uint16x8_t(static_cast(1u)); } static inline uint16x8_t mask() { return uint16x8_t(static_cast(0xffffu)); } inline void load(const void *alignedPtr) { DEBUG_BREAK_IF(!isAligned<16>(alignedPtr)); value = _mm_load_si128(reinterpret_cast(alignedPtr)); //SSE2 } inline void loadUnaligned(const void *ptr) { value = _mm_loadu_si128(reinterpret_cast(ptr)); //SSE2 } inline void store(void *alignedPtr) { DEBUG_BREAK_IF(!isAligned<16>(alignedPtr)); _mm_store_si128(reinterpret_cast<__m128i *>(alignedPtr), value); //SSE2 } inline void storeUnaligned(void *ptr) { _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr), value); //SSE2 } inline operator bool() const { return _mm_test_all_zeros(value, mask().value) ? false : true; //SSE4.1 alternatives? } inline uint16x8_t &operator-=(const uint16x8_t &a) { value = _mm_sub_epi16(value, a.value); //SSE2 return *this; } inline uint16x8_t &operator+=(const uint16x8_t &a) { value = _mm_add_epi16(value, a.value); //SSE2 return *this; } inline friend uint16x8_t operator>=(const uint16x8_t &a, const uint16x8_t &b) { uint16x8_t result; result.value = _mm_xor_si128(mask().value, _mm_cmplt_epi16(a.value, b.value)); //SSE2 return result; } inline friend uint16x8_t operator&&(const uint16x8_t &a, const uint16x8_t &b) { uint16x8_t result; result.value = _mm_and_si128(a.value, b.value); //SSE2 return result; } // NOTE: uint16x8_t::blend behaves like mask ? a : b inline friend uint16x8_t blend(const uint16x8_t &a, const uint16x8_t &b, const uint16x8_t &mask) { uint16x8_t result; // Have to swap arguments to get intended calling semantics result.value = _mm_blendv_epi8(b.value, a.value, mask.value); //SSE4.1 alternatives? return result; } }; }