mirror of
https://github.com/intel/llvm.git
synced 2026-01-16 13:35:38 +08:00
[Headers][X86] Enable constexpr handling for MMX/SSE/AVX/AVX512 PMADDWD/PMADDUBSW intrinsics (#161563)
This PR updates the PMADDWD/PMADDUBSW builtins to support constant expression handling, by extending the VectorExprEvaluator::VisitCallExpr that handles interp__builtin_ia32_pmadd builtins. Closes #155392
This commit is contained in:
committed by
GitHub
parent
bcec41e5e6
commit
ee192315b2
@@ -123,13 +123,16 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
|
||||
def Op#d128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
|
||||
}
|
||||
|
||||
def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
|
||||
def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
|
||||
def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
|
||||
def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
|
||||
def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
|
||||
def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
|
||||
}
|
||||
|
||||
let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
|
||||
def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
|
||||
}
|
||||
}
|
||||
|
||||
// AVX
|
||||
@@ -278,13 +281,14 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
|
||||
def psllw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
|
||||
def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
|
||||
def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
|
||||
def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
|
||||
def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
|
||||
def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
|
||||
}
|
||||
|
||||
let Features = "sse2",
|
||||
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
|
||||
def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
|
||||
|
||||
def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
|
||||
|
||||
def psllwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
|
||||
@@ -581,8 +585,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
|
||||
def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
|
||||
def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
|
||||
def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
|
||||
def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
|
||||
def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
|
||||
def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
|
||||
def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
|
||||
def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
|
||||
@@ -619,6 +621,9 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
|
||||
|
||||
def pblendvb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
|
||||
|
||||
def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
|
||||
def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
|
||||
|
||||
def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
|
||||
def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
|
||||
|
||||
@@ -1378,10 +1383,6 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
|
||||
def subps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
|
||||
}
|
||||
|
||||
let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
|
||||
def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">;
|
||||
def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">;
|
||||
}
|
||||
|
||||
let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
|
||||
def addss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
|
||||
@@ -1999,6 +2000,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512
|
||||
}
|
||||
|
||||
let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
|
||||
def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">;
|
||||
def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">;
|
||||
def psllv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
|
||||
def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
|
||||
def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
|
||||
|
||||
@@ -2549,6 +2549,44 @@ static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC,
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool interp__builtin_ia32_pmadd(
|
||||
InterpState &S, CodePtr OpPC, const CallExpr *Call,
|
||||
llvm::function_ref<APInt(const APSInt &, const APSInt &, const APSInt &,
|
||||
const APSInt &)>
|
||||
Fn) {
|
||||
assert(Call->getArg(0)->getType()->isVectorType() &&
|
||||
Call->getArg(1)->getType()->isVectorType());
|
||||
const Pointer &RHS = S.Stk.pop<Pointer>();
|
||||
const Pointer &LHS = S.Stk.pop<Pointer>();
|
||||
const Pointer &Dst = S.Stk.peek<Pointer>();
|
||||
|
||||
const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
|
||||
PrimType ElemT = *S.getContext().classify(VT->getElementType());
|
||||
unsigned NumElems = VT->getNumElements();
|
||||
const auto *DestVT = Call->getType()->castAs<VectorType>();
|
||||
PrimType DestElemT = *S.getContext().classify(DestVT->getElementType());
|
||||
bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
|
||||
|
||||
unsigned DstElem = 0;
|
||||
for (unsigned I = 0; I != NumElems; I += 2) {
|
||||
APSInt Result;
|
||||
INT_TYPE_SWITCH_NO_BOOL(ElemT, {
|
||||
APSInt LoLHS = LHS.elem<T>(I).toAPSInt();
|
||||
APSInt HiLHS = LHS.elem<T>(I + 1).toAPSInt();
|
||||
APSInt LoRHS = RHS.elem<T>(I).toAPSInt();
|
||||
APSInt HiRHS = RHS.elem<T>(I + 1).toAPSInt();
|
||||
Result = APSInt(Fn(LoLHS, HiLHS, LoRHS, HiRHS), DestUnsigned);
|
||||
});
|
||||
|
||||
INT_TYPE_SWITCH_NO_BOOL(DestElemT,
|
||||
{ Dst.elem<T>(DstElem) = static_cast<T>(Result); });
|
||||
++DstElem;
|
||||
}
|
||||
|
||||
Dst.initializeAllElements();
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
|
||||
const CallExpr *Call,
|
||||
unsigned BuiltinID) {
|
||||
@@ -3471,6 +3509,30 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
|
||||
return interp__builtin_elementwise_int_binop(S, OpPC, Call,
|
||||
llvm::APIntOps::avgCeilU);
|
||||
|
||||
case clang::X86::BI__builtin_ia32_pmaddubsw128:
|
||||
case clang::X86::BI__builtin_ia32_pmaddubsw256:
|
||||
case clang::X86::BI__builtin_ia32_pmaddubsw512:
|
||||
return interp__builtin_ia32_pmadd(
|
||||
S, OpPC, Call,
|
||||
[](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &LoRHS,
|
||||
const APSInt &HiRHS) {
|
||||
unsigned BitWidth = 2 * LoLHS.getBitWidth();
|
||||
return (LoLHS.zext(BitWidth) * LoRHS.sext(BitWidth))
|
||||
.sadd_sat((HiLHS.zext(BitWidth) * HiRHS.sext(BitWidth)));
|
||||
});
|
||||
|
||||
case clang::X86::BI__builtin_ia32_pmaddwd128:
|
||||
case clang::X86::BI__builtin_ia32_pmaddwd256:
|
||||
case clang::X86::BI__builtin_ia32_pmaddwd512:
|
||||
return interp__builtin_ia32_pmadd(
|
||||
S, OpPC, Call,
|
||||
[](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &LoRHS,
|
||||
const APSInt &HiRHS) {
|
||||
unsigned BitWidth = 2 * LoLHS.getBitWidth();
|
||||
return (LoLHS.sext(BitWidth) * LoRHS.sext(BitWidth)) +
|
||||
(HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth));
|
||||
});
|
||||
|
||||
case clang::X86::BI__builtin_ia32_pmulhuw128:
|
||||
case clang::X86::BI__builtin_ia32_pmulhuw256:
|
||||
case clang::X86::BI__builtin_ia32_pmulhuw512:
|
||||
|
||||
@@ -11778,6 +11778,54 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
|
||||
case clang::X86::BI__builtin_ia32_pavgw512:
|
||||
return EvaluateBinOpExpr(llvm::APIntOps::avgCeilU);
|
||||
|
||||
case clang::X86::BI__builtin_ia32_pmaddubsw128:
|
||||
case clang::X86::BI__builtin_ia32_pmaddubsw256:
|
||||
case clang::X86::BI__builtin_ia32_pmaddubsw512:
|
||||
case clang::X86::BI__builtin_ia32_pmaddwd128:
|
||||
case clang::X86::BI__builtin_ia32_pmaddwd256:
|
||||
case clang::X86::BI__builtin_ia32_pmaddwd512: {
|
||||
APValue SourceLHS, SourceRHS;
|
||||
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
|
||||
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
|
||||
return false;
|
||||
|
||||
auto *DestTy = E->getType()->castAs<VectorType>();
|
||||
QualType DestEltTy = DestTy->getElementType();
|
||||
unsigned SourceLen = SourceLHS.getVectorLength();
|
||||
bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
|
||||
SmallVector<APValue, 4> ResultElements;
|
||||
ResultElements.reserve(SourceLen / 2);
|
||||
|
||||
for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
|
||||
const APSInt &LoLHS = SourceLHS.getVectorElt(EltNum).getInt();
|
||||
const APSInt &HiLHS = SourceLHS.getVectorElt(EltNum + 1).getInt();
|
||||
const APSInt &LoRHS = SourceRHS.getVectorElt(EltNum).getInt();
|
||||
const APSInt &HiRHS = SourceRHS.getVectorElt(EltNum + 1).getInt();
|
||||
unsigned BitWidth = 2 * LoLHS.getBitWidth();
|
||||
|
||||
switch (E->getBuiltinCallee()) {
|
||||
case clang::X86::BI__builtin_ia32_pmaddubsw128:
|
||||
case clang::X86::BI__builtin_ia32_pmaddubsw256:
|
||||
case clang::X86::BI__builtin_ia32_pmaddubsw512:
|
||||
ResultElements.push_back(APValue(
|
||||
APSInt((LoLHS.zext(BitWidth) * LoRHS.sext(BitWidth))
|
||||
.sadd_sat((HiLHS.zext(BitWidth) * HiRHS.sext(BitWidth))),
|
||||
DestUnsigned)));
|
||||
break;
|
||||
case clang::X86::BI__builtin_ia32_pmaddwd128:
|
||||
case clang::X86::BI__builtin_ia32_pmaddwd256:
|
||||
case clang::X86::BI__builtin_ia32_pmaddwd512:
|
||||
ResultElements.push_back(
|
||||
APValue(APSInt((LoLHS.sext(BitWidth) * LoRHS.sext(BitWidth)) +
|
||||
(HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth)),
|
||||
DestUnsigned)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
|
||||
}
|
||||
|
||||
case clang::X86::BI__builtin_ia32_pmulhuw128:
|
||||
case clang::X86::BI__builtin_ia32_pmulhuw256:
|
||||
case clang::X86::BI__builtin_ia32_pmulhuw512:
|
||||
|
||||
@@ -1035,10 +1035,9 @@ _mm256_hsubs_epi16(__m256i __a, __m256i __b)
|
||||
/// \param __b
|
||||
/// A 256-bit vector containing one of the source operands.
|
||||
/// \returns A 256-bit vector of [16 x i16] containing the result.
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maddubs_epi16(__m256i __a, __m256i __b)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
|
||||
_mm256_maddubs_epi16(__m256i __a, __m256i __b) {
|
||||
return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
|
||||
}
|
||||
|
||||
/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
|
||||
@@ -1067,9 +1066,8 @@ _mm256_maddubs_epi16(__m256i __a, __m256i __b)
|
||||
/// \param __b
|
||||
/// A 256-bit vector of [16 x i16] containing one of the source operands.
|
||||
/// \returns A 256-bit vector of [8 x i32] containing the result.
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_madd_epi16(__m256i __a, __m256i __b)
|
||||
{
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
|
||||
_mm256_madd_epi16(__m256i __a, __m256i __b) {
|
||||
return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
|
||||
}
|
||||
|
||||
|
||||
@@ -1064,12 +1064,12 @@ _mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
|
||||
(__v32hi)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
|
||||
_mm512_maddubs_epi16(__m512i __X, __m512i __Y) {
|
||||
return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
|
||||
_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,
|
||||
__m512i __Y) {
|
||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
|
||||
@@ -1077,26 +1077,26 @@ _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,
|
||||
(__v32hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
|
||||
_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) {
|
||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
|
||||
(__v32hi)_mm512_maddubs_epi16(__X, __Y),
|
||||
(__v32hi)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
|
||||
_mm512_madd_epi16(__m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
|
||||
_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||
(__v16si)_mm512_madd_epi16(__A, __B),
|
||||
(__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
|
||||
_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||
(__v16si)_mm512_madd_epi16(__A, __B),
|
||||
|
||||
@@ -1295,21 +1295,21 @@ _mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I,
|
||||
(__v16hi)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
|
||||
_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
|
||||
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||
(__v8hi)_mm_maddubs_epi16(__X, __Y),
|
||||
(__v8hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
|
||||
_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
|
||||
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||
(__v8hi)_mm_maddubs_epi16(__X, __Y),
|
||||
(__v8hi)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
|
||||
_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
|
||||
__m256i __Y) {
|
||||
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||
@@ -1317,35 +1317,35 @@ _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
|
||||
(__v16hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
|
||||
_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
|
||||
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||
(__v16hi)_mm256_maddubs_epi16(__X, __Y),
|
||||
(__v16hi)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
|
||||
_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
|
||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||
(__v4si)_mm_madd_epi16(__A, __B),
|
||||
(__v4si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
|
||||
_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
|
||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||
(__v4si)_mm_madd_epi16(__A, __B),
|
||||
(__v4si)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
|
||||
_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||
(__v8si)_mm256_madd_epi16(__A, __B),
|
||||
(__v8si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
|
||||
_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||
(__v8si)_mm256_madd_epi16(__A, __B),
|
||||
|
||||
@@ -2290,8 +2290,8 @@ _mm_avg_epu16(__m128i __a, __m128i __b) {
|
||||
/// A 128-bit signed [8 x i16] vector.
|
||||
/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
|
||||
/// of both parameters.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
|
||||
__m128i __b) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_madd_epi16(__m128i __a, __m128i __b) {
|
||||
return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
|
||||
}
|
||||
|
||||
|
||||
@@ -679,11 +679,10 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) {
|
||||
/// A 64-bit integer vector of [4 x i16].
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
|
||||
/// products of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_madd_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
|
||||
(__v8hi)__anyext128(__m2)));
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
|
||||
_mm_madd_pi16(__m64 __m1, __m64 __m2) {
|
||||
return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__zext128(__m1),
|
||||
(__v8hi)__zext128(__m2)));
|
||||
}
|
||||
|
||||
/// Multiplies each 16-bit signed integer element of the first 64-bit
|
||||
|
||||
@@ -23,6 +23,9 @@
|
||||
|
||||
#define __trunc64(x) \
|
||||
(__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
|
||||
#define __zext128(x) \
|
||||
(__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
|
||||
1, 2, 3)
|
||||
#define __anyext128(x) \
|
||||
(__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
|
||||
1, -1, -1)
|
||||
@@ -504,10 +507,9 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
|
||||
/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
|
||||
/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
|
||||
/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maddubs_epi16(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_maddubs_epi16(__m128i __a, __m128i __b) {
|
||||
return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
|
||||
}
|
||||
|
||||
/// Multiplies corresponding pairs of packed 8-bit unsigned integer
|
||||
@@ -534,11 +536,10 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
|
||||
/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
|
||||
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
|
||||
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_maddubs_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
|
||||
(__v16qi)__anyext128(__b)));
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_maddubs_pi16(__m64 __a, __m64 __b) {
|
||||
return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__zext128(__a),
|
||||
(__v16qi)__zext128(__b)));
|
||||
}
|
||||
|
||||
/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
|
||||
@@ -796,6 +797,7 @@ _mm_sign_pi32(__m64 __a, __m64 __b)
|
||||
}
|
||||
|
||||
#undef __anyext128
|
||||
#undef __zext128
|
||||
#undef __trunc64
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
|
||||
@@ -810,12 +810,14 @@ __m256i test_mm256_madd_epi16(__m256i a, __m256i b) {
|
||||
// CHECK: call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
|
||||
return _mm256_madd_epi16(a, b);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v8si(_mm256_madd_epi16((__m256i)(__v16hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, (__m256i)(__v16hi){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 50, 250, 610, 1130, 1810, 2650, 3650, 4810));
|
||||
|
||||
__m256i test_mm256_maddubs_epi16(__m256i a, __m256i b) {
|
||||
// CHECK-LABEL: test_mm256_maddubs_epi16
|
||||
// CHECK: call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
|
||||
return _mm256_maddubs_epi16(a, b);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v16hi(_mm256_maddubs_epi16((__m256i)(__v32qi){1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,8}, (__m256i)(__v32qs){2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8}), 5, 18, 39, 68, 15, 42, 77, 120, -3, -14, -33, -60, -15, -42, -77, -120));
|
||||
|
||||
__m128i test_mm_maskload_epi32(int const *a, __m128i m) {
|
||||
// CHECK-LABEL: test_mm_maskload_epi32
|
||||
|
||||
@@ -1650,35 +1650,46 @@ __m512i test_mm512_maddubs_epi16(__m512i __X, __m512i __Y) {
|
||||
// CHECK: @llvm.x86.avx512.pmaddubs.w.512
|
||||
return _mm512_maddubs_epi16(__X,__Y);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v32hi(_mm512_maddubs_epi16((__m512i)(__v64qi){2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, (__m512i)(__v64qs){5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5}), -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5));
|
||||
|
||||
__m512i test_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X, __m512i __Y) {
|
||||
// CHECK-LABEL: test_mm512_mask_maddubs_epi16
|
||||
// CHECK: @llvm.x86.avx512.pmaddubs.w.512
|
||||
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
|
||||
return _mm512_mask_maddubs_epi16(__W,__U,__X,__Y);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v32hi(_mm512_mask_maddubs_epi16((__m512i)(__v32hi){-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32}, 0x0000FFFF, (__m512i)(__v64qi){2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, (__m512i)(__v64qs){5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5}), -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32));
|
||||
|
||||
__m512i test_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) {
|
||||
// CHECK-LABEL: test_mm512_maskz_maddubs_epi16
|
||||
// CHECK: @llvm.x86.avx512.pmaddubs.w.512
|
||||
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
|
||||
return _mm512_maskz_maddubs_epi16(__U,__X,__Y);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v32hi(_mm512_maskz_maddubs_epi16(0x0000FFFF, (__m512i)(__v64qi){2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, (__m512i)(__v64qs){5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5}), -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
|
||||
|
||||
__m512i test_mm512_madd_epi16(__m512i __A, __m512i __B) {
|
||||
// CHECK-LABEL: test_mm512_madd_epi16
|
||||
// CHECK: @llvm.x86.avx512.pmaddw.d.512
|
||||
return _mm512_madd_epi16(__A,__B);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v16si(_mm512_madd_epi16((__m512i)(__v32hi){1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4}, (__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}), 3, 7, 22, 30, 9, 21, 44, 60, 3, 7, 22, 30, 9, 21, 44, 60));
|
||||
|
||||
__m512i test_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
||||
// CHECK-LABEL: test_mm512_mask_madd_epi16
|
||||
// CHECK: @llvm.x86.avx512.pmaddw.d.512
|
||||
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
|
||||
return _mm512_mask_madd_epi16(__W,__U,__A,__B);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v16si(_mm512_mask_madd_epi16((__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600}, 0xF0F0, (__m512i)(__v32hi){1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4}, (__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}), 100, 200, 300, 400, 9, 21, 44, 60, 900, 1000, 1100, 1200, 9, 21, 44, 60));
|
||||
|
||||
__m512i test_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) {
|
||||
// CHECK-LABEL: test_mm512_maskz_madd_epi16
|
||||
// CHECK: @llvm.x86.avx512.pmaddw.d.512
|
||||
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
|
||||
return _mm512_maskz_madd_epi16(__U,__A,__B);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v16si(_mm512_maskz_madd_epi16(0xF0F0, (__m512i)(__v32hi){1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4}, (__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}), 0, 0, 0, 0, 9, 21, 44, 60, 0, 0, 0, 0, 9, 21, 44, 60));
|
||||
|
||||
__m256i test_mm512_cvtsepi16_epi8(__m512i __A) {
|
||||
// CHECK-LABEL: test_mm512_cvtsepi16_epi8
|
||||
|
||||
@@ -1865,6 +1865,7 @@ __m128i test_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m12
|
||||
// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
|
||||
return _mm_mask_maddubs_epi16(__W, __U, __X, __Y);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v8hi(_mm_mask_maddubs_epi16((__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, 0x0F, (__m128i)(__v16qi){1, 1, 2, 2, 3, 3, 4, 4, 1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v16qs){2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -2, -2, -3, -3, -4, -4}), 5, 18, 39, 68, 5, 6, 7, 8));
|
||||
|
||||
__m128i test_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
|
||||
// CHECK-LABEL: test_mm_maskz_maddubs_epi16
|
||||
@@ -1872,6 +1873,7 @@ __m128i test_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
|
||||
// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
|
||||
return _mm_maskz_maddubs_epi16(__U, __X, __Y);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v8hi(_mm_maskz_maddubs_epi16(0x0F, (__m128i)(__v16qi){1, 1, 2, 2, 3, 3, 4, 4, 1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v16qs){2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -2, -2, -3, -3, -4, -4}), 5, 18, 39, 68, 0, 0, 0, 0));
|
||||
|
||||
__m256i test_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
|
||||
// CHECK-LABEL: test_mm256_mask_maddubs_epi16
|
||||
@@ -1879,6 +1881,7 @@ __m256i test_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, _
|
||||
// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
|
||||
return _mm256_mask_maddubs_epi16(__W, __U, __X, __Y);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v16hi(_mm256_mask_maddubs_epi16((__m256i)(__v16hi){-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16}, 0x00FF, (__m256i)(__v32qi){1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,8}, (__m256i)(__v32qs){2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8}), 5, 18, 39, 68, 15, 42, 77, 120, -9, -10, -11, -12, -13, -14, -15, -16));
|
||||
|
||||
__m256i test_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
|
||||
// CHECK-LABEL: test_mm256_maskz_maddubs_epi16
|
||||
@@ -1886,6 +1889,7 @@ __m256i test_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y)
|
||||
// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
|
||||
return _mm256_maskz_maddubs_epi16(__U, __X, __Y);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v16hi(_mm256_maskz_maddubs_epi16(0x00FF, (__m256i)(__v32qi){1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,8}, (__m256i)(__v32qs){2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8}), 5, 18, 39, 68, 15, 42, 77, 120, 0, 0, 0, 0, 0, 0, 0, 0));
|
||||
|
||||
__m128i test_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
|
||||
// CHECK-LABEL: test_mm_mask_madd_epi16
|
||||
@@ -1893,6 +1897,7 @@ __m128i test_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i
|
||||
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
|
||||
return _mm_mask_madd_epi16(__W, __U, __A, __B);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v4si(_mm_mask_madd_epi16((__m128i)(__v4si){1, 2, 3, 4}, 0x3, (__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){9, 10, 11, 12, 13, 14, 15, 16}), 29, 81, 3, 4));
|
||||
|
||||
__m128i test_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
|
||||
// CHECK-LABEL: test_mm_maskz_madd_epi16
|
||||
@@ -1900,6 +1905,7 @@ __m128i test_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
|
||||
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
|
||||
return _mm_maskz_madd_epi16(__U, __A, __B);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v4si(_mm_maskz_madd_epi16(0x3, (__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){9, 10, 11, 12, 13, 14, 15, 16}), 29, 81, 0, 0));
|
||||
|
||||
__m256i test_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
|
||||
// CHECK-LABEL: test_mm256_mask_madd_epi16
|
||||
@@ -1907,6 +1913,7 @@ __m256i test_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m25
|
||||
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
|
||||
return _mm256_mask_madd_epi16(__W, __U, __A, __B);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v8si(_mm256_mask_madd_epi16((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}, 0x0F, (__m256i)(__v16hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, (__m256i)(__v16hi){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 50, 250, 610, 1130, 5, 6, 7, 8));
|
||||
|
||||
__m256i test_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
|
||||
// CHECK-LABEL: test_mm256_maskz_madd_epi16
|
||||
@@ -1914,6 +1921,7 @@ __m256i test_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
|
||||
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
|
||||
return _mm256_maskz_madd_epi16(__U, __A, __B);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v8si(_mm256_maskz_madd_epi16(0x0F, (__m256i)(__v16hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, (__m256i)(__v16hi){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 50, 250, 610, 1130, 0, 0, 0, 0));
|
||||
|
||||
__m128i test_mm_cvtsepi16_epi8(__m128i __A) {
|
||||
// CHECK-LABEL: test_mm_cvtsepi16_epi8
|
||||
|
||||
@@ -355,12 +355,14 @@ __m64 test_mm_madd_pi16(__m64 a, __m64 b) {
|
||||
// CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(
|
||||
return _mm_madd_pi16(a, b);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v2si(_mm_madd_pi16((__m64)(__v4hi){+1, -2, +3, -4}, (__m64)(__v4hi){-10, +8, +6, -4}), -26, 34));
|
||||
|
||||
__m64 test_mm_maddubs_pi16(__m64 a, __m64 b) {
|
||||
// CHECK-LABEL: test_mm_maddubs_pi16
|
||||
// CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(
|
||||
return _mm_maddubs_pi16(a, b);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v4hi(_mm_maddubs_pi16((__m64)(__v8qi){16, 17, 18, 19, 20, 21, 22, 23}, (__m64)(__v8qi){1, 2, 3, 4, 5, 0, 7, 8}), 50, 130, 100, 338));
|
||||
|
||||
void test_mm_maskmove_si64(__m64 d, __m64 n, char *p) {
|
||||
// CHECK-LABEL: test_mm_maskmove_si64
|
||||
|
||||
@@ -852,6 +852,7 @@ __m128i test_mm_madd_epi16(__m128i A, __m128i B) {
|
||||
// CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
|
||||
return _mm_madd_epi16(A, B);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v4si(_mm_madd_epi16((__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){9, 10, 11, 12, 13, 14, 15, 16}), 29, 81, 149, 233));
|
||||
|
||||
void test_mm_maskmoveu_si128(__m128i A, __m128i B, char* C) {
|
||||
// CHECK-LABEL: test_mm_maskmoveu_si128
|
||||
|
||||
@@ -96,6 +96,7 @@ __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
|
||||
// CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
|
||||
return _mm_maddubs_epi16(a, b);
|
||||
}
|
||||
TEST_CONSTEXPR(match_v8hi(_mm_maddubs_epi16((__m128i)(__v16qi){1, 1, 2, 2, 3, 3, 4, 4, 1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v16qs){2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -2, -2, -3, -3, -4, -4}), 5, 18, 39, 68, -3, -14, -33, -60));
|
||||
|
||||
__m128i test_mm_mulhrs_epi16(__m128i a, __m128i b) {
|
||||
// CHECK-LABEL: test_mm_mulhrs_epi16
|
||||
|
||||
Reference in New Issue
Block a user