[VectorCombine] transform bitcasted shuffle to narrower elements

bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'

We do not attempt this in InstCombine because we do not want to change
types and create new shuffle ops that are potentially not lowered as
well as the original code. Here, we can check the cost model to see if
it is worthwhile.

I've aggressively enabled this transform even if the types are the same
size and/or equal cost because moving the bitcast allows InstCombine to
make further simplifications.

In the motivating cases from PR35454:
https://bugs.llvm.org/show_bug.cgi?id=35454
...this is enough to let instcombine and the backend eliminate the
redundant shuffles, but we probably want to extend VectorCombine to
handle the inverse pattern (shuffle-of-bitcast) to get that
simplification directly in IR.

Differential Revision: https://reviews.llvm.org/D76727
This commit is contained in:
Sanjay Patel
2020-04-02 13:21:29 -04:00
parent f2334a7ef2
commit b6050ca181
2 changed files with 107 additions and 25 deletions

View File

@@ -17,6 +17,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
@@ -244,6 +245,49 @@ static bool foldExtractExtract(Instruction &I, const TargetTransformInfo &TTI) {
return true;
}
/// If this is a bitcast to narrow elements from a shuffle of wider elements,
/// try to bitcast the source vector to the narrow type followed by shuffle.
/// This can enable further transforms by moving bitcasts or shuffles together.
static bool foldBitcastShuf(Instruction &I, const TargetTransformInfo &TTI) {
Value *V;
ArrayRef<int> Mask;
if (!match(&I, m_BitCast(m_OneUse(m_ShuffleVector(m_Value(V), m_Undef(),
m_Mask(Mask))))))
return false;
Type *DestTy = I.getType();
Type *SrcTy = V->getType();
if (!DestTy->isVectorTy() || I.getOperand(0)->getType() != SrcTy)
return false;
// TODO: Handle bitcast from narrow element type to wide element type.
assert(SrcTy->isVectorTy() && "Shuffle of non-vector type?");
unsigned DestNumElts = DestTy->getVectorNumElements();
unsigned SrcNumElts = SrcTy->getVectorNumElements();
if (SrcNumElts > DestNumElts)
return false;
// The new shuffle must not cost more than the old shuffle. The bitcast is
// moved ahead of the shuffle, so assume that it has the same cost as before.
if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) >
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy))
return false;
// Bitcast the source vector and expand the shuffle mask to the equivalent for
// narrow elements.
// bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
IRBuilder<> Builder(&I);
Value *CastV = Builder.CreateBitCast(V, DestTy);
SmallVector<int, 16> NewMask;
assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
unsigned ScaleFactor = DestNumElts / SrcNumElts;
scaleShuffleMask(ScaleFactor, Mask, NewMask);
Value *Shuf = Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy),
NewMask);
I.replaceAllUsesWith(Shuf);
return true;
}
/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.
static bool runImpl(Function &F, const TargetTransformInfo &TTI,
@@ -265,6 +309,7 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI,
if (isa<DbgInfoIntrinsic>(I))
continue;
MadeChange |= foldExtractExtract(I, TTI);
MadeChange |= foldBitcastShuf(I, TTI);
}
}

View File

@@ -2,28 +2,39 @@
; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
; x86 does not have a cheap v16i8 shuffle until SSSE3 (pshufb)
define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_narrow_element(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; CHECK-NEXT: ret <16 x i8> [[R]]
; SSE-LABEL: @bitcast_shuf_narrow_element(
; SSE-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; SSE-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; SSE-NEXT: ret <16 x i8> [[R]]
;
; AVX-LABEL: @bitcast_shuf_narrow_element(
; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
; AVX-NEXT: ret <16 x i8> [[TMP2]]
;
%shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%r = bitcast <4 x i32> %shuf to <16 x i8>
ret <16 x i8> %r
}
; v4f32 is the same cost as v4i32, so this always works
define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_same_size(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <4 x float>
; CHECK-NEXT: ret <4 x float> [[R]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <4 x float>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: ret <4 x float> [[TMP2]]
;
%shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%r = bitcast <4 x i32> %shuf to <4 x float>
ret <4 x float> %r
}
; Negative test - length-changing shuffle
define <16 x i8> @bitcast_shuf_narrow_element_wrong_size(<2 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_size(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -35,6 +46,8 @@ define <16 x i8> @bitcast_shuf_narrow_element_wrong_size(<2 x i32> %v) {
ret <16 x i8> %r
}
; Negative test - must cast to vector type
define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_type(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -46,6 +59,8 @@ define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) {
ret i128 %r
}
; Negative test - but might want to try this
define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) {
; CHECK-LABEL: @bitcast_shuf_wide_element(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
@@ -59,6 +74,8 @@ define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) {
declare void @use(<4 x i32>)
; Negative test - don't create an extra shuffle
define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_uses(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -73,15 +90,25 @@ define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) {
}
define <2 x i64> @PR35454_1(<2 x i64> %v) {
; CHECK-LABEL: @PR35454_1(
; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
; CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8>
; CHECK-NEXT: [[ADD:%.*]] = shl <16 x i8> [[BC1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; CHECK-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32>
; CHECK-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
; CHECK-NEXT: ret <2 x i64> [[BC3]]
; SSE-LABEL: @PR35454_1(
; SSE-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
; SSE-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; SSE-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8>
; SSE-NEXT: [[ADD:%.*]] = shl <16 x i8> [[BC1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; SSE-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32>
; SSE-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; SSE-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
; SSE-NEXT: ret <2 x i64> [[BC3]]
;
; AVX-LABEL: @PR35454_1(
; AVX-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <16 x i8>
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
; AVX-NEXT: [[ADD:%.*]] = shl <16 x i8> [[TMP2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32>
; AVX-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
; AVX-NEXT: ret <2 x i64> [[BC3]]
;
%bc = bitcast <2 x i64> %v to <4 x i32>
%permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -94,15 +121,25 @@ define <2 x i64> @PR35454_1(<2 x i64> %v) {
}
define <2 x i64> @PR35454_2(<2 x i64> %v) {
; CHECK-LABEL: @PR35454_2(
; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
; CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16>
; CHECK-NEXT: [[ADD:%.*]] = shl <8 x i16> [[BC1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
; CHECK-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32>
; CHECK-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
; CHECK-NEXT: ret <2 x i64> [[BC3]]
; SSE-LABEL: @PR35454_2(
; SSE-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
; SSE-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; SSE-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16>
; SSE-NEXT: [[ADD:%.*]] = shl <8 x i16> [[BC1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
; SSE-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32>
; SSE-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; SSE-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
; SSE-NEXT: ret <2 x i64> [[BC3]]
;
; AVX-LABEL: @PR35454_2(
; AVX-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <8 x i16>
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
; AVX-NEXT: [[ADD:%.*]] = shl <8 x i16> [[TMP2]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
; AVX-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32>
; AVX-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
; AVX-NEXT: ret <2 x i64> [[BC3]]
;
%bc = bitcast <2 x i64> %v to <4 x i32>
%permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>