mirror of
https://github.com/intel/llvm.git
synced 2026-01-26 03:56:16 +08:00
[RISCV] Combine concat_vectors of loads into strided loads
If we're concatenating several smaller loads separated by a stride, we can try and increase the element size and perform a strided load. For example: ``` concat_vectors (load v4i8, p+0), (load v4i8, p+n), (load v4i8, p+n*2), (load v4i8, p+n*3) => vlse32 p, stride=n, VL=4 ``` This pattern can be produced by the SLP vectorizer. A special case is when the stride is exactly equal to the width of the vector, in which case it can be converted into a single consecutive vector load. For example: ``` concat_vectors (load v4i8, p), (load v4i8, p+4), (load v4i8, p+8), (load v4i8, p+12) => vle8 p, VL=16 ``` Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D147713
This commit is contained in:
@@ -1128,7 +1128,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
|
||||
if (Subtarget.hasVInstructions())
|
||||
setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER, ISD::MSCATTER,
|
||||
ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA, ISD::SRL,
|
||||
ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR});
|
||||
ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR,
|
||||
ISD::CONCAT_VECTORS});
|
||||
if (Subtarget.hasVendorXTHeadMemPair())
|
||||
setTargetDAGCombine({ISD::LOAD, ISD::STORE});
|
||||
if (Subtarget.useRVVForFixedLengthVectors())
|
||||
@@ -11147,6 +11148,136 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
|
||||
return tryFoldSelectIntoOp(N, DAG, FalseVal, TrueVal, /*Swapped*/true);
|
||||
}
|
||||
|
||||
// If we're concatenating a series of vector loads like
|
||||
// concat_vectors (load v4i8, p+0), (load v4i8, p+n), (load v4i8, p+n*2) ...
|
||||
// Then we can turn this into a strided load by widening the vector elements
|
||||
// vlse32 p, stride=n
|
||||
static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
|
||||
const RISCVSubtarget &Subtarget,
|
||||
const RISCVTargetLowering &TLI) {
|
||||
SDLoc DL(N);
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
// Only perform this combine on legal MVTs.
|
||||
if (!TLI.isTypeLegal(VT))
|
||||
return SDValue();
|
||||
|
||||
// TODO: Potentially extend this to scalable vectors
|
||||
if (VT.isScalableVector())
|
||||
return SDValue();
|
||||
|
||||
auto *BaseLd = dyn_cast<LoadSDNode>(N->getOperand(0));
|
||||
if (!BaseLd || !BaseLd->isSimple() || !ISD::isNormalLoad(BaseLd) ||
|
||||
!SDValue(BaseLd, 0).hasOneUse())
|
||||
return SDValue();
|
||||
|
||||
EVT BaseLdVT = BaseLd->getValueType(0);
|
||||
SDValue BasePtr = BaseLd->getBasePtr();
|
||||
|
||||
// Go through the loads and check that they're strided
|
||||
SDValue CurPtr = BasePtr;
|
||||
SDValue Stride;
|
||||
Align Align = BaseLd->getAlign();
|
||||
|
||||
for (SDValue Op : N->ops().drop_front()) {
|
||||
auto *Ld = dyn_cast<LoadSDNode>(Op);
|
||||
if (!Ld || !Ld->isSimple() || !Op.hasOneUse() ||
|
||||
Ld->getChain() != BaseLd->getChain() || !ISD::isNormalLoad(Ld) ||
|
||||
Ld->getValueType(0) != BaseLdVT)
|
||||
return SDValue();
|
||||
|
||||
SDValue Ptr = Ld->getBasePtr();
|
||||
// Check that each load's pointer is (add CurPtr, Stride)
|
||||
if (Ptr.getOpcode() != ISD::ADD || Ptr.getOperand(0) != CurPtr)
|
||||
return SDValue();
|
||||
SDValue Offset = Ptr.getOperand(1);
|
||||
if (!Stride)
|
||||
Stride = Offset;
|
||||
else if (Offset != Stride)
|
||||
return SDValue();
|
||||
|
||||
// The common alignment is the most restrictive (smallest) of all the loads
|
||||
Align = std::min(Align, Ld->getAlign());
|
||||
|
||||
CurPtr = Ptr;
|
||||
}
|
||||
|
||||
// A special case is if the stride is exactly the width of one of the loads,
|
||||
// in which case it's contiguous and can be combined into a regular vle
|
||||
// without changing the element size
|
||||
if (auto *ConstStride = dyn_cast<ConstantSDNode>(Stride);
|
||||
ConstStride &&
|
||||
ConstStride->getZExtValue() == BaseLdVT.getFixedSizeInBits() / 8) {
|
||||
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
|
||||
BaseLd->getPointerInfo(), BaseLd->getMemOperand()->getFlags(),
|
||||
VT.getStoreSize(), Align);
|
||||
// Can't do the combine if the load isn't naturally aligned with the element
|
||||
// type
|
||||
if (!TLI.allowsMemoryAccessForAlignment(*DAG.getContext(),
|
||||
DAG.getDataLayout(), VT, *MMO))
|
||||
return SDValue();
|
||||
|
||||
SDValue WideLoad = DAG.getLoad(VT, DL, BaseLd->getChain(), BasePtr, MMO);
|
||||
for (SDValue Ld : N->ops())
|
||||
DAG.makeEquivalentMemoryOrdering(cast<LoadSDNode>(Ld), WideLoad);
|
||||
return WideLoad;
|
||||
}
|
||||
|
||||
// Get the widened scalar type, e.g. v4i8 -> i64
|
||||
unsigned WideScalarBitWidth =
|
||||
BaseLdVT.getScalarSizeInBits() * BaseLdVT.getVectorNumElements();
|
||||
MVT WideScalarVT = MVT::getIntegerVT(WideScalarBitWidth);
|
||||
|
||||
// Get the vector type for the strided load, e.g. 4 x v4i8 -> v4i64
|
||||
MVT WideVecVT = MVT::getVectorVT(WideScalarVT, N->getNumOperands());
|
||||
if (!TLI.isTypeLegal(WideVecVT))
|
||||
return SDValue();
|
||||
|
||||
MVT ContainerVT = TLI.getContainerForFixedLengthVector(WideVecVT);
|
||||
SDValue VL =
|
||||
getDefaultVLOps(WideVecVT, ContainerVT, DL, DAG, Subtarget).second;
|
||||
SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
|
||||
SDValue IntID =
|
||||
DAG.getTargetConstant(Intrinsic::riscv_vlse, DL, Subtarget.getXLenVT());
|
||||
SDValue Ops[] = {BaseLd->getChain(),
|
||||
IntID,
|
||||
DAG.getUNDEF(ContainerVT),
|
||||
BasePtr,
|
||||
Stride,
|
||||
VL};
|
||||
|
||||
uint64_t MemSize;
|
||||
if (auto *ConstStride = dyn_cast<ConstantSDNode>(Stride))
|
||||
// total size = (elsize * n) + (stride - elsize) * (n-1)
|
||||
// = elsize + stride * (n-1)
|
||||
MemSize = WideScalarVT.getSizeInBits() +
|
||||
ConstStride->getSExtValue() * (N->getNumOperands() - 1);
|
||||
else
|
||||
// If Stride isn't constant, then we can't know how much it will load
|
||||
MemSize = MemoryLocation::UnknownSize;
|
||||
|
||||
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
|
||||
BaseLd->getPointerInfo(), BaseLd->getMemOperand()->getFlags(), MemSize,
|
||||
Align);
|
||||
|
||||
// Can't do the combine if the common alignment isn't naturally aligned with
|
||||
// the new element type
|
||||
if (!TLI.allowsMemoryAccessForAlignment(*DAG.getContext(),
|
||||
DAG.getDataLayout(), WideVecVT, *MMO))
|
||||
return SDValue();
|
||||
|
||||
SDValue StridedLoad = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs,
|
||||
Ops, WideVecVT, MMO);
|
||||
for (SDValue Ld : N->ops())
|
||||
DAG.makeEquivalentMemoryOrdering(cast<LoadSDNode>(Ld), StridedLoad);
|
||||
|
||||
// Note: Perform the bitcast before the convertFromScalableVector so we have
|
||||
// balanced pairs of convertFromScalable/convertToScalable
|
||||
SDValue Res = DAG.getBitcast(
|
||||
TLI.getContainerForFixedLengthVector(VT.getSimpleVT()), StridedLoad);
|
||||
return convertFromScalableVector(VT, Res, DAG, Subtarget);
|
||||
}
|
||||
|
||||
SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
@@ -11654,6 +11785,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
return Gather;
|
||||
break;
|
||||
}
|
||||
case ISD::CONCAT_VECTORS:
|
||||
if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
|
||||
return V;
|
||||
break;
|
||||
case RISCVISD::VMV_V_X_VL: {
|
||||
// Tail agnostic VMV.V.X only demands the vector element bitwidth from the
|
||||
// scalar input.
|
||||
|
||||
@@ -1,17 +1,14 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
||||
; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV32
|
||||
; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64
|
||||
; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,ZVE64F
|
||||
|
||||
; The two loads are contigous and should be folded into one
|
||||
define void @widen_2xv4i16(ptr %x, ptr %z) {
|
||||
; CHECK-LABEL: widen_2xv4i16:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: addi a0, a0, 8
|
||||
; CHECK-NEXT: vle16.v v9, (a0)
|
||||
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v9, 4
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: vse16.v v8, (a1)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <4 x i16>, ptr %x
|
||||
@@ -59,6 +56,26 @@ define void @widen_3xv4i16(ptr %x, ptr %z) {
|
||||
; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma
|
||||
; RV64-NEXT: vse16.v v8, (a1)
|
||||
; RV64-NEXT: ret
|
||||
;
|
||||
; ZVE64F-LABEL: widen_3xv4i16:
|
||||
; ZVE64F: # %bb.0:
|
||||
; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; ZVE64F-NEXT: vle16.v v8, (a0)
|
||||
; ZVE64F-NEXT: addi a2, a0, 8
|
||||
; ZVE64F-NEXT: vle16.v v10, (a2)
|
||||
; ZVE64F-NEXT: addi a0, a0, 16
|
||||
; ZVE64F-NEXT: vle16.v v12, (a0)
|
||||
; ZVE64F-NEXT: vsetivli zero, 8, e16, m2, tu, ma
|
||||
; ZVE64F-NEXT: vslideup.vi v8, v10, 4
|
||||
; ZVE64F-NEXT: vsetivli zero, 12, e16, m2, tu, ma
|
||||
; ZVE64F-NEXT: vslideup.vi v8, v12, 8
|
||||
; ZVE64F-NEXT: vsetivli zero, 1, e64, m2, ta, ma
|
||||
; ZVE64F-NEXT: vslidedown.vi v10, v8, 2
|
||||
; ZVE64F-NEXT: addi a0, a1, 16
|
||||
; ZVE64F-NEXT: vse64.v v10, (a0)
|
||||
; ZVE64F-NEXT: vsetivli zero, 8, e16, m1, ta, ma
|
||||
; ZVE64F-NEXT: vse16.v v8, (a1)
|
||||
; ZVE64F-NEXT: ret
|
||||
%a = load <4 x i16>, ptr %x
|
||||
%b.gep = getelementptr i8, ptr %x, i64 8
|
||||
%b = load <4 x i16>, ptr %b.gep
|
||||
@@ -74,20 +91,8 @@ define void @widen_3xv4i16(ptr %x, ptr %z) {
|
||||
define void @widen_4xv4i16(ptr %x, ptr %z) {
|
||||
; CHECK-LABEL: widen_4xv4i16:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: addi a2, a0, 8
|
||||
; CHECK-NEXT: vle16.v v10, (a2)
|
||||
; CHECK-NEXT: addi a2, a0, 16
|
||||
; CHECK-NEXT: vle16.v v12, (a2)
|
||||
; CHECK-NEXT: addi a0, a0, 24
|
||||
; CHECK-NEXT: vle16.v v14, (a0)
|
||||
; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v10, 4
|
||||
; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v12, 8
|
||||
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v14, 12
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: vse16.v v8, (a1)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <4 x i16>, ptr %x
|
||||
@@ -108,13 +113,10 @@ define void @widen_4xv4i16(ptr %x, ptr %z) {
|
||||
define void @strided_constant(ptr %x, ptr %z) {
|
||||
; CHECK-LABEL: strided_constant:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: addi a0, a0, 16
|
||||
; CHECK-NEXT: vle16.v v9, (a0)
|
||||
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v9, 4
|
||||
; CHECK-NEXT: vse16.v v8, (a1)
|
||||
; CHECK-NEXT: li a2, 16
|
||||
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
|
||||
; CHECK-NEXT: vlse64.v v8, (a0), a2
|
||||
; CHECK-NEXT: vse64.v v8, (a1)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <4 x i16>, ptr %x
|
||||
%b.gep = getelementptr i8, ptr %x, i64 16
|
||||
@@ -128,13 +130,10 @@ define void @strided_constant(ptr %x, ptr %z) {
|
||||
define void @strided_constant_64(ptr %x, ptr %z) {
|
||||
; CHECK-LABEL: strided_constant_64:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: addi a0, a0, 64
|
||||
; CHECK-NEXT: vle16.v v9, (a0)
|
||||
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v9, 4
|
||||
; CHECK-NEXT: vse16.v v8, (a1)
|
||||
; CHECK-NEXT: li a2, 64
|
||||
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
|
||||
; CHECK-NEXT: vlse64.v v8, (a0), a2
|
||||
; CHECK-NEXT: vse64.v v8, (a1)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <4 x i16>, ptr %x
|
||||
%b.gep = getelementptr i8, ptr %x, i64 64
|
||||
@@ -219,13 +218,9 @@ define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
|
||||
define void @strided_runtime(ptr %x, ptr %z, i64 %s) {
|
||||
; CHECK-LABEL: strided_runtime:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle16.v v9, (a0)
|
||||
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v9, 4
|
||||
; CHECK-NEXT: vse16.v v8, (a1)
|
||||
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
|
||||
; CHECK-NEXT: vlse64.v v8, (a0), a2
|
||||
; CHECK-NEXT: vse64.v v8, (a1)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <4 x i16>, ptr %x
|
||||
%b.gep = getelementptr i8, ptr %x, i64 %s
|
||||
@@ -238,21 +233,9 @@ define void @strided_runtime(ptr %x, ptr %z, i64 %s) {
|
||||
define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) {
|
||||
; CHECK-LABEL: strided_runtime_4xv4i16:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle16.v v10, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle16.v v12, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle16.v v14, (a0)
|
||||
; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v10, 4
|
||||
; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v12, 8
|
||||
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v14, 12
|
||||
; CHECK-NEXT: vse16.v v8, (a1)
|
||||
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
|
||||
; CHECK-NEXT: vlse64.v v8, (a0), a2
|
||||
; CHECK-NEXT: vse64.v v8, (a1)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <4 x i16>, ptr %x
|
||||
%b.gep = getelementptr i8, ptr %x, i64 %s
|
||||
@@ -307,6 +290,25 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
|
||||
; RV64-NEXT: vslideup.vi v8, v14, 12
|
||||
; RV64-NEXT: vse16.v v8, (a1)
|
||||
; RV64-NEXT: ret
|
||||
;
|
||||
; ZVE64F-LABEL: strided_runtime_mismatch_4xv4i16:
|
||||
; ZVE64F: # %bb.0:
|
||||
; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; ZVE64F-NEXT: vle16.v v8, (a0)
|
||||
; ZVE64F-NEXT: add a0, a0, a2
|
||||
; ZVE64F-NEXT: vle16.v v10, (a0)
|
||||
; ZVE64F-NEXT: add a0, a0, a3
|
||||
; ZVE64F-NEXT: vle16.v v12, (a0)
|
||||
; ZVE64F-NEXT: add a0, a0, a2
|
||||
; ZVE64F-NEXT: vle16.v v14, (a0)
|
||||
; ZVE64F-NEXT: vsetivli zero, 8, e16, m2, tu, ma
|
||||
; ZVE64F-NEXT: vslideup.vi v8, v10, 4
|
||||
; ZVE64F-NEXT: vsetivli zero, 12, e16, m2, tu, ma
|
||||
; ZVE64F-NEXT: vslideup.vi v8, v12, 8
|
||||
; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma
|
||||
; ZVE64F-NEXT: vslideup.vi v8, v14, 12
|
||||
; ZVE64F-NEXT: vse16.v v8, (a1)
|
||||
; ZVE64F-NEXT: ret
|
||||
%a = load <4 x i16>, ptr %x
|
||||
%b.gep = getelementptr i8, ptr %x, i64 %s
|
||||
%b = load <4 x i16>, ptr %b.gep
|
||||
@@ -324,21 +326,9 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
|
||||
define void @strided_runtime_4xv4f16(ptr %x, ptr %z, i64 %s) {
|
||||
; CHECK-LABEL: strided_runtime_4xv4f16:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle16.v v10, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle16.v v12, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle16.v v14, (a0)
|
||||
; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v10, 4
|
||||
; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v12, 8
|
||||
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v14, 12
|
||||
; CHECK-NEXT: vse16.v v8, (a1)
|
||||
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
|
||||
; CHECK-NEXT: vlse64.v v8, (a0), a2
|
||||
; CHECK-NEXT: vse64.v v8, (a1)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <4 x half>, ptr %x
|
||||
%b.gep = getelementptr i8, ptr %x, i64 %s
|
||||
@@ -357,21 +347,9 @@ define void @strided_runtime_4xv4f16(ptr %x, ptr %z, i64 %s) {
|
||||
define void @strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
|
||||
; CHECK-LABEL: strided_runtime_4xv2f32:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
|
||||
; CHECK-NEXT: vle32.v v8, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle32.v v10, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle32.v v12, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle32.v v14, (a0)
|
||||
; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v10, 2
|
||||
; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v12, 4
|
||||
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v14, 6
|
||||
; CHECK-NEXT: vse32.v v8, (a1)
|
||||
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
|
||||
; CHECK-NEXT: vlse64.v v8, (a0), a2
|
||||
; CHECK-NEXT: vse64.v v8, (a1)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <2 x float>, ptr %x
|
||||
%b.gep = getelementptr i8, ptr %x, i64 %s
|
||||
@@ -406,17 +384,13 @@ define void @strided_unaligned(ptr %x, ptr %z, i64 %s) {
|
||||
ret void
|
||||
}
|
||||
|
||||
; Shouldn't be combined because the loads have different alignments
|
||||
; Should use the most restrictive common alignment
|
||||
define void @strided_mismatched_alignments(ptr %x, ptr %z, i64 %s) {
|
||||
; CHECK-LABEL: strided_mismatched_alignments:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle16.v v9, (a0)
|
||||
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v9, 4
|
||||
; CHECK-NEXT: vse16.v v8, (a1)
|
||||
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
|
||||
; CHECK-NEXT: vlse64.v v8, (a0), a2
|
||||
; CHECK-NEXT: vse64.v v8, (a1)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <4 x i16>, ptr %x, align 8
|
||||
%b.gep = getelementptr i8, ptr %x, i64 %s
|
||||
@@ -429,13 +403,9 @@ define void @strided_mismatched_alignments(ptr %x, ptr %z, i64 %s) {
|
||||
define void @strided_ok_alignments_8(ptr %x, ptr %z, i64 %s) {
|
||||
; CHECK-LABEL: strided_ok_alignments_8:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle16.v v9, (a0)
|
||||
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v9, 4
|
||||
; CHECK-NEXT: vse16.v v8, (a1)
|
||||
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
|
||||
; CHECK-NEXT: vlse64.v v8, (a0), a2
|
||||
; CHECK-NEXT: vse64.v v8, (a1)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <4 x i16>, ptr %x, align 8
|
||||
%b.gep = getelementptr i8, ptr %x, i64 %s
|
||||
@@ -448,13 +418,9 @@ define void @strided_ok_alignments_8(ptr %x, ptr %z, i64 %s) {
|
||||
define void @strided_ok_alignments_16(ptr %x, ptr %z, i64 %s) {
|
||||
; CHECK-LABEL: strided_ok_alignments_16:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: add a0, a0, a2
|
||||
; CHECK-NEXT: vle16.v v9, (a0)
|
||||
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v9, 4
|
||||
; CHECK-NEXT: vse16.v v8, (a1)
|
||||
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
|
||||
; CHECK-NEXT: vlse64.v v8, (a0), a2
|
||||
; CHECK-NEXT: vse64.v v8, (a1)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <4 x i16>, ptr %x, align 16
|
||||
%b.gep = getelementptr i8, ptr %x, i64 %s
|
||||
@@ -499,3 +465,4 @@ define void @strided_non_load(ptr %x, ptr %z, <4 x i16> %b) {
|
||||
store <8 x i16> %c, ptr %z
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user