AMDGPU: Add VI i16 support

Patch By: Wei Ding

Differential Revision: https://reviews.llvm.org/D18049

llvm-svn: 285939
This commit is contained in:
Tom Stellard
2016-11-03 17:13:50 +00:00
parent 40c15abe5f
commit 2b3379cdff
42 changed files with 1483 additions and 348 deletions

View File

@@ -493,6 +493,8 @@ def isCIVI : Predicate <
def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">;
class PredicateControl {
Predicate SubtargetPredicate;
Predicate SIAssemblerPredicate = isSICI;

View File

@@ -586,19 +586,32 @@ bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const
bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
// Truncate is just accessing a subregister.
return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
unsigned SrcSize = Source.getSizeInBits();
unsigned DestSize = Dest.getSizeInBits();
return DestSize < SrcSize && DestSize % 32 == 0 ;
}
bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
// Truncate is just accessing a subregister.
return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
(Dest->getPrimitiveSizeInBits() % 32 == 0);
unsigned SrcSize = Source->getScalarSizeInBits();
unsigned DestSize = Dest->getScalarSizeInBits();
if (DestSize== 16 && Subtarget->has16BitInsts())
return SrcSize >= 32;
return DestSize < SrcSize && DestSize % 32 == 0;
}
bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
unsigned SrcSize = Src->getScalarSizeInBits();
unsigned DestSize = Dest->getScalarSizeInBits();
if (SrcSize == 16 && Subtarget->has16BitInsts())
return DestSize >= 32;
return SrcSize == 32 && DestSize == 64;
}
@@ -607,6 +620,10 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
// practical purposes, the extra mov 0 to load a 64-bit is free. As used,
// this will enable reducing 64-bit operations the 32-bit, which is always
// good.
if (Src == MVT::i16)
return Dest == MVT::i32 ||Dest == MVT::i64 ;
return Src == MVT::i32 && Dest == MVT::i64;
}
@@ -2446,6 +2463,10 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
if (VT.isVector() || Size > 64)
return SDValue();
// There are i16 integer mul/mad.
if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);

View File

@@ -529,14 +529,14 @@ multiclass BFIPatterns <Instruction BFI_INT,
def : Pat <
(fcopysign f32:$src0, f32:$src1),
(BFI_INT (LoadImm32 0x7fffffff), $src0, $src1)
(BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
>;
def : Pat <
(f64 (fcopysign f64:$src0, f64:$src1)),
(REG_SEQUENCE RC64,
(i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
(BFI_INT (LoadImm32 0x7fffffff),
(BFI_INT (LoadImm32 (i32 0x7fffffff)),
(i32 (EXTRACT_SUBREG $src0, sub1)),
(i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
>;
@@ -545,7 +545,7 @@ multiclass BFIPatterns <Instruction BFI_INT,
(f64 (fcopysign f64:$src0, f32:$src1)),
(REG_SEQUENCE RC64,
(i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
(BFI_INT (LoadImm32 0x7fffffff),
(BFI_INT (LoadImm32 (i32 0x7fffffff)),
(i32 (EXTRACT_SUBREG $src0, sub1)),
$src1), sub1)
>;

View File

@@ -708,13 +708,13 @@ let Predicates = [isGCN] in {
// int_SI_vs_load_input
def : Pat<
(SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
(BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0)
(BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0)
>;
// Offset in an 32-bit VGPR
def : Pat <
(SIload_constant v4i32:$sbase, i32:$voff),
(BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0)
(BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0)
>;
@@ -914,7 +914,7 @@ def : Pat<
>;
class MUBUFLoad_Pattern <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
PatFrag constant_ld> : Pat <
(vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
@@ -936,15 +936,34 @@ multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Ins
}
let Predicates = [isSICI] in {
def : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
def : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
def : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
def : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>;
defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>;
} // End Predicates = [isSICI]
multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
PatFrag ld> {
def : Pat <
(vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
(Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe)
>;
}
let Predicates = [Has16BitInsts] in {
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, mubuf_sextloadi8>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>;
} // End Predicates = [Has16BitInsts]
class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat <
(vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset))),
@@ -953,6 +972,8 @@ class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat <
def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i16, sextloadi8_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i16, extloadi8_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
@@ -1025,6 +1046,20 @@ defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_
defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>;
} // End Predicates = [isSICI]
multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
PatFrag st> {
def : Pat <
(st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe)),
(Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
>;
}
defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>;
defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>;
class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat <
(st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
u16imm:$offset)),
@@ -1033,6 +1068,8 @@ class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat
def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i16, truncstorei8_private>;
def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i16, store_private>;
def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;

View File

@@ -489,8 +489,12 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>;
def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>;
def : DSReadPat <DS_READ_I8, i16, si_sextload_local_i8>;
def : DSReadPat <DS_READ_U8, i16, si_az_extload_local_i8>;
def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>;
def : DSReadPat <DS_READ_U16, i16, si_load_local>;
def : DSReadPat <DS_READ_B32, i32, si_load_local>;
let AddedComplexity = 100 in {
@@ -512,6 +516,8 @@ class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>;
def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>;
def : DSWritePat <DS_WRITE_B8, i16, si_truncstore_local_i8>;
def : DSWritePat <DS_WRITE_B16, i16, si_store_local>;
def : DSWritePat <DS_WRITE_B32, i32, si_store_local>;
let AddedComplexity = 100 in {
@@ -522,8 +528,8 @@ def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>;
def : Pat <
(si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
i8:$offset1)),
(DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0),
(EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
(DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
(i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
(i1 0))
>;

View File

@@ -341,6 +341,8 @@ let Predicates = [isCIVI] in {
def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i16>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
@@ -389,6 +391,10 @@ def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
} // End Predicates = [isCIVI]
let Predicates = [isVI] in {
def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei8, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, flat_store, i16>;
}
//===----------------------------------------------------------------------===//

View File

@@ -78,6 +78,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
if (Subtarget->has16BitInsts())
addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
computeRegisterProperties(STI.getRegisterInfo());
// We need to custom lower vector stores from local memory
@@ -221,6 +224,55 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FDIV, MVT::f32, Custom);
setOperationAction(ISD::FDIV, MVT::f64, Custom);
if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::Constant, MVT::i16, Legal);
setOperationAction(ISD::SMIN, MVT::i16, Legal);
setOperationAction(ISD::SMAX, MVT::i16, Legal);
setOperationAction(ISD::UMIN, MVT::i16, Legal);
setOperationAction(ISD::UMAX, MVT::i16, Legal);
setOperationAction(ISD::SETCC, MVT::i16, Promote);
AddPromotedToType(ISD::SETCC, MVT::i16, MVT::i32);
setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
setOperationAction(ISD::ROTR, MVT::i16, Promote);
setOperationAction(ISD::ROTL, MVT::i16, Promote);
setOperationAction(ISD::SDIV, MVT::i16, Promote);
setOperationAction(ISD::UDIV, MVT::i16, Promote);
setOperationAction(ISD::SREM, MVT::i16, Promote);
setOperationAction(ISD::UREM, MVT::i16, Promote);
setOperationAction(ISD::BSWAP, MVT::i16, Promote);
setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
setOperationAction(ISD::CTTZ, MVT::i16, Promote);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
setOperationAction(ISD::CTLZ, MVT::i16, Promote);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
setOperationAction(ISD::BR_CC, MVT::i16, Expand);
setOperationAction(ISD::LOAD, MVT::i16, Custom);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
AddPromotedToType(ISD::UINT_TO_FP, MVT::i16, MVT::i32);
setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
AddPromotedToType(ISD::SINT_TO_FP, MVT::i16, MVT::i32);
setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
}
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
@@ -2558,7 +2610,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
EVT MemVT = Load->getMemoryVT();
if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
// FIXME: Copied from PPC
// First, load into 32 bits, then truncate to 1 bit.
@@ -2566,8 +2617,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue BasePtr = Load->getBasePtr();
MachineMemOperand *MMO = Load->getMemOperand();
EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
BasePtr, MVT::i8, MMO);
BasePtr, RealMemVT, MMO);
SDValue Ops[] = {
DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
@@ -3381,8 +3434,23 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
}
EVT VT = K0->getValueType(0);
return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
MVT NVT = MVT::i32;
unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
SDValue Tmp1, Tmp2, Tmp3;
Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
if (VT == MVT::i16) {
Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
Tmp1, Tmp2, Tmp3);
return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
} else
return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
}
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {

View File

@@ -1128,7 +1128,6 @@ def getAtomicNoRetOp : InstrMapping {
include "SIInstructions.td"
include "CIInstructions.td"
include "VIInstructions.td"
include "DSInstructions.td"
include "MIMGInstructions.td"

View File

@@ -374,7 +374,7 @@ def : Pat<
def : Pat <
(int_AMDGPU_kilp),
(SI_KILL 0xbf800000)
(SI_KILL (i32 0xbf800000))
>;
def : Pat <
@@ -555,7 +555,7 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
def : Pat <
(AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
(f32 FP_ZERO), (f32 FP_ONE)),
(V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod)
(V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
>;
/********** ================================ **********/
@@ -566,7 +566,7 @@ def : Pat <
def : Pat <
(fneg (fabs f32:$src)),
(S_OR_B32 $src, (S_MOV_B32 0x80000000)) // Set sign bit
(S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
>;
// FIXME: Should use S_OR_B32
@@ -575,19 +575,19 @@ def : Pat <
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
(V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
(V_MOV_B32_e32 0x80000000)), // Set sign bit.
(V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
(V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
sub1)
>;
def : Pat <
(fabs f32:$src),
(V_AND_B32_e64 $src, (V_MOV_B32_e32 0x7fffffff))
(V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
>;
def : Pat <
(fneg f32:$src),
(V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000))
(V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
>;
def : Pat <
@@ -595,8 +595,8 @@ def : Pat <
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
(V_AND_B32_e64 (EXTRACT_SUBREG f64:$src, sub1),
(V_MOV_B32_e32 0x7fffffff)), // Set sign bit.
(V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
(V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
sub1)
>;
@@ -605,8 +605,8 @@ def : Pat <
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
(V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
(V_MOV_B32_e32 0x80000000)),
(V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
(i32 (V_MOV_B32_e32 (i32 0x80000000)))),
sub1)
>;
@@ -666,21 +666,21 @@ def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
def : Pat <
(int_AMDGPU_cube v4f32:$src),
(REG_SEQUENCE VReg_128,
(V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1),
0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2),
(V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)),
0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub0,
(V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2),
(V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub1,
(V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
(V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub2,
(V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
(V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub3)
>;
@@ -701,7 +701,7 @@ def : Ext32Pat <anyext>;
def : Pat <
(AMDGPUurecip i32:$src0),
(V_CVT_U32_F32_e32
(V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1,
(V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
(V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
>;
@@ -767,32 +767,37 @@ def : Pat <
//===----------------------------------------------------------------------===//
def : Pat<(i32 (sext_inreg i32:$src, i1)),
(S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
(S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
// Handle sext_inreg in i64
def : Pat <
(i64 (sext_inreg i64:$src, i1)),
(S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16
(S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
>;
def : Pat <
(i16 (sext_inreg i16:$src, i8)),
(S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i8)),
(S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16
(S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i16)),
(S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16
(S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i32)),
(S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16
(S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
>;
def : Pat <
(i64 (zext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1)
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
>;
def : Pat <
@@ -804,7 +809,7 @@ class ZExt_i64_i1_Pat <SDNode ext> : Pat <
(i64 (ext i1:$src)),
(REG_SEQUENCE VReg_64,
(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
(S_MOV_B32 0), sub1)
(S_MOV_B32 (i32 0)), sub1)
>;
@@ -816,25 +821,25 @@ def : ZExt_i64_i1_Pat<anyext>;
def : Pat <
(i64 (sext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0,
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1)
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
>;
def : Pat <
(i64 (sext i1:$src)),
(REG_SEQUENCE VReg_64,
(V_CNDMASK_B32_e64 0, -1, $src), sub0,
(V_CNDMASK_B32_e64 0, -1, $src), sub1)
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
>;
class FPToI1Pat<Instruction Inst, int KOne, ValueType vt, SDPatternOperator fp_to_int> : Pat <
class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
(i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
(i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
(i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, f32, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, f32, fp_to_sint>;
def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, f64, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, f64, fp_to_sint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
// If we need to perform a logical operation on i1 values, we need to
// use vector comparisons since there is only one SCC register. Vector
@@ -859,12 +864,12 @@ def : Pat <
def : Pat <
(f32 (sint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
>;
def : Pat <
(f32 (uint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src)
(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
>;
def : Pat <
@@ -888,20 +893,20 @@ def : Pat <
def : Pat <
(i1 (trunc i32:$a)),
(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), 1)
(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
>;
def : Pat <
(i1 (trunc i64:$a)),
(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
(EXTRACT_SUBREG $a, sub0)), 1)
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
>;
def : Pat <
(i32 (bswap i32:$a)),
(V_BFI_B32 (S_MOV_B32 0x00ff00ff),
(V_ALIGNBIT_B32 $a, $a, 24),
(V_ALIGNBIT_B32 $a, $a, 8))
(V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
(V_ALIGNBIT_B32 $a, $a, (i32 24)),
(V_ALIGNBIT_B32 $a, $a, (i32 8)))
>;
def : Pat <
@@ -917,7 +922,7 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
def : Pat <
(vt (add (vt (shl 1, vt:$a)), -1)),
(BFM $a, (MOV 0))
(BFM $a, (MOV (i32 0)))
>;
}
@@ -928,7 +933,7 @@ def : BFEPattern <V_BFE_U32, S_MOV_B32>;
def : Pat<
(fcanonicalize f32:$src),
(V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0)
(V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
>;
def : Pat<
@@ -963,7 +968,7 @@ def : Pat <
(V_MOV_B64_PSEUDO 0x3fefffffffffffff),
DSTCLAMP.NONE, DSTOMOD.NONE),
$x,
(V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)),
(V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))),
DSTCLAMP.NONE, DSTOMOD.NONE)
>;

View File

@@ -123,7 +123,7 @@ def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
// TODO: Do we need to set DwarfRegAlias on register tuples?
// SGPR 32-bit registers
def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32,
(add (sequence "SGPR%u", 0, 103))> {
let AllocationPriority = 1;
}
@@ -190,7 +190,8 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
(add (decimate (shl TTMP_32, 3), 4))]>;
// VGPR 32-bit registers
def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
// i16 only on VI+
def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32,
(add (sequence "VGPR%u", 0, 255))> {
let AllocationPriority = 1;
let Size = 32;
@@ -258,8 +259,8 @@ def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32,
}
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
(add SReg_32_XM0, M0)> {
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32,
(add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> {
let AllocationPriority = 1;
}
@@ -346,7 +347,7 @@ def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
let Size = 32;
}
def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)> {
def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add VGPR_32, SReg_32)> {
let isAllocatable = 0;
}

View File

@@ -879,7 +879,7 @@ def : Pat <
(i64 (ctpop i64:$src)),
(i64 (REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
(S_MOV_B32 0), sub1))
(S_MOV_B32 (i32 0)), sub1))
>;
def : Pat <
@@ -887,6 +887,18 @@ def : Pat <
(S_ABS_I32 $x)
>;
def : Pat <
(i16 imm:$imm),
(S_MOV_B32 imm:$imm)
>;
// Same as a 32-bit inreg
def : Pat<
(i32 (sext i16:$src)),
(S_SEXT_I32_I16 $src)
>;
//===----------------------------------------------------------------------===//
// SOP2 Patterns
//===----------------------------------------------------------------------===//
@@ -898,6 +910,29 @@ def : Pat <
(S_ADD_U32 $src0, $src1)
>;
// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
// REG_SEQUENCE patterns don't support instructions with multiple
// outputs.
def : Pat<
(i64 (zext i16:$src)),
(REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0,
(S_MOV_B32 (i32 0)), sub1)
>;
def : Pat <
(i64 (sext i16:$src)),
(REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0,
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1)
>;
def : Pat<
(i32 (zext i16:$src)),
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
>;
//===----------------------------------------------------------------------===//
// SOPP Patterns
//===----------------------------------------------------------------------===//

View File

@@ -1,10 +0,0 @@
//===-- VIInstructions.td - VI Instruction Defintions ---------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// Instruction definitions for VI and newer.
//===----------------------------------------------------------------------===//

View File

@@ -301,6 +301,20 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16>;
}
let Predicates = [isVI] in {
def : Pat<
(f32 (f16_to_fp i16:$src)),
(V_CVT_F32_F16_e32 $src)
>;
def : Pat<
(i16 (fp_to_f16 f32:$src)),
(V_CVT_F16_F32_e32 $src)
>;
}
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
@@ -561,10 +575,39 @@ def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
let Predicates = [isVI] in {
def : Pat <
(int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
imm:$bound_ctrl),
(i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
imm:$bound_ctrl)),
(V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
(as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
>;
def : Pat<
(i32 (anyext i16:$src)),
(COPY $src)
>;
def : Pat<
(i64 (anyext i16:$src)),
(REG_SEQUENCE VReg_64,
(i32 (COPY $src)), sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
def : Pat<
(i16 (trunc i32:$src)),
(COPY $src)
>;
def : Pat<
(i1 (trunc i16:$src)),
(COPY $src)
>;
def : Pat <
(i16 (trunc i64:$src)),
(EXTRACT_SUBREG $src, sub0)
>;
} // End Predicates = [isVI]

View File

@@ -345,6 +345,78 @@ defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>;
} // End SubtargetPredicate = isVI
// Note: 16-bit instructions produce a 0 result in the high 16-bits.
multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
def : Pat<
(op i16:$src0, i16:$src1),
(inst $src0, $src1)
>;
def : Pat<
(i32 (zext (op i16:$src0, i16:$src1))),
(inst $src0, $src1)
>;
def : Pat<
(i64 (zext (op i16:$src0, i16:$src1))),
(REG_SEQUENCE VReg_64,
(inst $src0, $src1), sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
}
multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> {
def : Pat<
(op i16:$src0, i32:$src1),
(inst $src1, $src0)
>;
def : Pat<
(i32 (zext (op i16:$src0, i32:$src1))),
(inst $src1, $src0)
>;
def : Pat<
(i64 (zext (op i16:$src0, i32:$src1))),
(REG_SEQUENCE VReg_64,
(inst $src1, $src0), sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
}
class ZExt_i16_i1_Pat <SDNode ext> : Pat <
(i16 (ext i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
>;
let Predicates = [isVI] in {
defm : Arithmetic_i16_Pats<add, V_ADD_U16_e32>;
defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e32>;
defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e32>;
defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e32>;
defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e32>;
defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e32>;
defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e32>;
defm : Arithmetic_i16_Pats<and, V_AND_B32_e32>;
defm : Arithmetic_i16_Pats<or, V_OR_B32_e32>;
defm : Arithmetic_i16_Pats<xor, V_XOR_B32_e32>;
defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e32>;
defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e32>;
defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_B16_e32>;
def : ZExt_i16_i1_Pat<zext>;
def : ZExt_i16_i1_Pat<sext>;
def : ZExt_i16_i1_Pat<anyext>;
} // End Predicates = [isVI]
//===----------------------------------------------------------------------===//
// SI
//===----------------------------------------------------------------------===//

View File

@@ -222,6 +222,38 @@ let isCommutable = 1 in {
} // End SubtargetPredicate = isVI
def : Pat <
(i16 (select i1:$src0, i16:$src1, i16:$src2)),
(V_CNDMASK_B32_e64 $src2, $src1, $src0)
>;
let Predicates = [isVI] in {
multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
Instruction inst, SDPatternOperator op3> {
def : Pat<
(op2 (op1 i16:$src0, i16:$src1), i16:$src2),
(inst i16:$src0, i16:$src1, i16:$src2)
>;
def : Pat<
(i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
(inst i16:$src0, i16:$src1, i16:$src2)
>;
def : Pat<
(i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
(REG_SEQUENCE VReg_64,
(inst i16:$src0, i16:$src1, i16:$src2), sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
}
defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>;
defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>;
} // End Predicates = [isVI]
//===----------------------------------------------------------------------===//
// Target

View File

@@ -0,0 +1,149 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_add_i16:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: buffer_store_short [[ADD]]
define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
%gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
%gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
%a = load volatile i16, i16 addrspace(1)* %gep.in0
%b = load volatile i16, i16 addrspace(1)* %gep.in1
%add = add i16 %a, %b
store i16 %add, i16 addrspace(1)* %out
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_add_i16_constant:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]]
; VI-NEXT: buffer_store_short [[ADD]]
define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
%gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
%a = load volatile i16, i16 addrspace(1)* %gep.in0
%add = add i16 %a, 123
store i16 %add, i16 addrspace(1)* %out
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_add_i16_neg_constant:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]]
; VI-NEXT: buffer_store_short [[ADD]]
define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
%gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
%a = load volatile i16, i16 addrspace(1)* %gep.in0
%add = add i16 %a, -845
store i16 %add, i16 addrspace(1)* %out
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_add_i16_inline_neg1:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]]
; VI-NEXT: buffer_store_short [[ADD]]
define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
%gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
%a = load volatile i16, i16 addrspace(1)* %gep.in0
%add = add i16 %a, -1
store i16 %add, i16 addrspace(1)* %out
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: buffer_store_dword [[ADD]]
define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
%gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
%gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
%a = load volatile i16, i16 addrspace(1)* %gep.in0
%b = load volatile i16, i16 addrspace(1)* %gep.in1
%add = add i16 %a, %b
%ext = zext i16 %add to i32
store i32 %ext, i32 addrspace(1)* %out
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
%gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
%gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
%a = load volatile i16, i16 addrspace(1)* %gep.in0
%b = load volatile i16, i16 addrspace(1)* %gep.in1
%add = add i16 %a, %b
%ext = zext i16 %add to i64
store i64 %ext, i64 addrspace(1)* %out
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: buffer_store_dword [[SEXT]]
define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
%gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
%gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
%a = load i16, i16 addrspace(1)* %gep.in0
%b = load i16, i16 addrspace(1)* %gep.in1
%add = add i16 %a, %b
%ext = sext i16 %add to i32
store i32 %ext, i32 addrspace(1)* %out
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
%gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
%gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
%a = load i16, i16 addrspace(1)* %gep.in0
%b = load i16, i16 addrspace(1)* %gep.in1
%add = add i16 %a, %b
%ext = sext i16 %add to i64
store i64 %ext, i64 addrspace(1)* %out
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #0
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

View File

@@ -1,15 +1,40 @@
; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; CHECK-LABEL: {{^}}anyext_i1_i32:
; CHECK: v_cndmask_b32_e64
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
; GCN-LABEL: {{^}}anyext_i1_i32:
; GCN: v_cndmask_b32_e64
define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
entry:
%0 = icmp eq i32 %cond, 0
%1 = zext i1 %0 to i8
%2 = xor i8 %1, -1
%3 = and i8 %2, 1
%4 = zext i8 %3 to i32
store i32 %4, i32 addrspace(1)* %out
%tmp = icmp eq i32 %cond, 0
%tmp1 = zext i1 %tmp to i8
%tmp2 = xor i8 %tmp1, -1
%tmp3 = and i8 %tmp2, 1
%tmp4 = zext i8 %tmp3 to i32
store i32 %tmp4, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}s_anyext_i16_i32:
; VI: v_add_u16_e32 [[ADD:v[0-9]+]],
; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]]
; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]]
; VI: buffer_store_dword [[AND]]
define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
entry:
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
%a.ptr = getelementptr i16, i16 addrspace(1)* %a, i32 %tid.x
%b.ptr = getelementptr i16, i16 addrspace(1)* %b, i32 %tid.y
%a.l = load i16, i16 addrspace(1)* %a.ptr
%b.l = load i16, i16 addrspace(1)* %b.ptr
%tmp = add i16 %a.l, %b.l
%tmp1 = trunc i16 %tmp to i8
%tmp2 = xor i8 %tmp1, -1
%tmp3 = and i8 %tmp2, 1
%tmp4 = zext i8 %tmp3 to i32
store i32 %tmp4, i32 addrspace(1)* %out
ret void
}

View File

@@ -1,5 +1,6 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
declare i16 @llvm.bitreverse.i16(i16) #1
declare i32 @llvm.bitreverse.i32(i32) #1
@@ -12,7 +13,7 @@ declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1
declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
; FUNC-LABEL: {{^}}s_brev_i16:
; SI: s_brev_b32
; SI: s_brev_b32
define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
%brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
store i16 %brev, i16 addrspace(1)* %out

View File

@@ -116,14 +116,19 @@ ret:
; OPT: store
; OPT: ret
; For GFX8: since i16 is legal type, we cannot sink lshr into BBs.
; GCN-LABEL: {{^}}sink_ubfe_i16:
; GCN-NOT: lshr
; VI: s_bfe_u32 s0, s0, 0xc0004
; GCN: s_cbranch_vccnz
; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
; VI: s_and_b32 s0, s0, 0xff
; GCN: BB2_2:
; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
; VI: s_and_b32 s0, s0, 0x7f
; GCN: BB2_3:
; GCN: buffer_store_short

View File

@@ -1,10 +1,13 @@
; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
; FUNC-LABEL: {{^}}test_copy_v4i8:
; SI: buffer_load_dword [[REG:v[0-9]+]]
; SI: buffer_store_dword [[REG]]
; SI: s_endpgm
; GCN: buffer_load_dword [[REG:v[0-9]+]]
; GCN: buffer_store_dword [[REG]]
; GCN: s_endpgm
define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
@@ -12,10 +15,10 @@ define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)*
}
; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
; SI: buffer_load_dword [[REG:v[0-9]+]]
; SI: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
; SI: s_endpgm
; GCN: buffer_load_dword [[REG:v[0-9]+]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: s_endpgm
define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
@@ -24,11 +27,11 @@ define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
}
; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
; SI: buffer_load_dword [[REG:v[0-9]+]]
; SI: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
; SI: s_endpgm
; GCN: buffer_load_dword [[REG:v[0-9]+]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: s_endpgm
define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
@@ -38,12 +41,12 @@ define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
}
; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
; SI: buffer_load_dword [[REG:v[0-9]+]]
; SI: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
; SI: s_endpgm
; GCN: buffer_load_dword [[REG:v[0-9]+]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: s_endpgm
define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
@@ -54,14 +57,14 @@ define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
}
; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
; SI: buffer_load_dword
; SI-DAG: v_lshrrev_b32
; SI: v_and_b32
; SI: v_or_b32
; SI-DAG: buffer_store_dword
; SI-DAG: buffer_store_dword
; GCN: buffer_load_dword
; GCN-DAG: v_lshrrev_b32
; GCN: v_and_b32
; GCN: v_or_b32
; GCN-DAG: buffer_store_dword
; GCN-DAG: buffer_store_dword
; SI: s_endpgm
; GCN: s_endpgm
define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
%add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
@@ -70,18 +73,22 @@ define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> add
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use:
; SI: buffer_load_dword
; SI-DAG: v_lshrrev_b32
; GCN: {{buffer|flat}}_load_dword
; GCN-DAG: v_lshrrev_b32
; SI-DAG: v_add_i32
; SI-DAG: v_and_b32
; SI-DAG: v_or_b32
; SI-DAG: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: s_endpgm
; VI-DAG: v_add_u16
; GCN-DAG: v_and_b32
; GCN-DAG: v_or_b32
; GCN-DAG: {{buffer|flat}}_store_dword
; GCN: {{buffer|flat}}_store_dword
; GCN: {{buffer|flat}}_store_dword
; GCN: s_endpgm
define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
%val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
%add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
@@ -90,10 +97,10 @@ define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8>
}
; FUNC-LABEL: {{^}}test_copy_v3i8_align4:
; SI: buffer_load_dword
; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; SI: s_endpgm
; GCN: buffer_load_dword
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; GCN: s_endpgm
define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
%val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
@@ -101,11 +108,11 @@ define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
}
; FUNC-LABEL: {{^}}test_copy_v3i8_align2:
; SI-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; SI-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; SI: s_endpgm
; GCN-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; GCN: s_endpgm
define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
%val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
@@ -113,14 +120,14 @@ define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
}
; FUNC-LABEL: {{^}}test_copy_v3i8_align1:
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: s_endpgm
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: s_endpgm
define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
%val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
@@ -128,12 +135,12 @@ define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
}
; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_store_dword
; SI: s_endpgm
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_store_dword
; GCN: s_endpgm
define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
%val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
@@ -141,15 +148,15 @@ define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8>
}
; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: buffer_store_byte
; SI: s_endpgm
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: s_endpgm
define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4

View File

@@ -100,6 +100,7 @@ define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrsp
; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_byte [[RESULT]],
; GCN: s_endpgm
define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
%val = load i8, i8 addrspace(1)* %valptr
%ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone

View File

@@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone

View File

@@ -30,10 +30,10 @@ define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c)
}
; GCN-LABEL: {{^}}legacy_cube:
; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
; GCN: buffer_store_dwordx4
define void @legacy_cube(<4 x float> addrspace(1)* %out, <4 x float> %abcx) #1 {
%cube = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %abcx)

View File

@@ -1,15 +1,15 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
; SI-LABEL: {{^}}load_i8_to_f32:
; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]],
; SI-NOT: bfe
; SI-NOT: lshr
; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
; SI: buffer_store_dword [[CONV]],
; GCN-LABEL: {{^}}load_i8_to_f32:
; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]],
; GCN-NOT: bfe
; GCN-NOT: lshr
; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
; GCN: buffer_store_dword [[CONV]],
define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
%load = load i8, i8 addrspace(1)* %in, align 1
%cvt = uitofp i8 %load to float
@@ -17,11 +17,11 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n
ret void
}
; SI-LABEL: {{^}}load_v2i8_to_v2f32:
; SI: buffer_load_ushort [[LD:v[0-9]+]]
; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
; GCN-LABEL: {{^}}load_v2i8_to_v2f32:
; GCN: buffer_load_ushort [[LD:v[0-9]+]]
; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
%cvt = uitofp <2 x i8> %load to <2 x float>
@@ -29,13 +29,13 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
ret void
}
; SI-LABEL: {{^}}load_v3i8_to_v3f32:
; SI: buffer_load_dword [[VAL:v[0-9]+]]
; SI-NOT: v_cvt_f32_ubyte3_e32
; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: v_cvt_f32_ubyte3_e32
; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
%cvt = uitofp <3 x i8> %load to <3 x float>
@@ -43,15 +43,15 @@ define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8>
ret void
}
; SI-LABEL: {{^}}load_v4i8_to_v4f32:
; SI: buffer_load_dword [[LOADREG:v[0-9]+]]
; SI-NOT: bfe
; SI-NOT: lshr
; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
; GCN-LABEL: {{^}}load_v4i8_to_v4f32:
; GCN: buffer_load_dword [[LOADREG:v[0-9]+]]
; GCN-NOT: bfe
; GCN-NOT: lshr
; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
%cvt = uitofp <4 x i8> %load to <4 x float>
@@ -63,19 +63,19 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
; position in the word for the component.
; FIXME: Packing bytes
; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
; SI-DAG: v_lshlrev_b32
; SI-DAG: v_or_b32
; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
; GCN-DAG: v_lshlrev_b32
; GCN-DAG: v_or_b32
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
; SI: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
%cvt = uitofp <4 x i8> %load to <4 x float>
@@ -85,25 +85,31 @@ define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out
; FIXME: Need to handle non-uniform case for function below (load without gep).
; Instructions still emitted to repack bytes for add use.
; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
; SI: {{buffer|flat}}_load_dword
; SI-DAG: v_cvt_f32_ubyte0_e32
; SI-DAG: v_cvt_f32_ubyte1_e32
; SI-DAG: v_cvt_f32_ubyte2_e32
; SI-DAG: v_cvt_f32_ubyte3_e32
; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24
; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16
; GCN-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
; GCN: {{buffer|flat}}_load_dword
; GCN-DAG: v_cvt_f32_ubyte0_e32
; GCN-DAG: v_cvt_f32_ubyte1_e32
; GCN-DAG: v_cvt_f32_ubyte2_e32
; GCN-DAG: v_cvt_f32_ubyte3_e32
; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24
; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16
; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8
; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00,
; SI-DAG: v_add_i32
; SI: {{buffer|flat}}_store_dwordx4
; SI: {{buffer|flat}}_store_dword
; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffffff00,
; VI-DAG: v_add_u16_e32
; VI-DAG: v_add_u16_e32
; SI: s_endpgm
; GCN: {{buffer|flat}}_store_dwordx4
; GCN: {{buffer|flat}}_store_dword
; GCN: s_endpgm
define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@@ -116,8 +122,8 @@ define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <
}
; Make sure this doesn't crash.
; SI-LABEL: {{^}}load_v7i8_to_v7f32:
; SI: s_endpgm
; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
; GCN: s_endpgm
define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
%cvt = uitofp <7 x i8> %load to <7 x float>
@@ -125,22 +131,22 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
ret void
}
; SI-LABEL: {{^}}load_v8i8_to_v8f32:
; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
; SI-NOT: bfe
; SI-NOT: lshr
; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
; SI-NOT: bfe
; SI-NOT: lshr
; SI: buffer_store_dwordx4
; SI: buffer_store_dwordx4
; GCN-LABEL: {{^}}load_v8i8_to_v8f32:
; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
; GCN-NOT: bfe
; GCN-NOT: lshr
; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
; GCN-NOT: bfe
; GCN-NOT: lshr
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
%cvt = uitofp <8 x i8> %load to <8 x float>
@@ -148,11 +154,11 @@ define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8>
ret void
}
; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
; SI: buffer_load_dword [[LOADREG:v[0-9]+]],
; SI: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
; SI: buffer_store_dword [[CONV]],
; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
; GCN: buffer_load_dword [[LOADREG:v[0-9]+]],
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
; GCN: buffer_store_dword [[CONV]],
define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
%load = load i32, i32 addrspace(1)* %in, align 4
%add = add i32 %load, 2
@@ -162,7 +168,7 @@ define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addr
ret void
}
; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
%load = load i32, i32 addrspace(1)* %in, align 4
%inreg = and i32 %load, 65280
@@ -174,7 +180,7 @@ define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addr
; We don't get these ones because of the zext, but instcombine removes
; them so it shouldn't really matter.
; SI-LABEL: {{^}}i8_zext_i32_to_f32:
; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
%load = load i8, i8 addrspace(1)* %in, align 1
%ext = zext i8 %load to i32
@@ -183,7 +189,7 @@ define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1
ret void
}
; SI-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
%ext = zext <4 x i8> %load to <4 x i32>
@@ -192,11 +198,11 @@ define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4
ret void
}
; SI-LABEL: {{^}}extract_byte0_to_f32:
; SI: buffer_load_dword [[VAL:v[0-9]+]]
; SI-NOT: [[VAL]]
; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
; SI: buffer_store_dword [[CONV]]
; GCN-LABEL: {{^}}extract_byte0_to_f32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: [[VAL]]
; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[CONV]]
define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
%val = load i32, i32 addrspace(1)* %in
%and = and i32 %val, 255
@@ -205,11 +211,11 @@ define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspac
ret void
}
; SI-LABEL: {{^}}extract_byte1_to_f32:
; SI: buffer_load_dword [[VAL:v[0-9]+]]
; SI-NOT: [[VAL]]
; SI: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
; SI: buffer_store_dword [[CONV]]
; GCN-LABEL: {{^}}extract_byte1_to_f32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: [[VAL]]
; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[CONV]]
define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
%val = load i32, i32 addrspace(1)* %in
%srl = lshr i32 %val, 8
@@ -219,11 +225,11 @@ define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspac
ret void
}
; SI-LABEL: {{^}}extract_byte2_to_f32:
; SI: buffer_load_dword [[VAL:v[0-9]+]]
; SI-NOT: [[VAL]]
; SI: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
; SI: buffer_store_dword [[CONV]]
; GCN-LABEL: {{^}}extract_byte2_to_f32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: [[VAL]]
; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[CONV]]
define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
%val = load i32, i32 addrspace(1)* %in
%srl = lshr i32 %val, 16
@@ -233,11 +239,11 @@ define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspac
ret void
}
; SI-LABEL: {{^}}extract_byte3_to_f32:
; SI: buffer_load_dword [[VAL:v[0-9]+]]
; SI-NOT: [[VAL]]
; SI: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
; SI: buffer_store_dword [[CONV]]
; GCN-LABEL: {{^}}extract_byte3_to_f32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: [[VAL]]
; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[CONV]]
define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
%val = load i32, i32 addrspace(1)* %in
%srl = lshr i32 %val, 24

View File

@@ -0,0 +1,302 @@
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
; FUNC-LABEL: {{^}}zextload_global_i16_to_i32:
; SI: buffer_load_ushort
; SI: buffer_store_dword
; SI: s_endpgm
define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
%a = load i16, i16 addrspace(1)* %in
%ext = zext i16 %a to i32
store i32 %ext, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_i16_to_i32:
; SI: buffer_load_sshort
; SI: buffer_store_dword
; SI: s_endpgm
define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
%a = load i16, i16 addrspace(1)* %in
%ext = sext i16 %a to i32
store i32 %ext, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
; SI: buffer_load_ushort
; SI: s_endpgm
define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <1 x i16>, <1 x i16> addrspace(1)* %in
%ext = zext <1 x i16> %load to <1 x i32>
store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
; SI: buffer_load_sshort
; SI: s_endpgm
define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <1 x i16>, <1 x i16> addrspace(1)* %in
%ext = sext <1 x i16> %load to <1 x i32>
store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
; SI: s_endpgm
define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <2 x i16>, <2 x i16> addrspace(1)* %in
%ext = zext <2 x i16> %load to <2 x i32>
store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
; SI: s_endpgm
define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <2 x i16>, <2 x i16> addrspace(1)* %in
%ext = sext <2 x i16> %load to <2 x i32>
store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
; SI: s_endpgm
define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in
%ext = zext <4 x i16> %load to <4 x i32>
store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
; SI: s_endpgm
define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in
%ext = sext <4 x i16> %load to <4 x i32>
store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
; SI: s_endpgm
define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <8 x i16>, <8 x i16> addrspace(1)* %in
%ext = zext <8 x i16> %load to <8 x i32>
store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
; SI: s_endpgm
define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <8 x i16>, <8 x i16> addrspace(1)* %in
%ext = sext <8 x i16> %load to <8 x i32>
store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
; SI: s_endpgm
define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <16 x i16>, <16 x i16> addrspace(1)* %in
%ext = zext <16 x i16> %load to <16 x i32>
store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
; SI: s_endpgm
define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <16 x i16>, <16 x i16> addrspace(1)* %in
%ext = sext <16 x i16> %load to <16 x i32>
store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
; SI: s_endpgm
define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <32 x i16>, <32 x i16> addrspace(1)* %in
%ext = zext <32 x i16> %load to <32 x i32>
store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
; SI: s_endpgm
define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <32 x i16>, <32 x i16> addrspace(1)* %in
%ext = sext <32 x i16> %load to <32 x i32>
store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
; SI: s_endpgm
define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <64 x i16>, <64 x i16> addrspace(1)* %in
%ext = zext <64 x i16> %load to <64 x i32>
store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
; SI: s_endpgm
define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <64 x i16>, <64 x i16> addrspace(1)* %in
%ext = sext <64 x i16> %load to <64 x i32>
store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]],
; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
%a = load i16, i16 addrspace(1)* %in
%ext = zext i16 %a to i64
store i64 %ext, i64 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_i16_to_i64:
; VI: buffer_load_ushort [[LOAD:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0
; VI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
; VI: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0
define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
%a = load i16, i16 addrspace(1)* %in
%ext = sext i16 %a to i64
store i64 %ext, i64 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
; SI: s_endpgm
define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <1 x i16>, <1 x i16> addrspace(1)* %in
%ext = zext <1 x i16> %load to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
; SI: s_endpgm
define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <1 x i16>, <1 x i16> addrspace(1)* %in
%ext = sext <1 x i16> %load to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
; SI: s_endpgm
define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <2 x i16>, <2 x i16> addrspace(1)* %in
%ext = zext <2 x i16> %load to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
; SI: s_endpgm
define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <2 x i16>, <2 x i16> addrspace(1)* %in
%ext = sext <2 x i16> %load to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
; SI: s_endpgm
define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in
%ext = zext <4 x i16> %load to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
; SI: s_endpgm
define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in
%ext = sext <4 x i16> %load to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
; SI: s_endpgm
define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <8 x i16>, <8 x i16> addrspace(1)* %in
%ext = zext <8 x i16> %load to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
; SI: s_endpgm
define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <8 x i16>, <8 x i16> addrspace(1)* %in
%ext = sext <8 x i16> %load to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
; SI: s_endpgm
define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <16 x i16>, <16 x i16> addrspace(1)* %in
%ext = zext <16 x i16> %load to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
; SI: s_endpgm
define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <16 x i16>, <16 x i16> addrspace(1)* %in
%ext = sext <16 x i16> %load to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
; SI: s_endpgm
define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <32 x i16>, <32 x i16> addrspace(1)* %in
%ext = zext <32 x i16> %load to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
; SI: s_endpgm
define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <32 x i16>, <32 x i16> addrspace(1)* %in
%ext = sext <32 x i16> %load to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
; SI: s_endpgm
define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <64 x i16>, <64 x i16> addrspace(1)* %in
%ext = zext <64 x i16> %load to <64 x i64>
store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
; SI: s_endpgm
define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
%load = load <64 x i16>, <64 x i16> addrspace(1)* %in
%ext = sext <64 x i16> %load to <64 x i64>
store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
ret void
}

View File

@@ -379,19 +379,33 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x
; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
; XSI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
; XSI: v_cvt_f32_f16_e32
; XSI: v_cvt_f32_f16_e32
; XSI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
; XSI: v_cvt_f32_f16_e32
; XSI-NOT: v_cvt_f32_f16
; GCN: v_cvt_f64_f32_e32
; GCN: v_cvt_f64_f32_e32
; GCN: v_cvt_f64_f32_e32
; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
; XVI: v_cvt_f32_f16_e32
; XVI: v_cvt_f32_f16_e32
; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
; XVI: v_cvt_f32_f16_e32
; XVI-NOT: v_cvt_f32_f16
; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
; GCN: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
; GCN: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
; GCN: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
; GCN-NOT: v_cvt_f64_f32_e32
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
; GCN: s_endpgm
define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
%val = load <3 x half>, <3 x half> addrspace(1)* %in

View File

@@ -1,5 +1,6 @@
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
@@ -73,11 +74,14 @@ define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) n
}
; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8:
; SI: buffer_load_dword
; GCN: buffer_load_dword
; SI: v_add_i32
; SI-NEXT: v_and_b32_e32
; FIXME: Should be using s_add_i32
; VI: v_add_i32
; VI-NEXT: v_and_b32_e32
; SI-NOT: {{[^@]}}bfe
; SI: s_endpgm
; GCN: s_endpgm
define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
%load = load i32, i32 addrspace(1)* %in, align 4
%add = add i32 %load, 1

View File

@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}constant_load_i16:
@@ -428,8 +428,15 @@ define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(
}
; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64:
; GCN-NOHSA-DAG: buffer_load_sshort v[[LO:[0-9]+]],
; FIXME: Need to optimize this sequence to avoid extra bfe:
; t28: i32,ch = load<LD2[%in(addrspace=1)], anyext from i16> t12, t27, undef:i64
; t31: i64 = any_extend t28
; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16
; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]],
; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]],
; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]],
; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]

View File

@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@@ -444,8 +444,15 @@ define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
}
; FUNC-LABEL: {{^}}global_sextload_i16_to_i64:
; GCN-NOHSA-DAG: buffer_load_sshort v[[LO:[0-9]+]],
; FIXME: Need to optimize this sequence to avoid extra bfe:
; t28: i32,ch = load<LD2[%in(addrspace=1)], anyext from i16> t12, t27, undef:i64
; t31: i64 = any_extend t28
; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16
; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]],
; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]],
; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]],
; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]

View File

@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@@ -163,7 +163,8 @@ define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8
; GCN-NOHSA: buffer_load_dword v
; GCN-HSA: flat_load_dword v
; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
@@ -185,7 +186,16 @@ entry:
; GCN-NOHSA: buffer_load_dword v
; GCN-HSA: flat_load_dword v
; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
;FIXME: Need to optimize this sequence to avoid extra shift on VI.
; t23: i16 = truncate t18
; t49: i16 = srl t23, Constant:i32<8>
; t57: i32 = any_extend t49
; t58: i32 = sign_extend_inreg t57, ValueType:ch:i8
; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}}
; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8
; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8

View File

@@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}local_load_i16:
@@ -539,7 +539,13 @@ define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)*
}
; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
; GCN: ds_read_i16 v[[LO:[0-9]+]],
; FIXME: Need to optimize this sequence to avoid an extra shift.
; t25: i32,ch = load<LD2[%in(addrspace=3)], anyext from i16> t12, t10, undef:i32
; t28: i64 = any_extend t25
; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16
; SI: ds_read_i16 v[[LO:[0-9]+]],
; VI: ds_read_u16 v[[ULO:[0-9]+]]
; VI: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]

View File

@@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@@ -141,8 +141,17 @@ define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8>
; GCN-NOT: s_wqm_b64
; GCN: s_mov_b32 m0
; GCN: ds_read_u16
; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
; FIXME: Need to optimize this sequence to avoid extra shift on VI.
; t23: i16 = srl t39, Constant:i32<8>
; t31: i32 = any_extend t23
; t33: i32 = sign_extend_inreg t31, ValueType:ch:i8
; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}}
; VI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8
; EG: LDS_USHORT_READ_RET
; EG-DAG: BFE_INT
@@ -157,7 +166,8 @@ define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8>
; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32:
; GCN: ds_read_b32
; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, {{v[0-9]+}}
; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,

View File

@@ -1,11 +1,15 @@
; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC
; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; FUNC-LABEL: {{^}}u32_mad24:
; EG: MULADD_UINT24
; SI: v_mad_u32_u24
; VI: v_mad_u32_u24
define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
entry:
@@ -25,9 +29,9 @@ entry:
; The result must be sign-extended
; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
; EG: 16
; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
; FIXME: Should be using scalar instructions here.
; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
entry:
%0 = mul i16 %a, %b
@@ -37,14 +41,14 @@ entry:
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; FUNC-LABEL: {{^}}i8_mad24:
; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
; The result must be sign-extended
; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
; EG: 8
; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
entry:
%0 = mul i8 %a, %b

View File

@@ -0,0 +1,87 @@
; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_imax_sge_i16:
; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
%outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
%a = load i16, i16 addrspace(1)* %gep0, align 4
%b = load i16, i16 addrspace(1)* %gep1, align 4
%cmp = icmp sge i16 %a, %b
%val = select i1 %cmp, i16 %a, i16 %b
store i16 %val, i16 addrspace(1)* %outgep, align 4
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_imax_sge_v4i16:
; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind {
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid
%outgep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid
%a = load <4 x i16>, <4 x i16> addrspace(1)* %gep0, align 4
%b = load <4 x i16>, <4 x i16> addrspace(1)* %gep1, align 4
%cmp = icmp sge <4 x i16> %a, %b
%val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
store <4 x i16> %val, <4 x i16> addrspace(1)* %outgep, align 4
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_imax_sgt_i16:
; VI: v_max_i16_e32
define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
%outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
%a = load i16, i16 addrspace(1)* %gep0, align 4
%b = load i16, i16 addrspace(1)* %gep1, align 4
%cmp = icmp sgt i16 %a, %b
%val = select i1 %cmp, i16 %a, i16 %b
store i16 %val, i16 addrspace(1)* %outgep, align 4
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_umax_uge_i16:
; VI: v_max_u16_e32
define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
%outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
%a = load i16, i16 addrspace(1)* %gep0, align 4
%b = load i16, i16 addrspace(1)* %gep1, align 4
%cmp = icmp uge i16 %a, %b
%val = select i1 %cmp, i16 %a, i16 %b
store i16 %val, i16 addrspace(1)* %outgep, align 4
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_umax_ugt_i16:
; VI: v_max_u16_e32
define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
%outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
%a = load i16, i16 addrspace(1)* %gep0, align 4
%b = load i16, i16 addrspace(1)* %gep1, align 4
%cmp = icmp ugt i16 %a, %b
%val = select i1 %cmp, i16 %a, i16 %b
store i16 %val, i16 addrspace(1)* %outgep, align 4
ret void
}

View File

@@ -31,7 +31,8 @@ entry:
}
; FUNC-LABEL: {{^}}test_umul24_i16_vgpr_sext:
; GCN: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -62,8 +63,9 @@ entry:
}
; FUNC-LABEL: {{^}}test_umul24_i16_vgpr:
; GCN: v_mul_u32_u24_e32
; GCN: v_and_b32_e32
; SI: v_mul_u32_u24_e32
; SI: v_and_b32_e32
; VI: v_mul_lo_u16
define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -77,9 +79,9 @@ define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in)
ret void
}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; FUNC-LABEL: {{^}}test_umul24_i8_vgpr:
; GCN: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
define void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
entry:

View File

@@ -53,6 +53,48 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
ret void
}
;VI: {{^}}shl_i16:
;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
%b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
%a = load i16, i16 addrspace(1) * %in
%b = load i16, i16 addrspace(1) * %b_ptr
%result = shl i16 %a, %b
store i16 %result, i16 addrspace(1)* %out
ret void
}
;VI: {{^}}shl_v2i16:
;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
%a = load <2 x i16>, <2 x i16> addrspace(1) * %in
%b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
%result = shl <2 x i16> %a, %b
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
ret void
}
;VI: {{^}}shl_v4i16:
;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
%a = load <4 x i16>, <4 x i16> addrspace(1) * %in
%b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
%result = shl <4 x i16> %a, %b
store <4 x i16> %result, <4 x i16> addrspace(1)* %out
ret void
}
;EG-LABEL: {{^}}shl_i64:
;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}

View File

@@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
; GCN-LABEL: {{^}}s_sext_i1_to_i32:
; GCN: v_cndmask_b32_e64
@@ -55,22 +55,43 @@ define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) no
}
; GCN-LABEL: {{^}}s_sext_i16_to_i64:
; GCN: s_endpgm
; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000
define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
%sext = sext i16 %a to i64
store i64 %sext, i64 addrspace(1)* %out, align 8
ret void
}
; GCN-LABEL: {{^}}s_sext_i1_to_i16:
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
; GCN-NEXT: buffer_store_short [[RESULT]]
define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%cmp = icmp eq i32 %a, %b
%sext = sext i1 %cmp to i16
store i16 %sext, i16 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}s_sext_v4i8_to_v4i32:
; GCN: s_load_dword [[VAL:s[0-9]+]]
; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]]
; GCN-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008
; GCN-DAG: s_bfe_i32 [[EXT2:s[0-9]+]], [[VAL]], 0x80010
; GCN-DAG: s_ashr_i32 [[EXT3:s[0-9]+]], [[VAL]], 24
; SI-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008
; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]]
; FIXME: We end up with a v_bfe instruction, because the i16 srl
; gets selected to a v_lshrrev_b16 instructions, so the input to
; the bfe is a vector registers. To fix this we need to be able to
; optimize:
; t29: i16 = truncate t10
; t55: i16 = srl t29, Constant:i32<8>
; t63: i32 = any_extend t55
; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
; VI-DAG: v_bfe_i32 [[VEXT1:v[0-9]+]], v{{[0-9]+}}, 0, 8
; GCN-DAG: v_mov_b32_e32 [[VEXT0:v[0-9]+]], [[EXT0]]
; GCN-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]]
; SI-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]]
; GCN-DAG: v_mov_b32_e32 [[VEXT2:v[0-9]+]], [[EXT2]]
; GCN-DAG: v_mov_b32_e32 [[VEXT3:v[0-9]+]], [[EXT3]]
@@ -96,10 +117,17 @@ define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
; GCN-LABEL: {{^}}v_sext_v4i8_to_v4i32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8
; GCN-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8
; GCN-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8
; FIXME: need to optimize same sequence as above test to avoid
; this shift.
; VI-DAG: v_lshrrev_b16_e32 [[SH16:v[0-9]+]], 8, [[VAL]]
; GCN-DAG: v_ashrrev_i32_e32 [[EXT3:v[0-9]+]], 24, [[VAL]]
; VI-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8
; VI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8
; VI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[SH16]], 0, 8
; SI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8
; SI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8
; SI: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8
; GCN: buffer_store_dword [[EXT0]]
; GCN: buffer_store_dword [[EXT1]]

View File

@@ -46,6 +46,36 @@ define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
ret void
}
; FUNC-LABEL: {{^}}ashr_v2i16:
; FIXME: The ashr operation is uniform, but because its operands come from a
; global load we end up with the vector instructions rather than scalar.
; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
%a = load <2 x i16>, <2 x i16> addrspace(1)* %in
%b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
%result = ashr <2 x i16> %a, %b
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}ashr_v4i16:
; FIXME: The ashr operation is uniform, but because its operands come from a
; global load we end up with the vector instructions rather than scalar.
; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
%a = load <4 x i16>, <4 x i16> addrspace(1)* %in
%b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
%result = ashr <4 x i16> %a, %b
store <4 x i16> %result, <4 x i16> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}s_ashr_i64:
; GCN: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8

View File

@@ -54,6 +54,46 @@ define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)
ret void
}
; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
%b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
%a = load i16, i16 addrspace(1)* %in
%b = load i16, i16 addrspace(1)* %b_ptr
%result = sub i16 %a, %b
store i16 %result, i16 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}test_sub_v2i16:
; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
%a = load <2 x i16>, <2 x i16> addrspace(1) * %in
%b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
%result = sub <2 x i16> %a, %b
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}test_sub_v4i16:
; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
%a = load <4 x i16>, <4 x i16> addrspace(1) * %in
%b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
%result = sub <4 x i16> %a, %b
store <4 x i16> %result, <4 x i16> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}s_sub_i64:
; SI: s_sub_u32
; SI: s_subb_u32

View File

@@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
; CHECK: buffer_load_dword v
@@ -47,7 +47,12 @@ define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace
}
; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16:
; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
; FIXME We need to teach the dagcombiner to reduce load width for:
; t21: v2i32,ch = load<LD8[%in(addrspace=1)]> t12, t10, undef:i64
; t23: i64 = bitcast t21
; t30: i16 = truncate t23
; SI: buffer_load_dword v[[VAL:[0-9]+]]
; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]]
; CHECK: buffer_store_short [[VAL]]
define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
%ld = load <4 x i16>, <4 x i16> addrspace(1)* %in

View File

@@ -21,13 +21,20 @@ define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwi
ret void
}
; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
; SI-LABEL: {{^}}s_arg_global_truncstore_i16_to_i1:
; SI: s_load_dword [[LOAD:s[0-9]+]],
; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
; SI: buffer_store_byte [[VREG]],
define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
define void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
%trunc = trunc i16 %val to i1
store i1 %trunc, i1 addrspace(1)* %out, align 1
ret void
}
; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
%add = add i16 %val0, %val1
%trunc = trunc i16 %add to i1
store i1 %trunc, i1 addrspace(1)* %out, align 1
ret void
}

View File

@@ -2,39 +2,58 @@
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
; R600: {{^}}test:
; R600: {{^}}s_mad_zext_i32_to_i64:
; R600: MEM_RAT_CACHELESS STORE_RAW
; R600: MEM_RAT_CACHELESS STORE_RAW
; SI: {{^}}test:
; SI: {{^}}s_mad_zext_i32_to_i64:
; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
define void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
entry:
%0 = mul i32 %a, %b
%1 = add i32 %0, %c
%2 = zext i32 %1 to i64
store i64 %2, i64 addrspace(1)* %out
%tmp0 = mul i32 %a, %b
%tmp1 = add i32 %tmp0, %c
%tmp2 = zext i32 %tmp1 to i64
store i64 %tmp2, i64 addrspace(1)* %out
ret void
}
; SI-LABEL: {{^}}testi1toi32:
; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32
; SI: v_cndmask_b32
define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
define void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
entry:
%0 = icmp eq i32 %a, %b
%1 = zext i1 %0 to i32
store i32 %1, i32 addrspace(1)* %out
%tmp0 = icmp eq i32 %a, %b
%tmp1 = zext i1 %tmp0 to i32
store i32 %tmp1, i32 addrspace(1)* %out
ret void
}
; SI-LABEL: {{^}}zext_i1_to_i64:
; SI-LABEL: {{^}}s_arg_zext_i1_to_i64:
define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
%ext = zext i1 %arg to i64
store i64 %ext, i64 addrspace(1)* %out, align 8
ret void
}
; SI-LABEL: {{^}}s_cmp_zext_i1_to_i64:
; SI: s_mov_b32 s{{[0-9]+}}, 0
; SI: v_cmp_eq_u32
; SI: v_cndmask_b32
define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
%cmp = icmp eq i32 %a, %b
%ext = zext i1 %cmp to i64
store i64 %ext, i64 addrspace(1)* %out, align 8
ret void
}
; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; SI: buffer_store_short [[RESULT]]
define void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
%tmp0 = icmp eq i16 %a, %b
%tmp1 = zext i1 %tmp0 to i16
store i16 %tmp1, i16 addrspace(1)* %out
ret void
}
attributes #0 = { nounwind }