mirror of
https://github.com/intel/llvm.git
synced 2026-01-22 23:49:22 +08:00
[AVR] Custom lower 32-bit shift instructions
32-bit shift instructions were previously expanded using the default SelectionDAG expander, which meant it used 16-bit constant shifts and ORed them together. This works, but is far from optimal. I've optimized 32-bit shifts on AVR using a custom inserter. This is done using three new pseudo-instructions that take the upper and lower bits of the value in two separate 16-bit registers and outputs two 16-bit registers. This is the first commit in a series. When completed, shift instructions will take around 31% less instructions on average for constant 32-bit shifts, and is in all cases equal or better than the old behavior. It also tends to match or outperform avr-gcc: the only cases where avr-gcc does better is when it uses a loop to shift, or when the LLVM register allocator inserts some unnecessary movs. But it even outperforms avr-gcc in some cases where avr-gcc does not use a loop. As a side effect, non-constant 32-bit shifts also become more efficient. For some real-world differences: the build of compiler-rt I use in TinyGo becomes 2.7% smaller and the build of picolibc I use becomes 0.9% smaller. I think picolibc is a better representation of real-world code, but even a ~1% reduction in code size is really significant. The current patch just lays the groundwork. The result is actually a regression in code size. Later patches will use this as a basis to optimize these shift instructions. Differential Revision: https://reviews.llvm.org/D140569
This commit is contained in:
@@ -88,6 +88,9 @@ AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
|
||||
setOperationAction(ISD::SRA, MVT::i16, Custom);
|
||||
setOperationAction(ISD::SHL, MVT::i16, Custom);
|
||||
setOperationAction(ISD::SRL, MVT::i16, Custom);
|
||||
setOperationAction(ISD::SRA, MVT::i32, Custom);
|
||||
setOperationAction(ISD::SHL, MVT::i32, Custom);
|
||||
setOperationAction(ISD::SRL, MVT::i32, Custom);
|
||||
setOperationAction(ISD::SHL_PARTS, MVT::i16, Expand);
|
||||
setOperationAction(ISD::SRA_PARTS, MVT::i16, Expand);
|
||||
setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand);
|
||||
@@ -247,10 +250,13 @@ const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
NODE(CALL);
|
||||
NODE(WRAPPER);
|
||||
NODE(LSL);
|
||||
NODE(LSLW);
|
||||
NODE(LSR);
|
||||
NODE(LSRW);
|
||||
NODE(ROL);
|
||||
NODE(ROR);
|
||||
NODE(ASR);
|
||||
NODE(ASRW);
|
||||
NODE(LSLLOOP);
|
||||
NODE(LSRLOOP);
|
||||
NODE(ROLLOOP);
|
||||
@@ -279,6 +285,57 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
|
||||
assert(isPowerOf2_32(VT.getSizeInBits()) &&
|
||||
"Expected power-of-2 shift amount");
|
||||
|
||||
if (VT.getSizeInBits() == 32) {
|
||||
if (!isa<ConstantSDNode>(N->getOperand(1))) {
|
||||
// 32-bit shifts are converted to a loop in IR.
|
||||
// This should be unreachable.
|
||||
report_fatal_error("Expected a constant shift amount!");
|
||||
}
|
||||
SDVTList ResTys = DAG.getVTList(MVT::i16, MVT::i16);
|
||||
SDValue SrcLo =
|
||||
DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0),
|
||||
DAG.getConstant(0, dl, MVT::i16));
|
||||
SDValue SrcHi =
|
||||
DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0),
|
||||
DAG.getConstant(1, dl, MVT::i16));
|
||||
uint64_t ShiftAmount =
|
||||
cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
||||
if (ShiftAmount == 16) {
|
||||
// Special case these two operations because they appear to be used by the
|
||||
// generic codegen parts to lower 32-bit numbers.
|
||||
// TODO: perhaps we can lower shift amounts bigger than 16 to a 16-bit
|
||||
// shift of a part of the 32-bit value?
|
||||
switch (Op.getOpcode()) {
|
||||
case ISD::SHL: {
|
||||
SDValue Zero = DAG.getConstant(0, dl, MVT::i16);
|
||||
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Zero, SrcLo);
|
||||
}
|
||||
case ISD::SRL: {
|
||||
SDValue Zero = DAG.getConstant(0, dl, MVT::i16);
|
||||
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, SrcHi, Zero);
|
||||
}
|
||||
}
|
||||
}
|
||||
SDValue Cnt = DAG.getTargetConstant(ShiftAmount, dl, MVT::i8);
|
||||
unsigned Opc;
|
||||
switch (Op.getOpcode()) {
|
||||
default:
|
||||
llvm_unreachable("Invalid 32-bit shift opcode!");
|
||||
case ISD::SHL:
|
||||
Opc = AVRISD::LSLW;
|
||||
break;
|
||||
case ISD::SRL:
|
||||
Opc = AVRISD::LSRW;
|
||||
break;
|
||||
case ISD::SRA:
|
||||
Opc = AVRISD::ASRW;
|
||||
break;
|
||||
}
|
||||
SDValue Result = DAG.getNode(Opc, dl, ResTys, SrcLo, SrcHi, Cnt);
|
||||
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Result.getValue(0),
|
||||
Result.getValue(1));
|
||||
}
|
||||
|
||||
// Expand non-constant shifts to loops.
|
||||
if (!isa<ConstantSDNode>(N->getOperand(1))) {
|
||||
switch (Op.getOpcode()) {
|
||||
@@ -1789,6 +1846,114 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
|
||||
return RemBB;
|
||||
}
|
||||
|
||||
// Do a multibyte AVR shift. Insert shift instructions and put the output
|
||||
// registers in the Regs array.
|
||||
// Because AVR does not have a normal shift instruction (only a single bit shift
|
||||
// instruction), we have to emulate this behavior with other instructions.
|
||||
static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB,
|
||||
MutableArrayRef<std::pair<Register, int>> Regs,
|
||||
ISD::NodeType Opc, int64_t ShiftAmt) {
|
||||
const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
|
||||
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
|
||||
const DebugLoc &dl = MI.getDebugLoc();
|
||||
|
||||
const bool ShiftLeft = Opc == ISD::SHL;
|
||||
const bool ArithmeticShift = Opc == ISD::SRA;
|
||||
|
||||
// Shift by one. This is the fallback that always works, and the shift
|
||||
// operation that is used for 1, 2, and 3 bit shifts.
|
||||
while (ShiftLeft && ShiftAmt) {
|
||||
// Shift one to the left.
|
||||
for (ssize_t I = Regs.size() - 1; I >= 0; I--) {
|
||||
Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass);
|
||||
Register In = Regs[I].first;
|
||||
Register InSubreg = Regs[I].second;
|
||||
if (I == (ssize_t)Regs.size() - 1) { // first iteration
|
||||
BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Out)
|
||||
.addReg(In, 0, InSubreg)
|
||||
.addReg(In, 0, InSubreg);
|
||||
} else {
|
||||
BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), Out)
|
||||
.addReg(In, 0, InSubreg)
|
||||
.addReg(In, 0, InSubreg);
|
||||
}
|
||||
Regs[I] = std::pair(Out, 0);
|
||||
}
|
||||
ShiftAmt--;
|
||||
}
|
||||
while (!ShiftLeft && ShiftAmt) {
|
||||
// Shift one to the right.
|
||||
for (size_t I = 0; I < Regs.size(); I++) {
|
||||
Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass);
|
||||
Register In = Regs[I].first;
|
||||
Register InSubreg = Regs[I].second;
|
||||
if (I == 0) {
|
||||
unsigned Opc = ArithmeticShift ? AVR::ASRRd : AVR::LSRRd;
|
||||
BuildMI(*BB, MI, dl, TII.get(Opc), Out).addReg(In, 0, InSubreg);
|
||||
} else {
|
||||
BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), Out).addReg(In, 0, InSubreg);
|
||||
}
|
||||
Regs[I] = std::pair(Out, 0);
|
||||
}
|
||||
ShiftAmt--;
|
||||
}
|
||||
|
||||
if (ShiftAmt != 0) {
|
||||
llvm_unreachable("don't know how to shift!"); // sanity check
|
||||
}
|
||||
}
|
||||
|
||||
// Do a wide (32-bit) shift.
|
||||
MachineBasicBlock *
|
||||
AVRTargetLowering::insertWideShift(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) const {
|
||||
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
|
||||
const DebugLoc &dl = MI.getDebugLoc();
|
||||
|
||||
// How much to shift to the right (meaning: a negative number indicates a left
|
||||
// shift).
|
||||
int64_t ShiftAmt = MI.getOperand(4).getImm();
|
||||
ISD::NodeType Opc;
|
||||
switch (MI.getOpcode()) {
|
||||
case AVR::Lsl32:
|
||||
Opc = ISD::SHL;
|
||||
break;
|
||||
case AVR::Lsr32:
|
||||
Opc = ISD::SRL;
|
||||
break;
|
||||
case AVR::Asr32:
|
||||
Opc = ISD::SRA;
|
||||
break;
|
||||
}
|
||||
|
||||
// Read the input registers, with the most significant register at index 0.
|
||||
std::array<std::pair<Register, int>, 4> Registers = {
|
||||
std::pair(MI.getOperand(3).getReg(), AVR::sub_hi),
|
||||
std::pair(MI.getOperand(3).getReg(), AVR::sub_lo),
|
||||
std::pair(MI.getOperand(2).getReg(), AVR::sub_hi),
|
||||
std::pair(MI.getOperand(2).getReg(), AVR::sub_lo),
|
||||
};
|
||||
|
||||
// Do the shift. The registers are modified in-place.
|
||||
insertMultibyteShift(MI, BB, Registers, Opc, ShiftAmt);
|
||||
|
||||
// Combine the 8-bit registers into 16-bit register pairs.
|
||||
BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg())
|
||||
.addReg(Registers[0].first, 0, Registers[0].second)
|
||||
.addImm(AVR::sub_hi)
|
||||
.addReg(Registers[1].first, 0, Registers[1].second)
|
||||
.addImm(AVR::sub_lo);
|
||||
BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg())
|
||||
.addReg(Registers[2].first, 0, Registers[2].second)
|
||||
.addImm(AVR::sub_hi)
|
||||
.addReg(Registers[3].first, 0, Registers[3].second)
|
||||
.addImm(AVR::sub_lo);
|
||||
|
||||
// Remove the pseudo instruction.
|
||||
MI.eraseFromParent();
|
||||
return BB;
|
||||
}
|
||||
|
||||
static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
|
||||
if (I->getOpcode() == AVR::COPY) {
|
||||
Register SrcReg = I->getOperand(1).getReg();
|
||||
@@ -1901,6 +2066,10 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
||||
case AVR::Asr8:
|
||||
case AVR::Asr16:
|
||||
return insertShift(MI, MBB);
|
||||
case AVR::Lsl32:
|
||||
case AVR::Lsr32:
|
||||
case AVR::Asr32:
|
||||
return insertWideShift(MI, MBB);
|
||||
case AVR::MULRdRr:
|
||||
case AVR::MULSRdRr:
|
||||
return insertMul(MI, MBB);
|
||||
|
||||
@@ -39,14 +39,17 @@ enum NodeType {
|
||||
LSLBN, ///< Byte logical shift left N bits.
|
||||
LSLWN, ///< Word logical shift left N bits.
|
||||
LSLHI, ///< Higher 8-bit of word logical shift left.
|
||||
LSLW, ///< Wide logical shift left.
|
||||
LSR, ///< Logical shift right.
|
||||
LSRBN, ///< Byte logical shift right N bits.
|
||||
LSRWN, ///< Word logical shift right N bits.
|
||||
LSRLO, ///< Lower 8-bit of word logical shift right.
|
||||
LSRW, ///< Wide logical shift right.
|
||||
ASR, ///< Arithmetic shift right.
|
||||
ASRBN, ///< Byte arithmetic shift right N bits.
|
||||
ASRWN, ///< Word arithmetic shift right N bits.
|
||||
ASRLO, ///< Lower 8-bit of word arithmetic shift right.
|
||||
ASRW, ///< Wide arithmetic shift right.
|
||||
ROR, ///< Bit rotate right.
|
||||
ROL, ///< Bit rotate left.
|
||||
LSLLOOP, ///< A loop of single logical shift left instructions.
|
||||
@@ -186,6 +189,8 @@ protected:
|
||||
|
||||
private:
|
||||
MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const;
|
||||
MachineBasicBlock *insertWideShift(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) const;
|
||||
MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const;
|
||||
MachineBasicBlock *insertCopyZero(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) const;
|
||||
|
||||
@@ -69,6 +69,9 @@ def AVRasrbn : SDNode<"AVRISD::ASRBN", SDTIntBinOp>;
|
||||
def AVRlslwn : SDNode<"AVRISD::LSLWN", SDTIntBinOp>;
|
||||
def AVRlsrwn : SDNode<"AVRISD::LSRWN", SDTIntBinOp>;
|
||||
def AVRasrwn : SDNode<"AVRISD::ASRWN", SDTIntBinOp>;
|
||||
def AVRlslw : SDNode<"AVRISD::LSLW", SDTIntShiftDOp>;
|
||||
def AVRlsrw : SDNode<"AVRISD::LSRW", SDTIntShiftDOp>;
|
||||
def AVRasrw : SDNode<"AVRISD::ASRW", SDTIntShiftDOp>;
|
||||
|
||||
// Pseudo shift nodes for non-constant shift amounts.
|
||||
def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>;
|
||||
@@ -2337,6 +2340,11 @@ def Lsl16 : ShiftPseudo<(outs DREGS
|
||||
: $src, i8
|
||||
: $cnt))]>;
|
||||
|
||||
def Lsl32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
|
||||
(ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt),
|
||||
"# Lsl32 PSEUDO",
|
||||
[(set i16:$dstlo, i16:$dsthi, (AVRlslw i16:$srclo, i16:$srchi, i8:$cnt))]>;
|
||||
|
||||
def Lsr8 : ShiftPseudo<(outs GPR8
|
||||
: $dst),
|
||||
(ins GPR8
|
||||
@@ -2357,6 +2365,11 @@ def Lsr16 : ShiftPseudo<(outs DREGS
|
||||
: $src, i8
|
||||
: $cnt))]>;
|
||||
|
||||
def Lsr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
|
||||
(ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt),
|
||||
"# Lsr32 PSEUDO",
|
||||
[(set i16:$dstlo, i16:$dsthi, (AVRlsrw i16:$srclo, i16:$srchi, i8:$cnt))]>;
|
||||
|
||||
def Rol8 : ShiftPseudo<(outs GPR8
|
||||
: $dst),
|
||||
(ins GPR8
|
||||
@@ -2417,6 +2430,11 @@ def Asr16 : ShiftPseudo<(outs DREGS
|
||||
: $src, i8
|
||||
: $cnt))]>;
|
||||
|
||||
def Asr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
|
||||
(ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt),
|
||||
"# Asr32 PSEUDO",
|
||||
[(set i16:$dstlo, i16:$dsthi, (AVRasrw i16:$srclo, i16:$srchi, i8:$cnt))]>;
|
||||
|
||||
// lowered to a copy from the zero register.
|
||||
let usesCustomInserter=1 in
|
||||
def CopyZero : Pseudo<(outs GPR8:$rd), (ins), "clrz\t$rd", [(set i8:$rd, 0)]>;
|
||||
|
||||
129
llvm/test/CodeGen/AVR/shift32.ll
Normal file
129
llvm/test/CodeGen/AVR/shift32.ll
Normal file
@@ -0,0 +1,129 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=avr -mattr=movw -verify-machineinstrs | FileCheck %s
|
||||
|
||||
define i32 @shl_i32_1(i32 %a) {
|
||||
; CHECK-LABEL: shl_i32_1:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: lsl r22
|
||||
; CHECK-NEXT: rol r23
|
||||
; CHECK-NEXT: rol r24
|
||||
; CHECK-NEXT: rol r25
|
||||
; CHECK-NEXT: ret
|
||||
%res = shl i32 %a, 1
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
define i32 @shl_i32_2(i32 %a) {
|
||||
; CHECK-LABEL: shl_i32_2:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: lsl r22
|
||||
; CHECK-NEXT: rol r23
|
||||
; CHECK-NEXT: rol r24
|
||||
; CHECK-NEXT: rol r25
|
||||
; CHECK-NEXT: lsl r22
|
||||
; CHECK-NEXT: rol r23
|
||||
; CHECK-NEXT: rol r24
|
||||
; CHECK-NEXT: rol r25
|
||||
; CHECK-NEXT: ret
|
||||
%res = shl i32 %a, 2
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; This is a special case: this shift is performed directly inside SelectionDAG
|
||||
; instead of as a custom lowering like the other shift operations.
|
||||
define i32 @shl_i32_16(i32 %a) {
|
||||
; CHECK-LABEL: shl_i32_16:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: movw r24, r22
|
||||
; CHECK-NEXT: ldi r22, 0
|
||||
; CHECK-NEXT: ldi r23, 0
|
||||
; CHECK-NEXT: ret
|
||||
%res = shl i32 %a, 16
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; Combined with the register allocator, shift instructions can sometimes be
|
||||
; optimized away entirely. The least significant registers are simply stored
|
||||
; directly instead of moving them first.
|
||||
define void @shl_i32_16_ptr(i32 %a, ptr %ptr) {
|
||||
; CHECK-LABEL: shl_i32_16_ptr:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: movw r30, r20
|
||||
; CHECK-NEXT: std Z+2, r22
|
||||
; CHECK-NEXT: std Z+3, r23
|
||||
; CHECK-NEXT: ldi r24, 0
|
||||
; CHECK-NEXT: ldi r25, 0
|
||||
; CHECK-NEXT: st Z, r24
|
||||
; CHECK-NEXT: std Z+1, r25
|
||||
; CHECK-NEXT: ret
|
||||
%res = shl i32 %a, 16
|
||||
store i32 %res, ptr %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define i32 @lshr_i32_1(i32 %a) {
|
||||
; CHECK-LABEL: lshr_i32_1:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: lsr r25
|
||||
; CHECK-NEXT: ror r24
|
||||
; CHECK-NEXT: ror r23
|
||||
; CHECK-NEXT: ror r22
|
||||
; CHECK-NEXT: ret
|
||||
%res = lshr i32 %a, 1
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
define i32 @lshr_i32_2(i32 %a) {
|
||||
; CHECK-LABEL: lshr_i32_2:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: lsr r25
|
||||
; CHECK-NEXT: ror r24
|
||||
; CHECK-NEXT: ror r23
|
||||
; CHECK-NEXT: ror r22
|
||||
; CHECK-NEXT: lsr r25
|
||||
; CHECK-NEXT: ror r24
|
||||
; CHECK-NEXT: ror r23
|
||||
; CHECK-NEXT: ror r22
|
||||
; CHECK-NEXT: ret
|
||||
%res = lshr i32 %a, 2
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
define i32 @lshr_i32_16(i32 %a) {
|
||||
; CHECK-LABEL: lshr_i32_16:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: movw r22, r24
|
||||
; CHECK-NEXT: ldi r24, 0
|
||||
; CHECK-NEXT: ldi r25, 0
|
||||
; CHECK-NEXT: ret
|
||||
%res = lshr i32 %a, 16
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
define i32 @ashr_i32_1(i32 %a) {
|
||||
; CHECK-LABEL: ashr_i32_1:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: asr r25
|
||||
; CHECK-NEXT: ror r24
|
||||
; CHECK-NEXT: ror r23
|
||||
; CHECK-NEXT: ror r22
|
||||
; CHECK-NEXT: ret
|
||||
%res = ashr i32 %a, 1
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
define i32 @ashr_i32_2(i32 %a) {
|
||||
; CHECK-LABEL: ashr_i32_2:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: asr r25
|
||||
; CHECK-NEXT: ror r24
|
||||
; CHECK-NEXT: ror r23
|
||||
; CHECK-NEXT: ror r22
|
||||
; CHECK-NEXT: asr r25
|
||||
; CHECK-NEXT: ror r24
|
||||
; CHECK-NEXT: ror r23
|
||||
; CHECK-NEXT: ror r22
|
||||
; CHECK-NEXT: ret
|
||||
%res = ashr i32 %a, 2
|
||||
ret i32 %res
|
||||
}
|
||||
Reference in New Issue
Block a user