mirror of
https://github.com/intel/llvm.git
synced 2026-01-14 03:50:17 +08:00
[AMDGPU][GlobalISel] Add register bank legalization for G_FADD (#163407)
This patch adds register bank legalization support for G_FADD opcodes in the AMDGPU GlobalISel pipeline. Added new reg bank type UniInVgprS64. This patch also adds a combine logic for ReadAnyLane + Trunc + AnyExt. --------- Co-authored-by: Abhinav Garg <abhigarg@amd.com>
This commit is contained in:
@@ -24,6 +24,7 @@
|
||||
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
|
||||
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
|
||||
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
|
||||
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
|
||||
#include "llvm/CodeGen/GlobalISel/Utils.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
|
||||
@@ -34,9 +35,17 @@
|
||||
|
||||
using namespace llvm;
|
||||
using namespace AMDGPU;
|
||||
using namespace llvm::MIPatternMatch;
|
||||
|
||||
namespace {
|
||||
|
||||
// AMDGPU-specific pattern matchers
|
||||
template <typename SrcTy>
|
||||
inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>
|
||||
m_GAMDGPUReadAnyLane(const SrcTy &Src) {
|
||||
return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
|
||||
}
|
||||
|
||||
class AMDGPURegBankLegalize : public MachineFunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
@@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
|
||||
|
||||
Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
|
||||
// Src = G_AMDGPU_READANYLANE RALSrc
|
||||
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
|
||||
if (RAL)
|
||||
Register RALSrc;
|
||||
if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
|
||||
return RALSrc;
|
||||
|
||||
// TruncSrc = G_AMDGPU_READANYLANE RALSrc
|
||||
// AextSrc = G_TRUNC TruncSrc
|
||||
// Src = G_ANYEXT AextSrc
|
||||
if (mi_match(Src, MRI,
|
||||
m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) {
|
||||
return RALSrc;
|
||||
}
|
||||
|
||||
// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
|
||||
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
|
||||
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr
|
||||
|
||||
@@ -626,6 +626,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
|
||||
Register Dst = MI.getOperand(0).getReg();
|
||||
assert(MRI.getType(Dst) == V2S16);
|
||||
auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
|
||||
auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
|
||||
unsigned Opc = MI.getOpcode();
|
||||
auto Flags = MI.getFlags();
|
||||
auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
|
||||
auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
|
||||
auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32);
|
||||
auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32);
|
||||
auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
|
||||
auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
|
||||
B.buildMergeLikeInstr(Dst, {Lo, Hi});
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
|
||||
Register Dst = MI.getOperand(0).getReg();
|
||||
LLT DstTy = MRI.getType(Dst);
|
||||
@@ -698,6 +715,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
|
||||
return lowerUnpackBitShift(MI);
|
||||
case UnpackMinMax:
|
||||
return lowerUnpackMinMax(MI);
|
||||
case ScalarizeToS16:
|
||||
return lowerSplitTo16(MI);
|
||||
case Ext32To64: {
|
||||
const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
|
||||
MachineInstrBuilder Hi;
|
||||
@@ -849,6 +868,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
|
||||
return LLT::scalar(32);
|
||||
case Sgpr64:
|
||||
case Vgpr64:
|
||||
case UniInVgprS64:
|
||||
return LLT::scalar(64);
|
||||
case Sgpr128:
|
||||
case Vgpr128:
|
||||
@@ -972,6 +992,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
|
||||
case UniInVcc:
|
||||
case UniInVgprS16:
|
||||
case UniInVgprS32:
|
||||
case UniInVgprS64:
|
||||
case UniInVgprV2S16:
|
||||
case UniInVgprV4S32:
|
||||
case UniInVgprB32:
|
||||
@@ -1104,6 +1125,7 @@ void RegBankLegalizeHelper::applyMappingDst(
|
||||
break;
|
||||
}
|
||||
case UniInVgprS32:
|
||||
case UniInVgprS64:
|
||||
case UniInVgprV2S16:
|
||||
case UniInVgprV4S32: {
|
||||
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
|
||||
|
||||
@@ -72,6 +72,7 @@ class RegBankLegalizeHelper {
|
||||
static constexpr LLT P6 = LLT::pointer(6, 32);
|
||||
|
||||
MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32};
|
||||
MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};
|
||||
MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};
|
||||
MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1};
|
||||
|
||||
@@ -121,6 +122,7 @@ private:
|
||||
void lowerV_BFE(MachineInstr &MI);
|
||||
void lowerS_BFE(MachineInstr &MI);
|
||||
void lowerSplitTo32(MachineInstr &MI);
|
||||
void lowerSplitTo16(MachineInstr &MI);
|
||||
void lowerSplitTo32Select(MachineInstr &MI);
|
||||
void lowerSplitTo32SExtInReg(MachineInstr &MI);
|
||||
void lowerUnpackMinMax(MachineInstr &MI);
|
||||
|
||||
@@ -918,9 +918,20 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
|
||||
bool hasSALUFloat = ST->hasSALUFloatInsts();
|
||||
|
||||
addRulesForGOpcs({G_FADD}, Standard)
|
||||
.Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
|
||||
.Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
|
||||
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
|
||||
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
|
||||
.Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
|
||||
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
|
||||
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
|
||||
.Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
|
||||
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
|
||||
.Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
|
||||
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16},
|
||||
hasSALUFloat)
|
||||
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
|
||||
.Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
|
||||
.Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
|
||||
|
||||
addRulesForGOpcs({G_FPTOUI})
|
||||
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
|
||||
|
||||
@@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID {
|
||||
V4S32,
|
||||
|
||||
UniV2S16,
|
||||
UniV2S32,
|
||||
|
||||
DivV2S16,
|
||||
DivV2S32,
|
||||
|
||||
// B types
|
||||
B32,
|
||||
@@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID {
|
||||
UniInVcc,
|
||||
UniInVgprS16,
|
||||
UniInVgprS32,
|
||||
UniInVgprS64,
|
||||
UniInVgprV2S16,
|
||||
UniInVgprV2S32,
|
||||
UniInVgprV4S32,
|
||||
UniInVgprB32,
|
||||
UniInVgprB64,
|
||||
@@ -217,6 +221,7 @@ enum LoweringMethodID {
|
||||
V_BFE,
|
||||
VgprToVccCopy,
|
||||
SplitTo32,
|
||||
ScalarizeToS16,
|
||||
SplitTo32Select,
|
||||
SplitTo32SExtInReg,
|
||||
Ext32To64,
|
||||
|
||||
165
llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
Normal file
165
llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
Normal file
@@ -0,0 +1,165 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
|
||||
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
|
||||
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
|
||||
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
|
||||
|
||||
define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) {
|
||||
; GFX11-FAKE16-LABEL: fadd_s16_uniform:
|
||||
; GFX11-FAKE16: ; %bb.0:
|
||||
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
|
||||
; GFX11-FAKE16-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX11-TRUE16-LABEL: fadd_s16_uniform:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
|
||||
; GFX11-TRUE16-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX12-LABEL: fadd_s16_uniform:
|
||||
; GFX12: ; %bb.0:
|
||||
; GFX12-NEXT: s_add_f16 s0, s0, s1
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: ; return to shader part epilog
|
||||
%fadd = fadd half %a, %b
|
||||
ret half %fadd
|
||||
}
|
||||
|
||||
define amdgpu_ps half @fadd_s16_div(half %a, half %b) {
|
||||
; GFX11-FAKE16-LABEL: fadd_s16_div:
|
||||
; GFX11-FAKE16: ; %bb.0:
|
||||
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
|
||||
; GFX11-FAKE16-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX11-TRUE16-LABEL: fadd_s16_div:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX12-FAKE16-LABEL: fadd_s16_div:
|
||||
; GFX12-FAKE16: ; %bb.0:
|
||||
; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
|
||||
; GFX12-FAKE16-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX12-TRUE16-LABEL: fadd_s16_div:
|
||||
; GFX12-TRUE16: ; %bb.0:
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
|
||||
; GFX12-TRUE16-NEXT: ; return to shader part epilog
|
||||
%fadd = fadd half %a, %b
|
||||
ret half %fadd
|
||||
}
|
||||
|
||||
define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) {
|
||||
; GFX11-LABEL: fadd_s32_uniform:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_add_f32_e64 v0, s0, s1
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX12-LABEL: fadd_s32_uniform:
|
||||
; GFX12: ; %bb.0:
|
||||
; GFX12-NEXT: s_add_f32 s0, s0, s1
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: ; return to shader part epilog
|
||||
%fadd = fadd float %a, %b
|
||||
ret float %fadd
|
||||
}
|
||||
|
||||
define amdgpu_ps float @fadd_s32_div(float %a, float %b) {
|
||||
; GCN-LABEL: fadd_s32_div:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
%fadd = fadd float %a, %b
|
||||
ret float %fadd
|
||||
}
|
||||
|
||||
define amdgpu_ps void @fadd_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) {
|
||||
; GFX11-LABEL: fadd_s64_uniform:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_add_f64 v[2:3], s[0:1], s[2:3]
|
||||
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
|
||||
; GFX11-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: fadd_s64_uniform:
|
||||
; GFX12: ; %bb.0:
|
||||
; GFX12-NEXT: v_add_f64_e64 v[2:3], s[0:1], s[2:3]
|
||||
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
%fadd = fadd double %a, %b
|
||||
store double %fadd, ptr addrspace(1) %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @fadd_s64_div(double %a, double %b, ptr addrspace(1) %ptr) {
|
||||
; GFX11-LABEL: fadd_s64_div:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
|
||||
; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off
|
||||
; GFX11-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-LABEL: fadd_s64_div:
|
||||
; GFX12: ; %bb.0:
|
||||
; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3]
|
||||
; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off
|
||||
; GFX12-NEXT: s_endpgm
|
||||
%fadd = fadd double %a, %b
|
||||
store double %fadd, ptr addrspace(1) %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
|
||||
; GFX11-LABEL: fadd_v2s16_uniform:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_pk_add_f16 v0, s0, s1
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX12-LABEL: fadd_v2s16_uniform:
|
||||
; GFX12: ; %bb.0:
|
||||
; GFX12-NEXT: s_lshr_b32 s2, s0, 16
|
||||
; GFX12-NEXT: s_lshr_b32 s3, s1, 16
|
||||
; GFX12-NEXT: s_add_f16 s0, s0, s1
|
||||
; GFX12-NEXT: s_add_f16 s1, s2, s3
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: ; return to shader part epilog
|
||||
%fadd = fadd <2 x half> %a, %b
|
||||
ret <2 x half> %fadd
|
||||
}
|
||||
|
||||
define amdgpu_ps <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) {
|
||||
; GCN-LABEL: fadd_v2s16_div:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_pk_add_f16 v0, v0, v1
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
%fadd = fadd <2 x half> %a, %b
|
||||
ret <2 x half> %fadd
|
||||
}
|
||||
|
||||
define amdgpu_ps <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
|
||||
; GFX11-LABEL: fadd_v2s32_uniform:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_add_f32_e64 v0, s0, s2
|
||||
; GFX11-NEXT: v_add_f32_e64 v1, s1, s3
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX12-LABEL: fadd_v2s32_uniform:
|
||||
; GFX12: ; %bb.0:
|
||||
; GFX12-NEXT: s_add_f32 s0, s0, s2
|
||||
; GFX12-NEXT: s_add_f32 s1, s1, s3
|
||||
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
|
||||
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
||||
; GFX12-NEXT: ; return to shader part epilog
|
||||
%fadd = fadd <2 x float> %a, %b
|
||||
ret <2 x float> %fadd
|
||||
}
|
||||
|
||||
define amdgpu_ps <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) {
|
||||
; GCN-LABEL: fadd_v2s32_div:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
%fadd = fadd <2 x float> %a, %b
|
||||
ret <2 x float> %fadd
|
||||
}
|
||||
Reference in New Issue
Block a user