[AMDGPU][GlobalISel] Add register bank legalization for G_FADD (#163407)

This patch adds register bank legalization support for G_FADD opcodes in
the AMDGPU GlobalISel pipeline.
Added new reg bank type UniInVgprS64.
This patch also adds a combine logic for ReadAnyLane + Trunc + AnyExt.

---------

Co-authored-by: Abhinav Garg <abhigarg@amd.com>
This commit is contained in:
Abhinav Garg
2025-10-31 16:45:40 +05:30
committed by GitHub
parent 96c6fd36c1
commit 1057c63b24
6 changed files with 225 additions and 3 deletions

View File

@@ -24,6 +24,7 @@
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
@@ -34,9 +35,17 @@
using namespace llvm;
using namespace AMDGPU;
using namespace llvm::MIPatternMatch;
namespace {
// AMDGPU-specific pattern matchers
template <typename SrcTy>
inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>
m_GAMDGPUReadAnyLane(const SrcTy &Src) {
return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
}
class AMDGPURegBankLegalize : public MachineFunctionPass {
public:
static char ID;
@@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
// Src = G_AMDGPU_READANYLANE RALSrc
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
if (RAL)
Register RALSrc;
if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
return RALSrc;
// TruncSrc = G_AMDGPU_READANYLANE RALSrc
// AextSrc = G_TRUNC TruncSrc
// Src = G_ANYEXT AextSrc
if (mi_match(Src, MRI,
m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) {
return RALSrc;
}
// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr

View File

@@ -626,6 +626,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
MI.eraseFromParent();
}
void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
assert(MRI.getType(Dst) == V2S16);
auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
unsigned Opc = MI.getOpcode();
auto Flags = MI.getFlags();
auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32);
auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32);
auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
B.buildMergeLikeInstr(Dst, {Lo, Hi});
MI.eraseFromParent();
}
void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
@@ -698,6 +715,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
return lowerUnpackBitShift(MI);
case UnpackMinMax:
return lowerUnpackMinMax(MI);
case ScalarizeToS16:
return lowerSplitTo16(MI);
case Ext32To64: {
const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
MachineInstrBuilder Hi;
@@ -849,6 +868,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
return LLT::scalar(32);
case Sgpr64:
case Vgpr64:
case UniInVgprS64:
return LLT::scalar(64);
case Sgpr128:
case Vgpr128:
@@ -972,6 +992,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case UniInVcc:
case UniInVgprS16:
case UniInVgprS32:
case UniInVgprS64:
case UniInVgprV2S16:
case UniInVgprV4S32:
case UniInVgprB32:
@@ -1104,6 +1125,7 @@ void RegBankLegalizeHelper::applyMappingDst(
break;
}
case UniInVgprS32:
case UniInVgprS64:
case UniInVgprV2S16:
case UniInVgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));

View File

@@ -72,6 +72,7 @@ class RegBankLegalizeHelper {
static constexpr LLT P6 = LLT::pointer(6, 32);
MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32};
MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};
MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};
MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1};
@@ -121,6 +122,7 @@ private:
void lowerV_BFE(MachineInstr &MI);
void lowerS_BFE(MachineInstr &MI);
void lowerSplitTo32(MachineInstr &MI);
void lowerSplitTo16(MachineInstr &MI);
void lowerSplitTo32Select(MachineInstr &MI);
void lowerSplitTo32SExtInReg(MachineInstr &MI);
void lowerUnpackMinMax(MachineInstr &MI);

View File

@@ -918,9 +918,20 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
bool hasSALUFloat = ST->hasSALUFloatInsts();
addRulesForGOpcs({G_FADD}, Standard)
.Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
.Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
.Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
.Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
.Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16},
hasSALUFloat)
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
.Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
.Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
addRulesForGOpcs({G_FPTOUI})
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)

View File

@@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID {
V4S32,
UniV2S16,
UniV2S32,
DivV2S16,
DivV2S32,
// B types
B32,
@@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID {
UniInVcc,
UniInVgprS16,
UniInVgprS32,
UniInVgprS64,
UniInVgprV2S16,
UniInVgprV2S32,
UniInVgprV4S32,
UniInVgprB32,
UniInVgprB64,
@@ -217,6 +221,7 @@ enum LoweringMethodID {
V_BFE,
VgprToVccCopy,
SplitTo32,
ScalarizeToS16,
SplitTo32Select,
SplitTo32SExtInReg,
Ext32To64,

View File

@@ -0,0 +1,165 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) {
; GFX11-FAKE16-LABEL: fadd_s16_uniform:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
; GFX11-FAKE16-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: fadd_s16_uniform:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: fadd_s16_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_add_f16 s0, s0, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%fadd = fadd half %a, %b
ret half %fadd
}
define amdgpu_ps half @fadd_s16_div(half %a, half %b) {
; GFX11-FAKE16-LABEL: fadd_s16_div:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: fadd_s16_div:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX12-FAKE16-LABEL: fadd_s16_div:
; GFX12-FAKE16: ; %bb.0:
; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
; GFX12-FAKE16-NEXT: ; return to shader part epilog
;
; GFX12-TRUE16-LABEL: fadd_s16_div:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX12-TRUE16-NEXT: ; return to shader part epilog
%fadd = fadd half %a, %b
ret half %fadd
}
define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) {
; GFX11-LABEL: fadd_s32_uniform:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_f32_e64 v0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: fadd_s32_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_add_f32 s0, s0, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%fadd = fadd float %a, %b
ret float %fadd
}
define amdgpu_ps float @fadd_s32_div(float %a, float %b) {
; GCN-LABEL: fadd_s32_div:
; GCN: ; %bb.0:
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-NEXT: ; return to shader part epilog
%fadd = fadd float %a, %b
ret float %fadd
}
define amdgpu_ps void @fadd_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) {
; GFX11-LABEL: fadd_s64_uniform:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_f64 v[2:3], s[0:1], s[2:3]
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fadd_s64_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_add_f64_e64 v[2:3], s[0:1], s[2:3]
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
%fadd = fadd double %a, %b
store double %fadd, ptr addrspace(1) %ptr
ret void
}
define amdgpu_ps void @fadd_s64_div(double %a, double %b, ptr addrspace(1) %ptr) {
; GFX11-LABEL: fadd_s64_div:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fadd_s64_div:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off
; GFX12-NEXT: s_endpgm
%fadd = fadd double %a, %b
store double %fadd, ptr addrspace(1) %ptr
ret void
}
define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
; GFX11-LABEL: fadd_v2s16_uniform:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_pk_add_f16 v0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: fadd_v2s16_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_lshr_b32 s2, s0, 16
; GFX12-NEXT: s_lshr_b32 s3, s1, 16
; GFX12-NEXT: s_add_f16 s0, s0, s1
; GFX12-NEXT: s_add_f16 s1, s2, s3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%fadd = fadd <2 x half> %a, %b
ret <2 x half> %fadd
}
define amdgpu_ps <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) {
; GCN-LABEL: fadd_v2s16_div:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_f16 v0, v0, v1
; GCN-NEXT: ; return to shader part epilog
%fadd = fadd <2 x half> %a, %b
ret <2 x half> %fadd
}
define amdgpu_ps <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
; GFX11-LABEL: fadd_v2s32_uniform:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_f32_e64 v0, s0, s2
; GFX11-NEXT: v_add_f32_e64 v1, s1, s3
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: fadd_v2s32_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_add_f32 s0, s0, s2
; GFX12-NEXT: s_add_f32 s1, s1, s3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: ; return to shader part epilog
%fadd = fadd <2 x float> %a, %b
ret <2 x float> %fadd
}
define amdgpu_ps <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) {
; GCN-LABEL: fadd_v2s32_div:
; GCN: ; %bb.0:
; GCN-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
; GCN-NEXT: ; return to shader part epilog
%fadd = fadd <2 x float> %a, %b
ret <2 x float> %fadd
}