[AMDGPU] Provide control to force VGPR MFMA form (#148079)

This gives an override to the user to force select VGPR form of MFMA.
Eventually we will drop this in favor of compiler making better
decisions, but this provides a mechanism for users to address the cases
where MayNeedAGPRs favors the AGPR form and performance is degraded due
to poor RA.
This commit is contained in:
Jeffrey Byrnes
2025-07-18 13:53:17 -07:00
committed by GitHub
parent abdd4536ce
commit 695660cdfd
3 changed files with 3994 additions and 2 deletions

View File

@@ -29,6 +29,16 @@ enum { MAX_LANES = 64 };
using namespace llvm;
// TODO -- delete this flag once we have more robust mechanisms to allocate the
// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
// where it is better to produce the VGPR form (e.g. if there are VGPR users
// of the MFMA result).
cl::opt<bool> MFMAVGPRForm(
"amdgpu-mfma-vgpr-form", cl::Hidden,
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
"unspecified, default to compiler heuristics"),
cl::init(false));
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
const SITargetLowering *TLI = STI->getTargetLowering();
return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
@@ -69,8 +79,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}
MayNeedAGPRs = ST.hasMAIInsts();
if (ST.hasGFX90AInsts() &&
MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
!mayUseAGPRs(F))
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.

View File

@@ -0,0 +1,76 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
; HEURRC-LABEL: default:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
; HEURRC-NEXT: s_setpc_b64 s[30:31]
;
; VGPRRC-LABEL: default:
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
ret <4 x float> %result
}
define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 {
; HEURRC-LABEL: request_agpr:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
; HEURRC-NEXT: s_setpc_b64 s[30:31]
;
; VGPRRC-LABEL: request_agpr:
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
ret <4 x float> %result
}
define <4 x float> @request_no_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #1 {
; HEURRC-LABEL: request_no_agpr:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_setpc_b64 s[30:31]
;
; VGPRRC-LABEL: request_no_agpr:
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
ret <4 x float> %result
}
attributes #0 = { "amdgpu-agpr-alloc"="32,256" }
attributes #1 = { "amdgpu-agpr-alloc"="0,0" }

File diff suppressed because it is too large Load Diff