[AMDGPU] Provide control to force VGPR MFMA form (#148079)

This gives an override to the user to force select VGPR form of MFMA. Eventually we will drop this in favor of compiler making better decisions, but this provides a mechanism for users to address the cases where MayNeedAGPRs favors the AGPR form and performance is degraded due to poor RA.
2026-01-23 16:06:39 +08:00 · 2025-07-18 13:53:17 -07:00
parent abdd4536ce
commit 695660cdfd
3 changed files with 3994 additions and 2 deletions
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,6 +29,16 @@ enum { MAX_LANES = 64 };

 using namespace llvm;

+// TODO -- delete this flag once we have more robust mechanisms to allocate the
+// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
+// where it is better to produce the VGPR form (e.g. if there are VGPR users
+// of the MFMA result).
+cl::opt<bool> MFMAVGPRForm(
+    "amdgpu-mfma-vgpr-form", cl::Hidden,
+    cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
+             "unspecified, default to compiler heuristics"),
+    cl::init(false));
+
 const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
  const SITargetLowering *TLI = STI->getTargetLowering();
  return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
@@ -69,8 +79,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
    PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
  }

-  MayNeedAGPRs = ST.hasMAIInsts();
-  if (ST.hasGFX90AInsts() &&
+  MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
+  if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
      ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
      !mayUseAGPRs(F))
    MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
+; HEURRC-LABEL: default:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: default:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 {
+; HEURRC-LABEL: request_agpr:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: request_agpr:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+define <4 x float> @request_no_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #1 {
+; HEURRC-LABEL: request_no_agpr:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: request_no_agpr:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+attributes #0 = { "amdgpu-agpr-alloc"="32,256" }
+attributes #1 = { "amdgpu-agpr-alloc"="0,0" }
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll