mirror of
https://github.com/intel/llvm.git
synced 2026-01-23 16:06:39 +08:00
[AMDGPU] Provide control to force VGPR MFMA form (#148079)
This gives an override to the user to force select VGPR form of MFMA. Eventually we will drop this in favor of compiler making better decisions, but this provides a mechanism for users to address the cases where MayNeedAGPRs favors the AGPR form and performance is degraded due to poor RA.
This commit is contained in:
@@ -29,6 +29,16 @@ enum { MAX_LANES = 64 };
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
// TODO -- delete this flag once we have more robust mechanisms to allocate the
|
||||
// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
|
||||
// where it is better to produce the VGPR form (e.g. if there are VGPR users
|
||||
// of the MFMA result).
|
||||
cl::opt<bool> MFMAVGPRForm(
|
||||
"amdgpu-mfma-vgpr-form", cl::Hidden,
|
||||
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
|
||||
"unspecified, default to compiler heuristics"),
|
||||
cl::init(false));
|
||||
|
||||
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
|
||||
const SITargetLowering *TLI = STI->getTargetLowering();
|
||||
return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
|
||||
@@ -69,8 +79,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
|
||||
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
|
||||
}
|
||||
|
||||
MayNeedAGPRs = ST.hasMAIInsts();
|
||||
if (ST.hasGFX90AInsts() &&
|
||||
MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
|
||||
if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
|
||||
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
|
||||
!mayUseAGPRs(F))
|
||||
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
|
||||
|
||||
76
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
Normal file
76
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
Normal file
@@ -0,0 +1,76 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
|
||||
|
||||
define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
|
||||
; HEURRC-LABEL: default:
|
||||
; HEURRC: ; %bb.0:
|
||||
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
|
||||
; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
|
||||
; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
|
||||
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
|
||||
; HEURRC-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; VGPRRC-LABEL: default:
|
||||
; VGPRRC: ; %bb.0:
|
||||
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
|
||||
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
|
||||
ret <4 x float> %result
|
||||
}
|
||||
|
||||
define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 {
|
||||
; HEURRC-LABEL: request_agpr:
|
||||
; HEURRC: ; %bb.0:
|
||||
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
|
||||
; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
|
||||
; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
|
||||
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
|
||||
; HEURRC-NEXT: s_nop 1
|
||||
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
|
||||
; HEURRC-NEXT: s_nop 7
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
|
||||
; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
|
||||
; HEURRC-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; VGPRRC-LABEL: request_agpr:
|
||||
; VGPRRC: ; %bb.0:
|
||||
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
|
||||
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
|
||||
ret <4 x float> %result
|
||||
}
|
||||
|
||||
define <4 x float> @request_no_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #1 {
|
||||
; HEURRC-LABEL: request_no_agpr:
|
||||
; HEURRC: ; %bb.0:
|
||||
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
|
||||
; HEURRC-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; VGPRRC-LABEL: request_no_agpr:
|
||||
; VGPRRC: ; %bb.0:
|
||||
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
|
||||
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
|
||||
ret <4 x float> %result
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-agpr-alloc"="32,256" }
|
||||
attributes #1 = { "amdgpu-agpr-alloc"="0,0" }
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user