mirror of
https://github.com/intel/llvm.git
synced 2026-01-15 12:25:46 +08:00
[AMDGPU] Add iglp_opt(3) for simple mfma / exp interleaving (#117269)
Adds a minimal iglp_opt to do simple exp / mfma interleaving.
This commit is contained in:
@@ -1376,6 +1376,8 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
|
||||
|
||||
0. Interleave DS and MFMA instructions for small GEMM kernels.
|
||||
1. Interleave DS and MFMA instructions for single wave small GEMM kernels.
|
||||
2. Interleave TRANS and MFMA instructions, as well as their VALU and DS predecessors, for attention kernels.
|
||||
3. Interleave TRANS and MFMA instructions, with no predecessor interleaving, for attention kernels.
|
||||
|
||||
Only one iglp_opt intrinsic may be used in a scheduling region. The iglp_opt intrinsic
|
||||
cannot be combined with sched_barrier or sched_group_barrier.
|
||||
|
||||
@@ -832,7 +832,8 @@ void PipelineSolver::solve() {
|
||||
enum IGLPStrategyID : int {
|
||||
MFMASmallGemmOptID = 0,
|
||||
MFMASmallGemmSingleWaveOptID = 1,
|
||||
MFMAExpInterleave = 2
|
||||
MFMAExpInterleaveID = 2,
|
||||
MFMAExpSimpleInterleaveID = 3
|
||||
};
|
||||
|
||||
// Implement a IGLP scheduling strategy.
|
||||
@@ -1845,6 +1846,48 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy(
|
||||
return true;
|
||||
}
|
||||
|
||||
class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy {
|
||||
public:
|
||||
bool applyIGLPStrategy(
|
||||
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
|
||||
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
|
||||
AMDGPU::SchedulingPhase Phase) override;
|
||||
|
||||
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
|
||||
AMDGPU::SchedulingPhase Phase) override {
|
||||
return true;
|
||||
}
|
||||
|
||||
MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
|
||||
: IGLPStrategy(DAG, TII) {
|
||||
IsBottomUp = true;
|
||||
}
|
||||
};
|
||||
|
||||
bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
|
||||
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
|
||||
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
|
||||
AMDGPU::SchedulingPhase Phase) {
|
||||
// Count the number of MFMA instructions.
|
||||
unsigned MFMACount = 0;
|
||||
for (const MachineInstr &I : *DAG)
|
||||
if (TII->isMFMAorWMMA(I))
|
||||
++MFMACount;
|
||||
|
||||
const unsigned PipelineSyncID = 0;
|
||||
for (unsigned I = 0; I < MFMACount * 3; ++I) {
|
||||
SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
|
||||
SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
|
||||
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
|
||||
|
||||
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
|
||||
SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
|
||||
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
|
||||
private:
|
||||
// Whether the DS_READ is a predecessor of first four MFMA in region
|
||||
@@ -2308,8 +2351,10 @@ createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
|
||||
return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
|
||||
case MFMASmallGemmSingleWaveOptID:
|
||||
return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
|
||||
case MFMAExpInterleave:
|
||||
case MFMAExpInterleaveID:
|
||||
return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
|
||||
case MFMAExpSimpleInterleaveID:
|
||||
return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII);
|
||||
}
|
||||
|
||||
llvm_unreachable("Unknown IGLPStrategyID");
|
||||
|
||||
@@ -1146,6 +1146,7 @@
|
||||
; GCN-NEXT: s_waitcnt vmcnt(8)
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
; GCN-NEXT: s_endpgm
|
||||
|
||||
attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
|
||||
!0 = !{i64 2862105}
|
||||
|
||||
|
||||
170
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll
Normal file
170
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll
Normal file
@@ -0,0 +1,170 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
define amdgpu_kernel void @MFMAExpInterleave(ptr addrspace(1) %out0, ptr addrspace(1) %out1, float %in0, <4 x float> %in1) {
|
||||
; GCN-LABEL: MFMAExpInterleave:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s6, s[4:5], 0x10
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 1.0
|
||||
; GCN-NEXT: s_mov_b32 s7, 0x42b17218
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mul_f32_e32 v2, s6, v1
|
||||
; GCN-NEXT: v_rndne_f32_e32 v3, v2
|
||||
; GCN-NEXT: v_sub_f32_e32 v4, v2, v3
|
||||
; GCN-NEXT: v_fma_f32 v1, s6, v1, -v2
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0x32a5705f
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a0, s0
|
||||
; GCN-NEXT: v_fmac_f32_e32 v1, s6, v2
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a1, s1
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a2, s2
|
||||
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
|
||||
; GCN-NEXT: v_add_f32_e32 v1, v4, v1
|
||||
; GCN-NEXT: v_cvt_i32_f32_e32 v2, v3
|
||||
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
|
||||
; GCN-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GCN-NEXT: s_mov_b32 s0, 0x3fb8aa3b
|
||||
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
|
||||
; GCN-NEXT: ; iglp_opt mask(0x00000003)
|
||||
; GCN-NEXT: v_ldexp_f32 v1, v1, v2
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0
|
||||
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v2
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0x42b17218
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
||||
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v2
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0x7f800000
|
||||
; GCN-NEXT: s_mov_b32 s6, 0xc2ce8ed0
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
|
||||
; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
|
||||
; GCN-NEXT: v_rndne_f32_e32 v5, v3
|
||||
; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
|
||||
; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
|
||||
; GCN-NEXT: v_add_f32_e32 v3, v3, v4
|
||||
; GCN-NEXT: v_exp_f32_e32 v3, v3
|
||||
; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
|
||||
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
|
||||
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
|
||||
; GCN-NEXT: v_ldexp_f32 v3, v3, v4
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
||||
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
|
||||
; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
|
||||
; GCN-NEXT: v_rndne_f32_e32 v5, v3
|
||||
; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
|
||||
; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
|
||||
; GCN-NEXT: v_add_f32_e32 v3, v3, v4
|
||||
; GCN-NEXT: v_exp_f32_e32 v3, v3
|
||||
; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
|
||||
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
|
||||
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
|
||||
; GCN-NEXT: v_ldexp_f32 v3, v3, v4
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
||||
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
|
||||
; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
|
||||
; GCN-NEXT: v_rndne_f32_e32 v5, v3
|
||||
; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
|
||||
; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
|
||||
; GCN-NEXT: v_add_f32_e32 v3, v3, v4
|
||||
; GCN-NEXT: v_exp_f32_e32 v3, v3
|
||||
; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
|
||||
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
|
||||
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
|
||||
; GCN-NEXT: v_ldexp_f32 v3, v3, v4
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
||||
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
|
||||
; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
|
||||
; GCN-NEXT: v_rndne_f32_e32 v5, v3
|
||||
; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
|
||||
; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
|
||||
; GCN-NEXT: v_add_f32_e32 v3, v3, v4
|
||||
; GCN-NEXT: v_exp_f32_e32 v3, v3
|
||||
; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
|
||||
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
|
||||
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
|
||||
; GCN-NEXT: v_ldexp_f32 v3, v3, v4
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
||||
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
|
||||
; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
|
||||
; GCN-NEXT: v_rndne_f32_e32 v5, v3
|
||||
; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
|
||||
; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
|
||||
; GCN-NEXT: v_add_f32_e32 v3, v3, v4
|
||||
; GCN-NEXT: v_exp_f32_e32 v3, v3
|
||||
; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
|
||||
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
|
||||
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
|
||||
; GCN-NEXT: v_ldexp_f32 v3, v3, v4
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
|
||||
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
|
||||
; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
|
||||
; GCN-NEXT: v_rndne_f32_e32 v5, v3
|
||||
; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
|
||||
; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
|
||||
; GCN-NEXT: v_add_f32_e32 v3, v3, v4
|
||||
; GCN-NEXT: v_exp_f32_e32 v3, v3
|
||||
; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
|
||||
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
|
||||
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
|
||||
; GCN-NEXT: v_ldexp_f32 v0, v3, v4
|
||||
; GCN-NEXT: s_nop 0
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
||||
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
|
||||
; GCN-NEXT: s_nop 1
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
||||
; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
|
||||
; GCN-NEXT: v_fma_f32 v3, v0, s0, -v1
|
||||
; GCN-NEXT: v_rndne_f32_e32 v4, v1
|
||||
; GCN-NEXT: v_fmac_f32_e32 v3, 0x32a5705f, v0
|
||||
; GCN-NEXT: v_sub_f32_e32 v1, v1, v4
|
||||
; GCN-NEXT: v_add_f32_e32 v1, v1, v3
|
||||
; GCN-NEXT: v_exp_f32_e32 v1, v1
|
||||
; GCN-NEXT: v_cvt_i32_f32_e32 v3, v4
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GCN-NEXT: v_ldexp_f32 v1, v1, v3
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
||||
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
||||
; GCN-NEXT: global_store_dword v4, v0, s[2:3]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%mai0 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in1, i32 0, i32 0, i32 0)
|
||||
%mai1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai0, i32 0, i32 0, i32 0)
|
||||
%mai2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai1, i32 0, i32 0, i32 0)
|
||||
%mai3 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai2, i32 0, i32 0, i32 0)
|
||||
%mai4 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai3, i32 0, i32 0, i32 0)
|
||||
%mai5 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai4, i32 0, i32 0, i32 0)
|
||||
%mai6 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai5, i32 0, i32 0, i32 0)
|
||||
%mai7 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai6, i32 0, i32 0, i32 0)
|
||||
%exp0 = call float @llvm.exp.f32(float %in0)
|
||||
%exp1 = call float @llvm.exp.f32(float %exp0)
|
||||
%exp2 = call float @llvm.exp.f32(float %exp1)
|
||||
%exp3 = call float @llvm.exp.f32(float %exp2)
|
||||
%exp4 = call float @llvm.exp.f32(float %exp3)
|
||||
%exp5 = call float @llvm.exp.f32(float %exp4)
|
||||
%exp6 = call float @llvm.exp.f32(float %exp5)
|
||||
%exp7 = call float @llvm.exp.f32(float %exp6)
|
||||
store <4 x float> %mai7, ptr addrspace(1) %out0
|
||||
store float %exp7, ptr addrspace(1) %out1
|
||||
tail call void @llvm.amdgcn.iglp.opt(i32 3)
|
||||
ret void
|
||||
}
|
||||
@@ -492,7 +492,6 @@
|
||||
attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
|
||||
|
||||
!0 = !{i64 2862105}
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
@@ -899,4 +898,3 @@ body: |
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user