From a7455d7b7ce33bb4d04106e26f242e6cb24f3042 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 14 Feb 2021 09:54:25 -0500 Subject: [PATCH] AMDGPU: Remove kills following clusters of memory instruction In a future commit, soft clauses will be hinted with kill instructions rather than forced together with bundles. Look for kills that look like this, and erase them. I'm not sure if the check for specific uses is worthwhile, or if it would be better to just unconditionally erase kills. This reduces test churn in a future patch. --- llvm/lib/Target/AMDGPU/SIPostRABundler.cpp | 49 ++++++++++++++++ .../CodeGen/AMDGPU/postra-bundle-memops.mir | 58 +++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp index 2d220a0b71ac..1123a89b3e6a 100644 --- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp +++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -48,6 +48,9 @@ private: SmallSet Defs; + void collectUsedRegUnits(const MachineInstr &MI, + BitVector &UsedRegUnits) const; + bool isBundleCandidate(const MachineInstr &MI) const; bool isDependentLoad(const MachineInstr &MI) const; bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const; @@ -85,6 +88,21 @@ bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const { return false; } +void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI, + BitVector &UsedRegUnits) const { + for (const MachineOperand &Op : MI.operands()) { + if (!Op.isReg() || !Op.readsReg()) + continue; + + Register Reg = Op.getReg(); + assert(!Op.getSubReg() && + "subregister indexes should not be present after RA"); + + for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) + UsedRegUnits.set(*Units); + } +} + bool SIPostRABundler::isBundleCandidate(const MachineInstr &MI) const { const uint64_t IMemFlags = MI.getDesc().TSFlags & MemFlags; return IMemFlags != 0 && MI.mayLoadOrStore() && !MI.isBundled(); @@ -105,6 +123,9 @@ bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) { return false; TRI = MF.getSubtarget().getRegisterInfo(); + BitVector BundleUsedRegUnits(TRI->getNumRegUnits()); + BitVector KillUsedRegUnits(TRI->getNumRegUnits()); + bool Changed = false; for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::instr_iterator Next; @@ -147,6 +168,34 @@ bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) { Next = std::next(BundleEnd); if (ClauseLength > 1) { Changed = true; + + // Before register allocation, kills are inserted after potential soft + // clauses to hint register allocation. Look for kills that look like + // this, and erase them. + if (Next != E && Next->isKill()) { + MachineInstr &Kill = *Next; + + // TODO: Should maybe back-propagate kill flags to the bundle. + for (const MachineInstr &BundleMI : make_range(BundleStart, Next)) + collectUsedRegUnits(BundleMI, BundleUsedRegUnits); + collectUsedRegUnits(Kill, KillUsedRegUnits); + + BundleUsedRegUnits.flip(); + KillUsedRegUnits &= BundleUsedRegUnits; + + // Erase the kill if it's a subset of the used registers. + // + // TODO: Should we just remove all kills? Is there any real reason to + // keep them after RA? + if (KillUsedRegUnits.none()) { + ++Next; + Kill.eraseFromParent(); + } + + BundleUsedRegUnits.reset(); + KillUsedRegUnits.reset(); + } + finalizeBundle(MBB, BundleStart, Next); } diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir index fca771680713..c135a77163ef 100644 --- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir +++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir @@ -220,3 +220,61 @@ body: | $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec ... + +# Before register allocation, KILL hints are inserted after potential soft +# clauses to hint the register allocator to not clobber the input +# registers. Kills that look like this should be erased. +--- +name: post_bundle_kill +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + + ; GCN-LABEL: name: post_bundle_kill + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + KILL killed $vgpr3_vgpr4, killed $vgpr5_vgpr6 +... + +# Kill some other register not used in the bundle, should not be touched. +--- +name: post_bundle_kill_other +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + ; GCN-LABEL: name: post_bundle_kill_other + ; GCN: $vgpr7 = V_MOV_B32_e32 0, implicit $exec + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: KILL killed $vgpr7 + $vgpr7 = V_MOV_B32_e32 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + KILL killed $vgpr7 +... + +# Kill some other register not used in the bundle, but also some +# from the bundle. +--- +name: post_bundle_kill_plus_other +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + ; GCN-LABEL: name: post_bundle_kill_plus_other + ; GCN: $vgpr7 = V_MOV_B32_e32 0, implicit $exec + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: KILL killed $vgpr7, killed $vgpr3 + $vgpr7 = V_MOV_B32_e32 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + KILL killed $vgpr7, killed $vgpr3 +...