mirror of
https://github.com/intel/llvm.git
synced 2026-01-26 12:26:52 +08:00
[AMDGPU] Ressociate patterns with sub to use SALU
Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D148463
This commit is contained in:
@@ -11131,6 +11131,96 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
|
||||
return DAG.getNode(Opc, SL, VT, Add1, Op2);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::reassociateSub(SDNode *N, SelectionDAG &DAG) const {
|
||||
EVT VT = N->getValueType(0);
|
||||
if (VT != MVT::i32 && VT != MVT::i64)
|
||||
return SDValue();
|
||||
|
||||
if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
|
||||
return SDValue();
|
||||
|
||||
unsigned Opc = N->getOpcode();
|
||||
SDValue Op0 = N->getOperand(0);
|
||||
SDValue Op1 = N->getOperand(1);
|
||||
|
||||
if (!(Op0->isDivergent() ^ Op1->isDivergent()))
|
||||
return SDValue();
|
||||
|
||||
SDLoc SL(N);
|
||||
if (Op1->isDivergent() && Op1->hasOneUse()) {
|
||||
unsigned Op1Opc = Op1.getOpcode();
|
||||
if (Op1Opc != ISD::ADD && Op1Opc != ISD::SUB)
|
||||
return SDValue();
|
||||
|
||||
SDValue Op2 = Op1.getOperand(1);
|
||||
Op1 = Op1.getOperand(0);
|
||||
if (Opc == ISD::ADD && Op1Opc == ISD::SUB) {
|
||||
// s0 + (s1 - v0) --> (s0 + s1) - v0
|
||||
if (!Op1->isDivergent() && Op2->isDivergent())
|
||||
return DAG.getNode(ISD::SUB, SL, VT,
|
||||
DAG.getNode(ISD::ADD, SL, VT, Op0, Op1), Op2);
|
||||
// s0 + (v0 - s1) --> (s0 - s1) + v0
|
||||
if (Op1->isDivergent() && !Op2->isDivergent())
|
||||
return DAG.getNode(ISD::ADD, SL, VT,
|
||||
DAG.getNode(ISD::SUB, SL, VT, Op0, Op2), Op1);
|
||||
} else if (Opc == ISD::SUB) {
|
||||
if (Op1Opc == ISD::SUB) {
|
||||
// s0 - (s1 - v0) --> (s0 - s1) + v0
|
||||
if (!Op1->isDivergent() && Op2->isDivergent())
|
||||
return DAG.getNode(ISD::ADD, SL, VT,
|
||||
DAG.getNode(ISD::SUB, SL, VT, Op0, Op1), Op2);
|
||||
// s0 - (v0 - s1) --> (s0 + s1) - v0
|
||||
if (Op1->isDivergent() && !Op2->isDivergent())
|
||||
return DAG.getNode(ISD::SUB, SL, VT,
|
||||
DAG.getNode(ISD::ADD, SL, VT, Op0, Op2), Op1);
|
||||
} else if (Op1Opc == ISD::ADD) {
|
||||
// s0 - (s1 + v0) --> (s0 - s1) - v0
|
||||
if (Op1->isDivergent() ^ Op2->isDivergent()) {
|
||||
if (Op1->isDivergent())
|
||||
std::swap(Op1, Op2);
|
||||
return DAG.getNode(ISD::SUB, SL, VT,
|
||||
DAG.getNode(ISD::SUB, SL, VT, Op0, Op1), Op2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Op0->isDivergent() && Op0->hasOneUse()) {
|
||||
unsigned Op0Opc = Op0.getOpcode();
|
||||
if (Op0Opc != ISD::ADD && Op0Opc != ISD::SUB)
|
||||
return SDValue();
|
||||
|
||||
SDValue Op2 = Op0.getOperand(1);
|
||||
Op0 = Op0.getOperand(0);
|
||||
if (!Op0->isDivergent() && Op2->isDivergent()) {
|
||||
if (Opc == ISD::SUB) {
|
||||
// (s1 + v0) - s0 --> (s1 - s0) + v0
|
||||
if (Op0Opc == ISD::ADD)
|
||||
return DAG.getNode(ISD::ADD, SL, VT,
|
||||
DAG.getNode(ISD::SUB, SL, VT, Op0, Op1), Op2);
|
||||
|
||||
// (s1 - v0) - s0 --> (s1 - s0) - v0
|
||||
if (Op0Opc == ISD::SUB)
|
||||
return DAG.getNode(ISD::SUB, SL, VT,
|
||||
DAG.getNode(ISD::SUB, SL, VT, Op0, Op1), Op2);
|
||||
} else if (Opc == ISD::ADD && Op0Opc == ISD::SUB) {
|
||||
// (s1 - v0) + s0 --> (s0 + s1) - v0
|
||||
return DAG.getNode(ISD::SUB, SL, VT,
|
||||
DAG.getNode(ISD::ADD, SL, VT, Op0, Op1), Op2);
|
||||
}
|
||||
}
|
||||
|
||||
if (Op0->isDivergent() && !Op2->isDivergent()) {
|
||||
// (v0 - s1) + s0 --> (s0 - s1) + v0
|
||||
if (Opc == ISD::ADD && Op0Opc == ISD::SUB)
|
||||
return DAG.getNode(ISD::ADD, SL, VT,
|
||||
DAG.getNode(ISD::SUB, SL, VT, Op1, Op2), Op0);
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
|
||||
EVT VT,
|
||||
SDValue N0, SDValue N1, SDValue N2,
|
||||
@@ -11288,6 +11378,9 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
|
||||
return V;
|
||||
}
|
||||
|
||||
if (SDValue V = reassociateSub(N, DAG))
|
||||
return V;
|
||||
|
||||
if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
|
||||
return SDValue();
|
||||
|
||||
@@ -11330,6 +11423,9 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
if (SDValue V = reassociateSub(N, DAG))
|
||||
return V;
|
||||
|
||||
if (VT != MVT::i32)
|
||||
return SDValue();
|
||||
|
||||
|
||||
@@ -202,6 +202,7 @@ private:
|
||||
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
|
||||
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
|
||||
SDValue reassociateSub(SDNode *N, SelectionDAG &DAG) const;
|
||||
unsigned getFusedOpcode(const SelectionDAG &DAG,
|
||||
const SDNode *N0, const SDNode *N1) const;
|
||||
SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
|
||||
@@ -662,8 +662,8 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v0
|
||||
; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0
|
||||
; GCN-NEXT: s_sub_i32 s0, s2, s3
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0
|
||||
; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
;
|
||||
@@ -675,10 +675,10 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
|
||||
; GFX9-NEXT: s_sub_i32 s2, s2, s3
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
|
||||
; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0
|
||||
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
bb:
|
||||
@@ -711,8 +711,8 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v0
|
||||
; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0
|
||||
; GCN-NEXT: s_sub_i32 s0, s2, s3
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0
|
||||
; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
;
|
||||
@@ -724,10 +724,10 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
|
||||
; GFX9-NEXT: s_sub_i32 s2, s2, s3
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
|
||||
; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0
|
||||
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
bb:
|
||||
|
||||
305
llvm/test/CodeGen/AMDGPU/reassoc-sub-add.ll
Normal file
305
llvm/test/CodeGen/AMDGPU/reassoc-sub-add.ll
Normal file
@@ -0,0 +1,305 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
|
||||
|
||||
; s0 + (s1 - v0) --> (s0 + s1) - v0
|
||||
define amdgpu_kernel void @reassoc_sub_add_s0s1v0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) {
|
||||
; GFX8-LABEL: reassoc_sub_add_s0s1v0_i32:
|
||||
; GFX8: ; %bb.0: ; %bb
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_add_i32 s2, s3, s2
|
||||
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: reassoc_sub_add_s0s1v0_i32:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_i32 s2, s3, s2
|
||||
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
|
||||
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
bb:
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%sub = sub i32 %x, %tid
|
||||
%add = add i32 %y, %sub
|
||||
store i32 %add, ptr addrspace(1) %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; s0 + (v0 - s1) --> (s0 - s1) + v0
|
||||
define amdgpu_kernel void @reassoc_sub_add_s0v0s1_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) {
|
||||
; GFX8-LABEL: reassoc_sub_add_s0v0s1_i32:
|
||||
; GFX8: ; %bb.0: ; %bb
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_sub_i32 s2, s3, s2
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: reassoc_sub_add_s0v0s1_i32:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_sub_i32 s2, s3, s2
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
|
||||
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
bb:
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%sub = sub i32 %tid, %x
|
||||
%add = add i32 %y, %sub
|
||||
store i32 %add, ptr addrspace(1) %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; (v0 - s1) + s0 --> (s0 - s1) + v0
|
||||
define amdgpu_kernel void @reassoc_sub_add_v0s1s0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) {
|
||||
; GFX8-LABEL: reassoc_sub_add_v0s1s0_i32:
|
||||
; GFX8: ; %bb.0: ; %bb
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_sub_i32 s2, s3, s2
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: reassoc_sub_add_v0s1s0_i32:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_sub_i32 s2, s3, s2
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
|
||||
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
bb:
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%sub = sub i32 %tid, %x
|
||||
%add = add i32 %sub, %y
|
||||
store i32 %add, ptr addrspace(1) %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; s0 - (s1 - v0) --> (s0 - s1) + v0
|
||||
define amdgpu_kernel void @reassoc_sub_sub_s0s1v0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) {
|
||||
; GFX8-LABEL: reassoc_sub_sub_s0s1v0_i32:
|
||||
; GFX8: ; %bb.0: ; %bb
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_sub_i32 s2, s3, s2
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: reassoc_sub_sub_s0s1v0_i32:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_sub_i32 s2, s3, s2
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
|
||||
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
bb:
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%sub1 = sub i32 %x, %tid
|
||||
%sub2 = sub i32 %y, %sub1
|
||||
store i32 %sub2, ptr addrspace(1) %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; s0 - (v0 - s1) --> (s0 + s1) - v0
|
||||
define amdgpu_kernel void @reassoc_sub_sub_s0v0s1_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) {
|
||||
; GFX8-LABEL: reassoc_sub_sub_s0v0s1_i32:
|
||||
; GFX8: ; %bb.0: ; %bb
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_add_i32 s2, s3, s2
|
||||
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: reassoc_sub_sub_s0v0s1_i32:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_i32 s2, s3, s2
|
||||
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
|
||||
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
bb:
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%sub1 = sub i32 %tid, %x
|
||||
%sub2 = sub i32 %y, %sub1
|
||||
store i32 %sub2, ptr addrspace(1) %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; s0 - (s1 + v0) --> (s0 - s1) - v0
|
||||
define amdgpu_kernel void @reassoc_add_sub_s0s1v0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) {
|
||||
; GFX8-LABEL: reassoc_add_sub_s0s1v0_i32:
|
||||
; GFX8: ; %bb.0: ; %bb
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_sub_i32 s2, s3, s2
|
||||
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: reassoc_add_sub_s0s1v0_i32:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_sub_i32 s2, s3, s2
|
||||
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
|
||||
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
bb:
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%add = add i32 %x, %tid
|
||||
%sub = sub i32 %y, %add
|
||||
store i32 %sub, ptr addrspace(1) %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; s0 - (v0 + s1) --> (s0 - s1) - v0
|
||||
define amdgpu_kernel void @reassoc_add_sub_s0v0s1_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) {
|
||||
; GFX8-LABEL: reassoc_add_sub_s0v0s1_i32:
|
||||
; GFX8: ; %bb.0: ; %bb
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_sub_i32 s2, s3, s2
|
||||
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: reassoc_add_sub_s0v0s1_i32:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_sub_i32 s2, s3, s2
|
||||
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
|
||||
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
bb:
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%add = add i32 %tid, %x
|
||||
%sub = sub i32 %y, %add
|
||||
store i32 %sub, ptr addrspace(1) %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; (s1 + v0) - s0 --> (s1 - s0) + v0
|
||||
define amdgpu_kernel void @reassoc_add_sub_s1v0s0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) {
|
||||
; GFX8-LABEL: reassoc_add_sub_s1v0s0_i32:
|
||||
; GFX8: ; %bb.0: ; %bb
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_sub_i32 s2, s3, s2
|
||||
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: reassoc_add_sub_s1v0s0_i32:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_sub_i32 s2, s3, s2
|
||||
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
|
||||
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
bb:
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%add = add i32 %tid, %x
|
||||
%sub = sub i32 %y, %add
|
||||
store i32 %sub, ptr addrspace(1) %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; (s1 - v0) - s0 --> (s1 - s0) - v0
|
||||
define amdgpu_kernel void @reassoc_sub_sub_s1v0s0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) {
|
||||
; GFX8-LABEL: reassoc_sub_sub_s1v0s0_i32:
|
||||
; GFX8: ; %bb.0: ; %bb
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_sub_i32 s2, s2, s3
|
||||
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: reassoc_sub_sub_s1v0s0_i32:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_sub_i32 s2, s2, s3
|
||||
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
|
||||
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
bb:
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%sub1 = sub i32 %x, %tid
|
||||
%sub2 = sub i32 %sub1, %y
|
||||
store i32 %sub2, ptr addrspace(1) %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; (s1 - v0) + s0 --> (s0 + s1) - v0
|
||||
define amdgpu_kernel void @reassoc_sub_add_s1v0s0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) {
|
||||
; GFX8-LABEL: reassoc_sub_add_s1v0s0_i32:
|
||||
; GFX8: ; %bb.0: ; %bb
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s3
|
||||
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: reassoc_sub_add_s1v0s0_i32:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s3
|
||||
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
|
||||
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
bb:
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%sub = sub i32 %x, %tid
|
||||
%add = add i32 %sub, %y
|
||||
store i32 %add, ptr addrspace(1) %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
||||
Reference in New Issue
Block a user