mirror of
https://github.com/intel/llvm.git
synced 2026-01-16 05:32:28 +08:00
[AMDGPU] Add s_cluster_barrier on gfx1250 (#159175)
This commit is contained in:
committed by
GitHub
parent
59e43fe5d8
commit
4ab8dabc25
@@ -696,6 +696,7 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
|
||||
//===----------------------------------------------------------------------===//
|
||||
// GFX1250+ only builtins.
|
||||
//===----------------------------------------------------------------------===//
|
||||
TARGET_BUILTIN(__builtin_amdgcn_s_cluster_barrier, "v", "n", "gfx1250-insts")
|
||||
|
||||
TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts")
|
||||
|
||||
@@ -1478,6 +1478,16 @@ void test_prefetch(generic void *fptr, global void *gptr) {
|
||||
__builtin_amdgcn_global_prefetch(gptr, 8);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_s_cluster_barrier(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: call void @llvm.amdgcn.s.cluster.barrier()
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
void test_s_cluster_barrier()
|
||||
{
|
||||
__builtin_amdgcn_s_cluster_barrier();
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_global_add_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
|
||||
|
||||
@@ -3691,6 +3691,10 @@ def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
|
||||
// gfx1250 intrinsics
|
||||
// ===----------------------------------------------------------------------===//
|
||||
|
||||
// Vanilla cluster sync-barrier
|
||||
def int_amdgcn_s_cluster_barrier : ClangBuiltin<"__builtin_amdgcn_s_cluster_barrier">,
|
||||
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
|
||||
|
||||
// Async waits decrement ASYNCcnt and tensor waits decrement TENSORcnt which is
|
||||
// modeled as InaccessibleMem.
|
||||
class AMDGPUWaitAsyncIntrinsic :
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "llvm/IR/IntrinsicInst.h"
|
||||
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
|
||||
#define DEBUG_TYPE "amdgpu-lower-intrinsics"
|
||||
|
||||
@@ -49,7 +50,6 @@ public:
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<TargetPassConfig>();
|
||||
AU.setPreservesCFG();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -73,6 +73,7 @@ bool AMDGPULowerIntrinsicsImpl::run() {
|
||||
case Intrinsic::amdgcn_s_barrier_signal:
|
||||
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
|
||||
case Intrinsic::amdgcn_s_barrier_wait:
|
||||
case Intrinsic::amdgcn_s_cluster_barrier:
|
||||
forEachCall(F, [&](IntrinsicInst *II) { Changed |= visitBarrier(*II); });
|
||||
break;
|
||||
}
|
||||
@@ -81,13 +82,14 @@ bool AMDGPULowerIntrinsicsImpl::run() {
|
||||
return Changed;
|
||||
}
|
||||
|
||||
// Optimize barriers and lower s_barrier to a sequence of split barrier
|
||||
// intrinsics.
|
||||
// Optimize barriers and lower s_(cluster_)barrier to a sequence of split
|
||||
// barrier intrinsics.
|
||||
bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
|
||||
assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
|
||||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
|
||||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst ||
|
||||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait);
|
||||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
|
||||
I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier);
|
||||
|
||||
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
|
||||
bool IsSingleWaveWG = false;
|
||||
@@ -99,7 +101,59 @@ bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
|
||||
|
||||
IRBuilder<> B(&I);
|
||||
|
||||
if (IsSingleWaveWG) {
|
||||
// Lower the s_cluster_barrier intrinsic first. There is no corresponding
|
||||
// hardware instruction in any subtarget.
|
||||
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier) {
|
||||
// The default cluster barrier expects one signal per workgroup. So we need
|
||||
// a workgroup barrier first.
|
||||
if (IsSingleWaveWG) {
|
||||
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {});
|
||||
} else {
|
||||
Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
|
||||
Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
|
||||
Value *IsFirst = B.CreateIntrinsic(
|
||||
B.getInt1Ty(), Intrinsic::amdgcn_s_barrier_signal_isfirst,
|
||||
{BarrierID_32});
|
||||
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
|
||||
{BarrierID_16});
|
||||
|
||||
Instruction *ThenTerm =
|
||||
SplitBlockAndInsertIfThen(IsFirst, I.getIterator(), false);
|
||||
B.SetInsertPoint(ThenTerm);
|
||||
}
|
||||
|
||||
// Now we can signal the cluster barrier from a single wave and wait for the
|
||||
// barrier in all waves.
|
||||
Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::CLUSTER);
|
||||
Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::CLUSTER);
|
||||
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
|
||||
{BarrierID_32});
|
||||
|
||||
B.SetInsertPoint(&I);
|
||||
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
|
||||
{BarrierID_16});
|
||||
|
||||
I.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool IsWorkgroupScope = false;
|
||||
|
||||
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
|
||||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
|
||||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst) {
|
||||
int BarrierID = cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
|
||||
if (BarrierID == AMDGPU::Barrier::TRAP ||
|
||||
BarrierID == AMDGPU::Barrier::WORKGROUP ||
|
||||
(BarrierID >= AMDGPU::Barrier::NAMED_BARRIER_FIRST &&
|
||||
BarrierID <= AMDGPU::Barrier::NAMED_BARRIER_LAST))
|
||||
IsWorkgroupScope = true;
|
||||
} else {
|
||||
assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier);
|
||||
IsWorkgroupScope = true;
|
||||
}
|
||||
|
||||
if (IsWorkgroupScope && IsSingleWaveWG) {
|
||||
// Down-grade waits, remove split signals.
|
||||
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
|
||||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) {
|
||||
@@ -134,9 +188,7 @@ PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M,
|
||||
AMDGPULowerIntrinsicsImpl Impl(M, TM);
|
||||
if (!Impl.run())
|
||||
return PreservedAnalyses::all();
|
||||
PreservedAnalyses PA;
|
||||
PA.preserveSet<CFGAnalyses>();
|
||||
return PA;
|
||||
return PreservedAnalyses::none();
|
||||
}
|
||||
|
||||
bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) {
|
||||
|
||||
@@ -360,6 +360,7 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
|
||||
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
|
||||
switch (II->getIntrinsicID()) {
|
||||
case Intrinsic::amdgcn_s_barrier:
|
||||
case Intrinsic::amdgcn_s_cluster_barrier:
|
||||
case Intrinsic::amdgcn_s_barrier_signal:
|
||||
case Intrinsic::amdgcn_s_barrier_signal_var:
|
||||
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
|
||||
|
||||
@@ -1108,7 +1108,14 @@ enum Register_Flag : uint8_t {
|
||||
namespace AMDGPU {
|
||||
namespace Barrier {
|
||||
|
||||
enum Type { TRAP = -2, WORKGROUP = -1 };
|
||||
enum Type {
|
||||
CLUSTER_TRAP = -4,
|
||||
CLUSTER = -3,
|
||||
TRAP = -2,
|
||||
WORKGROUP = -1,
|
||||
NAMED_BARRIER_FIRST = 1,
|
||||
NAMED_BARRIER_LAST = 16,
|
||||
};
|
||||
|
||||
enum {
|
||||
BARRIER_SCOPE_WORKGROUP = 0,
|
||||
|
||||
107
llvm/test/CodeGen/AMDGPU/lower-intrinsics-cluster-barrier.ll
Normal file
107
llvm/test/CodeGen/AMDGPU/lower-intrinsics-cluster-barrier.ll
Normal file
@@ -0,0 +1,107 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=0 | FileCheck --check-prefixes=CHECK,NOOPT %s
|
||||
; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=CHECK,OPT-WAVE32 %s
|
||||
; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=CHECK,OPT-WAVE64 %s
|
||||
|
||||
declare void @foo(i1)
|
||||
|
||||
; Verify that the explicit use of a split cluster barrier isn't optimized away.
|
||||
define amdgpu_kernel void @split_barriers() "amdgpu-flat-work-group-size"="32,32" {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @split_barriers(
|
||||
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
|
||||
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
|
||||
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
|
||||
; CHECK-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -3)
|
||||
; CHECK-NEXT: call void @foo(i1 [[ISFIRST]])
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
call void @llvm.amdgcn.s.barrier.signal(i32 -3)
|
||||
call void @llvm.amdgcn.s.barrier.wait(i16 -3)
|
||||
%isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -3)
|
||||
call void @foo(i1 %isfirst)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @s_cluster_barrier() {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @s_cluster_barrier(
|
||||
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
|
||||
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
|
||||
; CHECK-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
|
||||
; CHECK: [[BB2]]:
|
||||
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
|
||||
; CHECK-NEXT: br label %[[BB3]]
|
||||
; CHECK: [[BB3]]:
|
||||
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
call void @llvm.amdgcn.s.cluster.barrier()
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @s_cluster_barrier_wg32() "amdgpu-flat-work-group-size"="32,32" {
|
||||
; NOOPT-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32(
|
||||
; NOOPT-SAME: ) #[[ATTR1]] {
|
||||
; NOOPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
|
||||
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
|
||||
; NOOPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
|
||||
; NOOPT: [[BB2]]:
|
||||
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
|
||||
; NOOPT-NEXT: br label %[[BB3]]
|
||||
; NOOPT: [[BB3]]:
|
||||
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
|
||||
; NOOPT-NEXT: ret void
|
||||
;
|
||||
; OPT-WAVE32-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32(
|
||||
; OPT-WAVE32-SAME: ) #[[ATTR1]] {
|
||||
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.wave.barrier()
|
||||
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
|
||||
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
|
||||
; OPT-WAVE32-NEXT: ret void
|
||||
;
|
||||
; OPT-WAVE64-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32(
|
||||
; OPT-WAVE64-SAME: ) #[[ATTR1]] {
|
||||
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
|
||||
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
|
||||
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
|
||||
; OPT-WAVE64-NEXT: ret void
|
||||
;
|
||||
call void @llvm.amdgcn.s.cluster.barrier()
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @s_cluster_barrier_wg64() "amdgpu-flat-work-group-size"="64,64" {
|
||||
; NOOPT-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64(
|
||||
; NOOPT-SAME: ) #[[ATTR2:[0-9]+]] {
|
||||
; NOOPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
|
||||
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
|
||||
; NOOPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
|
||||
; NOOPT: [[BB2]]:
|
||||
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
|
||||
; NOOPT-NEXT: br label %[[BB3]]
|
||||
; NOOPT: [[BB3]]:
|
||||
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
|
||||
; NOOPT-NEXT: ret void
|
||||
;
|
||||
; OPT-WAVE32-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64(
|
||||
; OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] {
|
||||
; OPT-WAVE32-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
|
||||
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
|
||||
; OPT-WAVE32-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
|
||||
; OPT-WAVE32: [[BB2]]:
|
||||
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
|
||||
; OPT-WAVE32-NEXT: br label %[[BB3]]
|
||||
; OPT-WAVE32: [[BB3]]:
|
||||
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
|
||||
; OPT-WAVE32-NEXT: ret void
|
||||
;
|
||||
; OPT-WAVE64-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64(
|
||||
; OPT-WAVE64-SAME: ) #[[ATTR2:[0-9]+]] {
|
||||
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
|
||||
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
|
||||
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
|
||||
; OPT-WAVE64-NEXT: ret void
|
||||
;
|
||||
call void @llvm.amdgcn.s.cluster.barrier()
|
||||
ret void
|
||||
}
|
||||
Reference in New Issue
Block a user