From e72d8b25531cb5a4fd1e802bac8c9aa6efee0aa1 Mon Sep 17 00:00:00 2001 From: Alex Maclean Date: Thu, 22 May 2025 17:28:43 +0000 Subject: [PATCH] Revert "[NVPTX] Unify and extend barrier{.cta} intrinsic support (#140615)" This reverts commit 735209c0688b10a66c24750422b35d8c2ad01bb5. --- clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp | 16 -- clang/test/CodeGen/builtins-nvptx-ptx60.cu | 4 +- clang/test/CodeGen/builtins-nvptx.c | 4 +- clang/test/Headers/gpuintrin.c | 2 +- llvm/docs/NVPTXUsage.rst | 47 +----- llvm/include/llvm/IR/IntrinsicsNVVM.td | 37 ++--- llvm/lib/IR/AutoUpgrade.cpp | 34 +--- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 41 ----- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 71 ++++---- .../Transforms/IPO/AttributorAttributes.cpp | 3 +- .../GlobalsModRef/functions_without_nosync.ll | 19 ++- .../Assembler/auto_upgrade_nvvm_intrinsics.ll | 22 --- llvm/test/CodeGen/NVPTX/barrier.ll | 151 +++--------------- llvm/test/CodeGen/NVPTX/named-barriers.ll | 42 +++++ .../CodeGen/NVPTX/noduplicate-syncthreads.ll | 6 +- llvm/test/Feature/intrinsic-noduplicate.ll | 6 +- .../Transforms/FunctionAttrs/convergent.ll | 6 +- .../JumpThreading/thread-two-bbs-cuda.ll | 8 +- .../test/Transforms/OpenMP/barrier_removal.ll | 28 ++-- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 24 ++- mlir/test/Target/LLVMIR/Import/nvvmir.ll | 3 +- mlir/test/Target/LLVMIR/nvvmir.mlir | 10 +- 22 files changed, 196 insertions(+), 388 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/named-barriers.ll diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp index 21c01a08549d..002af4f931c0 100644 --- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp @@ -1160,22 +1160,6 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_fence_sc_cluster: return Builder.CreateCall( CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster)); - case NVPTX::BI__nvvm_bar_sync: - return Builder.CreateCall( - CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all), - EmitScalarExpr(E->getArg(0))); - case NVPTX::BI__syncthreads: - return Builder.CreateCall( - CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all), - Builder.getInt32(0)); - case NVPTX::BI__nvvm_barrier_sync: - return Builder.CreateCall( - CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all), - EmitScalarExpr(E->getArg(0))); - case NVPTX::BI__nvvm_barrier_sync_cnt: - return Builder.CreateCall( - CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync), - {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))}); default: return nullptr; } diff --git a/clang/test/CodeGen/builtins-nvptx-ptx60.cu b/clang/test/CodeGen/builtins-nvptx-ptx60.cu index 0c40ecaa9561..599d09a20e04 100644 --- a/clang/test/CodeGen/builtins-nvptx-ptx60.cu +++ b/clang/test/CodeGen/builtins-nvptx-ptx60.cu @@ -32,10 +32,10 @@ __device__ void nvvm_sync(unsigned mask, int i, float f, int a, int b, // CHECK: call void @llvm.nvvm.bar.warp.sync(i32 // expected-error@+1 {{'__nvvm_bar_warp_sync' needs target feature ptx60}} __nvvm_bar_warp_sync(mask); - // CHECK: call void @llvm.nvvm.barrier.cta.sync.all(i32 + // CHECK: call void @llvm.nvvm.barrier.sync(i32 // expected-error@+1 {{'__nvvm_barrier_sync' needs target feature ptx60}} __nvvm_barrier_sync(mask); - // CHECK: call void @llvm.nvvm.barrier.cta.sync(i32 + // CHECK: call void @llvm.nvvm.barrier.sync.cnt(i32 // expected-error@+1 {{'__nvvm_barrier_sync_cnt' needs target feature ptx60}} __nvvm_barrier_sync_cnt(mask, i); diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index cef529163bb3..7904762709df 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -198,7 +198,7 @@ __device__ int read_pms() { __device__ void sync() { -// CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) +// CHECK: call void @llvm.nvvm.bar.sync(i32 0) __nvvm_bar_sync(0); @@ -259,7 +259,7 @@ __device__ void nvvm_math(float f1, float f2, double d1, double d2) { __nvvm_membar_gl(); // CHECK: call void @llvm.nvvm.membar.sys() __nvvm_membar_sys(); -// CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) +// CHECK: call void @llvm.nvvm.barrier0() __syncthreads(); } diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c index b254423ec4a1..f7dfb86ac465 100644 --- a/clang/test/Headers/gpuintrin.c +++ b/clang/test/Headers/gpuintrin.c @@ -887,7 +887,7 @@ __gpu_kernel void foo() { // NVPTX-LABEL: define internal void @__gpu_sync_threads( // NVPTX-SAME: ) #[[ATTR0]] { // NVPTX-NEXT: [[ENTRY:.*:]] -// NVPTX-NEXT: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) +// NVPTX-NEXT: call void @llvm.nvvm.barrier0() // NVPTX-NEXT: ret void // // diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 07750579f5a5..e0a5c0211477 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -199,58 +199,21 @@ map in the following way to CUDA builtins: Barriers -------- -'``llvm.nvvm.barrier.cta.*``' -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +'``llvm.nvvm.barrier0``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: """"""" .. code-block:: llvm - declare void @llvm.nvvm.barrier.cta.sync(i32 %id, i32 %n) - declare void @llvm.nvvm.barrier.cta.sync.all(i32 %id) - declare void @llvm.nvvm.barrier.cta.arrive(i32 %id, i32 %n) - - declare void @llvm.nvvm.barrier.cta.sync.aligned(i32 %id, i32 %n) - declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %id) - declare void @llvm.nvvm.barrier.cta.arrive.aligned(i32 %id, i32 %n) + declare void @llvm.nvvm.barrier0() Overview: """"""""" -The '``@llvm.nvvm.barrier.cta.*``' family of intrinsics perform barrier -synchronization and communication within a CTA. They can be used by the threads -within the CTA for synchronization and communication. - -Semantics: -"""""""""" - -Operand %id specifies a logical barrier resource and must fall within the range -0 through 15. When present, operand %n specifies the number of threads -participating in the barrier. When specifying a thread count, the value must be -a multiple of the warp size. With the '``@llvm.nvvm.barrier.cta.sync.*``' -variants, the '``.all``' suffix indicates that all threads in the CTA should -participate in the barrier and the %n operand is not present. - -All forms of the '``@llvm.nvvm.barrier.cta.*``' intrinsic cause the executing -thread to wait for all non-exited threads from its warp and then marks the -warp's arrival at the barrier. In addition to signaling its arrival at the -barrier, the '``@llvm.nvvm.barrier.cta.sync.*``' intrinsics cause the executing -thread to wait for non-exited threads of all other warps participating in the -barrier to arrive. On the other hand, the '``@llvm.nvvm.barrier.cta.arrive.*``' -intrinsic does not cause the executing thread to wait for threads of other -participating warps. - -When a barrier completes, the waiting threads are restarted without delay, -and the barrier is reinitialized so that it can be immediately reused. - -The '``@llvm.nvvm.barrier.cta.*``' intrinsic has an optional '``.aligned``' -modifier to indicate textual alignment of the barrier. When specified, it -indicates that all threads in the CTA will execute the same -'``@llvm.nvvm.barrier.cta.*``' instruction. In conditionally executed code, an -aligned '``@llvm.nvvm.barrier.cta.*``' instruction should only be used if it is -known that all threads in the CTA evaluate the condition identically, otherwise -behavior is undefined. +The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0`` +instruction, equivalent to the ``__syncthreads()`` call in CUDA. Electing a thread ----------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index a0a00677bc5b..3aa9b0303c63 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -128,12 +128,6 @@ // * llvm.nvvm.swap.lo.hi.b64 --> llvm.fshl(x, x, 32) // * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap // * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap -// * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0) -// * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x) -// * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x) -// * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned(x, y) -// * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x) -// * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync(x, y) def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr @@ -1269,6 +1263,18 @@ let TargetPrefix = "nvvm" in { defm int_nvvm_atomic_cas_gen_i : PTXAtomicWithScope3; // Bar.Sync + + // The builtin for "bar.sync 0" is called __syncthreads. Unlike most of the + // intrinsics in this file, this one is a user-facing API. + def int_nvvm_barrier0 : ClangBuiltin<"__syncthreads">, + Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; + // Synchronize all threads in the CTA at barrier 'n'. + def int_nvvm_barrier_n : ClangBuiltin<"__nvvm_bar_n">, + Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; + // Synchronize 'm', a multiple of warp size, (arg 2) threads in + // the CTA at barrier 'n' (arg 1). + def int_nvvm_barrier : ClangBuiltin<"__nvvm_bar">, + Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; def int_nvvm_barrier0_popc : ClangBuiltin<"__nvvm_bar0_popc">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; def int_nvvm_barrier0_and : ClangBuiltin<"__nvvm_bar0_and">, @@ -1276,21 +1282,16 @@ let TargetPrefix = "nvvm" in { def int_nvvm_barrier0_or : ClangBuiltin<"__nvvm_bar0_or">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; + def int_nvvm_bar_sync : NVVMBuiltin, + Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; def int_nvvm_bar_warp_sync : NVVMBuiltin, Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; - // barrier{.cta}.sync{.aligned} a{, b}; - // barrier{.cta}.arrive{.aligned} a, b; - let IntrProperties = [IntrConvergent, IntrNoCallback] in { - foreach align = ["", "_aligned"] in { - def int_nvvm_barrier_cta_sync # align # _all : - Intrinsic<[], [llvm_i32_ty]>; - def int_nvvm_barrier_cta_sync # align : - Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>; - def int_nvvm_barrier_cta_arrive # align : - Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>; - } - } + // barrier.sync id[, cnt] + def int_nvvm_barrier_sync : NVVMBuiltin, + Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; + def int_nvvm_barrier_sync_cnt : NVVMBuiltin, + Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; // barrier.cluster.[wait, arrive, arrive.relaxed] def int_nvvm_barrier_cluster_arrive : diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 94ac22f04742..7157baf394e3 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1343,9 +1343,12 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, // nvvm.abs.{i,ii} Expand = Name == "i" || Name == "ll" || Name == "bf16" || Name == "bf16x2"; - else if (Name.consume_front("fabs.")) + else if (Name == "fabs.f" || Name == "fabs.ftz.f" || Name == "fabs.d") // nvvm.fabs.{f,ftz.f,d} - Expand = Name == "f" || Name == "ftz.f" || Name == "d"; + Expand = true; + else if (Name == "clz.ll" || Name == "popc.ll" || Name == "h2f" || + Name == "swap.lo.hi.b64") + Expand = true; else if (Name.consume_front("max.") || Name.consume_front("min.")) // nvvm.{min,max}.{i,ii,ui,ull} Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" || @@ -1377,18 +1380,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, Expand = (Name.starts_with("i.") || Name.starts_with("f.") || Name.starts_with("p.")); else - Expand = StringSwitch(Name) - .Case("barrier0", true) - .Case("barrier.n", true) - .Case("barrier.sync.cnt", true) - .Case("barrier.sync", true) - .Case("barrier", true) - .Case("bar.sync", true) - .Case("clz.ll", true) - .Case("popc.ll", true) - .Case("h2f", true) - .Case("swap.lo.hi.b64", true) - .Default(false); + Expand = false; if (Expand) { NewFn = nullptr; @@ -2486,20 +2478,6 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, MDNode *MD = MDNode::get(Builder.getContext(), {}); LD->setMetadata(LLVMContext::MD_invariant_load, MD); return LD; - } else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") { - Value *Arg = - Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0); - Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all, - {}, {Arg}); - } else if (Name == "barrier") { - Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned, {}, - {CI->getArgOperand(0), CI->getArgOperand(1)}); - } else if (Name == "barrier.sync") { - Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all, {}, - {CI->getArgOperand(0)}); - } else if (Name == "barrier.sync.cnt") { - Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync, {}, - {CI->getArgOperand(0), CI->getArgOperand(1)}); } else { Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name); if (IID != Intrinsic::not_intrinsic && diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 5234fb080618..444d35b3115b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -237,47 +237,6 @@ def BF16RT : RegTyInfo; def F16X2RT : RegTyInfo; def BF16X2RT : RegTyInfo; -// This class provides a basic wrapper around an NVPTXInst that abstracts the -// specific syntax of most PTX instructions. It automatically handles the -// construction of the asm string based on the provided dag arguments. -// For example, the following asm-strings would be computed: -// -// * BasicFlagsNVPTXInst<(outs Int32Regs:$dst), -// (ins Int32Regs:$a, Int32Regs:$b), (ins), -// "add.s32">; -// ---> "add.s32 \t$dst, $a, $b;" -// -// * BasicFlagsNVPTXInst<(outs Int32Regs:$d), -// (ins Int32Regs:$a, Int32Regs:$b, Hexu32imm:$c), -// (ins PrmtMode:$mode), -// "prmt.b32${mode}">; -// ---> "prmt.b32${mode} \t$d, $a, $b, $c;" -// -class BasicFlagsNVPTXInst pattern = []> - : NVPTXInst< - outs_dag, - !con(ins_dag, flags_dag), - !strconcat( - asmstr, - !if(!and(!empty(ins_dag), !empty(outs_dag)), "", - !strconcat( - " \t", - !interleave( - !foreach(i, !range(!size(outs_dag)), - "$" # !getdagname(outs_dag, i)), - "|"), - !if(!or(!empty(ins_dag), !empty(outs_dag)), "", ", "), - !interleave( - !foreach(i, !range(!size(ins_dag)), - "$" # !getdagname(ins_dag, i)), - ", "))), - ";"), - pattern>; - -class BasicNVPTXInst pattern = []> - : BasicFlagsNVPTXInst; - multiclass I3Inst requires = []> { diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index f7b8aca0f77d..193418ca391e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -67,6 +67,15 @@ class THREADMASK_INFO { // Synchronization and shuffle functions //----------------------------------- let isConvergent = true in { +def INT_BARRIER0 : NVPTXInst<(outs), (ins), + "bar.sync \t0;", + [(int_nvvm_barrier0)]>; +def INT_BARRIERN : NVPTXInst<(outs), (ins Int32Regs:$src1), + "bar.sync \t$src1;", + [(int_nvvm_barrier_n i32:$src1)]>; +def INT_BARRIER : NVPTXInst<(outs), (ins Int32Regs:$src1, Int32Regs:$src2), + "bar.sync \t$src1, $src2;", + [(int_nvvm_barrier i32:$src1, i32:$src2)]>; def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred), !strconcat("{{ \n\t", ".reg .pred \t%p1; \n\t", @@ -93,6 +102,9 @@ def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred), "}}"), [(set i32:$dst, (int_nvvm_barrier0_or i32:$pred))]>; +def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync \t$i;", + [(int_nvvm_bar_sync imm:$i)]>; + def INT_BAR_WARP_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "bar.warp.sync \t$i;", [(int_nvvm_bar_warp_sync imm:$i)]>, Requires<[hasPTX<60>, hasSM<30>]>; @@ -100,44 +112,29 @@ def INT_BAR_WARP_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "bar.warp.sync \ [(int_nvvm_bar_warp_sync i32:$i)]>, Requires<[hasPTX<60>, hasSM<30>]>; -multiclass BARRIER1 requires = []> { - def _i : BasicNVPTXInst<(outs), (ins i32imm:$i), asmstr, - [(intrinsic imm:$i)]>, - Requires; +def INT_BARRIER_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "barrier.sync \t$i;", + [(int_nvvm_barrier_sync imm:$i)]>, + Requires<[hasPTX<60>, hasSM<30>]>; +def INT_BARRIER_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "barrier.sync \t$i;", + [(int_nvvm_barrier_sync i32:$i)]>, + Requires<[hasPTX<60>, hasSM<30>]>; - def _r : BasicNVPTXInst<(outs), (ins Int32Regs:$i), asmstr, - [(intrinsic i32:$i)]>, - Requires; -} - -multiclass BARRIER2 requires = []> { - def _rr : BasicNVPTXInst<(outs), (ins Int32Regs:$i, Int32Regs:$j), asmstr, - [(intrinsic i32:$i, i32:$j)]>, - Requires; - - def _ri : BasicNVPTXInst<(outs), (ins Int32Regs:$i, i32imm:$j), asmstr, - [(intrinsic i32:$i, imm:$j)]>, - Requires; - - def _ir : BasicNVPTXInst<(outs), (ins i32imm:$i, Int32Regs:$j), asmstr, - [(intrinsic imm:$i, i32:$j)]>, - Requires; - - def _ii : BasicNVPTXInst<(outs), (ins i32imm:$i, i32imm:$j), asmstr, - [(intrinsic imm:$i, imm:$j)]>, - Requires; -} - -// Note the "bar.sync" variants could be renamed to the equivalent corresponding -// "barrier.*.aligned" variants. We use the older syntax for compatibility with -// older versions of the PTX ISA. -defm BARRIER_CTA_SYNC_ALIGNED_ALL : BARRIER1<"bar.sync", int_nvvm_barrier_cta_sync_aligned_all>; -defm BARRIER_CTA_SYNC_ALIGNED : BARRIER2<"bar.sync", int_nvvm_barrier_cta_sync_aligned>; -defm BARRIER_CTA_ARRIVE_ALIGNED : BARRIER2<"bar.arrive", int_nvvm_barrier_cta_arrive_aligned>; - -defm BARRIER_CTA_SYNC_ALL : BARRIER1<"barrier.sync", int_nvvm_barrier_cta_sync_all, [hasPTX<60>]>; -defm BARRIER_CTA_SYNC : BARRIER2<"barrier.sync", int_nvvm_barrier_cta_sync, [hasPTX<60>]>; -defm BARRIER_CTA_ARRIVE : BARRIER2<"barrier.arrive", int_nvvm_barrier_cta_arrive, [hasPTX<60>]>; +def INT_BARRIER_SYNC_CNT_RR : NVPTXInst<(outs), (ins Int32Regs:$id, Int32Regs:$cnt), + "barrier.sync \t$id, $cnt;", + [(int_nvvm_barrier_sync_cnt i32:$id, i32:$cnt)]>, + Requires<[hasPTX<60>, hasSM<30>]>; +def INT_BARRIER_SYNC_CNT_RI : NVPTXInst<(outs), (ins Int32Regs:$id, i32imm:$cnt), + "barrier.sync \t$id, $cnt;", + [(int_nvvm_barrier_sync_cnt i32:$id, imm:$cnt)]>, + Requires<[hasPTX<60>, hasSM<30>]>; +def INT_BARRIER_SYNC_CNT_IR : NVPTXInst<(outs), (ins i32imm:$id, Int32Regs:$cnt), + "barrier.sync \t$id, $cnt;", + [(int_nvvm_barrier_sync_cnt imm:$id, i32:$cnt)]>, + Requires<[hasPTX<60>, hasSM<30>]>; +def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt), + "barrier.sync \t$id, $cnt;", + [(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>, + Requires<[hasPTX<60>, hasSM<30>]>; class INT_BARRIER_CLUSTER Preds = [hasPTX<78>, hasSM<90>]>: diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 79d9b3da054b..8b843634600b 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -2150,8 +2150,7 @@ struct AANoUnwindCallSite final bool AANoSync::isAlignedBarrier(const CallBase &CB, bool ExecutedAligned) { switch (CB.getIntrinsicID()) { - case Intrinsic::nvvm_barrier_cta_sync_aligned_all: - case Intrinsic::nvvm_barrier_cta_sync_aligned: + case Intrinsic::nvvm_barrier0: case Intrinsic::nvvm_barrier0_and: case Intrinsic::nvvm_barrier0_or: case Intrinsic::nvvm_barrier0_popc: diff --git a/llvm/test/Analysis/GlobalsModRef/functions_without_nosync.ll b/llvm/test/Analysis/GlobalsModRef/functions_without_nosync.ll index 7019694439bb..e92a45807ed9 100644 --- a/llvm/test/Analysis/GlobalsModRef/functions_without_nosync.ll +++ b/llvm/test/Analysis/GlobalsModRef/functions_without_nosync.ll @@ -11,15 +11,28 @@ target triple = "nvptx64-nvidia-cuda" ; CHECK-LABEL: @bar_sync ; CHECK: store -; CHECK: tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) +; CHECK: tail call void @llvm.nvvm.bar.sync(i32 0) ; CHECK: load define dso_local i32 @bar_sync(i32 %0) local_unnamed_addr { store i32 %0, ptr addrspacecast (ptr addrspace(3) @s to ptr), align 4 - tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + tail call void @llvm.nvvm.bar.sync(i32 0) %2 = load i32, ptr addrspacecast (ptr addrspace(3) @s to ptr), align 4 ret i32 %2 } -declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #0 +declare void @llvm.nvvm.bar.sync(i32) #0 + +; CHECK-LABEL: @barrier0 +; CHECK: store +; CHECK: tail call void @llvm.nvvm.barrier0() +; CHECK: load +define dso_local i32 @barrier0(i32 %0) local_unnamed_addr { + store i32 %0, ptr addrspacecast (ptr addrspace(3) @s to ptr), align 4 + tail call void @llvm.nvvm.barrier0() + %2 = load i32, ptr addrspacecast (ptr addrspace(3) @s to ptr), align 4 + ret i32 %2 +} + +declare void @llvm.nvvm.barrier0() #0 attributes #0 = { convergent nounwind } diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll index e362ad88a8c0..2bfa1c2dfba7 100644 --- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll +++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll @@ -78,13 +78,6 @@ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2); declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2); -declare void @llvm.nvvm.barrier0() -declare void @llvm.nvvm.barrier.n(i32) -declare void @llvm.nvvm.bar.sync(i32) -declare void @llvm.nvvm.barrier(i32, i32) -declare void @llvm.nvvm.barrier.sync(i32) -declare void @llvm.nvvm.barrier.sync.cnt(i32, i32) - ; CHECK-LABEL: @simple_upgrade define void @simple_upgrade(i32 %a, i64 %b, i16 %c) { ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a) @@ -331,18 +324,3 @@ define void @nvvm_cp_async_bulk_tensor_g2s_tile(ptr addrspace(3) %d, ptr addrspa ret void } -define void @cta_barriers(i32 %x, i32 %y) { -; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) -; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %x) -; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %x) -; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned(i32 %x, i32 %y) -; CHECK: call void @llvm.nvvm.barrier.cta.sync.all(i32 %x) -; CHECK: call void @llvm.nvvm.barrier.cta.sync(i32 %x, i32 %y) - call void @llvm.nvvm.barrier0() - call void @llvm.nvvm.barrier.n(i32 %x) - call void @llvm.nvvm.bar.sync(i32 %x) - call void @llvm.nvvm.barrier(i32 %x, i32 %y) - call void @llvm.nvvm.barrier.sync(i32 %x) - call void @llvm.nvvm.barrier.sync.cnt(i32 %x, i32 %y) - ret void -} diff --git a/llvm/test/CodeGen/NVPTX/barrier.ll b/llvm/test/CodeGen/NVPTX/barrier.ll index 75db99b7f49d..05bdc9087f57 100644 --- a/llvm/test/CodeGen/NVPTX/barrier.ll +++ b/llvm/test/CodeGen/NVPTX/barrier.ll @@ -1,136 +1,33 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %} declare void @llvm.nvvm.bar.warp.sync(i32) -declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) -declare void @llvm.nvvm.barrier.cta.sync.aligned(i32, i32) -declare void @llvm.nvvm.barrier.cta.sync.all(i32) -declare void @llvm.nvvm.barrier.cta.sync(i32, i32) -declare void @llvm.nvvm.barrier.cta.arrive(i32, i32) -declare void @llvm.nvvm.barrier.cta.arrive.aligned(i32, i32) +declare void @llvm.nvvm.barrier.sync(i32) +declare void @llvm.nvvm.barrier.sync.cnt(i32, i32) -define void @barrier_warp_sync(i32 %id) { -; CHECK-LABEL: barrier_warp_sync( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [barrier_warp_sync_param_0]; -; CHECK-NEXT: bar.warp.sync %r1; -; CHECK-NEXT: bar.warp.sync 6; -; CHECK-NEXT: ret; +; CHECK-LABEL: .func{{.*}}barrier_sync +define void @barrier_sync(i32 %id, i32 %cnt) { + ; CHECK: ld.param.b32 [[ID:%r[0-9]+]], [barrier_sync_param_0]; + ; CHECK: ld.param.b32 [[CNT:%r[0-9]+]], [barrier_sync_param_1]; + + ; CHECK: barrier.sync [[ID]], [[CNT]]; + call void @llvm.nvvm.barrier.sync.cnt(i32 %id, i32 %cnt) + ; CHECK: barrier.sync [[ID]], 32; + call void @llvm.nvvm.barrier.sync.cnt(i32 %id, i32 32) + ; CHECK: barrier.sync 3, [[CNT]]; + call void @llvm.nvvm.barrier.sync.cnt(i32 3, i32 %cnt) + ; CHECK: barrier.sync 4, 64; + call void @llvm.nvvm.barrier.sync.cnt(i32 4, i32 64) + + ; CHECK: barrier.sync [[ID]]; + call void @llvm.nvvm.barrier.sync(i32 %id) + ; CHECK: barrier.sync 1; + call void @llvm.nvvm.barrier.sync(i32 1) + + ; CHECK: bar.warp.sync [[ID]]; call void @llvm.nvvm.bar.warp.sync(i32 %id) + ; CHECK: bar.warp.sync 6; call void @llvm.nvvm.bar.warp.sync(i32 6) - ret void + ret void; } -define void @barrier_cta_sync_aligned_all(i32 %id) { -; CHECK-LABEL: barrier_cta_sync_aligned_all( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [barrier_cta_sync_aligned_all_param_0]; -; CHECK-NEXT: bar.sync %r1; -; CHECK-NEXT: bar.sync 3; -; CHECK-NEXT: ret; - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %id) - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 3) - ret void -} - -define void @barrier_cta_sync_aligned(i32 %id, i32 %cnt) { -; CHECK-LABEL: barrier_cta_sync_aligned( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [barrier_cta_sync_aligned_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [barrier_cta_sync_aligned_param_1]; -; CHECK-NEXT: bar.sync %r1, %r2; -; CHECK-NEXT: bar.sync 3, %r2; -; CHECK-NEXT: bar.sync %r1, 64; -; CHECK-NEXT: bar.sync 4, 64; -; CHECK-NEXT: ret; - call void @llvm.nvvm.barrier.cta.sync.aligned(i32 %id, i32 %cnt) - call void @llvm.nvvm.barrier.cta.sync.aligned(i32 3, i32 %cnt) - call void @llvm.nvvm.barrier.cta.sync.aligned(i32 %id, i32 64) - call void @llvm.nvvm.barrier.cta.sync.aligned(i32 4, i32 64) - ret void -} - -define void @barrier_cta_arrive_aligned(i32 %id, i32 %cnt) { -; CHECK-LABEL: barrier_cta_arrive_aligned( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [barrier_cta_arrive_aligned_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [barrier_cta_arrive_aligned_param_1]; -; CHECK-NEXT: bar.arrive %r1, %r2; -; CHECK-NEXT: bar.arrive 3, %r2; -; CHECK-NEXT: bar.arrive %r1, 64; -; CHECK-NEXT: bar.arrive 4, 64; -; CHECK-NEXT: ret; - call void @llvm.nvvm.barrier.cta.arrive.aligned(i32 %id, i32 %cnt) - call void @llvm.nvvm.barrier.cta.arrive.aligned(i32 3, i32 %cnt) - call void @llvm.nvvm.barrier.cta.arrive.aligned(i32 %id, i32 64) - call void @llvm.nvvm.barrier.cta.arrive.aligned(i32 4, i32 64) - ret void -} - -define void @barrier_cta_sync_all(i32 %id) { -; CHECK-LABEL: barrier_cta_sync_all( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [barrier_cta_sync_all_param_0]; -; CHECK-NEXT: barrier.sync %r1; -; CHECK-NEXT: barrier.sync 3; -; CHECK-NEXT: ret; - call void @llvm.nvvm.barrier.cta.sync.all(i32 %id) - call void @llvm.nvvm.barrier.cta.sync.all(i32 3) - ret void -} - -define void @barrier_cta_sync(i32 %id, i32 %cnt) { -; CHECK-LABEL: barrier_cta_sync( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [barrier_cta_sync_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [barrier_cta_sync_param_1]; -; CHECK-NEXT: barrier.sync %r1, %r2; -; CHECK-NEXT: barrier.sync 3, %r2; -; CHECK-NEXT: barrier.sync %r1, 64; -; CHECK-NEXT: barrier.sync 4, 64; -; CHECK-NEXT: ret; - call void @llvm.nvvm.barrier.cta.sync(i32 %id, i32 %cnt) - call void @llvm.nvvm.barrier.cta.sync(i32 3, i32 %cnt) - call void @llvm.nvvm.barrier.cta.sync(i32 %id, i32 64) - call void @llvm.nvvm.barrier.cta.sync(i32 4, i32 64) - ret void -} - -define void @barrier_cta_arrive(i32 %id, i32 %cnt) { -; CHECK-LABEL: barrier_cta_arrive( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [barrier_cta_arrive_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [barrier_cta_arrive_param_1]; -; CHECK-NEXT: barrier.arrive %r1, %r2; -; CHECK-NEXT: barrier.arrive 3, %r2; -; CHECK-NEXT: barrier.arrive %r1, 64; -; CHECK-NEXT: barrier.arrive 4, 64; -; CHECK-NEXT: ret; - call void @llvm.nvvm.barrier.cta.arrive(i32 %id, i32 %cnt) - call void @llvm.nvvm.barrier.cta.arrive(i32 3, i32 %cnt) - call void @llvm.nvvm.barrier.cta.arrive(i32 %id, i32 64) - call void @llvm.nvvm.barrier.cta.arrive(i32 4, i32 64) - ret void -} diff --git a/llvm/test/CodeGen/NVPTX/named-barriers.ll b/llvm/test/CodeGen/NVPTX/named-barriers.ll new file mode 100644 index 000000000000..34e93cef6aaa --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/named-barriers.ll @@ -0,0 +1,42 @@ +; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} + +; Use bar.sync to arrive at a pre-computed barrier number and +; wait for all threads in CTA to also arrive: +define ptx_device void @test_barrier_named_cta() { +; CHECK: mov.b32 %r[[REG0:[0-9]+]], 0; +; CHECK: bar.sync %r[[REG0]]; +; CHECK: mov.b32 %r[[REG1:[0-9]+]], 10; +; CHECK: bar.sync %r[[REG1]]; +; CHECK: mov.b32 %r[[REG2:[0-9]+]], 15; +; CHECK: bar.sync %r[[REG2]]; +; CHECK: ret; + call void @llvm.nvvm.barrier.n(i32 0) + call void @llvm.nvvm.barrier.n(i32 10) + call void @llvm.nvvm.barrier.n(i32 15) + ret void +} + +; Use bar.sync to arrive at a pre-computed barrier number and +; wait for fixed number of cooperating threads to arrive: +define ptx_device void @test_barrier_named() { +; CHECK: mov.b32 %r[[REG0A:[0-9]+]], 32; +; CHECK: mov.b32 %r[[REG0B:[0-9]+]], 0; +; CHECK: bar.sync %r[[REG0B]], %r[[REG0A]]; +; CHECK: mov.b32 %r[[REG1A:[0-9]+]], 352; +; CHECK: mov.b32 %r[[REG1B:[0-9]+]], 10; +; CHECK: bar.sync %r[[REG1B]], %r[[REG1A]]; +; CHECK: mov.b32 %r[[REG2A:[0-9]+]], 992; +; CHECK: mov.b32 %r[[REG2B:[0-9]+]], 15; +; CHECK: bar.sync %r[[REG2B]], %r[[REG2A]]; +; CHECK: ret; + call void @llvm.nvvm.barrier(i32 0, i32 32) + call void @llvm.nvvm.barrier(i32 10, i32 352) + call void @llvm.nvvm.barrier(i32 15, i32 992) + ret void +} + +declare void @llvm.nvvm.barrier(i32, i32) +declare void @llvm.nvvm.barrier.n(i32) diff --git a/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll b/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll index 02abae0c8f9c..2a0c5ab7299b 100644 --- a/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll +++ b/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll @@ -3,8 +3,8 @@ ; Make sure the call to syncthreads is not duplicate here by the LLVM ; optimizations, because it has the noduplicate attribute set. -; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all -; CHECK-NOT: call void @llvm.nvvm.barrier.cta.sync.aligned.all +; CHECK: call void @llvm.nvvm.barrier0 +; CHECK-NOT: call void @llvm.nvvm.barrier0 ; Function Attrs: nounwind define void @foo(ptr %output) #1 { @@ -36,7 +36,7 @@ if.else: ; preds = %entry br label %if.end if.end: ; preds = %if.else, %if.then - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() %6 = load ptr, ptr %output.addr, align 8 %7 = load float, ptr %6, align 4 %conv7 = fpext float %7 to double diff --git a/llvm/test/Feature/intrinsic-noduplicate.ll b/llvm/test/Feature/intrinsic-noduplicate.ll index 42264ef909e8..ecdb381b7920 100644 --- a/llvm/test/Feature/intrinsic-noduplicate.ll +++ b/llvm/test/Feature/intrinsic-noduplicate.ll @@ -2,9 +2,9 @@ ; REQUIRES: nvptx-registered-target ; Make sure LLVM knows about the convergent attribute on the -; llvm.nvvm.barrier.cta.sync.aligned.all intrinsic. +; llvm.nvvm.barrier0 intrinsic. -declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) +declare void @llvm.nvvm.barrier0() -; CHECK: declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #[[ATTRNUM:[0-9]+]] +; CHECK: declare void @llvm.nvvm.barrier0() #[[ATTRNUM:[0-9]+]] ; CHECK: attributes #[[ATTRNUM]] = { convergent nocallback nounwind } diff --git a/llvm/test/Transforms/FunctionAttrs/convergent.ll b/llvm/test/Transforms/FunctionAttrs/convergent.ll index e2581b2b418f..49c357bd6bc8 100644 --- a/llvm/test/Transforms/FunctionAttrs/convergent.ll +++ b/llvm/test/Transforms/FunctionAttrs/convergent.ll @@ -70,17 +70,17 @@ define i32 @indirect_non_convergent_call(ptr %f) convergent norecurse { ret i32 %a } -declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) convergent +declare void @llvm.nvvm.barrier0() convergent define i32 @intrinsic() convergent { ; Implicitly convergent, because the intrinsic is convergent. ; CHECK: Function Attrs: convergent norecurse nounwind ; CHECK-LABEL: define {{[^@]+}}@intrinsic ; CHECK-SAME: () #[[ATTR4:[0-9]+]] { -; CHECK-NEXT: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) +; CHECK-NEXT: call void @llvm.nvvm.barrier0() ; CHECK-NEXT: ret i32 0 ; - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() ret i32 0 } diff --git a/llvm/test/Transforms/JumpThreading/thread-two-bbs-cuda.ll b/llvm/test/Transforms/JumpThreading/thread-two-bbs-cuda.ll index 1671baaaa087..8a9e6f728936 100644 --- a/llvm/test/Transforms/JumpThreading/thread-two-bbs-cuda.ll +++ b/llvm/test/Transforms/JumpThreading/thread-two-bbs-cuda.ll @@ -12,7 +12,7 @@ define i32 @wrapped_tid() #0 comdat align 32 { ret i32 %1 } -declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #1 +declare void @llvm.nvvm.barrier0() #1 ; We had a bug where we duplicated basic blocks containing convergent ; functions like @llvm.nvvm.barrier0 below. Verify that we don't do @@ -32,9 +32,9 @@ define void @foo() local_unnamed_addr #2 comdat align 32 { br label %6 6: -; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) -; CHECK-NOT: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) +; CHECK: call void @llvm.nvvm.barrier0() +; CHECK-NOT: call void @llvm.nvvm.barrier0() + call void @llvm.nvvm.barrier0() %7 = icmp eq i32 %2, 0 br i1 %7, label %11, label %8 diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll index f662d5dd85b2..5b7544b1a796 100644 --- a/llvm/test/Transforms/OpenMP/barrier_removal.ll +++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll @@ -8,7 +8,7 @@ target triple = "amdgcn-amd-amdhsa" declare void @useI32(i32) declare void @unknown() declare void @aligned_barrier() "llvm.assume"="ompx_aligned_barrier" -declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) +declare void @llvm.nvvm.barrier0() declare i32 @llvm.nvvm.barrier0.and(i32) declare i32 @llvm.nvvm.barrier0.or(i32) declare i32 @llvm.nvvm.barrier0.popc(i32) @@ -58,7 +58,7 @@ define amdgpu_kernel void @pos_empty_3() "kernel" { ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: ret void ; - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() ret void } define amdgpu_kernel void @pos_empty_4() "kernel" { @@ -393,12 +393,12 @@ define amdgpu_kernel void @pos_multiple() "kernel" { ; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: ret void ; - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() call void @aligned_barrier() call void @aligned_barrier() call void @llvm.amdgcn.s.barrier() call void @aligned_barrier() - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() call void @aligned_barrier() call void @aligned_barrier() ret void @@ -422,7 +422,7 @@ define amdgpu_kernel void @multiple_blocks_kernel_1(i1 %c0, i1 %c1) "kernel" { ; CHECK-NEXT: ret void ; fence acquire - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() fence release call void @aligned_barrier() fence seq_cst @@ -441,7 +441,7 @@ f0: fence release call void @aligned_barrier() fence acquire - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() fence acquire br i1 %c1, label %t1, label %f1 t1: @@ -473,7 +473,7 @@ define amdgpu_kernel void @multiple_blocks_kernel_2(i1 %c0, i1 %c1, ptr %p) "ker ; CHECK-NEXT: br label [[M:%.*]] ; CHECK: f0: ; CHECK-NEXT: store i32 4, ptr [[P]], align 4 -; CHECK-NEXT: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) +; CHECK-NEXT: call void @llvm.nvvm.barrier0() ; CHECK-NEXT: br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]] ; CHECK: t1: ; CHECK-NEXT: br label [[M]] @@ -483,7 +483,7 @@ define amdgpu_kernel void @multiple_blocks_kernel_2(i1 %c0, i1 %c1, ptr %p) "ker ; CHECK-NEXT: store i32 4, ptr [[P]], align 4 ; CHECK-NEXT: ret void ; - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() store i32 4, ptr %p call void @aligned_barrier() br i1 %c0, label %t0, label %f0 @@ -496,7 +496,7 @@ t0b: f0: call void @aligned_barrier() store i32 4, ptr %p - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() br i1 %c1, label %t1, label %f1 t1: call void @aligned_barrier() @@ -527,7 +527,7 @@ define void @multiple_blocks_non_kernel_1(i1 %c0, i1 %c1) "kernel" { ; CHECK: m: ; CHECK-NEXT: ret void ; - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() call void @aligned_barrier() br i1 %c0, label %t0, label %f0 t0: @@ -538,7 +538,7 @@ t0b: br label %m f0: call void @aligned_barrier() - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() br i1 %c1, label %t1, label %f1 t1: call void @aligned_barrier() @@ -577,7 +577,7 @@ t0b: br label %m f0: call void @aligned_barrier() - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() br i1 %c1, label %t1, label %f1 t1: call void @aligned_barrier() @@ -614,7 +614,7 @@ t0b: br label %m f0: call void @aligned_barrier() - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() br i1 %c1, label %t1, label %f1 t1: call void @aligned_barrier() @@ -665,7 +665,7 @@ t0b: br label %m f0: call void @aligned_barrier() - call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + call void @llvm.nvvm.barrier0() store i32 2, ptr %p br i1 %c1, label %t1, label %f1 t1: diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 0c5c87cfe002..e4a44f698b62 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -535,13 +535,8 @@ def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">, // NVVM synchronization op definitions //===----------------------------------------------------------------------===// -def NVVM_Barrier0Op : NVVM_Op<"barrier0"> { +def NVVM_Barrier0Op : NVVM_IntrOp<"barrier0"> { let assemblyFormat = "attr-dict"; - string llvmBuilder = [{ - createIntrinsicCall( - builder, llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all, - {builder.getInt32(0)}); - }]; } def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> { @@ -549,14 +544,15 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> { Optional:$barrierId, Optional:$numberOfThreads); string llvmBuilder = [{ - llvm::Value *id = $barrierId ? $barrierId : builder.getInt32(0); - if ($numberOfThreads) - createIntrinsicCall( - builder, llvm::Intrinsic::nvvm_barrier_cta_sync_aligned, - {id, $numberOfThreads}); - else - createIntrinsicCall( - builder, llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all, {id}); + if ($numberOfThreads && $barrierId) { + createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier, + {$barrierId, $numberOfThreads}); + } else if($barrierId) { + createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_n, + {$barrierId}); + } else { + createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier0); + } }]; let hasVerifier = 1; diff --git a/mlir/test/Target/LLVMIR/Import/nvvmir.ll b/mlir/test/Target/LLVMIR/Import/nvvmir.ll index 2da0b0ceb2cf..c8b7b82f47fd 100644 --- a/mlir/test/Target/LLVMIR/Import/nvvmir.ll +++ b/mlir/test/Target/LLVMIR/Import/nvvmir.ll @@ -73,11 +73,12 @@ define float @nvvm_rcp(float %0) { ; CHECK-LABEL: @llvm_nvvm_barrier0() define void @llvm_nvvm_barrier0() { - ; CHECK: llvm.nvvm.barrier.cta.sync.aligned.all + ; CHECK: nvvm.barrier0 call void @llvm.nvvm.barrier0() ret void } + ; TODO: Support the intrinsics below once they derive from NVVM_IntrOp rather than from NVVM_Op. ; ; define i32 @nvvm_shfl(i32 %0, i32 %1, i32 %2, i32 %3, float %4) { diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 90519a940262..894b72733a46 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -162,7 +162,7 @@ llvm.func @nvvm_rcp(%0: f32) -> f32 { // CHECK-LABEL: @llvm_nvvm_barrier0 llvm.func @llvm_nvvm_barrier0() { - // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + // CHECK: call void @llvm.nvvm.barrier0() nvvm.barrier0 llvm.return } @@ -170,11 +170,11 @@ llvm.func @llvm_nvvm_barrier0() { // CHECK-LABEL: @llvm_nvvm_barrier( // CHECK-SAME: i32 %[[barId:.*]], i32 %[[numThreads:.*]]) llvm.func @llvm_nvvm_barrier(%barID : i32, %numberOfThreads : i32) { - // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) - nvvm.barrier - // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %[[barId]]) + // CHECK: call void @llvm.nvvm.barrier0() + nvvm.barrier + // CHECK: call void @llvm.nvvm.barrier.n(i32 %[[barId]]) nvvm.barrier id = %barID - // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned(i32 %[[barId]], i32 %[[numThreads]]) + // CHECK: call void @llvm.nvvm.barrier(i32 %[[barId]], i32 %[[numThreads]]) nvvm.barrier id = %barID number_of_threads = %numberOfThreads llvm.return }