[MLIR][NVVM] Update mbarrier.arrive.expect_tx Op (#169922)

This patch updates the mbarrier.arrive.expect_tx Op.
It also adds an Op for its arrive_drop version.

* No change in the existing inline-asm lowering.
   This functionality continues to work as is.
* An optional return value is added for shared_cta space.
* The scope and semantics are added as attributes.
* Inline-PTX lowering is available when `predicate` is provided.
  Otherwise, the Op lowers to intrinsics.
* lit tests are added to verify the lowering to intrinsics.
* Specific negative tests are added to check the invalid cases for
inline-ptx lowering.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
This commit is contained in:
Durgadoss R
2025-12-01 22:19:34 +05:30
committed by GitHub
parent f3cce97ba7
commit fddf7b0510
7 changed files with 401 additions and 19 deletions

View File

@@ -889,10 +889,7 @@ def NVVM_MBarrierArriveDropNocompleteOp : NVVM_Op<"mbarrier.arrive_drop.nocomple
}];
}
def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,
Arguments<(ins
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
I32:$txcount, PtxPredicate:$predicate)> {
def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx"> {
let summary = "MBarrier Arrive with Expected Transaction Count";
let description = [{
The `nvvm.mbarrier.arrive.expect_tx` operation performs an expect-tx operation
@@ -903,11 +900,11 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
threads within the CTA. When other threads perform corresponding acquire operations
(like 'mbarrier.test.wait'), they synchronize with this release pattern.
This operation first performs an expect-tx operation with the specified transaction
count, then performs an arrive-on operation with an implicit count of 1. The
expect-tx operation increases the tx-count of the *mbarrier object* by the specified
expectCount value, setting the current phase to expect and tracks the completion
of additional asynchronous transactions.
This operation first performs an expect-tx operation with the specified transaction
count, then performs an arrive-on operation with an implicit count of 1. The
expect-tx operation increases the expect-count of the *mbarrier object* by the
specified value (i.e. `txcount`), setting the current phase to expect and track
the completion of additional asynchronous transactions.
The operation takes the following operands:
- `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic
@@ -915,11 +912,86 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
- `txcount`: An unsigned integer specifying the expected transaction count
for the expect-tx operation. This represents the number of asynchronous transactions
expected to complete before the barrier phase completes.
- `predicate`: Optional predicate for conditional execution.
- `scope`: This specifies the set of threads that directly observe the memory
synchronizing effect of the `mbarrier.test.wait` operation.
- `relaxed`: When set to true, the `arrive` operation has relaxed memory semantics
and does not provide any ordering or visibility guarantees.
- `predicate`: Optional predicate for conditional execution used only when lowering to
inline-ptx.
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop)
}];
let results = (outs Optional<I64>:$res);
let arguments = (ins
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
I32:$txcount,
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope,
DefaultValuedAttr<BoolAttr, "false">:$relaxed,
PtxPredicate:$predicate);
let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands) (`->` type($res)^)?";
let hasVerifier = 1;
let extraClassDeclaration = [{
bool hasIntrinsic() { return !getPredicate(); }
bool getAsmValues(RewriterBase &rewriter,
llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>> &asmValues);
static mlir::NVVM::IDArgPair
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
llvm::IRBuilderBase& builder);
}];
string llvmBuilder = [{
auto [id, args] = NVVM::MBarrierArriveExpectTxOp::getIntrinsicIDAndArgs(
*op, moduleTranslation, builder);
int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
if (addrSpace != NVVM::NVVMMemorySpace::SharedCluster)
$res = createIntrinsicCall(builder, id, args);
else
createIntrinsicCall(builder, id, args);
}];
}
def NVVM_MBarrierArriveDropExpectTxOp : NVVM_Op<"mbarrier.arrive_drop.expect_tx"> {
let summary = "MBarrier arrive_drop with expected transaction count";
let description = [{
The `nvvm.mbarrier.arrive_drop.expect_tx` operation is similar to the
`nvvm.mbarrier.arrive.expect_tx` operation except that it performs an
`arrive_drop` operation instead of only an `arrive` operation.
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop)
}];
let results = (outs Optional<I64>:$res);
let arguments = (ins
AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared, LLVM_PointerSharedCluster]>:$addr,
I32:$txcount,
DefaultValuedAttr<MemScopeKindAttr, "MemScopeKind::CTA">:$scope,
DefaultValuedAttr<BoolAttr, "false">:$relaxed);
let assemblyFormat = "$addr `,` $txcount attr-dict `:` type(operands) (`->` type($res)^)?";
let hasVerifier = 1;
let extraClassDeclaration = [{
static mlir::NVVM::IDArgPair
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
llvm::IRBuilderBase& builder);
}];
string llvmBuilder = [{
auto [id, args] = NVVM::MBarrierArriveDropExpectTxOp::getIntrinsicIDAndArgs(
*op, moduleTranslation, builder);
int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
if (addrSpace != NVVM::NVVMMemorySpace::SharedCluster)
$res = createIntrinsicCall(builder, id, args);
else
createIntrinsicCall(builder, id, args);
}];
let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
}
def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">,

View File

@@ -923,7 +923,11 @@ struct NVGPUMBarrierArriveExpectTxLowering
adaptor.getMbarId(), rewriter);
Value txcount = truncToI32(b, adaptor.getTxcount());
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxOp>(
op, barrier, txcount, adaptor.getPredicate());
op, Type{}, // return-value is optional and is void by default
barrier, txcount, // barrier and txcount
NVVM::MemScopeKind::CTA, // default scope is CTA
false, // relaxed-semantics is false
adaptor.getPredicate());
return success();
}
};

View File

@@ -274,6 +274,34 @@ LogicalResult MBarrierArriveDropOp::verify() {
getRes());
}
LogicalResult MBarrierArriveExpectTxOp::verify() {
// The inline-ptx version of this Op does not support all features.
// With predicate, this Op lowers to inline-ptx. So, verify and
// error-out if there are unsupported features.
if (getPredicate()) {
if (getScope() != NVVM::MemScopeKind::CTA)
return emitError("mbarrier scope must be CTA when using predicate");
if (isPtrInSharedClusterSpace(getAddr()))
return emitError("mbarrier in shared_cluster space is not supported when "
"using predicate");
if (getRes())
return emitError("return-value is not supported when using predicate");
if (getRelaxed() == true)
return emitError("mbarrier with relaxed semantics is not supported when "
"using predicate");
}
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(),
getRes());
}
LogicalResult MBarrierArriveDropExpectTxOp::verify() {
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope(),
getRes());
}
LogicalResult MBarrierExpectTxOp::verify() {
return verifyMBarrierArriveLikeOp(getOperation(), getAddr(), getScope());
}
@@ -2576,6 +2604,87 @@ mlir::NVVM::IDArgPair MBarrierArriveDropOp::getIntrinsicIDAndArgs(
return {id, {mbar, count}};
}
bool MBarrierArriveExpectTxOp::getAsmValues(
RewriterBase &rewriter,
llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>>
&asmValues) {
// Add all the operands but not the attrs to the asmValues list.
// The attrs here are used to generate the right variants for
// intrinsics-lowering. So, we ignore them while generating inline-PTX.
for (auto val : getOperands())
asmValues.push_back({val, mlir::NVVM::PTXRegisterMod::Read});
return false;
}
mlir::NVVM::IDArgPair MBarrierArriveExpectTxOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::MBarrierArriveExpectTxOp>(op);
bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr());
bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER;
// bit-0: Space
// bit-1: Scope
size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0);
// clang-format off
static constexpr llvm::Intrinsic::ID IDs[] = {
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cta_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cta_space_cluster,
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cluster_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_scope_cluster_space_cluster};
static constexpr llvm::Intrinsic::ID relaxedIDs[] = {
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cta_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cta_space_cluster,
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cluster_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_expect_tx_relaxed_scope_cluster_space_cluster};
// clang-format on
auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index];
// Tidy-up the Intrinsic Args
llvm::Value *txcount = mt.lookupValue(thisOp.getTxcount());
llvm::Value *mbar = mt.lookupValue(thisOp.getAddr());
bool needCast = isPtrInGenericSpace(thisOp.getAddr());
if (needCast)
mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared);
return {id, {mbar, txcount}};
}
mlir::NVVM::IDArgPair MBarrierArriveDropExpectTxOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::MBarrierArriveDropExpectTxOp>(op);
bool isClusterSpace = isPtrInSharedClusterSpace(thisOp.getAddr());
bool isClusterScope = thisOp.getScope() == NVVM::MemScopeKind::CLUSTER;
// bit-0: Space
// bit-1: Scope
size_t index = ((isClusterScope ? 1 : 0) << 1) | (isClusterSpace ? 1 : 0);
// clang-format off
static constexpr llvm::Intrinsic::ID IDs[] = {
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cta_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cta_space_cluster,
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cluster_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_scope_cluster_space_cluster};
static constexpr llvm::Intrinsic::ID relaxedIDs[] = {
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cta_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cta_space_cluster,
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cluster_space_cta,
llvm::Intrinsic::nvvm_mbarrier_arrive_drop_expect_tx_relaxed_scope_cluster_space_cluster};
// clang-format on
auto id = thisOp.getRelaxed() ? relaxedIDs[index] : IDs[index];
// Tidy-up the Intrinsic Args
llvm::Value *txcount = mt.lookupValue(thisOp.getTxcount());
llvm::Value *mbar = mt.lookupValue(thisOp.getAddr());
bool needCast = isPtrInGenericSpace(thisOp.getAddr());
if (needCast)
mbar = castPtrToAddrSpace(builder, mbar, NVVMMemorySpace::Shared);
return {id, {mbar, txcount}};
}
mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::MBarrierArriveNocompleteOp>(op);

View File

@@ -16,8 +16,6 @@ llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %cou
// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) {
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r"
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr<3>, i32
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b"
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1
llvm.return
@@ -25,8 +23,6 @@ llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i3
// CHECK-LABEL: @init_mbarrier_arrive_expect_tx_generic
llvm.func @init_mbarrier_arrive_expect_tx_generic(%barrier : !llvm.ptr, %txcount : i32, %pred : i1) {
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r"
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr, i32
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r,b"
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr, i32, i1
llvm.return
@@ -544,8 +540,8 @@ func.func @elect_one_leader_sync() {
// -----
// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
llvm.func @init_mbarrier_arrive_expect_tx(%desc : !llvm.ptr, %pred : i1) {
// CHECK-LABEL: @test_nvvm_prefetch
llvm.func @test_nvvm_prefetch(%desc : !llvm.ptr, %pred : i1) {
//CHECK: nvvm.prefetch tensormap, %{{.*}}
nvvm.prefetch tensormap, %desc : !llvm.ptr
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$1 prefetch.tensormap [$0];", "l,b"

View File

@@ -0,0 +1,68 @@
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
llvm.func @mbarrier_arrive_drop_expect_tx_generic(%barrier: !llvm.ptr, %txcount : i32) {
// CHECK-LABEL: define void @mbarrier_arrive_drop_expect_tx_generic(ptr %0, i32 %1) {
// CHECK-NEXT: %3 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %3, i32 %1)
// CHECK-NEXT: %5 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %5, i32 %1)
// CHECK-NEXT: %7 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %7, i32 %1)
// CHECK-NEXT: %9 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %10 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %9, i32 %1)
// CHECK-NEXT: %11 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %12 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %11, i32 %1)
// CHECK-NEXT: %13 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %14 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cluster.space.cta(ptr addrspace(3) %13, i32 %1)
// CHECK-NEXT: ret void
// CHECK-NEXT: }
%0 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount : !llvm.ptr, i32 -> i64
%1 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>} : !llvm.ptr, i32 -> i64
%2 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr, i32 -> i64
%3 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr, i32 -> i64
%4 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>, relaxed = true} : !llvm.ptr, i32 -> i64
%5 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>, relaxed = true} : !llvm.ptr, i32 -> i64
llvm.return
}
llvm.func @mbarrier_arrive_drop_expect_tx_shared(%barrier: !llvm.ptr<3>, %txcount : i32) {
// CHECK-LABEL: define void @mbarrier_arrive_drop_expect_tx_shared(ptr addrspace(3) %0, i32 %1) {
// CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %5 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %7 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: ret void
// CHECK-NEXT: }
%0 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount : !llvm.ptr<3>, i32 -> i64
%1 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>} : !llvm.ptr<3>, i32 -> i64
%2 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr<3>, i32 -> i64
%3 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr<3>, i32 -> i64
%4 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>, relaxed = true} : !llvm.ptr<3>, i32 -> i64
%5 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>, relaxed = true} : !llvm.ptr<3>, i32 -> i64
llvm.return
}
llvm.func @mbarrier_arrive_drop_expect_tx_shared_cluster(%barrier: !llvm.ptr<7>, %txcount : i32) {
// CHECK-LABEL: define void @mbarrier_arrive_drop_expect_tx_shared_cluster(ptr addrspace(7) %0, i32 %1) {
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.drop.expect.tx.relaxed.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: ret void
// CHECK-NEXT: }
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>} : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>, relaxed = true} : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive_drop.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>, relaxed = true} : !llvm.ptr<7>, i32
llvm.return
}

View File

@@ -0,0 +1,68 @@
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
llvm.func @mbarrier_arrive_expect_tx_generic(%barrier: !llvm.ptr, %txcount : i32) {
// CHECK-LABEL: define void @mbarrier_arrive_expect_tx_generic(ptr %0, i32 %1) {
// CHECK-NEXT: %3 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cta.space.cta(ptr addrspace(3) %3, i32 %1)
// CHECK-NEXT: %5 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cta.space.cta(ptr addrspace(3) %5, i32 %1)
// CHECK-NEXT: %7 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %7, i32 %1)
// CHECK-NEXT: %9 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %10 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %9, i32 %1)
// CHECK-NEXT: %11 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %12 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %11, i32 %1)
// CHECK-NEXT: %13 = addrspacecast ptr %0 to ptr addrspace(3)
// CHECK-NEXT: %14 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cluster.space.cta(ptr addrspace(3) %13, i32 %1)
// CHECK-NEXT: ret void
// CHECK-NEXT: }
%0 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr, i32 -> i64
%1 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>} : !llvm.ptr, i32 -> i64
%2 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr, i32 -> i64
%3 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr, i32 -> i64
%4 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>, relaxed = true} : !llvm.ptr, i32 -> i64
%5 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>, relaxed = true} : !llvm.ptr, i32 -> i64
llvm.return
}
llvm.func @mbarrier_arrive_expect_tx_shared(%barrier: !llvm.ptr<3>, %txcount : i32) {
// CHECK-LABEL: define void @mbarrier_arrive_expect_tx_shared(ptr addrspace(3) %0, i32 %1) {
// CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %4 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %5 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %6 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %7 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cta.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: %8 = call i64 @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cluster.space.cta(ptr addrspace(3) %0, i32 %1)
// CHECK-NEXT: ret void
// CHECK-NEXT: }
%0 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr<3>, i32 -> i64
%1 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>} : !llvm.ptr<3>, i32 -> i64
%2 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr<3>, i32 -> i64
%3 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr<3>, i32 -> i64
%4 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>, relaxed = true} : !llvm.ptr<3>, i32 -> i64
%5 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>, relaxed = true} : !llvm.ptr<3>, i32 -> i64
llvm.return
}
llvm.func @mbarrier_arrive_expect_tx_shared_cluster(%barrier: !llvm.ptr<7>, %txcount : i32) {
// CHECK-LABEL: define void @mbarrier_arrive_expect_tx_shared_cluster(ptr addrspace(7) %0, i32 %1) {
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.expect.tx.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cta.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: call void @llvm.nvvm.mbarrier.arrive.expect.tx.relaxed.scope.cluster.space.cluster(ptr addrspace(7) %0, i32 %1)
// CHECK-NEXT: ret void
// CHECK-NEXT: }
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>} : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {relaxed = true} : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cta>, relaxed = true} : !llvm.ptr<7>, i32
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount {scope = #nvvm.mem_scope<cluster>, relaxed = true} : !llvm.ptr<7>, i32
llvm.return
}

View File

@@ -47,3 +47,68 @@ llvm.func @mbarrier_complete_tx_scope(%barrier: !llvm.ptr<3>, %tx_count: i32) {
nvvm.mbarrier.complete_tx %barrier, %tx_count {scope = #nvvm.mem_scope<sys>} : !llvm.ptr<3>, i32
llvm.return
}
// -----
llvm.func @mbarrier_arr_expect_tx(%barrier: !llvm.ptr<3>, %tx_count: i32) {
// expected-error @below {{mbarrier scope must be either CTA or Cluster}}
%1 = nvvm.mbarrier.arrive.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope<gpu>} : !llvm.ptr<3>, i32 -> i64
llvm.return
}
// -----
llvm.func @mbarrier_arr_expect_tx_cluster(%barrier: !llvm.ptr<7>, %tx_count: i32) {
// expected-error @below {{mbarrier in shared_cluster space cannot return any value}}
%1 = nvvm.mbarrier.arrive.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope<cta>} : !llvm.ptr<7>, i32 -> i64
llvm.return
}
// -----
llvm.func @init_mbarrier_arrive_expect_tx_asm_ret(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) {
// expected-error @below {{return-value is not supported when using predicate}}
%1 = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1 -> i64
llvm.return
}
// -----
llvm.func @init_mbarrier_arrive_expect_tx_asm_relaxed(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) {
// expected-error @below {{mbarrier with relaxed semantics is not supported when using predicate}}
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred {relaxed = true} : !llvm.ptr<3>, i32, i1
llvm.return
}
// -----
llvm.func @init_mbarrier_arrive_expect_tx_asm_cta(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) {
// expected-error @below {{mbarrier scope must be CTA when using predicate}}
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred {scope = #nvvm.mem_scope<cluster>} : !llvm.ptr<3>, i32, i1
llvm.return
}
// -----
llvm.func @init_mbarrier_arrive_expect_tx_asm_cluster(%barrier : !llvm.ptr<7>, %txcount : i32, %pred : i1) {
// expected-error @below {{mbarrier in shared_cluster space is not supported when using predicate}}
nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr<7>, i32, i1
llvm.return
}
// -----
llvm.func @mbarrier_arr_drop_expect_tx(%barrier: !llvm.ptr<3>, %tx_count: i32) {
// expected-error @below {{mbarrier scope must be either CTA or Cluster}}
%1 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope<gpu>} : !llvm.ptr<3>, i32 -> i64
llvm.return
}
// -----
llvm.func @mbarrier_arr_drop_expect_tx_cluster(%barrier: !llvm.ptr<7>, %tx_count: i32) {
// expected-error @below {{mbarrier in shared_cluster space cannot return any value}}
%1 = nvvm.mbarrier.arrive_drop.expect_tx %barrier, %tx_count {scope = #nvvm.mem_scope<cta>} : !llvm.ptr<7>, i32 -> i64
llvm.return
}