diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index f33942b3c7c0..457451886e14 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -2135,8 +2135,8 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol, let summary = "declares a reduction kind"; let description = [{ - Declares an OpenMP reduction kind. This requires two mandatory and one - optional region. + Declares an OpenMP reduction kind. This requires two mandatory and two + optional regions. 1. The initializer region specifies how to initialize the thread-local reduction value. This is usually the neutral element of the reduction. @@ -2149,6 +2149,10 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol, 3. The atomic reduction region is optional and specifies how two values can be combined atomically given local accumulator variables. It is expected to store the combined value in the first accumulator variable. + 4. The cleanup region is optional and specifies how to clean up any memory + allocated by the initializer region. The region has an argument that + contains the value of the thread-local reduction accumulator. This will + be executed after the reduction has completed. Note that the MLIR type system does not allow for type-polymorphic reductions. Separate reduction declarations should be created for different @@ -2163,12 +2167,14 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol, let regions = (region AnyRegion:$initializerRegion, AnyRegion:$reductionRegion, - AnyRegion:$atomicReductionRegion); + AnyRegion:$atomicReductionRegion, + AnyRegion:$cleanupRegion); let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword " "`init` $initializerRegion " "`combiner` $reductionRegion " - "custom($atomicReductionRegion)"; + "custom($atomicReductionRegion) " + "custom($cleanupRegion)"; let extraClassDeclaration = [{ PointerLikeType getAccumulatorType() { diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index bf5875071e0d..a04343154a4d 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1538,6 +1538,21 @@ static void printAtomicReductionRegion(OpAsmPrinter &printer, printer.printRegion(region); } +static ParseResult parseCleanupReductionRegion(OpAsmParser &parser, + Region ®ion) { + if (parser.parseOptionalKeyword("cleanup")) + return success(); + return parser.parseRegion(region); +} + +static void printCleanupReductionRegion(OpAsmPrinter &printer, + DeclareReductionOp op, Region ®ion) { + if (region.empty()) + return; + printer << "cleanup "; + printer.printRegion(region); +} + LogicalResult DeclareReductionOp::verifyRegions() { if (getInitializerRegion().empty()) return emitOpError() << "expects non-empty initializer region"; @@ -1571,21 +1586,29 @@ LogicalResult DeclareReductionOp::verifyRegions() { "of the reduction type"; } - if (getAtomicReductionRegion().empty()) - return success(); + if (!getAtomicReductionRegion().empty()) { + Block &atomicReductionEntryBlock = getAtomicReductionRegion().front(); + if (atomicReductionEntryBlock.getNumArguments() != 2 || + atomicReductionEntryBlock.getArgumentTypes()[0] != + atomicReductionEntryBlock.getArgumentTypes()[1]) + return emitOpError() << "expects atomic reduction region with two " + "arguments of the same type"; + auto ptrType = llvm::dyn_cast( + atomicReductionEntryBlock.getArgumentTypes()[0]); + if (!ptrType || + (ptrType.getElementType() && ptrType.getElementType() != getType())) + return emitOpError() << "expects atomic reduction region arguments to " + "be accumulators containing the reduction type"; + } + + if (getCleanupRegion().empty()) + return success(); + Block &cleanupEntryBlock = getCleanupRegion().front(); + if (cleanupEntryBlock.getNumArguments() != 1 || + cleanupEntryBlock.getArgument(0).getType() != getType()) + return emitOpError() << "expects cleanup region with one argument " + "of the reduction type"; - Block &atomicReductionEntryBlock = getAtomicReductionRegion().front(); - if (atomicReductionEntryBlock.getNumArguments() != 2 || - atomicReductionEntryBlock.getArgumentTypes()[0] != - atomicReductionEntryBlock.getArgumentTypes()[1]) - return emitOpError() << "expects atomic reduction region with two " - "arguments of the same type"; - auto ptrType = llvm::dyn_cast( - atomicReductionEntryBlock.getArgumentTypes()[0]); - if (!ptrType || - (ptrType.getElementType() && ptrType.getElementType() != getType())) - return emitOpError() << "expects atomic reduction region arguments to " - "be accumulators containing the reduction type"; return success(); } diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index c4bf6a20ebe7..08ec57803aff 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -877,6 +877,32 @@ static void collectReductionInfo( } } +/// handling of DeclareReductionOp's cleanup region +static LogicalResult inlineReductionCleanup( + llvm::SmallVectorImpl &reductionDecls, + llvm::ArrayRef privateReductionVariables, + LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder) { + for (auto [i, reductionDecl] : llvm::enumerate(reductionDecls)) { + Region &cleanupRegion = reductionDecl.getCleanupRegion(); + if (cleanupRegion.empty()) + continue; + + // map the argument to the cleanup region + Block &entry = cleanupRegion.front(); + moduleTranslation.mapValue(entry.getArgument(0), + privateReductionVariables[i]); + + if (failed(inlineConvertOmpRegions(cleanupRegion, "omp.reduction.cleanup", + builder, moduleTranslation))) + return failure(); + + // clear block argument mapping in case it needs to be re-created with a + // different source for another use of the same reduction decl + moduleTranslation.forgetMapping(cleanupRegion); + } + return success(); +} + /// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, @@ -1072,7 +1098,9 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, tempTerminator->eraseFromParent(); builder.restoreIP(nextInsertionPoint); - return success(); + // after the workshare loop, deallocate private reduction variables + return inlineReductionCleanup(reductionDecls, privateReductionVariables, + moduleTranslation, builder); } /// A RAII class that on construction replaces the region arguments of the @@ -1125,13 +1153,13 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, LogicalResult bodyGenStatus = success(); llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); - auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { - // Collect reduction declarations - SmallVector reductionDecls; - collectReductionDecls(opInst, reductionDecls); + // Collect reduction declarations + SmallVector reductionDecls; + collectReductionDecls(opInst, reductionDecls); + SmallVector privateReductionVariables; + auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { // Allocate reduction vars - SmallVector privateReductionVariables; DenseMap reductionVariableMap; if (!isByRef) { allocByValReductionVars(opInst, builder, moduleTranslation, allocaIP, @@ -1331,7 +1359,18 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, // TODO: Perform finalization actions for variables. This has to be // called for variables which have destructors/finalizers. - auto finiCB = [&](InsertPointTy codeGenIP) {}; + auto finiCB = [&](InsertPointTy codeGenIP) { + InsertPointTy oldIP = builder.saveIP(); + builder.restoreIP(codeGenIP); + + // if the reduction has a cleanup region, inline it here to finalize the + // reduction variables + if (failed(inlineReductionCleanup(reductionDecls, privateReductionVariables, + moduleTranslation, builder))) + bodyGenStatus = failure(); + + builder.restoreIP(oldIP); + }; llvm::Value *ifCond = nullptr; if (auto ifExprVar = opInst.getIfExprVar()) diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index a00383cf4405..1134db77d5ba 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -436,6 +436,25 @@ atomic { // ----- +// expected-error @below {{op expects cleanup region with one argument of the reduction type}} +omp.declare_reduction @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = arith.constant 0.0 : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = arith.addf %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +cleanup { +^bb0(%arg: f64): + omp.yield +} + +// ----- + func.func @foo(%lb : index, %ub : index, %step : index) { %c1 = arith.constant 1 : i32 %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index 30ce77423005..e2c255c7a3cc 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -603,6 +603,8 @@ func.func @omp_target_pretty(%if_cond : i1, %device : si32, %num_threads : i32) // CHECK: atomic // CHECK: ^{{.+}}(%{{.+}}: !llvm.ptr, %{{.+}}: !llvm.ptr): // CHECK: omp.yield +// CHECK: cleanup +// CHECK: omp.yield omp.declare_reduction @add_f32 : f32 init { ^bb0(%arg: f32): @@ -620,6 +622,10 @@ atomic { llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 omp.yield } +cleanup { +^bb0(%arg: f32): + omp.yield +} // CHECK-LABEL: func @wsloop_reduction func.func @wsloop_reduction(%lb : index, %ub : index, %step : index) { @@ -789,6 +795,7 @@ combiner { omp.yield (%1 : f32) } // CHECK-NOT: atomic +// CHECK-NOT: cleanup // CHECK-LABEL: func @wsloop_reduction2 func.func @wsloop_reduction2(%lb : index, %ub : index, %step : index) { @@ -2088,6 +2095,7 @@ func.func @opaque_pointers_atomic_rwu(%v: !llvm.ptr, %x: !llvm.ptr) { // CHECK-LABEL: @opaque_pointers_reduction // CHECK: atomic { // CHECK-NEXT: ^{{[[:alnum:]]+}}(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr): +// CHECK-NOT: cleanup omp.declare_reduction @opaque_pointers_reduction : f32 init { ^bb0(%arg: f32): diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir new file mode 100644 index 000000000000..9ae4c4ad392b --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir @@ -0,0 +1,94 @@ +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + +// test a parallel reduction with a cleanup region + + omp.declare_reduction @add_reduction_i_32 : !llvm.ptr init { + ^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(0 : i32) : i32 + %c4 = llvm.mlir.constant(4 : i64) : i64 + %2 = llvm.call @malloc(%c4) : (i64) -> !llvm.ptr + llvm.store %0, %2 : i32, !llvm.ptr + omp.yield(%2 : !llvm.ptr) + } combiner { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.load %arg0 : !llvm.ptr -> i32 + %1 = llvm.load %arg1 : !llvm.ptr -> i32 + %2 = llvm.add %0, %1 : i32 + llvm.store %2, %arg0 : i32, !llvm.ptr + omp.yield(%arg0 : !llvm.ptr) + } cleanup { + ^bb0(%arg0: !llvm.ptr): + llvm.call @free(%arg0) : (!llvm.ptr) -> () + omp.yield + } + + // CHECK-LABEL: @main + llvm.func @main() { + %0 = llvm.mlir.constant(-1 : i32) : i32 + %1 = llvm.mlir.addressof @i : !llvm.ptr + %2 = llvm.mlir.addressof @j : !llvm.ptr + omp.parallel byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr, @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr) { + llvm.store %0, %arg0 : i32, !llvm.ptr + llvm.store %0, %arg1 : i32, !llvm.ptr + omp.terminator + } + llvm.return + } + llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + llvm.return %0 : i32 + } + llvm.mlir.global internal @j() {addr_space = 0 : i32} : i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + llvm.return %0 : i32 + } + llvm.func @malloc(%arg0 : i64) -> !llvm.ptr + llvm.func @free(%arg0 : !llvm.ptr) -> () + +// CHECK: %{{.+}} = +// Call to the outlined function. +// CHECK: call void {{.*}} @__kmpc_fork_call +// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Outlined function. +// CHECK: define internal void @[[OUTLINED]] + +// Private reduction variable and its initialization. +// CHECK: %tid.addr.local = alloca i32 +// CHECK: %[[MALLOC_I:.+]] = call ptr @malloc(i64 4) +// CHECK: %[[PRIV_PTR_I:.+]] = alloca ptr +// CHECK: store ptr %[[MALLOC_I]], ptr %[[PRIV_PTR_I]] +// CHECK: %[[MALLOC_J:.+]] = call ptr @malloc(i64 4) +// CHECK: %[[PRIV_PTR_J:.+]] = alloca ptr +// CHECK: store ptr %[[MALLOC_J]], ptr %[[PRIV_PTR_J]] + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + + +// Non-atomic reduction: +// CHECK: %[[PRIV_VAL_PTR_I:.+]] = load ptr, ptr %[[PRIV_PTR_I]] +// CHECK: %[[LOAD_I:.+]] = load i32, ptr @i +// CHECK: %[[PRIV_VAL_I:.+]] = load i32, ptr %[[PRIV_VAL_PTR_I]] +// CHECK: %[[SUM_I:.+]] = add i32 %[[LOAD_I]], %[[PRIV_VAL_I]] +// CHECK: store i32 %[[SUM_I]], ptr @i +// CHECK: %[[PRIV_VAL_PTR_J:.+]] = load ptr, ptr %[[PRIV_PTR_J]] +// CHECK: %[[LOAD_J:.+]] = load i32, ptr @j +// CHECK: %[[PRIV_VAL_J:.+]] = load i32, ptr %[[PRIV_VAL_PTR_J]] +// CHECK: %[[SUM_J:.+]] = add i32 %[[LOAD_J]], %[[PRIV_VAL_J]] +// CHECK: store i32 %[[SUM_J]], ptr @j +// CHECK: call void @__kmpc_end_reduce +// CHECK: br label %[[FINALIZE:.+]] + +// CHECK: [[FINALIZE]]: +// CHECK: br label %[[OMP_FINALIZE:.+]] + +// Cleanup region: +// CHECK: [[OMP_FINALIZE]]: +// CHECK: call void @free(ptr %[[PRIV_PTR_I]]) +// CHECK: call void @free(ptr %[[PRIV_PTR_J]]) + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: add i32 diff --git a/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir b/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir new file mode 100644 index 000000000000..a1e17afa53e2 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir @@ -0,0 +1,86 @@ +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + +// test a wsloop reduction with a cleanup region + + omp.declare_reduction @add_reduction_i_32 : !llvm.ptr init { + ^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(0 : i32) : i32 + %c4 = llvm.mlir.constant(4 : i64) : i64 + %2 = llvm.call @malloc(%c4) : (i64) -> !llvm.ptr + llvm.store %0, %2 : i32, !llvm.ptr + omp.yield(%2 : !llvm.ptr) + } combiner { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.load %arg0 : !llvm.ptr -> i32 + %1 = llvm.load %arg1 : !llvm.ptr -> i32 + %2 = llvm.add %0, %1 : i32 + llvm.store %2, %arg0 : i32, !llvm.ptr + omp.yield(%arg0 : !llvm.ptr) + } cleanup { + ^bb0(%arg0: !llvm.ptr): + llvm.call @free(%arg0) : (!llvm.ptr) -> () + omp.yield + } + + // CHECK-LABEL: @main + llvm.func @main() { + %0 = llvm.mlir.constant(-1 : i32) : i32 + %1 = llvm.mlir.addressof @i : !llvm.ptr + %2 = llvm.mlir.addressof @j : !llvm.ptr + %loop_ub = llvm.mlir.constant(9 : i32) : i32 + %loop_lb = llvm.mlir.constant(0 : i32) : i32 + %loop_step = llvm.mlir.constant(1 : i32) : i32 + omp.wsloop byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr, @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr) for (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) { + llvm.store %0, %arg0 : i32, !llvm.ptr + llvm.store %0, %arg1 : i32, !llvm.ptr + omp.terminator + } + llvm.return + } + llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + llvm.return %0 : i32 + } + llvm.mlir.global internal @j() {addr_space = 0 : i32} : i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + llvm.return %0 : i32 + } + llvm.func @malloc(%arg0 : i64) -> !llvm.ptr + llvm.func @free(%arg0 : !llvm.ptr) -> () + +// Private reduction variable and its initialization. +// CHECK: %[[MALLOC_I:.+]] = call ptr @malloc(i64 4) +// CHECK: %[[PRIV_PTR_I:.+]] = alloca ptr +// CHECK: store ptr %[[MALLOC_I]], ptr %[[PRIV_PTR_I]] +// CHECK: %[[MALLOC_J:.+]] = call ptr @malloc(i64 4) +// CHECK: %[[PRIV_PTR_J:.+]] = alloca ptr +// CHECK: store ptr %[[MALLOC_J]], ptr %[[PRIV_PTR_J]] + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Weirdly the finalization block is generated before the reduction blocks: +// CHECK: [[FINALIZE:.+]]: +// CHECK: call void @__kmpc_barrier +// CHECK: call void @free(ptr %[[PRIV_PTR_I]]) +// CHECK: call void @free(ptr %[[PRIV_PTR_J]]) +// CHECK: ret void + +// Non-atomic reduction: +// CHECK: %[[PRIV_VAL_PTR_I:.+]] = load ptr, ptr %[[PRIV_PTR_I]] +// CHECK: %[[LOAD_I:.+]] = load i32, ptr @i +// CHECK: %[[PRIV_VAL_I:.+]] = load i32, ptr %[[PRIV_VAL_PTR_I]] +// CHECK: %[[SUM_I:.+]] = add i32 %[[LOAD_I]], %[[PRIV_VAL_I]] +// CHECK: store i32 %[[SUM_I]], ptr @i +// CHECK: %[[PRIV_VAL_PTR_J:.+]] = load ptr, ptr %[[PRIV_PTR_J]] +// CHECK: %[[LOAD_J:.+]] = load i32, ptr @j +// CHECK: %[[PRIV_VAL_J:.+]] = load i32, ptr %[[PRIV_VAL_PTR_J]] +// CHECK: %[[SUM_J:.+]] = add i32 %[[LOAD_J]], %[[PRIV_VAL_J]] +// CHECK: store i32 %[[SUM_J]], ptr @j +// CHECK: call void @__kmpc_end_reduce +// CHECK: br label %[[FINALIZE]] + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: add i32