[MLIR][OpenMP] Add cleanup region to omp.declare_reduction (#87377)

Currently, by-ref reductions will allocate the per-thread reduction
variable in the initialization region. Adding a cleanup region allows
that allocation to be undone. This will allow flang to support reduction
of arrays stored on the heap.

This conflation of allocation and initialization in the initialization
should be fixed in the future to better match the OpenMP standard, but
that is beyond the scope of this patch.
This commit is contained in:
Tom Eccles
2024-04-04 11:19:42 +01:00
committed by GitHub
parent dbd6eb6779
commit cc34ad91f0
7 changed files with 300 additions and 25 deletions

View File

@@ -2135,8 +2135,8 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol,
let summary = "declares a reduction kind";
let description = [{
Declares an OpenMP reduction kind. This requires two mandatory and one
optional region.
Declares an OpenMP reduction kind. This requires two mandatory and two
optional regions.
1. The initializer region specifies how to initialize the thread-local
reduction value. This is usually the neutral element of the reduction.
@@ -2149,6 +2149,10 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol,
3. The atomic reduction region is optional and specifies how two values
can be combined atomically given local accumulator variables. It is
expected to store the combined value in the first accumulator variable.
4. The cleanup region is optional and specifies how to clean up any memory
allocated by the initializer region. The region has an argument that
contains the value of the thread-local reduction accumulator. This will
be executed after the reduction has completed.
Note that the MLIR type system does not allow for type-polymorphic
reductions. Separate reduction declarations should be created for different
@@ -2163,12 +2167,14 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol,
let regions = (region AnyRegion:$initializerRegion,
AnyRegion:$reductionRegion,
AnyRegion:$atomicReductionRegion);
AnyRegion:$atomicReductionRegion,
AnyRegion:$cleanupRegion);
let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword "
"`init` $initializerRegion "
"`combiner` $reductionRegion "
"custom<AtomicReductionRegion>($atomicReductionRegion)";
"custom<AtomicReductionRegion>($atomicReductionRegion) "
"custom<CleanupReductionRegion>($cleanupRegion)";
let extraClassDeclaration = [{
PointerLikeType getAccumulatorType() {

View File

@@ -1538,6 +1538,21 @@ static void printAtomicReductionRegion(OpAsmPrinter &printer,
printer.printRegion(region);
}
static ParseResult parseCleanupReductionRegion(OpAsmParser &parser,
Region &region) {
if (parser.parseOptionalKeyword("cleanup"))
return success();
return parser.parseRegion(region);
}
static void printCleanupReductionRegion(OpAsmPrinter &printer,
DeclareReductionOp op, Region &region) {
if (region.empty())
return;
printer << "cleanup ";
printer.printRegion(region);
}
LogicalResult DeclareReductionOp::verifyRegions() {
if (getInitializerRegion().empty())
return emitOpError() << "expects non-empty initializer region";
@@ -1571,21 +1586,29 @@ LogicalResult DeclareReductionOp::verifyRegions() {
"of the reduction type";
}
if (getAtomicReductionRegion().empty())
return success();
if (!getAtomicReductionRegion().empty()) {
Block &atomicReductionEntryBlock = getAtomicReductionRegion().front();
if (atomicReductionEntryBlock.getNumArguments() != 2 ||
atomicReductionEntryBlock.getArgumentTypes()[0] !=
atomicReductionEntryBlock.getArgumentTypes()[1])
return emitOpError() << "expects atomic reduction region with two "
"arguments of the same type";
auto ptrType = llvm::dyn_cast<PointerLikeType>(
atomicReductionEntryBlock.getArgumentTypes()[0]);
if (!ptrType ||
(ptrType.getElementType() && ptrType.getElementType() != getType()))
return emitOpError() << "expects atomic reduction region arguments to "
"be accumulators containing the reduction type";
}
if (getCleanupRegion().empty())
return success();
Block &cleanupEntryBlock = getCleanupRegion().front();
if (cleanupEntryBlock.getNumArguments() != 1 ||
cleanupEntryBlock.getArgument(0).getType() != getType())
return emitOpError() << "expects cleanup region with one argument "
"of the reduction type";
Block &atomicReductionEntryBlock = getAtomicReductionRegion().front();
if (atomicReductionEntryBlock.getNumArguments() != 2 ||
atomicReductionEntryBlock.getArgumentTypes()[0] !=
atomicReductionEntryBlock.getArgumentTypes()[1])
return emitOpError() << "expects atomic reduction region with two "
"arguments of the same type";
auto ptrType = llvm::dyn_cast<PointerLikeType>(
atomicReductionEntryBlock.getArgumentTypes()[0]);
if (!ptrType ||
(ptrType.getElementType() && ptrType.getElementType() != getType()))
return emitOpError() << "expects atomic reduction region arguments to "
"be accumulators containing the reduction type";
return success();
}

View File

@@ -877,6 +877,32 @@ static void collectReductionInfo(
}
}
/// handling of DeclareReductionOp's cleanup region
static LogicalResult inlineReductionCleanup(
llvm::SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
llvm::ArrayRef<llvm::Value *> privateReductionVariables,
LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder) {
for (auto [i, reductionDecl] : llvm::enumerate(reductionDecls)) {
Region &cleanupRegion = reductionDecl.getCleanupRegion();
if (cleanupRegion.empty())
continue;
// map the argument to the cleanup region
Block &entry = cleanupRegion.front();
moduleTranslation.mapValue(entry.getArgument(0),
privateReductionVariables[i]);
if (failed(inlineConvertOmpRegions(cleanupRegion, "omp.reduction.cleanup",
builder, moduleTranslation)))
return failure();
// clear block argument mapping in case it needs to be re-created with a
// different source for another use of the same reduction decl
moduleTranslation.forgetMapping(cleanupRegion);
}
return success();
}
/// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder.
static LogicalResult
convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
@@ -1072,7 +1098,9 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
tempTerminator->eraseFromParent();
builder.restoreIP(nextInsertionPoint);
return success();
// after the workshare loop, deallocate private reduction variables
return inlineReductionCleanup(reductionDecls, privateReductionVariables,
moduleTranslation, builder);
}
/// A RAII class that on construction replaces the region arguments of the
@@ -1125,13 +1153,13 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
LogicalResult bodyGenStatus = success();
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
// Collect reduction declarations
SmallVector<omp::DeclareReductionOp> reductionDecls;
collectReductionDecls(opInst, reductionDecls);
// Collect reduction declarations
SmallVector<omp::DeclareReductionOp> reductionDecls;
collectReductionDecls(opInst, reductionDecls);
SmallVector<llvm::Value *> privateReductionVariables;
auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
// Allocate reduction vars
SmallVector<llvm::Value *> privateReductionVariables;
DenseMap<Value, llvm::Value *> reductionVariableMap;
if (!isByRef) {
allocByValReductionVars(opInst, builder, moduleTranslation, allocaIP,
@@ -1331,7 +1359,18 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
// TODO: Perform finalization actions for variables. This has to be
// called for variables which have destructors/finalizers.
auto finiCB = [&](InsertPointTy codeGenIP) {};
auto finiCB = [&](InsertPointTy codeGenIP) {
InsertPointTy oldIP = builder.saveIP();
builder.restoreIP(codeGenIP);
// if the reduction has a cleanup region, inline it here to finalize the
// reduction variables
if (failed(inlineReductionCleanup(reductionDecls, privateReductionVariables,
moduleTranslation, builder)))
bodyGenStatus = failure();
builder.restoreIP(oldIP);
};
llvm::Value *ifCond = nullptr;
if (auto ifExprVar = opInst.getIfExprVar())

View File

@@ -436,6 +436,25 @@ atomic {
// -----
// expected-error @below {{op expects cleanup region with one argument of the reduction type}}
omp.declare_reduction @add_f32 : f32
init {
^bb0(%arg: f32):
%0 = arith.constant 0.0 : f32
omp.yield (%0 : f32)
}
combiner {
^bb1(%arg0: f32, %arg1: f32):
%1 = arith.addf %arg0, %arg1 : f32
omp.yield (%1 : f32)
}
cleanup {
^bb0(%arg: f64):
omp.yield
}
// -----
func.func @foo(%lb : index, %ub : index, %step : index) {
%c1 = arith.constant 1 : i32
%0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr

View File

@@ -603,6 +603,8 @@ func.func @omp_target_pretty(%if_cond : i1, %device : si32, %num_threads : i32)
// CHECK: atomic
// CHECK: ^{{.+}}(%{{.+}}: !llvm.ptr, %{{.+}}: !llvm.ptr):
// CHECK: omp.yield
// CHECK: cleanup
// CHECK: omp.yield
omp.declare_reduction @add_f32 : f32
init {
^bb0(%arg: f32):
@@ -620,6 +622,10 @@ atomic {
llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32
omp.yield
}
cleanup {
^bb0(%arg: f32):
omp.yield
}
// CHECK-LABEL: func @wsloop_reduction
func.func @wsloop_reduction(%lb : index, %ub : index, %step : index) {
@@ -789,6 +795,7 @@ combiner {
omp.yield (%1 : f32)
}
// CHECK-NOT: atomic
// CHECK-NOT: cleanup
// CHECK-LABEL: func @wsloop_reduction2
func.func @wsloop_reduction2(%lb : index, %ub : index, %step : index) {
@@ -2088,6 +2095,7 @@ func.func @opaque_pointers_atomic_rwu(%v: !llvm.ptr, %x: !llvm.ptr) {
// CHECK-LABEL: @opaque_pointers_reduction
// CHECK: atomic {
// CHECK-NEXT: ^{{[[:alnum:]]+}}(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr):
// CHECK-NOT: cleanup
omp.declare_reduction @opaque_pointers_reduction : f32
init {
^bb0(%arg: f32):

View File

@@ -0,0 +1,94 @@
// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
// test a parallel reduction with a cleanup region
omp.declare_reduction @add_reduction_i_32 : !llvm.ptr init {
^bb0(%arg0: !llvm.ptr):
%0 = llvm.mlir.constant(0 : i32) : i32
%c4 = llvm.mlir.constant(4 : i64) : i64
%2 = llvm.call @malloc(%c4) : (i64) -> !llvm.ptr
llvm.store %0, %2 : i32, !llvm.ptr
omp.yield(%2 : !llvm.ptr)
} combiner {
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
%0 = llvm.load %arg0 : !llvm.ptr -> i32
%1 = llvm.load %arg1 : !llvm.ptr -> i32
%2 = llvm.add %0, %1 : i32
llvm.store %2, %arg0 : i32, !llvm.ptr
omp.yield(%arg0 : !llvm.ptr)
} cleanup {
^bb0(%arg0: !llvm.ptr):
llvm.call @free(%arg0) : (!llvm.ptr) -> ()
omp.yield
}
// CHECK-LABEL: @main
llvm.func @main() {
%0 = llvm.mlir.constant(-1 : i32) : i32
%1 = llvm.mlir.addressof @i : !llvm.ptr
%2 = llvm.mlir.addressof @j : !llvm.ptr
omp.parallel byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr, @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr) {
llvm.store %0, %arg0 : i32, !llvm.ptr
llvm.store %0, %arg1 : i32, !llvm.ptr
omp.terminator
}
llvm.return
}
llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 {
%0 = llvm.mlir.constant(0 : i32) : i32
llvm.return %0 : i32
}
llvm.mlir.global internal @j() {addr_space = 0 : i32} : i32 {
%0 = llvm.mlir.constant(0 : i32) : i32
llvm.return %0 : i32
}
llvm.func @malloc(%arg0 : i64) -> !llvm.ptr
llvm.func @free(%arg0 : !llvm.ptr) -> ()
// CHECK: %{{.+}} =
// Call to the outlined function.
// CHECK: call void {{.*}} @__kmpc_fork_call
// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]
// Outlined function.
// CHECK: define internal void @[[OUTLINED]]
// Private reduction variable and its initialization.
// CHECK: %tid.addr.local = alloca i32
// CHECK: %[[MALLOC_I:.+]] = call ptr @malloc(i64 4)
// CHECK: %[[PRIV_PTR_I:.+]] = alloca ptr
// CHECK: store ptr %[[MALLOC_I]], ptr %[[PRIV_PTR_I]]
// CHECK: %[[MALLOC_J:.+]] = call ptr @malloc(i64 4)
// CHECK: %[[PRIV_PTR_J:.+]] = alloca ptr
// CHECK: store ptr %[[MALLOC_J]], ptr %[[PRIV_PTR_J]]
// Call to the reduction function.
// CHECK: call i32 @__kmpc_reduce
// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
// Non-atomic reduction:
// CHECK: %[[PRIV_VAL_PTR_I:.+]] = load ptr, ptr %[[PRIV_PTR_I]]
// CHECK: %[[LOAD_I:.+]] = load i32, ptr @i
// CHECK: %[[PRIV_VAL_I:.+]] = load i32, ptr %[[PRIV_VAL_PTR_I]]
// CHECK: %[[SUM_I:.+]] = add i32 %[[LOAD_I]], %[[PRIV_VAL_I]]
// CHECK: store i32 %[[SUM_I]], ptr @i
// CHECK: %[[PRIV_VAL_PTR_J:.+]] = load ptr, ptr %[[PRIV_PTR_J]]
// CHECK: %[[LOAD_J:.+]] = load i32, ptr @j
// CHECK: %[[PRIV_VAL_J:.+]] = load i32, ptr %[[PRIV_VAL_PTR_J]]
// CHECK: %[[SUM_J:.+]] = add i32 %[[LOAD_J]], %[[PRIV_VAL_J]]
// CHECK: store i32 %[[SUM_J]], ptr @j
// CHECK: call void @__kmpc_end_reduce
// CHECK: br label %[[FINALIZE:.+]]
// CHECK: [[FINALIZE]]:
// CHECK: br label %[[OMP_FINALIZE:.+]]
// Cleanup region:
// CHECK: [[OMP_FINALIZE]]:
// CHECK: call void @free(ptr %[[PRIV_PTR_I]])
// CHECK: call void @free(ptr %[[PRIV_PTR_J]])
// Reduction function.
// CHECK: define internal void @[[REDFUNC]]
// CHECK: add i32

View File

@@ -0,0 +1,86 @@
// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
// test a wsloop reduction with a cleanup region
omp.declare_reduction @add_reduction_i_32 : !llvm.ptr init {
^bb0(%arg0: !llvm.ptr):
%0 = llvm.mlir.constant(0 : i32) : i32
%c4 = llvm.mlir.constant(4 : i64) : i64
%2 = llvm.call @malloc(%c4) : (i64) -> !llvm.ptr
llvm.store %0, %2 : i32, !llvm.ptr
omp.yield(%2 : !llvm.ptr)
} combiner {
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
%0 = llvm.load %arg0 : !llvm.ptr -> i32
%1 = llvm.load %arg1 : !llvm.ptr -> i32
%2 = llvm.add %0, %1 : i32
llvm.store %2, %arg0 : i32, !llvm.ptr
omp.yield(%arg0 : !llvm.ptr)
} cleanup {
^bb0(%arg0: !llvm.ptr):
llvm.call @free(%arg0) : (!llvm.ptr) -> ()
omp.yield
}
// CHECK-LABEL: @main
llvm.func @main() {
%0 = llvm.mlir.constant(-1 : i32) : i32
%1 = llvm.mlir.addressof @i : !llvm.ptr
%2 = llvm.mlir.addressof @j : !llvm.ptr
%loop_ub = llvm.mlir.constant(9 : i32) : i32
%loop_lb = llvm.mlir.constant(0 : i32) : i32
%loop_step = llvm.mlir.constant(1 : i32) : i32
omp.wsloop byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr, @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr) for (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
llvm.store %0, %arg0 : i32, !llvm.ptr
llvm.store %0, %arg1 : i32, !llvm.ptr
omp.terminator
}
llvm.return
}
llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 {
%0 = llvm.mlir.constant(0 : i32) : i32
llvm.return %0 : i32
}
llvm.mlir.global internal @j() {addr_space = 0 : i32} : i32 {
%0 = llvm.mlir.constant(0 : i32) : i32
llvm.return %0 : i32
}
llvm.func @malloc(%arg0 : i64) -> !llvm.ptr
llvm.func @free(%arg0 : !llvm.ptr) -> ()
// Private reduction variable and its initialization.
// CHECK: %[[MALLOC_I:.+]] = call ptr @malloc(i64 4)
// CHECK: %[[PRIV_PTR_I:.+]] = alloca ptr
// CHECK: store ptr %[[MALLOC_I]], ptr %[[PRIV_PTR_I]]
// CHECK: %[[MALLOC_J:.+]] = call ptr @malloc(i64 4)
// CHECK: %[[PRIV_PTR_J:.+]] = alloca ptr
// CHECK: store ptr %[[MALLOC_J]], ptr %[[PRIV_PTR_J]]
// Call to the reduction function.
// CHECK: call i32 @__kmpc_reduce
// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
// Weirdly the finalization block is generated before the reduction blocks:
// CHECK: [[FINALIZE:.+]]:
// CHECK: call void @__kmpc_barrier
// CHECK: call void @free(ptr %[[PRIV_PTR_I]])
// CHECK: call void @free(ptr %[[PRIV_PTR_J]])
// CHECK: ret void
// Non-atomic reduction:
// CHECK: %[[PRIV_VAL_PTR_I:.+]] = load ptr, ptr %[[PRIV_PTR_I]]
// CHECK: %[[LOAD_I:.+]] = load i32, ptr @i
// CHECK: %[[PRIV_VAL_I:.+]] = load i32, ptr %[[PRIV_VAL_PTR_I]]
// CHECK: %[[SUM_I:.+]] = add i32 %[[LOAD_I]], %[[PRIV_VAL_I]]
// CHECK: store i32 %[[SUM_I]], ptr @i
// CHECK: %[[PRIV_VAL_PTR_J:.+]] = load ptr, ptr %[[PRIV_PTR_J]]
// CHECK: %[[LOAD_J:.+]] = load i32, ptr @j
// CHECK: %[[PRIV_VAL_J:.+]] = load i32, ptr %[[PRIV_VAL_PTR_J]]
// CHECK: %[[SUM_J:.+]] = add i32 %[[LOAD_J]], %[[PRIV_VAL_J]]
// CHECK: store i32 %[[SUM_J]], ptr @j
// CHECK: call void @__kmpc_end_reduce
// CHECK: br label %[[FINALIZE]]
// Reduction function.
// CHECK: define internal void @[[REDFUNC]]
// CHECK: add i32