[Flang][OpenMP] Appropriately emit present/load/store in all cases in MapInfoFinalization (#150311)

Currently, we return early whenever we've already generated an
allocation for intermediate descriptor variables (required in certain
cases when we can't directly access the base address of a passes in
descriptor function argument due to HLFIR/FIR restrictions). This
unfortunately, skips over the presence check and load/store required to
set the intermediate descriptor allocations values/data. This is fine in
most cases, but if a function happens to have a series of branches with
seperate target regions capturing the same input argument, we'd emit the
present/load/store into the first branch with the first target inside of
it, the secondary (or any preceding) branches would not have the
present/load/store, this would lead to the subsequent mapped values in
that branch being empty and then leading to a memory access violation on
device.

The fix for the moment is to emit a present/load/store at the relevant
location of every target utilising the input argument, this likely will
also lead to fixing possible issues with the input argument being
manipulated inbetween target regions (primarily resizing, the data
should remain the same as we're just copying an address around, in
theory at least). There's possible optimizations/simplifications to emit
less load/stores such as by raising the load/store out of the branches
when we can, but I'm inclined to leave this sort of optimization to
lower level passes such as an LLVM pass (which very possibly already
covers it).
This commit is contained in:
agozillon
2025-07-25 16:15:54 +02:00
committed by GitHub
parent adb2421202
commit 73272d6fc6
3 changed files with 122 additions and 22 deletions

View File

@@ -137,32 +137,41 @@ class MapInfoFinalizationPass
!fir::factory::isOptionalArgument(descriptor.getDefiningOp()))
return descriptor;
mlir::Value &slot = localBoxAllocas[descriptor.getDefiningOp()];
if (slot) {
return slot;
mlir::Value &alloca = localBoxAllocas[descriptor.getDefiningOp()];
mlir::Location loc = boxMap->getLoc();
if (!alloca) {
// The fir::BoxOffsetOp only works with !fir.ref<!fir.box<...>> types, as
// allowing it to access non-reference box operations can cause some
// problematic SSA IR. However, in the case of assumed shape's the type
// is not a !fir.ref, in these cases to retrieve the appropriate
// !fir.ref<!fir.box<...>> to access the data we need to map we must
// perform an alloca and then store to it and retrieve the data from the
// new alloca.
mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint();
mlir::Block *allocaBlock = builder.getAllocaBlock();
assert(allocaBlock && "No alloca block found for this top level op");
builder.setInsertionPointToStart(allocaBlock);
mlir::Type allocaType = descriptor.getType();
if (fir::isBoxAddress(allocaType))
allocaType = fir::unwrapRefType(allocaType);
alloca = fir::AllocaOp::create(builder, loc, allocaType);
builder.restoreInsertionPoint(insPt);
}
// The fir::BoxOffsetOp only works with !fir.ref<!fir.box<...>> types, as
// allowing it to access non-reference box operations can cause some
// problematic SSA IR. However, in the case of assumed shape's the type
// is not a !fir.ref, in these cases to retrieve the appropriate
// !fir.ref<!fir.box<...>> to access the data we need to map we must
// perform an alloca and then store to it and retrieve the data from the new
// alloca.
mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint();
mlir::Block *allocaBlock = builder.getAllocaBlock();
mlir::Location loc = boxMap->getLoc();
assert(allocaBlock && "No alloca block found for this top level op");
builder.setInsertionPointToStart(allocaBlock);
mlir::Type allocaType = descriptor.getType();
if (fir::isBoxAddress(allocaType))
allocaType = fir::unwrapRefType(allocaType);
auto alloca = fir::AllocaOp::create(builder, loc, allocaType);
builder.restoreInsertionPoint(insPt);
// We should only emit a store if the passed in data is present, it is
// possible a user passes in no argument to an optional parameter, in which
// case we cannot store or we'll segfault on the emitted memcpy.
// TODO: We currently emit a present -> load/store every time we use a
// mapped value that requires a local allocation, this isn't the most
// efficient, although, it is more correct in a lot of situations. One
// such situation is emitting a this series of instructions in separate
// segments of a branch (e.g. two target regions in separate else/if branch
// mapping the same function argument), however, it would be nice to be able
// to optimize these situations e.g. raising the load/store out of the
// branch if possible. But perhaps this is best left to lower level
// optimisation passes.
auto isPresent =
fir::IsPresentOp::create(builder, loc, builder.getI1Type(), descriptor);
builder.genIfOp(loc, {}, isPresent, false)
@@ -171,7 +180,7 @@ class MapInfoFinalizationPass
fir::StoreOp::create(builder, loc, descriptor, alloca);
})
.end();
return slot = alloca;
return alloca;
}
/// Function that generates a FIR operation accessing the descriptor's

View File

@@ -0,0 +1,46 @@
!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
module mod
contains
subroutine foo(dt, switch)
implicit none
real(4), dimension(:), intent(inout) :: dt
logical, intent(in) :: switch
integer :: dim, idx
if (switch) then
!$omp target teams distribute parallel do
do idx = 1, 100
dt(idx) = 20
end do
else
!$omp target teams distribute parallel do
do idx = 1, 100
dt(idx) = 30
end do
end if
end subroutine foo
end module
! CHECK-LABEL: func.func @{{.*}}(
! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "dt"},
! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope {{.*}}
! CHECK: fir.if %{{.*}} {
! CHECK: %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#1 : (!fir.box<!fir.array<?xf32>>) -> i1
! CHECK: fir.if %[[VAL_2]] {
! CHECK: fir.store %[[VAL_1]]#1 to %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
! CHECK: }
! CHECK: %[[VAL_3:.*]] = fir.box_offset %[[VAL_0]] base_addr : (!fir.ref<!fir.box<!fir.array<?xf32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>
! CHECK: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, f32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%[[VAL_3]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>> {name = ""}
! CHECK: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, !fir.box<!fir.array<?xf32>>) map_clauses(implicit, to) capture(ByRef) members(%[[VAL_4]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>) -> !fir.ref<!fir.array<?xf32>> {name = "dt"}
! CHECK: omp.target host_eval({{.*}}) map_entries({{.*}}%[[VAL_5]] -> {{.*}}, %[[VAL_4]] -> {{.*}} : {{.*}}) {
! CHECK: } else {
! CHECK: %[[VAL_6:.*]] = fir.is_present %[[VAL_1]]#1 : (!fir.box<!fir.array<?xf32>>) -> i1
! CHECK: fir.if %[[VAL_6]] {
! CHECK: fir.store %[[VAL_1]]#1 to %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
! CHECK: }
! CHECK: %[[VAL_7:.*]] = fir.box_offset %[[VAL_0]] base_addr : (!fir.ref<!fir.box<!fir.array<?xf32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>
! CHECK: %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, f32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%[[VAL_7]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>> {name = ""}
! CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, !fir.box<!fir.array<?xf32>>) map_clauses(implicit, to) capture(ByRef) members(%[[VAL_8]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>) -> !fir.ref<!fir.array<?xf32>> {name = "dt"}
! CHECK: omp.target host_eval({{.*}}) map_entries({{.*}}, %[[VAL_9]] ->{{.*}}, %[[VAL_8]] -> {{.*}} : {{.*}}) {

View File

@@ -0,0 +1,45 @@
! OpenMP offloading test that checks we do not cause a segfault when mapping
! optional function arguments (present or otherwise). No results requiring
! checking other than that the program compiles and runs to completion with no
! error. This particular variation checks that we're correctly emitting the
! load/store in both branches mapping the input array.
! REQUIRES: flang, amdgpu
! RUN: %libomptarget-compile-fortran-run-and-check-generic
module reproducer_mod
contains
subroutine branching_target_call(dt, switch)
implicit none
real(4), dimension(:), intent(inout) :: dt
logical, intent(in) :: switch
integer :: dim, idx
dim = size(dt)
if (switch) then
!$omp target teams distribute parallel do
do idx = 1, dim
dt(idx) = 20
end do
else
!$omp target teams distribute parallel do
do idx = 1, dim
dt(idx) = 30
end do
end if
end subroutine branching_target_call
end module reproducer_mod
program reproducer
use reproducer_mod
implicit none
real(4), dimension(:), allocatable :: dt
integer :: n = 21312
integer :: i
allocate (dt(n))
call branching_target_call(dt, .FALSE.)
call branching_target_call(dt, .TRUE.)
print *, "PASSED"
end program reproducer
! CHECK: PASSED