[flang][cuda] Do not produce data transfer in offloaded do concurrent (#147435)

If a `do concurrent` loop is offloaded then there should be no CUDA data transfer in it. Update the semantic and lowering to take that into account. `AssignmentChecker` has to be put into a separate pass because the checkers in `SemanticsVisitor` cannot have the same `Enter/Leave` functions. The `DoForallChecker` already has `Eneter/Leave` functions for the `DoConstruct`.
2026-01-22 06:19:46 +08:00 · 2025-07-08 10:52:15 -07:00
parent 1e3f6a6c4f
commit 46caad52ac
10 changed files with 109 additions and 72 deletions
--- a/flang/include/flang/Optimizer/Builder/CUFCommon.h
+++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h
@@ -27,7 +27,8 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
                                            mlir::SymbolTable &symTab);

 bool isCUDADeviceContext(mlir::Operation *op);
-bool isCUDADeviceContext(mlir::Region &);
+bool isCUDADeviceContext(mlir::Region &,
+                         bool isDoConcurrentOffloadEnabled = false);
 bool isRegisteredDeviceGlobal(fir::GlobalOp op);
 bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);

--- a/flang/include/flang/Support/Fortran-features.h
+++ b/flang/include/flang/Support/Fortran-features.h
@@ -55,7 +55,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
    SavedLocalInSpecExpr, PrintNamelist, AssumedRankPassedToNonAssumedRank,
    IgnoreIrrelevantAttributes, Unsigned, AmbiguousStructureConstructor,
    ContiguousOkForSeqAssociation, ForwardRefExplicitTypeDummy,
-    InaccessibleDeferredOverride, CudaWarpMatchFunction)
+    InaccessibleDeferredOverride, CudaWarpMatchFunction, DoConcurrentOffload)

 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -4886,7 +4886,10 @@ private:
    mlir::Location loc = getCurrentLocation();
    fir::FirOpBuilder &builder = getFirOpBuilder();

-    bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
+    bool isInDeviceContext = cuf::isCUDADeviceContext(
+        builder.getRegion(),
+        getFoldingContext().languageFeatures().IsEnabled(
+            Fortran::common::LanguageFeature::DoConcurrentOffload));

    bool isCUDATransfer =
        IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext;
--- a/flang/lib/Optimizer/Builder/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -43,7 +43,8 @@ bool cuf::isCUDADeviceContext(mlir::Operation *op) {
 // for it.
 // If the insertion point is inside an OpenACC region op, it is considered
 // device context.
-bool cuf::isCUDADeviceContext(mlir::Region &region) {
+bool cuf::isCUDADeviceContext(mlir::Region &region,
+                              bool isDoConcurrentOffloadEnabled) {
  if (region.getParentOfType<cuf::KernelOp>())
    return true;
  if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
@@ -56,6 +57,9 @@ bool cuf::isCUDADeviceContext(mlir::Region &region) {
             cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
    }
  }
+  if (isDoConcurrentOffloadEnabled &&
+      region.getParentOfType<fir::DoConcurrentLoopOp>())
+    return true;
  return false;
 }

--- a/flang/lib/Semantics/assignment.cpp
+++ b/flang/lib/Semantics/assignment.cpp
@@ -42,7 +42,6 @@ public:
  void Analyze(const parser::AssignmentStmt &);
  void Analyze(const parser::PointerAssignmentStmt &);
  void Analyze(const parser::ConcurrentControl &);
-  int deviceConstructDepth_{0};
  SemanticsContext &context() { return context_; }

 private:
@@ -97,21 +96,6 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) {
    if (whereDepth_ > 0) {
      CheckShape(lhsLoc, &lhs);
    }
-    if (context_.foldingContext().languageFeatures().IsEnabled(
-            common::LanguageFeature::CUDA)) {
-      const auto &scope{context_.FindScope(lhsLoc)};
-      const Scope &progUnit{GetProgramUnitContaining(scope)};
-      if (!IsCUDADeviceContext(&progUnit) && deviceConstructDepth_ == 0) {
-        if (Fortran::evaluate::HasCUDADeviceAttrs(lhs) &&
-            Fortran::evaluate::HasCUDAImplicitTransfer(rhs)) {
-          if (GetNbOfCUDAManagedOrUnifiedSymbols(lhs) == 1 &&
-              GetNbOfCUDAManagedOrUnifiedSymbols(rhs) == 1 &&
-              GetNbOfCUDADeviceSymbols(rhs) == 1)
-            return; // This is a special case handled on the host.
-          context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
-        }
-      }
-    }
  }
 }

@@ -254,46 +238,6 @@ void AssignmentChecker::Enter(const parser::MaskedElsewhereStmt &x) {
 void AssignmentChecker::Leave(const parser::MaskedElsewhereStmt &) {
  context_.value().PopWhereContext();
 }
-void AssignmentChecker::Enter(const parser::CUFKernelDoConstruct &x) {
-  ++context_.value().deviceConstructDepth_;
-}
-void AssignmentChecker::Leave(const parser::CUFKernelDoConstruct &) {
-  --context_.value().deviceConstructDepth_;
-}
-static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) {
-  const auto &beginBlockDirective =
-      std::get<Fortran::parser::AccBeginBlockDirective>(x.t);
-  const auto &blockDirective =
-      std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t);
-  if (blockDirective.v == llvm::acc::ACCD_parallel ||
-      blockDirective.v == llvm::acc::ACCD_serial ||
-      blockDirective.v == llvm::acc::ACCD_kernels) {
-    return true;
-  }
-  return false;
-}
-void AssignmentChecker::Enter(const parser::OpenACCBlockConstruct &x) {
-  if (IsOpenACCComputeConstruct(x)) {
-    ++context_.value().deviceConstructDepth_;
-  }
-}
-void AssignmentChecker::Leave(const parser::OpenACCBlockConstruct &x) {
-  if (IsOpenACCComputeConstruct(x)) {
-    --context_.value().deviceConstructDepth_;
-  }
-}
-void AssignmentChecker::Enter(const parser::OpenACCCombinedConstruct &) {
-  ++context_.value().deviceConstructDepth_;
-}
-void AssignmentChecker::Leave(const parser::OpenACCCombinedConstruct &) {
-  --context_.value().deviceConstructDepth_;
-}
-void AssignmentChecker::Enter(const parser::OpenACCLoopConstruct &) {
-  ++context_.value().deviceConstructDepth_;
-}
-void AssignmentChecker::Leave(const parser::OpenACCLoopConstruct &) {
-  --context_.value().deviceConstructDepth_;
-}

 } // namespace Fortran::semantics
 template class Fortran::common::Indirection<
--- a/flang/lib/Semantics/assignment.h
+++ b/flang/lib/Semantics/assignment.h
@@ -46,14 +46,6 @@ public:
  void Leave(const parser::EndWhereStmt &);
  void Enter(const parser::MaskedElsewhereStmt &);
  void Leave(const parser::MaskedElsewhereStmt &);
-  void Enter(const parser::CUFKernelDoConstruct &);
-  void Leave(const parser::CUFKernelDoConstruct &);
-  void Enter(const parser::OpenACCBlockConstruct &);
-  void Leave(const parser::OpenACCBlockConstruct &);
-  void Enter(const parser::OpenACCCombinedConstruct &);
-  void Leave(const parser::OpenACCCombinedConstruct &);
-  void Enter(const parser::OpenACCLoopConstruct &);
-  void Leave(const parser::OpenACCLoopConstruct &);

  SemanticsContext &context();

--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -685,18 +685,67 @@ void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) {
      std::get<std::list<parser::CUFReduction>>(directive.t)) {
    CheckReduce(context_, reduce);
  }
-  inCUFKernelDoConstruct_ = true;
+  ++deviceConstructDepth_;
+}
+
+static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) {
+  const auto &beginBlockDirective =
+      std::get<Fortran::parser::AccBeginBlockDirective>(x.t);
+  const auto &blockDirective =
+      std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t);
+  if (blockDirective.v == llvm::acc::ACCD_parallel ||
+      blockDirective.v == llvm::acc::ACCD_serial ||
+      blockDirective.v == llvm::acc::ACCD_kernels) {
+    return true;
+  }
+  return false;
 }

 void CUDAChecker::Leave(const parser::CUFKernelDoConstruct &) {
-  inCUFKernelDoConstruct_ = false;
+  --deviceConstructDepth_;
+}
+void CUDAChecker::Enter(const parser::OpenACCBlockConstruct &x) {
+  if (IsOpenACCComputeConstruct(x)) {
+    ++deviceConstructDepth_;
+  }
+}
+void CUDAChecker::Leave(const parser::OpenACCBlockConstruct &x) {
+  if (IsOpenACCComputeConstruct(x)) {
+    --deviceConstructDepth_;
+  }
+}
+void CUDAChecker::Enter(const parser::OpenACCCombinedConstruct &) {
+  ++deviceConstructDepth_;
+}
+void CUDAChecker::Leave(const parser::OpenACCCombinedConstruct &) {
+  --deviceConstructDepth_;
+}
+void CUDAChecker::Enter(const parser::OpenACCLoopConstruct &) {
+  ++deviceConstructDepth_;
+}
+void CUDAChecker::Leave(const parser::OpenACCLoopConstruct &) {
+  --deviceConstructDepth_;
+}
+void CUDAChecker::Enter(const parser::DoConstruct &x) {
+  if (x.IsDoConcurrent() &&
+      context_.foldingContext().languageFeatures().IsEnabled(
+          common::LanguageFeature::DoConcurrentOffload)) {
+    ++deviceConstructDepth_;
+  }
+}
+void CUDAChecker::Leave(const parser::DoConstruct &x) {
+  if (x.IsDoConcurrent() &&
+      context_.foldingContext().languageFeatures().IsEnabled(
+          common::LanguageFeature::DoConcurrentOffload)) {
+    --deviceConstructDepth_;
+  }
 }

 void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
  auto lhsLoc{std::get<parser::Variable>(x.t).GetSource()};
  const auto &scope{context_.FindScope(lhsLoc)};
  const Scope &progUnit{GetProgramUnitContaining(scope)};
-  if (IsCUDADeviceContext(&progUnit) || inCUFKernelDoConstruct_) {
+  if (IsCUDADeviceContext(&progUnit) || deviceConstructDepth_ > 0) {
    return; // Data transfer with assignment is only perform on host.
  }

@@ -714,6 +763,16 @@ void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
    context_.Say(lhsLoc,
        "More than one reference to a CUDA object on the right hand side of the assigment"_err_en_US);
  }
+
+  if (Fortran::evaluate::HasCUDADeviceAttrs(assign->lhs) &&
+      Fortran::evaluate::HasCUDAImplicitTransfer(assign->rhs)) {
+    if (GetNbOfCUDAManagedOrUnifiedSymbols(assign->lhs) == 1 &&
+        GetNbOfCUDAManagedOrUnifiedSymbols(assign->rhs) == 1 &&
+        GetNbOfCUDADeviceSymbols(assign->rhs) == 1) {
+      return; // This is a special case handled on the host.
+    }
+    context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
+  }
 }

 } // namespace Fortran::semantics
--- a/flang/lib/Semantics/check-cuda.h
+++ b/flang/lib/Semantics/check-cuda.h
@@ -41,10 +41,18 @@ public:
  void Enter(const parser::CUFKernelDoConstruct &);
  void Leave(const parser::CUFKernelDoConstruct &);
  void Enter(const parser::AssignmentStmt &);
+  void Enter(const parser::OpenACCBlockConstruct &);
+  void Leave(const parser::OpenACCBlockConstruct &);
+  void Enter(const parser::OpenACCCombinedConstruct &);
+  void Leave(const parser::OpenACCCombinedConstruct &);
+  void Enter(const parser::OpenACCLoopConstruct &);
+  void Leave(const parser::OpenACCLoopConstruct &);
+  void Enter(const parser::DoConstruct &);
+  void Leave(const parser::DoConstruct &);

 private:
  SemanticsContext &context_;
-  bool inCUFKernelDoConstruct_ = false;
+  int deviceConstructDepth_{0};
 };

 bool CanonicalizeCUDA(parser::Program &);
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -403,3 +403,19 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPsub20()
 ! CHECK-NOT: cuf.data_transfer
 ! CHECK: hlfir.assign
+
+subroutine sub21()
+  real, allocatable,device:: a(:,:), b(:,:)
+  real:: s
+  integer:: i,j,N=16
+  allocate(a(N,N),b(N,N))
+  do concurrent(i=1:N, j=1:N) reduce(+:s)
+    b(i,j)=a(i,j)**2
+    s=s+b(i,j)
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub21()
+! CHECK: fir.do_concurrent.loop
+! CHECK-NOT: cuf.data_transfer
+! CHECK: hlfir.assign
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -223,6 +223,11 @@ static llvm::cl::opt<bool> enableCUDA("fcuda",
                                      llvm::cl::desc("enable CUDA Fortran"),
                                      llvm::cl::init(false));

+static llvm::cl::opt<bool>
+    enableDoConcurrentOffload("fdoconcurrent-offload",
+                              llvm::cl::desc("enable do concurrent offload"),
+                              llvm::cl::init(false));
+
 static llvm::cl::opt<bool>
    disableCUDAWarpFunction("fcuda-disable-warp-function",
                            llvm::cl::desc("Disable CUDA Warp Function"),
@@ -608,6 +613,11 @@ int main(int argc, char **argv) {
    options.features.Enable(Fortran::common::LanguageFeature::CUDA);
  }

+  if (enableDoConcurrentOffload) {
+    options.features.Enable(
+        Fortran::common::LanguageFeature::DoConcurrentOffload);
+  }
+
  if (disableCUDAWarpFunction) {
    options.features.Enable(
        Fortran::common::LanguageFeature::CudaWarpMatchFunction, false);