[flang][cuda] Do not produce data transfer in offloaded do concurrent (#147435)

If a `do concurrent` loop is offloaded then there should be no CUDA data
transfer in it. Update the semantic and lowering to take that into
account.

`AssignmentChecker` has to be put into a separate pass because the
checkers in `SemanticsVisitor` cannot have the same `Enter/Leave`
functions. The `DoForallChecker` already has `Eneter/Leave` functions
for the `DoConstruct`.
This commit is contained in:
Valentin Clement (バレンタイン クレメン)
2025-07-08 10:52:15 -07:00
committed by GitHub
parent 1e3f6a6c4f
commit 46caad52ac
10 changed files with 109 additions and 72 deletions

View File

@@ -27,7 +27,8 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
mlir::SymbolTable &symTab);
bool isCUDADeviceContext(mlir::Operation *op);
bool isCUDADeviceContext(mlir::Region &);
bool isCUDADeviceContext(mlir::Region &,
bool isDoConcurrentOffloadEnabled = false);
bool isRegisteredDeviceGlobal(fir::GlobalOp op);
bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);

View File

@@ -55,7 +55,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
SavedLocalInSpecExpr, PrintNamelist, AssumedRankPassedToNonAssumedRank,
IgnoreIrrelevantAttributes, Unsigned, AmbiguousStructureConstructor,
ContiguousOkForSeqAssociation, ForwardRefExplicitTypeDummy,
InaccessibleDeferredOverride, CudaWarpMatchFunction)
InaccessibleDeferredOverride, CudaWarpMatchFunction, DoConcurrentOffload)
// Portability and suspicious usage warnings
ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,

View File

@@ -4886,7 +4886,10 @@ private:
mlir::Location loc = getCurrentLocation();
fir::FirOpBuilder &builder = getFirOpBuilder();
bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
bool isInDeviceContext = cuf::isCUDADeviceContext(
builder.getRegion(),
getFoldingContext().languageFeatures().IsEnabled(
Fortran::common::LanguageFeature::DoConcurrentOffload));
bool isCUDATransfer =
IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext;

View File

@@ -43,7 +43,8 @@ bool cuf::isCUDADeviceContext(mlir::Operation *op) {
// for it.
// If the insertion point is inside an OpenACC region op, it is considered
// device context.
bool cuf::isCUDADeviceContext(mlir::Region &region) {
bool cuf::isCUDADeviceContext(mlir::Region &region,
bool isDoConcurrentOffloadEnabled) {
if (region.getParentOfType<cuf::KernelOp>())
return true;
if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
@@ -56,6 +57,9 @@ bool cuf::isCUDADeviceContext(mlir::Region &region) {
cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
}
}
if (isDoConcurrentOffloadEnabled &&
region.getParentOfType<fir::DoConcurrentLoopOp>())
return true;
return false;
}

View File

@@ -42,7 +42,6 @@ public:
void Analyze(const parser::AssignmentStmt &);
void Analyze(const parser::PointerAssignmentStmt &);
void Analyze(const parser::ConcurrentControl &);
int deviceConstructDepth_{0};
SemanticsContext &context() { return context_; }
private:
@@ -97,21 +96,6 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) {
if (whereDepth_ > 0) {
CheckShape(lhsLoc, &lhs);
}
if (context_.foldingContext().languageFeatures().IsEnabled(
common::LanguageFeature::CUDA)) {
const auto &scope{context_.FindScope(lhsLoc)};
const Scope &progUnit{GetProgramUnitContaining(scope)};
if (!IsCUDADeviceContext(&progUnit) && deviceConstructDepth_ == 0) {
if (Fortran::evaluate::HasCUDADeviceAttrs(lhs) &&
Fortran::evaluate::HasCUDAImplicitTransfer(rhs)) {
if (GetNbOfCUDAManagedOrUnifiedSymbols(lhs) == 1 &&
GetNbOfCUDAManagedOrUnifiedSymbols(rhs) == 1 &&
GetNbOfCUDADeviceSymbols(rhs) == 1)
return; // This is a special case handled on the host.
context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
}
}
}
}
}
@@ -254,46 +238,6 @@ void AssignmentChecker::Enter(const parser::MaskedElsewhereStmt &x) {
void AssignmentChecker::Leave(const parser::MaskedElsewhereStmt &) {
context_.value().PopWhereContext();
}
void AssignmentChecker::Enter(const parser::CUFKernelDoConstruct &x) {
++context_.value().deviceConstructDepth_;
}
void AssignmentChecker::Leave(const parser::CUFKernelDoConstruct &) {
--context_.value().deviceConstructDepth_;
}
static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) {
const auto &beginBlockDirective =
std::get<Fortran::parser::AccBeginBlockDirective>(x.t);
const auto &blockDirective =
std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t);
if (blockDirective.v == llvm::acc::ACCD_parallel ||
blockDirective.v == llvm::acc::ACCD_serial ||
blockDirective.v == llvm::acc::ACCD_kernels) {
return true;
}
return false;
}
void AssignmentChecker::Enter(const parser::OpenACCBlockConstruct &x) {
if (IsOpenACCComputeConstruct(x)) {
++context_.value().deviceConstructDepth_;
}
}
void AssignmentChecker::Leave(const parser::OpenACCBlockConstruct &x) {
if (IsOpenACCComputeConstruct(x)) {
--context_.value().deviceConstructDepth_;
}
}
void AssignmentChecker::Enter(const parser::OpenACCCombinedConstruct &) {
++context_.value().deviceConstructDepth_;
}
void AssignmentChecker::Leave(const parser::OpenACCCombinedConstruct &) {
--context_.value().deviceConstructDepth_;
}
void AssignmentChecker::Enter(const parser::OpenACCLoopConstruct &) {
++context_.value().deviceConstructDepth_;
}
void AssignmentChecker::Leave(const parser::OpenACCLoopConstruct &) {
--context_.value().deviceConstructDepth_;
}
} // namespace Fortran::semantics
template class Fortran::common::Indirection<

View File

@@ -46,14 +46,6 @@ public:
void Leave(const parser::EndWhereStmt &);
void Enter(const parser::MaskedElsewhereStmt &);
void Leave(const parser::MaskedElsewhereStmt &);
void Enter(const parser::CUFKernelDoConstruct &);
void Leave(const parser::CUFKernelDoConstruct &);
void Enter(const parser::OpenACCBlockConstruct &);
void Leave(const parser::OpenACCBlockConstruct &);
void Enter(const parser::OpenACCCombinedConstruct &);
void Leave(const parser::OpenACCCombinedConstruct &);
void Enter(const parser::OpenACCLoopConstruct &);
void Leave(const parser::OpenACCLoopConstruct &);
SemanticsContext &context();

View File

@@ -685,18 +685,67 @@ void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) {
std::get<std::list<parser::CUFReduction>>(directive.t)) {
CheckReduce(context_, reduce);
}
inCUFKernelDoConstruct_ = true;
++deviceConstructDepth_;
}
static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) {
const auto &beginBlockDirective =
std::get<Fortran::parser::AccBeginBlockDirective>(x.t);
const auto &blockDirective =
std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t);
if (blockDirective.v == llvm::acc::ACCD_parallel ||
blockDirective.v == llvm::acc::ACCD_serial ||
blockDirective.v == llvm::acc::ACCD_kernels) {
return true;
}
return false;
}
void CUDAChecker::Leave(const parser::CUFKernelDoConstruct &) {
inCUFKernelDoConstruct_ = false;
--deviceConstructDepth_;
}
void CUDAChecker::Enter(const parser::OpenACCBlockConstruct &x) {
if (IsOpenACCComputeConstruct(x)) {
++deviceConstructDepth_;
}
}
void CUDAChecker::Leave(const parser::OpenACCBlockConstruct &x) {
if (IsOpenACCComputeConstruct(x)) {
--deviceConstructDepth_;
}
}
void CUDAChecker::Enter(const parser::OpenACCCombinedConstruct &) {
++deviceConstructDepth_;
}
void CUDAChecker::Leave(const parser::OpenACCCombinedConstruct &) {
--deviceConstructDepth_;
}
void CUDAChecker::Enter(const parser::OpenACCLoopConstruct &) {
++deviceConstructDepth_;
}
void CUDAChecker::Leave(const parser::OpenACCLoopConstruct &) {
--deviceConstructDepth_;
}
void CUDAChecker::Enter(const parser::DoConstruct &x) {
if (x.IsDoConcurrent() &&
context_.foldingContext().languageFeatures().IsEnabled(
common::LanguageFeature::DoConcurrentOffload)) {
++deviceConstructDepth_;
}
}
void CUDAChecker::Leave(const parser::DoConstruct &x) {
if (x.IsDoConcurrent() &&
context_.foldingContext().languageFeatures().IsEnabled(
common::LanguageFeature::DoConcurrentOffload)) {
--deviceConstructDepth_;
}
}
void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
auto lhsLoc{std::get<parser::Variable>(x.t).GetSource()};
const auto &scope{context_.FindScope(lhsLoc)};
const Scope &progUnit{GetProgramUnitContaining(scope)};
if (IsCUDADeviceContext(&progUnit) || inCUFKernelDoConstruct_) {
if (IsCUDADeviceContext(&progUnit) || deviceConstructDepth_ > 0) {
return; // Data transfer with assignment is only perform on host.
}
@@ -714,6 +763,16 @@ void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
context_.Say(lhsLoc,
"More than one reference to a CUDA object on the right hand side of the assigment"_err_en_US);
}
if (Fortran::evaluate::HasCUDADeviceAttrs(assign->lhs) &&
Fortran::evaluate::HasCUDAImplicitTransfer(assign->rhs)) {
if (GetNbOfCUDAManagedOrUnifiedSymbols(assign->lhs) == 1 &&
GetNbOfCUDAManagedOrUnifiedSymbols(assign->rhs) == 1 &&
GetNbOfCUDADeviceSymbols(assign->rhs) == 1) {
return; // This is a special case handled on the host.
}
context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
}
}
} // namespace Fortran::semantics

View File

@@ -41,10 +41,18 @@ public:
void Enter(const parser::CUFKernelDoConstruct &);
void Leave(const parser::CUFKernelDoConstruct &);
void Enter(const parser::AssignmentStmt &);
void Enter(const parser::OpenACCBlockConstruct &);
void Leave(const parser::OpenACCBlockConstruct &);
void Enter(const parser::OpenACCCombinedConstruct &);
void Leave(const parser::OpenACCCombinedConstruct &);
void Enter(const parser::OpenACCLoopConstruct &);
void Leave(const parser::OpenACCLoopConstruct &);
void Enter(const parser::DoConstruct &);
void Leave(const parser::DoConstruct &);
private:
SemanticsContext &context_;
bool inCUFKernelDoConstruct_ = false;
int deviceConstructDepth_{0};
};
bool CanonicalizeCUDA(parser::Program &);

View File

@@ -403,3 +403,19 @@ end subroutine
! CHECK-LABEL: func.func @_QPsub20()
! CHECK-NOT: cuf.data_transfer
! CHECK: hlfir.assign
subroutine sub21()
real, allocatable,device:: a(:,:), b(:,:)
real:: s
integer:: i,j,N=16
allocate(a(N,N),b(N,N))
do concurrent(i=1:N, j=1:N) reduce(+:s)
b(i,j)=a(i,j)**2
s=s+b(i,j)
end do
end subroutine
! CHECK-LABEL: func.func @_QPsub21()
! CHECK: fir.do_concurrent.loop
! CHECK-NOT: cuf.data_transfer
! CHECK: hlfir.assign

View File

@@ -223,6 +223,11 @@ static llvm::cl::opt<bool> enableCUDA("fcuda",
llvm::cl::desc("enable CUDA Fortran"),
llvm::cl::init(false));
static llvm::cl::opt<bool>
enableDoConcurrentOffload("fdoconcurrent-offload",
llvm::cl::desc("enable do concurrent offload"),
llvm::cl::init(false));
static llvm::cl::opt<bool>
disableCUDAWarpFunction("fcuda-disable-warp-function",
llvm::cl::desc("Disable CUDA Warp Function"),
@@ -608,6 +613,11 @@ int main(int argc, char **argv) {
options.features.Enable(Fortran::common::LanguageFeature::CUDA);
}
if (enableDoConcurrentOffload) {
options.features.Enable(
Fortran::common::LanguageFeature::DoConcurrentOffload);
}
if (disableCUDAWarpFunction) {
options.features.Enable(
Fortran::common::LanguageFeature::CudaWarpMatchFunction, false);