mirror of
https://github.com/intel/llvm.git
synced 2026-01-22 06:19:46 +08:00
[flang][cuda] Do not produce data transfer in offloaded do concurrent (#147435)
If a `do concurrent` loop is offloaded then there should be no CUDA data transfer in it. Update the semantic and lowering to take that into account. `AssignmentChecker` has to be put into a separate pass because the checkers in `SemanticsVisitor` cannot have the same `Enter/Leave` functions. The `DoForallChecker` already has `Eneter/Leave` functions for the `DoConstruct`.
This commit is contained in:
committed by
GitHub
parent
1e3f6a6c4f
commit
46caad52ac
@@ -27,7 +27,8 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
|
||||
mlir::SymbolTable &symTab);
|
||||
|
||||
bool isCUDADeviceContext(mlir::Operation *op);
|
||||
bool isCUDADeviceContext(mlir::Region &);
|
||||
bool isCUDADeviceContext(mlir::Region &,
|
||||
bool isDoConcurrentOffloadEnabled = false);
|
||||
bool isRegisteredDeviceGlobal(fir::GlobalOp op);
|
||||
bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);
|
||||
|
||||
|
||||
@@ -55,7 +55,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
|
||||
SavedLocalInSpecExpr, PrintNamelist, AssumedRankPassedToNonAssumedRank,
|
||||
IgnoreIrrelevantAttributes, Unsigned, AmbiguousStructureConstructor,
|
||||
ContiguousOkForSeqAssociation, ForwardRefExplicitTypeDummy,
|
||||
InaccessibleDeferredOverride, CudaWarpMatchFunction)
|
||||
InaccessibleDeferredOverride, CudaWarpMatchFunction, DoConcurrentOffload)
|
||||
|
||||
// Portability and suspicious usage warnings
|
||||
ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
|
||||
|
||||
@@ -4886,7 +4886,10 @@ private:
|
||||
mlir::Location loc = getCurrentLocation();
|
||||
fir::FirOpBuilder &builder = getFirOpBuilder();
|
||||
|
||||
bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
|
||||
bool isInDeviceContext = cuf::isCUDADeviceContext(
|
||||
builder.getRegion(),
|
||||
getFoldingContext().languageFeatures().IsEnabled(
|
||||
Fortran::common::LanguageFeature::DoConcurrentOffload));
|
||||
|
||||
bool isCUDATransfer =
|
||||
IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext;
|
||||
|
||||
@@ -43,7 +43,8 @@ bool cuf::isCUDADeviceContext(mlir::Operation *op) {
|
||||
// for it.
|
||||
// If the insertion point is inside an OpenACC region op, it is considered
|
||||
// device context.
|
||||
bool cuf::isCUDADeviceContext(mlir::Region ®ion) {
|
||||
bool cuf::isCUDADeviceContext(mlir::Region ®ion,
|
||||
bool isDoConcurrentOffloadEnabled) {
|
||||
if (region.getParentOfType<cuf::KernelOp>())
|
||||
return true;
|
||||
if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
|
||||
@@ -56,6 +57,9 @@ bool cuf::isCUDADeviceContext(mlir::Region ®ion) {
|
||||
cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
|
||||
}
|
||||
}
|
||||
if (isDoConcurrentOffloadEnabled &&
|
||||
region.getParentOfType<fir::DoConcurrentLoopOp>())
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -42,7 +42,6 @@ public:
|
||||
void Analyze(const parser::AssignmentStmt &);
|
||||
void Analyze(const parser::PointerAssignmentStmt &);
|
||||
void Analyze(const parser::ConcurrentControl &);
|
||||
int deviceConstructDepth_{0};
|
||||
SemanticsContext &context() { return context_; }
|
||||
|
||||
private:
|
||||
@@ -97,21 +96,6 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) {
|
||||
if (whereDepth_ > 0) {
|
||||
CheckShape(lhsLoc, &lhs);
|
||||
}
|
||||
if (context_.foldingContext().languageFeatures().IsEnabled(
|
||||
common::LanguageFeature::CUDA)) {
|
||||
const auto &scope{context_.FindScope(lhsLoc)};
|
||||
const Scope &progUnit{GetProgramUnitContaining(scope)};
|
||||
if (!IsCUDADeviceContext(&progUnit) && deviceConstructDepth_ == 0) {
|
||||
if (Fortran::evaluate::HasCUDADeviceAttrs(lhs) &&
|
||||
Fortran::evaluate::HasCUDAImplicitTransfer(rhs)) {
|
||||
if (GetNbOfCUDAManagedOrUnifiedSymbols(lhs) == 1 &&
|
||||
GetNbOfCUDAManagedOrUnifiedSymbols(rhs) == 1 &&
|
||||
GetNbOfCUDADeviceSymbols(rhs) == 1)
|
||||
return; // This is a special case handled on the host.
|
||||
context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -254,46 +238,6 @@ void AssignmentChecker::Enter(const parser::MaskedElsewhereStmt &x) {
|
||||
void AssignmentChecker::Leave(const parser::MaskedElsewhereStmt &) {
|
||||
context_.value().PopWhereContext();
|
||||
}
|
||||
void AssignmentChecker::Enter(const parser::CUFKernelDoConstruct &x) {
|
||||
++context_.value().deviceConstructDepth_;
|
||||
}
|
||||
void AssignmentChecker::Leave(const parser::CUFKernelDoConstruct &) {
|
||||
--context_.value().deviceConstructDepth_;
|
||||
}
|
||||
static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) {
|
||||
const auto &beginBlockDirective =
|
||||
std::get<Fortran::parser::AccBeginBlockDirective>(x.t);
|
||||
const auto &blockDirective =
|
||||
std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t);
|
||||
if (blockDirective.v == llvm::acc::ACCD_parallel ||
|
||||
blockDirective.v == llvm::acc::ACCD_serial ||
|
||||
blockDirective.v == llvm::acc::ACCD_kernels) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
void AssignmentChecker::Enter(const parser::OpenACCBlockConstruct &x) {
|
||||
if (IsOpenACCComputeConstruct(x)) {
|
||||
++context_.value().deviceConstructDepth_;
|
||||
}
|
||||
}
|
||||
void AssignmentChecker::Leave(const parser::OpenACCBlockConstruct &x) {
|
||||
if (IsOpenACCComputeConstruct(x)) {
|
||||
--context_.value().deviceConstructDepth_;
|
||||
}
|
||||
}
|
||||
void AssignmentChecker::Enter(const parser::OpenACCCombinedConstruct &) {
|
||||
++context_.value().deviceConstructDepth_;
|
||||
}
|
||||
void AssignmentChecker::Leave(const parser::OpenACCCombinedConstruct &) {
|
||||
--context_.value().deviceConstructDepth_;
|
||||
}
|
||||
void AssignmentChecker::Enter(const parser::OpenACCLoopConstruct &) {
|
||||
++context_.value().deviceConstructDepth_;
|
||||
}
|
||||
void AssignmentChecker::Leave(const parser::OpenACCLoopConstruct &) {
|
||||
--context_.value().deviceConstructDepth_;
|
||||
}
|
||||
|
||||
} // namespace Fortran::semantics
|
||||
template class Fortran::common::Indirection<
|
||||
|
||||
@@ -46,14 +46,6 @@ public:
|
||||
void Leave(const parser::EndWhereStmt &);
|
||||
void Enter(const parser::MaskedElsewhereStmt &);
|
||||
void Leave(const parser::MaskedElsewhereStmt &);
|
||||
void Enter(const parser::CUFKernelDoConstruct &);
|
||||
void Leave(const parser::CUFKernelDoConstruct &);
|
||||
void Enter(const parser::OpenACCBlockConstruct &);
|
||||
void Leave(const parser::OpenACCBlockConstruct &);
|
||||
void Enter(const parser::OpenACCCombinedConstruct &);
|
||||
void Leave(const parser::OpenACCCombinedConstruct &);
|
||||
void Enter(const parser::OpenACCLoopConstruct &);
|
||||
void Leave(const parser::OpenACCLoopConstruct &);
|
||||
|
||||
SemanticsContext &context();
|
||||
|
||||
|
||||
@@ -685,18 +685,67 @@ void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) {
|
||||
std::get<std::list<parser::CUFReduction>>(directive.t)) {
|
||||
CheckReduce(context_, reduce);
|
||||
}
|
||||
inCUFKernelDoConstruct_ = true;
|
||||
++deviceConstructDepth_;
|
||||
}
|
||||
|
||||
static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) {
|
||||
const auto &beginBlockDirective =
|
||||
std::get<Fortran::parser::AccBeginBlockDirective>(x.t);
|
||||
const auto &blockDirective =
|
||||
std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t);
|
||||
if (blockDirective.v == llvm::acc::ACCD_parallel ||
|
||||
blockDirective.v == llvm::acc::ACCD_serial ||
|
||||
blockDirective.v == llvm::acc::ACCD_kernels) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void CUDAChecker::Leave(const parser::CUFKernelDoConstruct &) {
|
||||
inCUFKernelDoConstruct_ = false;
|
||||
--deviceConstructDepth_;
|
||||
}
|
||||
void CUDAChecker::Enter(const parser::OpenACCBlockConstruct &x) {
|
||||
if (IsOpenACCComputeConstruct(x)) {
|
||||
++deviceConstructDepth_;
|
||||
}
|
||||
}
|
||||
void CUDAChecker::Leave(const parser::OpenACCBlockConstruct &x) {
|
||||
if (IsOpenACCComputeConstruct(x)) {
|
||||
--deviceConstructDepth_;
|
||||
}
|
||||
}
|
||||
void CUDAChecker::Enter(const parser::OpenACCCombinedConstruct &) {
|
||||
++deviceConstructDepth_;
|
||||
}
|
||||
void CUDAChecker::Leave(const parser::OpenACCCombinedConstruct &) {
|
||||
--deviceConstructDepth_;
|
||||
}
|
||||
void CUDAChecker::Enter(const parser::OpenACCLoopConstruct &) {
|
||||
++deviceConstructDepth_;
|
||||
}
|
||||
void CUDAChecker::Leave(const parser::OpenACCLoopConstruct &) {
|
||||
--deviceConstructDepth_;
|
||||
}
|
||||
void CUDAChecker::Enter(const parser::DoConstruct &x) {
|
||||
if (x.IsDoConcurrent() &&
|
||||
context_.foldingContext().languageFeatures().IsEnabled(
|
||||
common::LanguageFeature::DoConcurrentOffload)) {
|
||||
++deviceConstructDepth_;
|
||||
}
|
||||
}
|
||||
void CUDAChecker::Leave(const parser::DoConstruct &x) {
|
||||
if (x.IsDoConcurrent() &&
|
||||
context_.foldingContext().languageFeatures().IsEnabled(
|
||||
common::LanguageFeature::DoConcurrentOffload)) {
|
||||
--deviceConstructDepth_;
|
||||
}
|
||||
}
|
||||
|
||||
void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
|
||||
auto lhsLoc{std::get<parser::Variable>(x.t).GetSource()};
|
||||
const auto &scope{context_.FindScope(lhsLoc)};
|
||||
const Scope &progUnit{GetProgramUnitContaining(scope)};
|
||||
if (IsCUDADeviceContext(&progUnit) || inCUFKernelDoConstruct_) {
|
||||
if (IsCUDADeviceContext(&progUnit) || deviceConstructDepth_ > 0) {
|
||||
return; // Data transfer with assignment is only perform on host.
|
||||
}
|
||||
|
||||
@@ -714,6 +763,16 @@ void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
|
||||
context_.Say(lhsLoc,
|
||||
"More than one reference to a CUDA object on the right hand side of the assigment"_err_en_US);
|
||||
}
|
||||
|
||||
if (Fortran::evaluate::HasCUDADeviceAttrs(assign->lhs) &&
|
||||
Fortran::evaluate::HasCUDAImplicitTransfer(assign->rhs)) {
|
||||
if (GetNbOfCUDAManagedOrUnifiedSymbols(assign->lhs) == 1 &&
|
||||
GetNbOfCUDAManagedOrUnifiedSymbols(assign->rhs) == 1 &&
|
||||
GetNbOfCUDADeviceSymbols(assign->rhs) == 1) {
|
||||
return; // This is a special case handled on the host.
|
||||
}
|
||||
context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Fortran::semantics
|
||||
|
||||
@@ -41,10 +41,18 @@ public:
|
||||
void Enter(const parser::CUFKernelDoConstruct &);
|
||||
void Leave(const parser::CUFKernelDoConstruct &);
|
||||
void Enter(const parser::AssignmentStmt &);
|
||||
void Enter(const parser::OpenACCBlockConstruct &);
|
||||
void Leave(const parser::OpenACCBlockConstruct &);
|
||||
void Enter(const parser::OpenACCCombinedConstruct &);
|
||||
void Leave(const parser::OpenACCCombinedConstruct &);
|
||||
void Enter(const parser::OpenACCLoopConstruct &);
|
||||
void Leave(const parser::OpenACCLoopConstruct &);
|
||||
void Enter(const parser::DoConstruct &);
|
||||
void Leave(const parser::DoConstruct &);
|
||||
|
||||
private:
|
||||
SemanticsContext &context_;
|
||||
bool inCUFKernelDoConstruct_ = false;
|
||||
int deviceConstructDepth_{0};
|
||||
};
|
||||
|
||||
bool CanonicalizeCUDA(parser::Program &);
|
||||
|
||||
@@ -403,3 +403,19 @@ end subroutine
|
||||
! CHECK-LABEL: func.func @_QPsub20()
|
||||
! CHECK-NOT: cuf.data_transfer
|
||||
! CHECK: hlfir.assign
|
||||
|
||||
subroutine sub21()
|
||||
real, allocatable,device:: a(:,:), b(:,:)
|
||||
real:: s
|
||||
integer:: i,j,N=16
|
||||
allocate(a(N,N),b(N,N))
|
||||
do concurrent(i=1:N, j=1:N) reduce(+:s)
|
||||
b(i,j)=a(i,j)**2
|
||||
s=s+b(i,j)
|
||||
end do
|
||||
end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPsub21()
|
||||
! CHECK: fir.do_concurrent.loop
|
||||
! CHECK-NOT: cuf.data_transfer
|
||||
! CHECK: hlfir.assign
|
||||
|
||||
@@ -223,6 +223,11 @@ static llvm::cl::opt<bool> enableCUDA("fcuda",
|
||||
llvm::cl::desc("enable CUDA Fortran"),
|
||||
llvm::cl::init(false));
|
||||
|
||||
static llvm::cl::opt<bool>
|
||||
enableDoConcurrentOffload("fdoconcurrent-offload",
|
||||
llvm::cl::desc("enable do concurrent offload"),
|
||||
llvm::cl::init(false));
|
||||
|
||||
static llvm::cl::opt<bool>
|
||||
disableCUDAWarpFunction("fcuda-disable-warp-function",
|
||||
llvm::cl::desc("Disable CUDA Warp Function"),
|
||||
@@ -608,6 +613,11 @@ int main(int argc, char **argv) {
|
||||
options.features.Enable(Fortran::common::LanguageFeature::CUDA);
|
||||
}
|
||||
|
||||
if (enableDoConcurrentOffload) {
|
||||
options.features.Enable(
|
||||
Fortran::common::LanguageFeature::DoConcurrentOffload);
|
||||
}
|
||||
|
||||
if (disableCUDAWarpFunction) {
|
||||
options.features.Enable(
|
||||
Fortran::common::LanguageFeature::CudaWarpMatchFunction, false);
|
||||
|
||||
Reference in New Issue
Block a user