mirror of
https://github.com/intel/llvm.git
synced 2026-01-13 02:38:07 +08:00
[MLIR][OpenMP] Add MLIR Lowering Support for dist_schedule (#152736)
`dist_schedule` was previously supported in Flang/Clang but was not implemented in MLIR, instead a user would get a "not yet implemented" error. This patch adds support for the `dist_schedule` clause to be lowered to LLVM IR when used in an `omp.distribute` or `omp.wsloop` section. There has needed to be some rework required to ensure that MLIR/LLVM emits the correct Schedule Type for the clause, as it uses a different schedule type to other OpenMP directives/clauses in the runtime library. This patch also ensures that when using dist_schedule or a chunked schedule clause, the correct llvm loop parallel accesses details are added.
This commit is contained in:
@@ -42,10 +42,10 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low
|
||||
| target update construct | P | device clause not supported |
|
||||
| declare target directive | P | |
|
||||
| teams construct | Y | |
|
||||
| distribute construct | P | dist_schedule clause not supported |
|
||||
| distribute simd construct | P | dist_schedule and linear clauses are not supported |
|
||||
| distribute parallel loop construct | P | dist_schedule clause not supported |
|
||||
| distribute parallel loop simd construct | P | dist_schedule and linear clauses are not supported |
|
||||
| distribute construct | P | |
|
||||
| distribute simd construct | P | linear clauses are not supported |
|
||||
| distribute parallel loop construct | P | |
|
||||
| distribute parallel loop simd construct | P | linear clauses are not supported |
|
||||
| depend clause | Y | |
|
||||
| declare reduction construct | N | |
|
||||
| atomic construct extensions | Y | |
|
||||
@@ -53,13 +53,13 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low
|
||||
| cancellation point construct | Y | |
|
||||
| parallel do simd construct | P | linear clause not supported |
|
||||
| target teams construct | P | device clause not supported |
|
||||
| teams distribute construct | P | dist_schedule clause not supported |
|
||||
| teams distribute simd construct | P | dist_schedule and linear clauses are not supported |
|
||||
| target teams distribute construct | P | device and dist_schedule clauses are not supported |
|
||||
| teams distribute parallel loop construct | P | dist_schedule clause not supported |
|
||||
| target teams distribute parallel loop construct | P | device and dist_schedule clauses are not supported |
|
||||
| teams distribute parallel loop simd construct | P | dist_schedule and linear clauses are not supported |
|
||||
| target teams distribute parallel loop simd construct | P | device, dist_schedule and linear clauses are not supported |
|
||||
| teams distribute construct | P | |
|
||||
| teams distribute simd construct | P | linear clause is not supported |
|
||||
| target teams distribute construct | P | device clause is not supported |
|
||||
| teams distribute parallel loop construct | P | |
|
||||
| target teams distribute parallel loop construct | P | device clause is not supported |
|
||||
| teams distribute parallel loop simd construct | P | linear clause is not supported |
|
||||
| target teams distribute parallel loop simd construct | P | device and linear clauses are not supported |
|
||||
|
||||
## Extensions
|
||||
### ATOMIC construct
|
||||
|
||||
@@ -490,7 +490,8 @@ def OMP_SCHEDULE_Dynamic : EnumVal<"dynamic", 3, 1> {}
|
||||
def OMP_SCHEDULE_Guided : EnumVal<"guided", 4, 1> {}
|
||||
def OMP_SCHEDULE_Auto : EnumVal<"auto", 5, 1> {}
|
||||
def OMP_SCHEDULE_Runtime : EnumVal<"runtime", 6, 1> {}
|
||||
def OMP_SCHEDULE_Default : EnumVal<"default", 7, 0> { let isDefault = 1; }
|
||||
def OMP_SCHEDULE_Distribute : EnumVal<"distribute", 7, 1> {}
|
||||
def OMP_SCHEDULE_Default : EnumVal<"default", 8, 0> { let isDefault = 1; }
|
||||
def OMPC_Schedule : Clause<[Spelling<"schedule">]> {
|
||||
let clangClass = "OMPScheduleClause";
|
||||
let flangClass = "OmpScheduleClause";
|
||||
@@ -501,6 +502,7 @@ def OMPC_Schedule : Clause<[Spelling<"schedule">]> {
|
||||
OMP_SCHEDULE_Guided,
|
||||
OMP_SCHEDULE_Auto,
|
||||
OMP_SCHEDULE_Runtime,
|
||||
OMP_SCHEDULE_Distribute,
|
||||
OMP_SCHEDULE_Default
|
||||
];
|
||||
}
|
||||
|
||||
@@ -1133,11 +1133,17 @@ private:
|
||||
/// \param NeedsBarrier Indicates whether a barrier must be inserted after
|
||||
/// the loop.
|
||||
/// \param LoopType Type of workshare loop.
|
||||
/// \param HasDistSchedule Defines if the clause being lowered is
|
||||
/// dist_schedule as this is handled slightly differently
|
||||
/// \param DistScheduleSchedType Defines the Schedule Type for the Distribute
|
||||
/// loop. Defaults to None if no Distribute loop is present.
|
||||
///
|
||||
/// \returns Point where to insert code after the workshare construct.
|
||||
InsertPointOrErrorTy applyStaticWorkshareLoop(
|
||||
DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
|
||||
omp::WorksharingLoopType LoopType, bool NeedsBarrier);
|
||||
omp::WorksharingLoopType LoopType, bool NeedsBarrier,
|
||||
bool HasDistSchedule = false,
|
||||
omp::OMPScheduleType DistScheduleSchedType = omp::OMPScheduleType::None);
|
||||
|
||||
/// Modifies the canonical loop a statically-scheduled workshare loop with a
|
||||
/// user-specified chunk size.
|
||||
@@ -1150,13 +1156,22 @@ private:
|
||||
/// \param NeedsBarrier Indicates whether a barrier must be inserted after the
|
||||
/// loop.
|
||||
/// \param ChunkSize The user-specified chunk size.
|
||||
/// \param SchedType Optional type of scheduling to be passed to the init
|
||||
/// function.
|
||||
/// \param DistScheduleChunkSize The size of dist_shcedule chunk considered
|
||||
/// as a unit when
|
||||
/// scheduling. If \p nullptr, defaults to 1.
|
||||
/// \param DistScheduleSchedType Defines the Schedule Type for the Distribute
|
||||
/// loop. Defaults to None if no Distribute loop is present.
|
||||
///
|
||||
/// \returns Point where to insert code after the workshare construct.
|
||||
InsertPointOrErrorTy applyStaticChunkedWorkshareLoop(DebugLoc DL,
|
||||
CanonicalLoopInfo *CLI,
|
||||
InsertPointTy AllocaIP,
|
||||
bool NeedsBarrier,
|
||||
Value *ChunkSize);
|
||||
InsertPointOrErrorTy applyStaticChunkedWorkshareLoop(
|
||||
DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
|
||||
bool NeedsBarrier, Value *ChunkSize,
|
||||
omp::OMPScheduleType SchedType =
|
||||
omp::OMPScheduleType::UnorderedStaticChunked,
|
||||
Value *DistScheduleChunkSize = nullptr,
|
||||
omp::OMPScheduleType DistScheduleSchedType = omp::OMPScheduleType::None);
|
||||
|
||||
/// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
|
||||
///
|
||||
@@ -1235,6 +1250,10 @@ public:
|
||||
/// \param LoopType Information about type of loop worksharing.
|
||||
/// It corresponds to type of loop workshare OpenMP pragma.
|
||||
/// \param NoLoop If true, no-loop code is generated.
|
||||
/// \param HasDistSchedule Defines if the clause being lowered is
|
||||
/// dist_schedule as this is handled slightly differently
|
||||
///
|
||||
/// \param DistScheduleChunkSize The chunk size for dist_schedule loop
|
||||
///
|
||||
/// \returns Point where to insert code after the workshare construct.
|
||||
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(
|
||||
@@ -1246,7 +1265,8 @@ public:
|
||||
bool HasOrderedClause = false,
|
||||
omp::WorksharingLoopType LoopType =
|
||||
omp::WorksharingLoopType::ForStaticLoop,
|
||||
bool NoLoop = false);
|
||||
bool NoLoop = false, bool HasDistSchedule = false,
|
||||
Value *DistScheduleChunkSize = nullptr);
|
||||
|
||||
/// Tile a loop nest.
|
||||
///
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
|
||||
#include "llvm/ADT/SmallBitVector.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/Analysis/AssumptionCache.h"
|
||||
@@ -136,6 +137,8 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
|
||||
case OMPScheduleType::NomergeOrderedRuntime:
|
||||
case OMPScheduleType::NomergeOrderedAuto:
|
||||
case OMPScheduleType::NomergeOrderedTrapezoidal:
|
||||
case OMPScheduleType::OrderedDistributeChunked:
|
||||
case OMPScheduleType::OrderedDistribute:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
@@ -182,7 +185,7 @@ static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
|
||||
/// arguments.
|
||||
static OMPScheduleType
|
||||
getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
|
||||
bool HasSimdModifier) {
|
||||
bool HasSimdModifier, bool HasDistScheduleChunks) {
|
||||
// Currently, the default schedule it static.
|
||||
switch (ClauseKind) {
|
||||
case OMP_SCHEDULE_Default:
|
||||
@@ -199,6 +202,9 @@ getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
|
||||
case OMP_SCHEDULE_Runtime:
|
||||
return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
|
||||
: OMPScheduleType::BaseRuntime;
|
||||
case OMP_SCHEDULE_Distribute:
|
||||
return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
|
||||
: OMPScheduleType::BaseDistribute;
|
||||
}
|
||||
llvm_unreachable("unhandled schedule clause argument");
|
||||
}
|
||||
@@ -267,9 +273,10 @@ getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType,
|
||||
static OMPScheduleType
|
||||
computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
|
||||
bool HasSimdModifier, bool HasMonotonicModifier,
|
||||
bool HasNonmonotonicModifier, bool HasOrderedClause) {
|
||||
OMPScheduleType BaseSchedule =
|
||||
getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
|
||||
bool HasNonmonotonicModifier, bool HasOrderedClause,
|
||||
bool HasDistScheduleChunks) {
|
||||
OMPScheduleType BaseSchedule = getOpenMPBaseScheduleType(
|
||||
ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
|
||||
OMPScheduleType OrderedSchedule =
|
||||
getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
|
||||
OMPScheduleType Result = getOpenMPMonotonicityScheduleType(
|
||||
@@ -4803,7 +4810,8 @@ static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M,
|
||||
|
||||
OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
|
||||
DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
|
||||
WorksharingLoopType LoopType, bool NeedsBarrier) {
|
||||
WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
|
||||
OMPScheduleType DistScheduleSchedType) {
|
||||
assert(CLI->isValid() && "Requires a valid canonical loop");
|
||||
assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
|
||||
"Require dedicated allocate IP");
|
||||
@@ -4859,15 +4867,29 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
|
||||
|
||||
// Call the "init" function and update the trip count of the loop with the
|
||||
// value it produced.
|
||||
SmallVector<Value *, 10> Args(
|
||||
{SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
|
||||
if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
|
||||
Value *PDistUpperBound =
|
||||
Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
|
||||
Args.push_back(PDistUpperBound);
|
||||
auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
|
||||
PUpperBound, IVTy, PStride, One, Zero, StaticInit,
|
||||
this](Value *SchedulingType, auto &Builder) {
|
||||
SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
|
||||
PLowerBound, PUpperBound});
|
||||
if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
|
||||
Value *PDistUpperBound =
|
||||
Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
|
||||
Args.push_back(PDistUpperBound);
|
||||
}
|
||||
Args.append({PStride, One, Zero});
|
||||
createRuntimeFunctionCall(StaticInit, Args);
|
||||
};
|
||||
BuildInitCall(SchedulingType, Builder);
|
||||
if (HasDistSchedule &&
|
||||
LoopType != WorksharingLoopType::DistributeStaticLoop) {
|
||||
Constant *DistScheduleSchedType = ConstantInt::get(
|
||||
I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
|
||||
// We want to emit a second init function call for the dist_schedule clause
|
||||
// to the Distribute construct. This should only be done however if a
|
||||
// Workshare Loop is nested within a Distribute Construct
|
||||
BuildInitCall(DistScheduleSchedType, Builder);
|
||||
}
|
||||
Args.append({PStride, One, Zero});
|
||||
createRuntimeFunctionCall(StaticInit, Args);
|
||||
Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
|
||||
Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
|
||||
Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
|
||||
@@ -4906,14 +4928,44 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
|
||||
return AfterIP;
|
||||
}
|
||||
|
||||
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
|
||||
LoopInfo &LI);
|
||||
static void addLoopMetadata(CanonicalLoopInfo *Loop,
|
||||
ArrayRef<Metadata *> Properties);
|
||||
|
||||
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI,
|
||||
LLVMContext &Ctx, Loop *Loop,
|
||||
LoopInfo &LoopInfo,
|
||||
SmallVector<Metadata *> &LoopMDList) {
|
||||
SmallSet<BasicBlock *, 8> Reachable;
|
||||
|
||||
// Get the basic blocks from the loop in which memref instructions
|
||||
// can be found.
|
||||
// TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
|
||||
// preferably without running any passes.
|
||||
for (BasicBlock *Block : Loop->getBlocks()) {
|
||||
if (Block == CLI->getCond() || Block == CLI->getHeader())
|
||||
continue;
|
||||
Reachable.insert(Block);
|
||||
}
|
||||
|
||||
// Add access group metadata to memory-access instructions.
|
||||
MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
|
||||
for (BasicBlock *BB : Reachable)
|
||||
addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
|
||||
// TODO: If the loop has existing parallel access metadata, have
|
||||
// to combine two lists.
|
||||
LoopMDList.push_back(MDNode::get(
|
||||
Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
|
||||
}
|
||||
|
||||
OpenMPIRBuilder::InsertPointOrErrorTy
|
||||
OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
|
||||
CanonicalLoopInfo *CLI,
|
||||
InsertPointTy AllocaIP,
|
||||
bool NeedsBarrier,
|
||||
Value *ChunkSize) {
|
||||
OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
|
||||
DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
|
||||
bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
|
||||
Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
|
||||
assert(CLI->isValid() && "Requires a valid canonical loop");
|
||||
assert(ChunkSize && "Chunk size is required");
|
||||
assert(ChunkSize || DistScheduleChunkSize && "Chunk size is required");
|
||||
|
||||
LLVMContext &Ctx = CLI->getFunction()->getContext();
|
||||
Value *IV = CLI->getIndVar();
|
||||
@@ -4927,6 +4979,18 @@ OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
|
||||
Constant *Zero = ConstantInt::get(InternalIVTy, 0);
|
||||
Constant *One = ConstantInt::get(InternalIVTy, 1);
|
||||
|
||||
Function *F = CLI->getFunction();
|
||||
FunctionAnalysisManager FAM;
|
||||
FAM.registerPass([]() { return DominatorTreeAnalysis(); });
|
||||
FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
|
||||
LoopAnalysis LIA;
|
||||
LoopInfo &&LI = LIA.run(*F, FAM);
|
||||
Loop *L = LI.getLoopFor(CLI->getHeader());
|
||||
SmallVector<Metadata *> LoopMDList;
|
||||
if (ChunkSize || DistScheduleChunkSize)
|
||||
applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
|
||||
addLoopMetadata(CLI, LoopMDList);
|
||||
|
||||
// Declare useful OpenMP runtime functions.
|
||||
FunctionCallee StaticInit =
|
||||
getKmpcForStaticInitForType(InternalIVTy, M, *this);
|
||||
@@ -4949,13 +5013,18 @@ OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
|
||||
Builder.SetCurrentDebugLocation(DL);
|
||||
|
||||
// TODO: Detect overflow in ubsan or max-out with current tripcount.
|
||||
Value *CastedChunkSize =
|
||||
Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
|
||||
Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
|
||||
ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
|
||||
Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
|
||||
DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
|
||||
"distschedulechunksize");
|
||||
Value *CastedTripCount =
|
||||
Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
|
||||
|
||||
Constant *SchedulingType = ConstantInt::get(
|
||||
I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
|
||||
Constant *SchedulingType =
|
||||
ConstantInt::get(I32Type, static_cast<int>(SchedType));
|
||||
Constant *DistSchedulingType =
|
||||
ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
|
||||
Builder.CreateStore(Zero, PLowerBound);
|
||||
Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
|
||||
Builder.CreateStore(OrigUpperBound, PUpperBound);
|
||||
@@ -4967,12 +5036,26 @@ OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
|
||||
Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
|
||||
Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
|
||||
Value *ThreadNum = getOrCreateThreadID(SrcLoc);
|
||||
createRuntimeFunctionCall(
|
||||
StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
|
||||
/*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
|
||||
/*plower=*/PLowerBound, /*pupper=*/PUpperBound,
|
||||
/*pstride=*/PStride, /*incr=*/One,
|
||||
/*chunk=*/CastedChunkSize});
|
||||
auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
|
||||
PUpperBound, PStride, One,
|
||||
this](Value *SchedulingType, Value *ChunkSize,
|
||||
auto &Builder) {
|
||||
createRuntimeFunctionCall(
|
||||
StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
|
||||
/*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
|
||||
/*plower=*/PLowerBound, /*pupper=*/PUpperBound,
|
||||
/*pstride=*/PStride, /*incr=*/One,
|
||||
/*chunk=*/ChunkSize});
|
||||
};
|
||||
BuildInitCall(SchedulingType, CastedChunkSize, Builder);
|
||||
if (DistScheduleSchedType != OMPScheduleType::None &&
|
||||
SchedType != OMPScheduleType::OrderedDistributeChunked &&
|
||||
SchedType != OMPScheduleType::OrderedDistribute) {
|
||||
// We want to emit a second init function call for the dist_schedule clause
|
||||
// to the Distribute construct. This should only be done however if a
|
||||
// Workshare Loop is nested within a Distribute Construct
|
||||
BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
|
||||
}
|
||||
|
||||
// Load values written by the "init" function.
|
||||
Value *FirstChunkStart =
|
||||
@@ -5299,31 +5382,47 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
|
||||
bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
|
||||
bool HasSimdModifier, bool HasMonotonicModifier,
|
||||
bool HasNonmonotonicModifier, bool HasOrderedClause,
|
||||
WorksharingLoopType LoopType, bool NoLoop) {
|
||||
WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
|
||||
Value *DistScheduleChunkSize) {
|
||||
if (Config.isTargetDevice())
|
||||
return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
|
||||
OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
|
||||
SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
|
||||
HasNonmonotonicModifier, HasOrderedClause);
|
||||
HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
|
||||
|
||||
bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
|
||||
OMPScheduleType::ModifierOrdered;
|
||||
OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
|
||||
if (HasDistSchedule) {
|
||||
DistScheduleSchedType = DistScheduleChunkSize
|
||||
? OMPScheduleType::OrderedDistributeChunked
|
||||
: OMPScheduleType::OrderedDistribute;
|
||||
}
|
||||
switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
|
||||
case OMPScheduleType::BaseStatic:
|
||||
assert(!ChunkSize && "No chunk size with static-chunked schedule");
|
||||
if (IsOrdered)
|
||||
case OMPScheduleType::BaseDistribute:
|
||||
assert(!ChunkSize || !DistScheduleChunkSize &&
|
||||
"No chunk size with static-chunked schedule");
|
||||
if (IsOrdered && !HasDistSchedule)
|
||||
return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
|
||||
NeedsBarrier, ChunkSize);
|
||||
// FIXME: Monotonicity ignored?
|
||||
return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier);
|
||||
if (DistScheduleChunkSize)
|
||||
return applyStaticChunkedWorkshareLoop(
|
||||
DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
|
||||
DistScheduleChunkSize, DistScheduleSchedType);
|
||||
return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
|
||||
HasDistSchedule);
|
||||
|
||||
case OMPScheduleType::BaseStaticChunked:
|
||||
if (IsOrdered)
|
||||
case OMPScheduleType::BaseDistributeChunked:
|
||||
if (IsOrdered && !HasDistSchedule)
|
||||
return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
|
||||
NeedsBarrier, ChunkSize);
|
||||
// FIXME: Monotonicity ignored?
|
||||
return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
|
||||
ChunkSize);
|
||||
return applyStaticChunkedWorkshareLoop(
|
||||
DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
|
||||
DistScheduleChunkSize, DistScheduleSchedType);
|
||||
|
||||
case OMPScheduleType::BaseRuntime:
|
||||
case OMPScheduleType::BaseAuto:
|
||||
@@ -5917,8 +6016,8 @@ static void addLoopMetadata(CanonicalLoopInfo *Loop,
|
||||
}
|
||||
|
||||
/// Attach llvm.access.group metadata to the memref instructions of \p Block
|
||||
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
|
||||
LoopInfo &LI) {
|
||||
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
|
||||
LoopInfo &LI) {
|
||||
for (Instruction &I : *Block) {
|
||||
if (I.mayReadOrWriteMemory()) {
|
||||
// TODO: This instruction may already have access group from
|
||||
@@ -6108,16 +6207,8 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
|
||||
// dependences of 'safelen' iterations are possible.
|
||||
// If clause order(concurrent) is specified then the memory instructions
|
||||
// are marked parallel even if 'safelen' is finite.
|
||||
if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
|
||||
// Add access group metadata to memory-access instructions.
|
||||
MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
|
||||
for (BasicBlock *BB : Reachable)
|
||||
addSimdMetadata(BB, AccessGroup, LI);
|
||||
// TODO: If the loop has existing parallel access metadata, have
|
||||
// to combine two lists.
|
||||
LoopMDList.push_back(MDNode::get(
|
||||
Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
|
||||
}
|
||||
if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
|
||||
applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
|
||||
|
||||
// FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
|
||||
// versions so we can't add the loop attributes in that case.
|
||||
|
||||
@@ -617,6 +617,7 @@ parseScheduleClause(OpAsmParser &parser, ClauseScheduleKindAttr &scheduleAttr,
|
||||
break;
|
||||
case ClauseScheduleKind::Auto:
|
||||
case ClauseScheduleKind::Runtime:
|
||||
case ClauseScheduleKind::Distribute:
|
||||
chunkSize = std::nullopt;
|
||||
}
|
||||
|
||||
|
||||
@@ -61,6 +61,8 @@ convertToScheduleKind(std::optional<omp::ClauseScheduleKind> schedKind) {
|
||||
return llvm::omp::OMP_SCHEDULE_Auto;
|
||||
case omp::ClauseScheduleKind::Runtime:
|
||||
return llvm::omp::OMP_SCHEDULE_Runtime;
|
||||
case omp::ClauseScheduleKind::Distribute:
|
||||
return llvm::omp::OMP_SCHEDULE_Distribute;
|
||||
}
|
||||
llvm_unreachable("unhandled schedule clause argument");
|
||||
}
|
||||
@@ -319,10 +321,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
|
||||
if (op.getDevice())
|
||||
result = todo("device");
|
||||
};
|
||||
auto checkDistSchedule = [&todo](auto op, LogicalResult &result) {
|
||||
if (op.getDistScheduleChunkSize())
|
||||
result = todo("dist_schedule with chunk_size");
|
||||
};
|
||||
auto checkHint = [](auto op, LogicalResult &) {
|
||||
if (op.getHint())
|
||||
op.emitWarning("hint clause discarded");
|
||||
@@ -387,7 +385,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
|
||||
})
|
||||
.Case([&](omp::DistributeOp op) {
|
||||
checkAllocate(op, result);
|
||||
checkDistSchedule(op, result);
|
||||
checkOrder(op, result);
|
||||
})
|
||||
.Case([&](omp::OrderedRegionOp op) { checkParLevelSimd(op, result); })
|
||||
@@ -2548,6 +2545,19 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
|
||||
chunk = builder.CreateSExtOrTrunc(chunkVar, ivType);
|
||||
}
|
||||
|
||||
omp::DistributeOp distributeOp = nullptr;
|
||||
llvm::Value *distScheduleChunk = nullptr;
|
||||
bool hasDistSchedule = false;
|
||||
if (llvm::isa_and_present<omp::DistributeOp>(opInst.getParentOp())) {
|
||||
distributeOp = cast<omp::DistributeOp>(opInst.getParentOp());
|
||||
hasDistSchedule = distributeOp.getDistScheduleStatic();
|
||||
if (distributeOp.getDistScheduleChunkSize()) {
|
||||
llvm::Value *chunkVar = moduleTranslation.lookupValue(
|
||||
distributeOp.getDistScheduleChunkSize());
|
||||
distScheduleChunk = builder.CreateSExtOrTrunc(chunkVar, ivType);
|
||||
}
|
||||
}
|
||||
|
||||
PrivateVarsInfo privateVarsInfo(wsloopOp);
|
||||
|
||||
SmallVector<omp::DeclareReductionOp> reductionDecls;
|
||||
@@ -2675,7 +2685,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
|
||||
convertToScheduleKind(schedule), chunk, isSimd,
|
||||
scheduleMod == omp::ScheduleModifier::monotonic,
|
||||
scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
|
||||
workshareLoopType, noLoopMode);
|
||||
workshareLoopType, noLoopMode, hasDistSchedule, distScheduleChunk);
|
||||
|
||||
if (failed(handleError(wsloopIP, opInst)))
|
||||
return failure();
|
||||
@@ -5266,15 +5276,18 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
|
||||
if (!isa_and_present<omp::WsloopOp>(distributeOp.getNestedWrapper())) {
|
||||
// TODO: Add support for clauses which are valid for DISTRIBUTE
|
||||
// constructs. Static schedule is the default.
|
||||
auto schedule = omp::ClauseScheduleKind::Static;
|
||||
bool isOrdered = false;
|
||||
bool hasDistSchedule = distributeOp.getDistScheduleStatic();
|
||||
auto schedule = hasDistSchedule ? omp::ClauseScheduleKind::Distribute
|
||||
: omp::ClauseScheduleKind::Static;
|
||||
// dist_schedule clauses are ordered - otherise this should be false
|
||||
bool isOrdered = hasDistSchedule;
|
||||
std::optional<omp::ScheduleModifier> scheduleMod;
|
||||
bool isSimd = false;
|
||||
llvm::omp::WorksharingLoopType workshareLoopType =
|
||||
llvm::omp::WorksharingLoopType::DistributeStaticLoop;
|
||||
bool loopNeedsBarrier = false;
|
||||
llvm::Value *chunk = nullptr;
|
||||
|
||||
llvm::Value *chunk = moduleTranslation.lookupValue(
|
||||
distributeOp.getDistScheduleChunkSize());
|
||||
llvm::CanonicalLoopInfo *loopInfo =
|
||||
findCurrentLoopInfo(moduleTranslation);
|
||||
llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP =
|
||||
@@ -5283,12 +5296,11 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
|
||||
convertToScheduleKind(schedule), chunk, isSimd,
|
||||
scheduleMod == omp::ScheduleModifier::monotonic,
|
||||
scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
|
||||
workshareLoopType);
|
||||
workshareLoopType, false, hasDistSchedule, chunk);
|
||||
|
||||
if (!wsloopIP)
|
||||
return wsloopIP.takeError();
|
||||
}
|
||||
|
||||
if (failed(cleanupPrivateVars(builder, moduleTranslation,
|
||||
distributeOp.getLoc(), privVarsInfo.llvmVars,
|
||||
privVarsInfo.privatizers)))
|
||||
|
||||
@@ -615,3 +615,22 @@ omp.declare_mapper @my_mapper : !llvm.struct<"_QFdeclare_mapperTmy_type", (i32)>
|
||||
// CHECK: omp.declare_mapper.info map_entries(%{{.*}}, %{{.*}} : !llvm.ptr, !llvm.ptr)
|
||||
omp.declare_mapper.info map_entries(%3, %2 : !llvm.ptr, !llvm.ptr)
|
||||
}
|
||||
|
||||
// CHECK-LABEL: llvm.func @omp_dist_schedule(%arg0: i32) {
|
||||
func.func @omp_dist_schedule(%arg0: i32) {
|
||||
%c1_i32 = arith.constant 1 : i32
|
||||
// CHECK: %1 = llvm.mlir.constant(1024 : i32) : i32
|
||||
%c1024_i32 = arith.constant 1024 : i32
|
||||
%c16_i32 = arith.constant 16 : i32
|
||||
%c8_i32 = arith.constant 8 : i32
|
||||
omp.teams num_teams( to %c8_i32 : i32) thread_limit(%c16_i32 : i32) {
|
||||
// CHECK: omp.distribute dist_schedule_static dist_schedule_chunk_size(%1 : i32) {
|
||||
omp.distribute dist_schedule_static dist_schedule_chunk_size(%c1024_i32 : i32) {
|
||||
omp.loop_nest (%arg1) : i32 = (%c1_i32) to (%arg0) inclusive step (%c1_i32) {
|
||||
omp.terminator
|
||||
}
|
||||
}
|
||||
omp.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
34
mlir/test/Target/LLVMIR/openmp-dist_schedule.mlir
Normal file
34
mlir/test/Target/LLVMIR/openmp-dist_schedule.mlir
Normal file
@@ -0,0 +1,34 @@
|
||||
// Test that dist_schedule gets correctly translated with the correct schedule type and chunk size where appropriate
|
||||
|
||||
// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
|
||||
|
||||
llvm.func @distribute_dist_schedule_chunk_size(%lb : i32, %ub : i32, %step : i32, %x : i32) {
|
||||
// CHECK: call void @[[RUNTIME_FUNC:__kmpc_for_static_init_4u]](ptr @1, i32 %omp_global_thread_num, i32 91, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 1024)
|
||||
// We want to make sure that the next call is not another init builder.
|
||||
// CHECK-NOT: call void @[[RUNTIME_FUNC]]
|
||||
%1 = llvm.mlir.constant(1024: i32) : i32
|
||||
omp.distribute dist_schedule_static dist_schedule_chunk_size(%1 : i32) {
|
||||
omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
|
||||
omp.yield
|
||||
}
|
||||
}
|
||||
llvm.return
|
||||
}
|
||||
|
||||
// When a chunk size is present, we need to make sure the correct parallel accesses metadata is added
|
||||
// CHECK: !2 = !{!"llvm.loop.parallel_accesses", !3}
|
||||
// CHECK-NEXT: !3 = distinct !{}
|
||||
|
||||
// -----
|
||||
|
||||
llvm.func @distribute_dist_schedule(%lb : i32, %ub : i32, %step : i32, %x : i32) {
|
||||
// CHECK: call void @[[RUNTIME_FUNC:__kmpc_for_static_init_4u]](ptr @1, i32 %omp_global_thread_num, i32 92, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 0)
|
||||
// We want to make sure that the next call is not another init builder.
|
||||
// CHECK-NOT: call void @[[RUNTIME_FUNC]]
|
||||
omp.distribute dist_schedule_static {
|
||||
omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
|
||||
omp.yield
|
||||
}
|
||||
}
|
||||
llvm.return
|
||||
}
|
||||
205
mlir/test/Target/LLVMIR/openmp-dist_schedule_with_wsloop.mlir
Normal file
205
mlir/test/Target/LLVMIR/openmp-dist_schedule_with_wsloop.mlir
Normal file
@@ -0,0 +1,205 @@
|
||||
// Test that dist_schedule gets correctly translated with the correct schedule type and chunk size where appropriate while using workshare loops.
|
||||
|
||||
// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
|
||||
|
||||
llvm.func @distribute_wsloop_dist_schedule_chunked_schedule_chunked(%n: i32, %teams: i32, %threads: i32, %dcs: i32) {
|
||||
%0 = llvm.mlir.constant(0 : i32) : i32
|
||||
%1 = llvm.mlir.constant(1 : i32) : i32
|
||||
%scs = llvm.mlir.constant(64 : i32) : i32
|
||||
|
||||
omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) {
|
||||
omp.parallel {
|
||||
omp.distribute dist_schedule_static dist_schedule_chunk_size(%dcs : i32) {
|
||||
omp.wsloop schedule(static = %scs : i32) {
|
||||
omp.loop_nest (%i) : i32 = (%0) to (%n) step (%1) {
|
||||
omp.yield
|
||||
}
|
||||
} {omp.composite}
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
}
|
||||
llvm.return
|
||||
}
|
||||
// CHECK: define internal void @distribute_wsloop_dist_schedule_chunked_schedule_chunked..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 {
|
||||
// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 33, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 64)
|
||||
// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 91, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 %3)
|
||||
|
||||
llvm.func @distribute_wsloop_dist_schedule_chunked_schedule_chunked_i64(%n: i32, %teams: i32, %threads: i32) {
|
||||
%0 = llvm.mlir.constant(0 : i64) : i64
|
||||
%1 = llvm.mlir.constant(1 : i64) : i64
|
||||
%dcs = llvm.mlir.constant(1024 : i64) : i64
|
||||
%scs = llvm.mlir.constant(64 : i64) : i64
|
||||
%n64 = llvm.zext %n : i32 to i64
|
||||
|
||||
omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) {
|
||||
omp.parallel {
|
||||
omp.distribute dist_schedule_static dist_schedule_chunk_size(%dcs : i64) {
|
||||
omp.wsloop schedule(static = %scs : i64) {
|
||||
omp.loop_nest (%i) : i64 = (%0) to (%n64) step (%1) {
|
||||
omp.yield
|
||||
}
|
||||
} {omp.composite}
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
}
|
||||
llvm.return
|
||||
}
|
||||
// CHECK: define internal void @distribute_wsloop_dist_schedule_chunked_schedule_chunked_i64..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 {
|
||||
// CHECK: call void @__kmpc_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 33, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i64 1, i64 64)
|
||||
// call void @__kmpc_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 91, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i64 1, i64 1024)
|
||||
|
||||
// -----
|
||||
|
||||
llvm.func @distribute_wsloop_dist_schedule_chunked(%n: i32, %teams: i32, %threads: i32) {
|
||||
%0 = llvm.mlir.constant(0 : i32) : i32
|
||||
%1 = llvm.mlir.constant(1 : i32) : i32
|
||||
%dcs = llvm.mlir.constant(1024 : i32) : i32
|
||||
|
||||
omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) {
|
||||
omp.parallel {
|
||||
omp.distribute dist_schedule_static dist_schedule_chunk_size(%dcs : i32) {
|
||||
omp.wsloop schedule(static) {
|
||||
omp.loop_nest (%i) : i32 = (%0) to (%n) step (%1) {
|
||||
omp.yield
|
||||
}
|
||||
} {omp.composite}
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
}
|
||||
llvm.return
|
||||
}
|
||||
// CHECK: define internal void @distribute_wsloop_dist_schedule_chunked..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 {
|
||||
// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 34, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 0)
|
||||
// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 91, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 1024)
|
||||
|
||||
llvm.func @distribute_wsloop_dist_schedule_chunked_i64(%n: i32, %teams: i32, %threads: i32) {
|
||||
%0 = llvm.mlir.constant(0 : i64) : i64
|
||||
%1 = llvm.mlir.constant(1 : i64) : i64
|
||||
%dcs = llvm.mlir.constant(1024 : i64) : i64
|
||||
%n64 = llvm.zext %n : i32 to i64
|
||||
|
||||
omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) {
|
||||
omp.parallel {
|
||||
omp.distribute dist_schedule_static dist_schedule_chunk_size(%dcs : i64) {
|
||||
omp.wsloop schedule(static) {
|
||||
omp.loop_nest (%i) : i64 = (%0) to (%n64) step (%1) {
|
||||
omp.yield
|
||||
}
|
||||
} {omp.composite}
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
}
|
||||
llvm.return
|
||||
}
|
||||
// CHECK: define internal void @distribute_wsloop_dist_schedule_chunked_i64..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 {
|
||||
// CHECK: call void @__kmpc_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 34, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i64 1, i64 0)
|
||||
// CHECK: call void @__kmpc_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 91, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i64 1, i64 1024)
|
||||
|
||||
// -----
|
||||
|
||||
llvm.func @distribute_wsloop_schedule_chunked(%n: i32, %teams: i32, %threads: i32) {
|
||||
%0 = llvm.mlir.constant(0 : i32) : i32
|
||||
%1 = llvm.mlir.constant(1 : i32) : i32
|
||||
%scs = llvm.mlir.constant(64 : i32) : i32
|
||||
|
||||
omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) {
|
||||
omp.parallel {
|
||||
omp.distribute dist_schedule_static {
|
||||
omp.wsloop schedule(static = %scs : i32) {
|
||||
omp.loop_nest (%i) : i32 = (%0) to (%n) step (%1) {
|
||||
omp.yield
|
||||
}
|
||||
} {omp.composite}
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
}
|
||||
llvm.return
|
||||
}
|
||||
// CHECK: define internal void @distribute_wsloop_schedule_chunked..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 {
|
||||
// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 33, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 64)
|
||||
// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 92, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i32 1, i32 0)
|
||||
|
||||
llvm.func @distribute_wsloop_schedule_chunked_i64(%n: i32, %teams: i32, %threads: i32) {
|
||||
%0 = llvm.mlir.constant(0 : i64) : i64
|
||||
%1 = llvm.mlir.constant(1 : i64) : i64
|
||||
%scs = llvm.mlir.constant(64 : i64) : i64
|
||||
%n64 = llvm.zext %n : i32 to i64
|
||||
|
||||
omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) {
|
||||
omp.parallel {
|
||||
omp.distribute dist_schedule_static {
|
||||
omp.wsloop schedule(static = %scs : i64) {
|
||||
omp.loop_nest (%i) : i64 = (%0) to (%n64) step (%1) {
|
||||
omp.yield
|
||||
}
|
||||
} {omp.composite}
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
}
|
||||
llvm.return
|
||||
}
|
||||
|
||||
// CHECK: define internal void @distribute_wsloop_schedule_chunked_i64..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 {
|
||||
// CHECK: call void @__kmpc_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 33, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i64 1, i64 64)
|
||||
// CHECK: call void @__kmpc_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 92, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.stride, i64 1, i64 0)
|
||||
|
||||
// -----
|
||||
|
||||
llvm.func @distribute_wsloop_no_chunks(%n: i32, %teams: i32, %threads: i32) {
|
||||
%0 = llvm.mlir.constant(0 : i32) : i32
|
||||
%1 = llvm.mlir.constant(1 : i32) : i32
|
||||
|
||||
omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) {
|
||||
omp.parallel {
|
||||
omp.distribute dist_schedule_static {
|
||||
omp.wsloop schedule(static) {
|
||||
omp.loop_nest (%i) : i32 = (%0) to (%n) step (%1) {
|
||||
omp.yield
|
||||
}
|
||||
} {omp.composite}
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
}
|
||||
llvm.return
|
||||
}
|
||||
// CHECK: define internal void @distribute_wsloop_no_chunks..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 {
|
||||
// CHECK: call void @__kmpc_dist_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 34, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.distupperbound, ptr %p.stride, i32 1, i32 0)
|
||||
// CHECK: call void @__kmpc_dist_for_static_init_4u(ptr @1, i32 %omp_global_thread_num9, i32 92, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.distupperbound10, ptr %p.stride, i32 1, i32 0)
|
||||
|
||||
llvm.func @distribute_wsloop_no_chunks_i64(%n: i32, %teams: i32, %threads: i32) {
|
||||
%0 = llvm.mlir.constant(0 : i64) : i64
|
||||
%1 = llvm.mlir.constant(1 : i64) : i64
|
||||
%n64 = llvm.zext %n : i32 to i64
|
||||
|
||||
omp.teams num_teams(to %teams : i32) thread_limit(%threads : i32) {
|
||||
omp.parallel {
|
||||
omp.distribute dist_schedule_static {
|
||||
omp.wsloop schedule(static) {
|
||||
omp.loop_nest (%i) : i64 = (%0) to (%n64) step (%1) {
|
||||
omp.yield
|
||||
}
|
||||
} {omp.composite}
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
} {omp.composite}
|
||||
omp.terminator
|
||||
}
|
||||
llvm.return
|
||||
}
|
||||
// CHECK: define internal void @distribute_wsloop_no_chunks_i64..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %0) #0 {
|
||||
// CHECK: call void @__kmpc_dist_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 34, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.distupperbound, ptr %p.stride, i64 1, i64 0)
|
||||
// CHECK: call void @__kmpc_dist_for_static_init_8u(ptr @1, i32 %omp_global_thread_num9, i32 92, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, ptr %p.distupperbound10, ptr %p.stride, i64 1, i64 0)
|
||||
@@ -39,19 +39,6 @@ llvm.func @distribute_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr
|
||||
|
||||
// -----
|
||||
|
||||
llvm.func @distribute_dist_schedule(%lb : i32, %ub : i32, %step : i32, %x : i32) {
|
||||
// expected-error@below {{not yet implemented: Unhandled clause dist_schedule with chunk_size in omp.distribute operation}}
|
||||
// expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
|
||||
omp.distribute dist_schedule_static dist_schedule_chunk_size(%x : i32) {
|
||||
omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
|
||||
omp.yield
|
||||
}
|
||||
}
|
||||
llvm.return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
llvm.func @distribute_order(%lb : i32, %ub : i32, %step : i32) {
|
||||
// expected-error@below {{not yet implemented: Unhandled clause order in omp.distribute operation}}
|
||||
// expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
|
||||
|
||||
Reference in New Issue
Block a user