mirror of
https://github.com/intel/llvm.git
synced 2026-02-05 13:21:04 +08:00
[mlir][sparse] support parallel for/reduction in sparsification.
This patch fix the re-revert D135927 (which caused a windows build failure) to re-enable parallel for/reduction. It also fix a warning caused by D137442. Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D137565
This commit is contained in:
@@ -219,9 +219,12 @@ Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
|
||||
OpBuilder &builder, Location loc, size_t tid, size_t dim,
|
||||
MutableArrayRef<Value> reduc, bool isParallel, ArrayRef<size_t> extraTids,
|
||||
ArrayRef<size_t> extraDims) {
|
||||
|
||||
assert(dimTypes[tid].size() > dim);
|
||||
// We can not re-enter the same level.
|
||||
assert(!coord[tid][dim]);
|
||||
// TODO: support multiple return on parallel for?
|
||||
assert(!isParallel || reduc.size() <= 1);
|
||||
|
||||
Value step = constantIndex(builder, loc, 1);
|
||||
auto dimType = dimTypes[tid][dim];
|
||||
@@ -232,11 +235,38 @@ Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
|
||||
Value lo = isSparseInput ? pidxs[tid][dim] // current offset
|
||||
: loopSeqStack.back(); // univeral tid
|
||||
Value hi = highs[tid][dim];
|
||||
Operation *loop = nullptr;
|
||||
Value iv;
|
||||
if (isParallel) {
|
||||
scf::ParallelOp parOp =
|
||||
builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
|
||||
builder.setInsertionPointToStart(parOp.getBody());
|
||||
assert(parOp.getNumReductions() == reduc.size());
|
||||
iv = parOp.getInductionVars()[0];
|
||||
|
||||
// In-place update on the reduction variable vector.
|
||||
// Note that the init vals is not the actual reduction variables but instead
|
||||
// used as a `special handle` to (temporarily) represent them. The
|
||||
// expression on init vals will be moved into scf.reduce and replaced with
|
||||
// the block arguments when exiting the loop (see exitForLoop). This is
|
||||
// needed as we can not build the actual reduction block and get the actual
|
||||
// reduction varaible before users fill parallel loop body.
|
||||
for (int i = 0, e = reduc.size(); i < e; i++)
|
||||
reduc[i] = parOp.getInitVals()[i];
|
||||
loop = parOp;
|
||||
} else {
|
||||
scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
|
||||
builder.setInsertionPointToStart(forOp.getBody());
|
||||
iv = forOp.getInductionVar();
|
||||
|
||||
// In-place update on the reduction variable vector.
|
||||
assert(forOp.getNumRegionIterArgs() == reduc.size());
|
||||
for (int i = 0, e = reduc.size(); i < e; i++)
|
||||
reduc[i] = forOp.getRegionIterArg(i);
|
||||
loop = forOp;
|
||||
}
|
||||
assert(loop && iv);
|
||||
|
||||
scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
|
||||
builder.setInsertionPointToStart(forOp.getBody());
|
||||
Value iv = forOp.getInductionVar();
|
||||
assert(iv);
|
||||
if (isSparseInput) {
|
||||
pidxs[tid][dim] = iv;
|
||||
// Generating a load on the indices array yields the coordinate.
|
||||
@@ -253,16 +283,12 @@ Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
|
||||
|
||||
// NOTE: we can also prepares for next dim here in advance
|
||||
// Push the loop into stack
|
||||
loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), forOp,
|
||||
loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), loop,
|
||||
coord[tid][dim]);
|
||||
// Emit extra locals.
|
||||
emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims);
|
||||
|
||||
// In-place update on the reduction variable vector.
|
||||
assert(forOp.getNumRegionIterArgs() == reduc.size());
|
||||
for (int i = 0, e = reduc.size(); i < e; i++)
|
||||
reduc[i] = forOp.getRegionIterArg(i);
|
||||
return forOp;
|
||||
return loop;
|
||||
}
|
||||
|
||||
Operation *SparseTensorLoopEmitter::enterCoIterationOverTensorsAtDims(
|
||||
@@ -434,17 +460,73 @@ void SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDenseDims(
|
||||
}
|
||||
}
|
||||
|
||||
SmallVector<Value, 2>
|
||||
SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc,
|
||||
ArrayRef<Value> reduc) {
|
||||
void SparseTensorLoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
|
||||
MutableArrayRef<Value> reduc) {
|
||||
LoopLevelInfo &loopInfo = loopStack.back();
|
||||
auto &dims = loopStack.back().dims;
|
||||
auto &tids = loopStack.back().tids;
|
||||
auto forOp = llvm::cast<scf::ForOp>(loopInfo.loop);
|
||||
if (!reduc.empty()) {
|
||||
assert(reduc.size() == forOp.getNumResults());
|
||||
builder.setInsertionPointToEnd(forOp.getBody());
|
||||
builder.create<scf::YieldOp>(loc, reduc);
|
||||
auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop);
|
||||
if (forOp) {
|
||||
if (!reduc.empty()) {
|
||||
assert(reduc.size() == forOp.getNumResults());
|
||||
rewriter.setInsertionPointToEnd(forOp.getBody());
|
||||
rewriter.create<scf::YieldOp>(loc, reduc);
|
||||
}
|
||||
// Exit the loop.
|
||||
rewriter.setInsertionPointAfter(forOp);
|
||||
// In-place update reduction variables.
|
||||
for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++)
|
||||
reduc[i] = forOp.getResult(i);
|
||||
} else {
|
||||
auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop);
|
||||
if (!reduc.empty()) {
|
||||
assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1);
|
||||
Operation *redExp = reduc.front().getDefiningOp();
|
||||
// Reduction expression should have no use.
|
||||
assert(redExp->getUses().empty());
|
||||
// This must be a binary operation.
|
||||
// NOTE: This is users' responsibilty to ensure the operation are
|
||||
// commutative.
|
||||
assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1);
|
||||
|
||||
Value redVal = parOp.getInitVals().front();
|
||||
Value curVal;
|
||||
if (redExp->getOperand(0) == redVal)
|
||||
curVal = redExp->getOperand(1);
|
||||
else if (redExp->getOperand(1) == redVal)
|
||||
curVal = redExp->getOperand(0);
|
||||
// One of the operands must be the init value (which is also the
|
||||
// previous reduction value).
|
||||
assert(curVal);
|
||||
// The reduction expression should be the only user of the reduction val
|
||||
// inside the parallel for.
|
||||
unsigned numUsers = 0;
|
||||
for (Operation *op : redVal.getUsers()) {
|
||||
if (op->getParentOp() == parOp)
|
||||
numUsers++;
|
||||
}
|
||||
assert(numUsers == 1);
|
||||
(void)numUsers; // to silence unused variable warning in release build
|
||||
|
||||
rewriter.setInsertionPointAfter(redExp);
|
||||
auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
|
||||
// Attach to the reduction op.
|
||||
Block *redBlock = &redOp.getRegion().getBlocks().front();
|
||||
rewriter.setInsertionPointToEnd(redBlock);
|
||||
Operation *newRed = rewriter.clone(*redExp);
|
||||
// Replaces arguments of the reduction expression by using the block
|
||||
// arguments from scf.reduce.
|
||||
rewriter.updateRootInPlace(
|
||||
newRed, [&]() { newRed->setOperands(redBlock->getArguments()); });
|
||||
// Erases the out-dated reduction expression.
|
||||
rewriter.eraseOp(redExp);
|
||||
rewriter.setInsertionPointToEnd(redBlock);
|
||||
rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
|
||||
}
|
||||
rewriter.setInsertionPointAfter(parOp);
|
||||
// In-place update reduction variables.
|
||||
for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++)
|
||||
reduc[i] = parOp.getResult(i);
|
||||
}
|
||||
|
||||
// Finished iterating a tensor, clean up
|
||||
@@ -458,14 +540,10 @@ SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc,
|
||||
if (!isDenseDLT(dimTypes[tid][dim]))
|
||||
highs[tid][dim] = Value();
|
||||
}
|
||||
// exit the loop
|
||||
builder.setInsertionPointAfter(forOp);
|
||||
return forOp.getResults();
|
||||
}
|
||||
|
||||
SmallVector<Value, 2>
|
||||
SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc,
|
||||
ArrayRef<Value> reduc) {
|
||||
void SparseTensorLoopEmitter::exitCoIterationLoop(
|
||||
OpBuilder &builder, Location loc, MutableArrayRef<Value> reduc) {
|
||||
auto whileOp = llvm::cast<scf::WhileOp>(loopStack.back().loop);
|
||||
auto &dims = loopStack.back().dims;
|
||||
auto &tids = loopStack.back().tids;
|
||||
@@ -499,10 +577,10 @@ SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc,
|
||||
}
|
||||
|
||||
// Reduction value from users.
|
||||
SmallVector<Value, 2> ret;
|
||||
for (auto red : reduc) {
|
||||
operands.push_back(red);
|
||||
ret.push_back(whileOp->getResult(o++));
|
||||
for (unsigned i = 0, e = reduc.size(); i < e; i++) {
|
||||
operands.push_back(reduc[i]);
|
||||
// In place update reduction variable.
|
||||
reduc[i] = whileOp->getResult(o++);
|
||||
}
|
||||
|
||||
// An (optional) universal index.
|
||||
@@ -517,26 +595,24 @@ SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc,
|
||||
assert(o == operands.size());
|
||||
builder.create<scf::YieldOp>(loc, operands);
|
||||
builder.setInsertionPointAfter(whileOp);
|
||||
return ret;
|
||||
}
|
||||
|
||||
SmallVector<Value, 2>
|
||||
SparseTensorLoopEmitter::exitCurrentLoop(OpBuilder &builder, Location loc,
|
||||
ArrayRef<Value> reduc) {
|
||||
void SparseTensorLoopEmitter::exitCurrentLoop(RewriterBase &rewriter,
|
||||
Location loc,
|
||||
MutableArrayRef<Value> reduc) {
|
||||
// Clean up the values, it would help use to discover potential bug at a
|
||||
// earlier stage (instead of silently using a wrong value).
|
||||
LoopLevelInfo &loopInfo = loopStack.back();
|
||||
assert(loopInfo.tids.size() == loopInfo.dims.size());
|
||||
SmallVector<Value, 2> red;
|
||||
if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
|
||||
red = exitCoiterationLoop(builder, loc, reduc);
|
||||
exitCoIterationLoop(rewriter, loc, reduc);
|
||||
} else {
|
||||
red = exitForLoop(builder, loc, reduc);
|
||||
exitForLoop(rewriter, loc, reduc);
|
||||
}
|
||||
|
||||
assert(loopStack.size() == loopSeqStack.size());
|
||||
loopStack.pop_back();
|
||||
return red;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
@@ -380,8 +380,8 @@ public:
|
||||
ArrayRef<size_t> dims, bool needsUniv, MutableArrayRef<Value> reduc = {},
|
||||
ArrayRef<size_t> extraTids = {}, ArrayRef<size_t> extraDims = {});
|
||||
|
||||
SmallVector<Value, 2> exitCurrentLoop(OpBuilder &builder, Location loc,
|
||||
ArrayRef<Value> reduc = {});
|
||||
void exitCurrentLoop(RewriterBase &rewriter, Location loc,
|
||||
MutableArrayRef<Value> reduc = {});
|
||||
|
||||
/// Returns the array of coordinate for all the loop generated till now.
|
||||
void getCoordinateArray(SmallVectorImpl<Value> &coords) const {
|
||||
@@ -452,17 +452,35 @@ private:
|
||||
ArrayRef<size_t> dims);
|
||||
|
||||
/// Exits a for loop, returns the reduction results, e.g.,
|
||||
/// For sequential for loops:
|
||||
/// %ret = for () {
|
||||
/// ...
|
||||
/// %val = addi %args, %c
|
||||
/// yield %val
|
||||
/// }
|
||||
/// Return %ret to user, while %val is provided by users (`reduc`)
|
||||
SmallVector<Value, 2> exitForLoop(OpBuilder &builder, Location loc,
|
||||
ArrayRef<Value> reduc);
|
||||
/// For parallel loops, the following generated code by users:
|
||||
/// %ret = parallel () init(%args) {
|
||||
/// ...
|
||||
/// %val = op %args, %c
|
||||
/// }
|
||||
/// will be transformed into
|
||||
/// %ret = parallel () init(%args) {
|
||||
/// ...
|
||||
/// scf.reduce(%c) bb0(%0, %1){
|
||||
/// %val = op %0, %1
|
||||
/// scf.reduce.return %val
|
||||
/// }
|
||||
/// }
|
||||
/// NOTE: only one instruction will be moved into reduce block, transformation
|
||||
/// will fail if multiple instructions are used to compute the reduction
|
||||
/// value.
|
||||
/// Return %ret to user, while %val is provided by users (`reduc`).
|
||||
void exitForLoop(RewriterBase &rewriter, Location loc,
|
||||
MutableArrayRef<Value> reduc);
|
||||
|
||||
/// Exits a while loop, returns the reduction results.
|
||||
SmallVector<Value, 2> exitCoiterationLoop(OpBuilder &builder, Location loc,
|
||||
ArrayRef<Value> reduc);
|
||||
void exitCoIterationLoop(OpBuilder &builder, Location loc,
|
||||
MutableArrayRef<Value> reduc);
|
||||
|
||||
// Whether the loop emitter needs to treat the last tensor as the output
|
||||
// tensor.
|
||||
|
||||
@@ -410,6 +410,34 @@ static Value getCustomRedId(Operation *op) {
|
||||
// Sparse compiler synthesis methods (statements and expressions).
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
/// Generates loop boundary statements (entering/exiting loops). The function
|
||||
/// passes and updates the reduction value.
|
||||
static Optional<Operation *> genLoopBoundary(
|
||||
CodeGen &codegen, Merger &merger,
|
||||
function_ref<Optional<Operation *>(MutableArrayRef<Value> reduc)>
|
||||
callback) {
|
||||
SmallVector<Value, 4> reduc;
|
||||
if (codegen.redVal)
|
||||
reduc.push_back(codegen.redVal);
|
||||
if (codegen.expValues)
|
||||
reduc.push_back(codegen.expCount);
|
||||
if (codegen.insChain)
|
||||
reduc.push_back(codegen.insChain);
|
||||
|
||||
auto r = callback(reduc);
|
||||
|
||||
// Callback should do in-place update on reduction value vector.
|
||||
unsigned i = 0;
|
||||
if (codegen.redVal)
|
||||
updateReduc(merger, codegen, reduc[i++]);
|
||||
if (codegen.expValues)
|
||||
codegen.expCount = reduc[i++];
|
||||
if (codegen.insChain)
|
||||
codegen.insChain = reduc[i];
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
/// Local bufferization of all dense and sparse data structures.
|
||||
static void genBuffers(Merger &merger, CodeGen &codegen, OpBuilder &builder,
|
||||
linalg::GenericOp op) {
|
||||
@@ -869,23 +897,25 @@ static void genExpansion(Merger &merger, CodeGen &codegen, OpBuilder &builder,
|
||||
/// Returns parallelization strategy. Any implicit loop in the Linalg
|
||||
/// operation that is marked "parallel" is a candidate. Whether it is actually
|
||||
/// converted to a parallel operation depends on the requested strategy.
|
||||
static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction,
|
||||
bool isSparse) {
|
||||
static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isSparse) {
|
||||
// Reject parallelization of sparse output.
|
||||
if (codegen.sparseOut)
|
||||
return false;
|
||||
// Parallel loops on tensor expansion can cause data races.
|
||||
if (codegen.expCount)
|
||||
return false;
|
||||
// Inspect strategy.
|
||||
switch (codegen.options.parallelizationStrategy) {
|
||||
case SparseParallelizationStrategy::kNone:
|
||||
return false;
|
||||
case SparseParallelizationStrategy::kDenseOuterLoop:
|
||||
return isOuter && !isSparse && !isReduction;
|
||||
return isOuter && !isSparse;
|
||||
case SparseParallelizationStrategy::kAnyStorageOuterLoop:
|
||||
return isOuter && !isReduction;
|
||||
return isOuter;
|
||||
case SparseParallelizationStrategy::kDenseAnyLoop:
|
||||
return !isSparse && !isReduction;
|
||||
return !isSparse;
|
||||
case SparseParallelizationStrategy::kAnyStorageAnyLoop:
|
||||
return !isReduction;
|
||||
return true;
|
||||
}
|
||||
llvm_unreachable("unexpected parallelization strategy");
|
||||
}
|
||||
@@ -898,33 +928,16 @@ static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder,
|
||||
ArrayRef<size_t> extraDims) {
|
||||
Location loc = op.getLoc();
|
||||
auto iteratorTypes = op.getIteratorTypesArray();
|
||||
bool isReduction = linalg::isReductionIterator(iteratorTypes[idx]);
|
||||
bool isSparse = isCompressedDLT(merger.getDimLevelType(tid, idx)) ||
|
||||
isSingletonDLT(merger.getDimLevelType(tid, idx));
|
||||
bool isParallel = isParallelFor(codegen, isOuter, isReduction, isSparse);
|
||||
assert(!isParallel);
|
||||
|
||||
// Emit a sequential for loop.
|
||||
SmallVector<Value, 4> operands;
|
||||
if (codegen.redVal)
|
||||
operands.push_back(codegen.redVal);
|
||||
if (codegen.expValues)
|
||||
operands.push_back(codegen.expCount);
|
||||
if (codegen.insChain)
|
||||
operands.push_back(codegen.insChain);
|
||||
|
||||
Operation *loop = codegen.loopEmitter.enterLoopOverTensorAtDim(
|
||||
builder, loc, tid, dim, operands, isParallel, extraTids, extraDims);
|
||||
|
||||
unsigned o = 0;
|
||||
if (codegen.redVal)
|
||||
updateReduc(merger, codegen, operands[o++]);
|
||||
if (codegen.expValues)
|
||||
codegen.expCount = operands[o++];
|
||||
if (codegen.insChain)
|
||||
codegen.insChain = operands[o++];
|
||||
assert(o == operands.size());
|
||||
bool isParallel = isParallelFor(codegen, isOuter, isSparse);
|
||||
|
||||
Operation *loop =
|
||||
genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
|
||||
return codegen.loopEmitter.enterLoopOverTensorAtDim(
|
||||
builder, loc, tid, dim, reduc, isParallel, extraTids, extraDims);
|
||||
}).value();
|
||||
assert(loop);
|
||||
return loop;
|
||||
}
|
||||
|
||||
@@ -934,29 +947,15 @@ static Operation *genWhile(Merger &merger, CodeGen &codegen, OpBuilder &builder,
|
||||
ArrayRef<size_t> condTids, ArrayRef<size_t> condDims,
|
||||
ArrayRef<size_t> extraTids,
|
||||
ArrayRef<size_t> extraDims) {
|
||||
SmallVector<Value, 4> operands;
|
||||
|
||||
// Construct the while-loop with a parameter for each index.
|
||||
if (codegen.redVal)
|
||||
operands.push_back(codegen.redVal);
|
||||
if (codegen.expValues)
|
||||
operands.push_back(codegen.expCount);
|
||||
if (codegen.insChain)
|
||||
operands.push_back(codegen.insChain);
|
||||
|
||||
Operation *loop = codegen.loopEmitter.enterCoIterationOverTensorsAtDims(
|
||||
builder, op.getLoc(), condTids, condDims, needsUniv, operands, extraTids,
|
||||
extraDims);
|
||||
|
||||
unsigned o = 0;
|
||||
if (codegen.redVal)
|
||||
updateReduc(merger, codegen, operands[o++]);
|
||||
if (codegen.expValues)
|
||||
codegen.expCount = operands[o++];
|
||||
if (codegen.insChain)
|
||||
codegen.insChain = operands[o++];
|
||||
assert(o == operands.size());
|
||||
|
||||
Operation *loop =
|
||||
genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
|
||||
// Construct the while-loop with a parameter for each index.
|
||||
return codegen.loopEmitter.enterCoIterationOverTensorsAtDims(
|
||||
builder, op.getLoc(), condTids, condDims, needsUniv, reduc,
|
||||
extraTids, extraDims);
|
||||
}).value();
|
||||
assert(loop);
|
||||
return loop;
|
||||
}
|
||||
|
||||
@@ -1186,37 +1185,21 @@ static Operation *startLoop(Merger &merger, CodeGen &codegen,
|
||||
}
|
||||
|
||||
/// Ends a single loop in current sequence. Returns new values for needsUniv.
|
||||
static bool endLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder,
|
||||
static bool endLoop(Merger &merger, CodeGen &codegen, RewriterBase &rewriter,
|
||||
linalg::GenericOp op, Operation *loop, unsigned idx,
|
||||
unsigned li, bool needsUniv) {
|
||||
// End a while-loop.
|
||||
if (auto whileOp = dyn_cast<scf::WhileOp>(loop)) {
|
||||
finalizeWhileOp(merger, codegen, builder, op, idx, needsUniv,
|
||||
finalizeWhileOp(merger, codegen, rewriter, op, idx, needsUniv,
|
||||
merger.lat(li).bits, whileOp);
|
||||
} else {
|
||||
needsUniv = false;
|
||||
}
|
||||
|
||||
SmallVector<Value, 2> reduc;
|
||||
if (codegen.redVal)
|
||||
reduc.push_back(codegen.redVal);
|
||||
if (codegen.expValues)
|
||||
reduc.push_back(codegen.expCount);
|
||||
if (codegen.insChain)
|
||||
reduc.push_back(codegen.insChain);
|
||||
|
||||
auto loopRet =
|
||||
codegen.loopEmitter.exitCurrentLoop(builder, op.getLoc(), reduc);
|
||||
assert(reduc.size() == loopRet.size());
|
||||
|
||||
unsigned o = 0;
|
||||
if (codegen.redVal)
|
||||
updateReduc(merger, codegen, loopRet[o++]);
|
||||
if (codegen.expValues)
|
||||
codegen.expCount = loopRet[o++];
|
||||
if (codegen.insChain)
|
||||
codegen.insChain = loopRet[o++];
|
||||
assert(o == loopRet.size());
|
||||
genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
|
||||
codegen.loopEmitter.exitCurrentLoop(rewriter, op.getLoc(), reduc);
|
||||
return llvm::None;
|
||||
});
|
||||
|
||||
return needsUniv;
|
||||
}
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
// RUN: mlir-opt %s -sparsification="parallelization-strategy=none" | \
|
||||
// RUN: FileCheck %s --check-prefix=CHECK-PAR0
|
||||
// FIXME: we do not support vectorization/parallel loops in loop emitter right now
|
||||
// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \
|
||||
// R_U_N: FileCheck %s --check-prefix=CHECK-PAR1
|
||||
// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \
|
||||
// R_U_N: FileCheck %s --check-prefix=CHECK-PAR2
|
||||
// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \
|
||||
// R_U_N: FileCheck %s --check-prefix=CHECK-PAR3
|
||||
// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
|
||||
// R_U_N: FileCheck %s --check-prefix=CHECK-PAR4
|
||||
// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \
|
||||
// RUN: FileCheck %s --check-prefix=CHECK-PAR1
|
||||
// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \
|
||||
// RUN: FileCheck %s --check-prefix=CHECK-PAR2
|
||||
// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \
|
||||
// RUN: FileCheck %s --check-prefix=CHECK-PAR3
|
||||
// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
|
||||
// RUN: FileCheck %s --check-prefix=CHECK-PAR4
|
||||
|
||||
#DenseMatrix = #sparse_tensor.encoding<{
|
||||
dimLevelType = [ "dense", "dense" ]
|
||||
@@ -151,7 +150,8 @@ func.func @scale_ss(%scale: f32,
|
||||
//
|
||||
// CHECK-PAR4-LABEL: func @matvec
|
||||
// CHECK-PAR4: scf.parallel
|
||||
// CHECK-PAR4: scf.for
|
||||
// CHECK-PAR4: scf.parallel
|
||||
// CHECK-PAR4: scf.reduce
|
||||
// CHECK-PAR4: return
|
||||
//
|
||||
func.func @matvec(%arga: tensor<16x32xf32, #CSR>,
|
||||
|
||||
63
mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir
Normal file
63
mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir
Normal file
@@ -0,0 +1,63 @@
|
||||
// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
|
||||
// RUN: FileCheck %s
|
||||
|
||||
#CSR = #sparse_tensor.encoding<{
|
||||
dimLevelType = [ "dense", "compressed" ]
|
||||
}>
|
||||
|
||||
#trait_matvec = {
|
||||
indexing_maps = [
|
||||
affine_map<(i,j) -> (i,j)>, // A
|
||||
affine_map<(i,j) -> (j)>, // b
|
||||
affine_map<(i,j) -> (i)> // x (out)
|
||||
],
|
||||
iterator_types = ["parallel", "reduction"],
|
||||
doc = "x(i) += A(i,j) * b(j)"
|
||||
}
|
||||
// CHECK-LABEL: func.func @matvec(
|
||||
// CHECK-SAME: %[[TMP_arg0:.*]]: tensor<16x32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>,
|
||||
// CHECK-SAME: %[[TMP_arg1:.*]]: tensor<32xf32>,
|
||||
// CHECK-SAME: %[[TMP_arg2:.*]]: tensor<16xf32>) -> tensor<16xf32> {
|
||||
// CHECK-DAG: %[[TMP_c16:.*]] = arith.constant 16 : index
|
||||
// CHECK-DAG: %[[TMP_c0:.*]] = arith.constant 0 : index
|
||||
// CHECK-DAG: %[[TMP_c1:.*]] = arith.constant 1 : index
|
||||
// CHECK: %[[TMP_0:.*]] = sparse_tensor.pointers %[[TMP_arg0]] {dimension = 1 : index}
|
||||
// CHECK: %[[TMP_1:.*]] = sparse_tensor.indices %[[TMP_arg0]] {dimension = 1 : index}
|
||||
// CHECK: %[[TMP_2:.*]] = sparse_tensor.values %[[TMP_arg0]]
|
||||
// CHECK: %[[TMP_3:.*]] = bufferization.to_memref %[[TMP_arg1]] : memref<32xf32>
|
||||
// CHECK: %[[TMP_4:.*]] = bufferization.to_memref %[[TMP_arg2]] : memref<16xf32>
|
||||
// CHECK: scf.parallel (%[[TMP_arg3:.*]]) = (%[[TMP_c0]]) to (%[[TMP_c16]]) step (%[[TMP_c1]]) {
|
||||
// CHECK: %[[TMP_6:.*]] = memref.load %[[TMP_4]][%[[TMP_arg3]]] : memref<16xf32>
|
||||
// CHECK: %[[TMP_7:.*]] = memref.load %[[TMP_0]][%[[TMP_arg3]]] : memref<?xindex>
|
||||
// CHECK: %[[TMP_8:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
|
||||
// CHECK: %[[TMP_9:.*]] = memref.load %[[TMP_0]][%[[TMP_8]]] : memref<?xindex>
|
||||
// CHECK: %[[TMP_10:.*]] = scf.parallel (%[[TMP_arg4:.*]]) = (%[[TMP_7]]) to (%[[TMP_9]]) step (%[[TMP_c1]]) init (%[[TMP_6]]) -> f32 {
|
||||
// CHECK: %[[TMP_11:.*]] = memref.load %[[TMP_1]][%[[TMP_arg4]]] : memref<?xindex>
|
||||
// CHECK: %[[TMP_12:.*]] = memref.load %[[TMP_2]][%[[TMP_arg4]]] : memref<?xf32>
|
||||
// CHECK: %[[TMP_13:.*]] = memref.load %[[TMP_3]][%[[TMP_11]]] : memref<32xf32>
|
||||
// CHECK: %[[TMP_14:.*]] = arith.mulf %[[TMP_12]], %[[TMP_13]] : f32
|
||||
// CHECK: scf.reduce(%[[TMP_14]]) : f32 {
|
||||
// CHECK: ^bb0(%[[TMP_arg5:.*]]: f32, %[[TMP_arg6:.*]]: f32):
|
||||
// CHECK: %[[TMP_15:.*]] = arith.addf %[[TMP_arg5]], %[[TMP_arg6]] : f32
|
||||
// CHECK: scf.reduce.return %[[TMP_15]] : f32
|
||||
// CHECK: }
|
||||
// CHECK: scf.yield
|
||||
// CHECK: }
|
||||
// CHECK: memref.store %[[TMP_10]], %[[TMP_4]][%[[TMP_arg3]]] : memref<16xf32>
|
||||
// CHECK: scf.yield
|
||||
// CHECK: }
|
||||
// CHECK: %[[TMP_5:.*]] = bufferization.to_tensor %[[TMP_4]] : memref<16xf32>
|
||||
// CHECK: return %[[TMP_5]] : tensor<16xf32>
|
||||
func.func @matvec(%arga: tensor<16x32xf32, #CSR>,
|
||||
%argb: tensor<32xf32>,
|
||||
%argx: tensor<16xf32>) -> tensor<16xf32> {
|
||||
%0 = linalg.generic #trait_matvec
|
||||
ins(%arga, %argb : tensor<16x32xf32, #CSR>, tensor<32xf32>)
|
||||
outs(%argx: tensor<16xf32>) {
|
||||
^bb(%A: f32, %b: f32, %x: f32):
|
||||
%0 = arith.mulf %A, %b : f32
|
||||
%1 = arith.addf %0, %x : f32
|
||||
linalg.yield %1 : f32
|
||||
} -> tensor<16xf32>
|
||||
return %0 : tensor<16xf32>
|
||||
}
|
||||
@@ -2,6 +2,14 @@
|
||||
// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
|
||||
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
|
||||
// RUN: FileCheck %s
|
||||
//
|
||||
// Do the same run, but now with parallelization.
|
||||
//
|
||||
// RUN: mlir-opt %s --sparse-compiler="parallelization-strategy=any-storage-any-loop" | \
|
||||
// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
|
||||
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
|
||||
// RUN: FileCheck %s
|
||||
|
||||
|
||||
#CSR = #sparse_tensor.encoding<{
|
||||
dimLevelType = [ "dense", "compressed" ],
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
// RUN: -e entry -entry-point-result=void \
|
||||
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
|
||||
// RUN: FileCheck %s
|
||||
//
|
||||
// Do the same run, but now with parallelization.
|
||||
//
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: --sparse-compiler="parallelization-strategy=any-storage-any-loop" | \
|
||||
// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \
|
||||
// RUN: mlir-cpu-runner \
|
||||
// RUN: -e entry -entry-point-result=void \
|
||||
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
|
||||
// RUN: FileCheck %s
|
||||
|
||||
!Filename = !llvm.ptr<i8>
|
||||
|
||||
|
||||
Reference in New Issue
Block a user