diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h index 723a262f24ac..d143954b78fc 100644 --- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h +++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h @@ -299,53 +299,8 @@ LogicalResult separateFullTiles(MutableArrayRef nest, SmallVectorImpl *fullTileNest = nullptr); -/// Walk either an scf.for or an affine.for to find a band to coalesce. -template -LogicalResult coalescePerfectlyNestedLoops(LoopOpTy op) { - LogicalResult result(failure()); - SmallVector loops; - getPerfectlyNestedLoops(loops, op); - - // Look for a band of loops that can be coalesced, i.e. perfectly nested - // loops with bounds defined above some loop. - // 1. For each loop, find above which parent loop its operands are - // defined. - SmallVector operandsDefinedAbove(loops.size()); - for (unsigned i = 0, e = loops.size(); i < e; ++i) { - operandsDefinedAbove[i] = i; - for (unsigned j = 0; j < i; ++j) { - if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) { - operandsDefinedAbove[i] = j; - break; - } - } - } - - // 2. Identify bands of loops such that the operands of all of them are - // defined above the first loop in the band. Traverse the nest bottom-up - // so that modifications don't invalidate the inner loops. - for (unsigned end = loops.size(); end > 0; --end) { - unsigned start = 0; - for (; start < end - 1; ++start) { - auto maxPos = - *std::max_element(std::next(operandsDefinedAbove.begin(), start), - std::next(operandsDefinedAbove.begin(), end)); - if (maxPos > start) - continue; - assert(maxPos == start && - "expected loop bounds to be known at the start of the band"); - auto band = llvm::MutableArrayRef(loops.data() + start, end - start); - if (succeeded(coalesceLoops(band))) - result = success(); - break; - } - // If a band was found and transformed, keep looking at the loops above - // the outermost transformed loop. - if (start != end - 1) - end = start + 1; - } - return result; -} +/// Walk an affine.for to find a band to coalesce. +LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op); } // namespace affine } // namespace mlir diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h index 883d11bcc4df..bc09cc7f7fa5 100644 --- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h @@ -100,11 +100,16 @@ getSCFMinMaxExpr(Value value, SmallVectorImpl &dims, /// `loops` contains a list of perfectly nested loops with bounds and steps /// independent of any loop induction variable involved in the nest. LogicalResult coalesceLoops(MutableArrayRef loops); +LogicalResult coalesceLoops(RewriterBase &rewriter, + MutableArrayRef); + +/// Walk an affine.for to find a band to coalesce. +LogicalResult coalescePerfectlyNestedSCFForLoops(scf::ForOp op); /// Take the ParallelLoop and for each set of dimension indices, combine them /// into a single dimension. combinedDimensions must contain each index into /// loops exactly once. -void collapseParallelLoops(scf::ParallelOp loops, +void collapseParallelLoops(RewriterBase &rewriter, scf::ParallelOp loops, ArrayRef> combinedDimensions); /// Unrolls this for operation by the specified unroll factor. Returns failure diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h index 15b1c3892948..2562301e499d 100644 --- a/mlir/include/mlir/IR/PatternMatch.h +++ b/mlir/include/mlir/IR/PatternMatch.h @@ -15,6 +15,7 @@ #include "llvm/Support/TypeName.h" #include +using llvm::SmallPtrSetImpl; namespace mlir { class PatternRewriter; @@ -704,6 +705,8 @@ public: return user != exceptedUser; }); } + void replaceAllUsesExcept(Value from, Value to, + const SmallPtrSetImpl &preservedUsers); /// Used to notify the listener that the IR failed to be rewritten because of /// a match failure, and provide a callback to populate a diagnostic with the diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp index 1dc69ab493d4..05c77070a70c 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp @@ -39,9 +39,9 @@ struct LoopCoalescingPass func::FuncOp func = getOperation(); func.walk([](Operation *op) { if (auto scfForOp = dyn_cast(op)) - (void)coalescePerfectlyNestedLoops(scfForOp); + (void)coalescePerfectlyNestedSCFForLoops(scfForOp); else if (auto affineForOp = dyn_cast(op)) - (void)coalescePerfectlyNestedLoops(affineForOp); + (void)coalescePerfectlyNestedAffineLoops(affineForOp); }); } }; diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp index af59973d7a92..268050a30e00 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp @@ -2765,3 +2765,51 @@ mlir::affine::separateFullTiles(MutableArrayRef inputNest, return success(); } + +LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) { + LogicalResult result(failure()); + SmallVector loops; + getPerfectlyNestedLoops(loops, op); + if (loops.size() <= 1) + return success(); + + // Look for a band of loops that can be coalesced, i.e. perfectly nested + // loops with bounds defined above some loop. + // 1. For each loop, find above which parent loop its operands are + // defined. + SmallVector operandsDefinedAbove(loops.size()); + for (unsigned i = 0, e = loops.size(); i < e; ++i) { + operandsDefinedAbove[i] = i; + for (unsigned j = 0; j < i; ++j) { + if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) { + operandsDefinedAbove[i] = j; + break; + } + } + } + + // 2. Identify bands of loops such that the operands of all of them are + // defined above the first loop in the band. Traverse the nest bottom-up + // so that modifications don't invalidate the inner loops. + for (unsigned end = loops.size(); end > 0; --end) { + unsigned start = 0; + for (; start < end - 1; ++start) { + auto maxPos = + *std::max_element(std::next(operandsDefinedAbove.begin(), start), + std::next(operandsDefinedAbove.begin(), end)); + if (maxPos > start) + continue; + assert(maxPos == start && + "expected loop bounds to be known at the start of the band"); + auto band = llvm::MutableArrayRef(loops.data() + start, end - start); + if (succeeded(coalesceLoops(band))) + result = success(); + break; + } + // If a band was found and transformed, keep looking at the loops above + // the outermost transformed loop. + if (start != end - 1) + end = start + 1; + } + return result; +} diff --git a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp index c09184148208..7e4faf8b73af 100644 --- a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp +++ b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp @@ -332,9 +332,9 @@ transform::LoopCoalesceOp::applyToOne(transform::TransformRewriter &rewriter, transform::TransformState &state) { LogicalResult result(failure()); if (scf::ForOp scfForOp = dyn_cast(op)) - result = coalescePerfectlyNestedLoops(scfForOp); + result = coalescePerfectlyNestedSCFForLoops(scfForOp); else if (AffineForOp affineForOp = dyn_cast(op)) - result = coalescePerfectlyNestedLoops(affineForOp); + result = coalescePerfectlyNestedAffineLoops(affineForOp); results.push_back(op); if (failed(result)) { diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp index a69df025bcba..6ba7020e86fa 100644 --- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp @@ -28,6 +28,7 @@ namespace { struct TestSCFParallelLoopCollapsing : public impl::TestSCFParallelLoopCollapsingBase< TestSCFParallelLoopCollapsing> { + void runOnOperation() override { Operation *module = getOperation(); @@ -88,6 +89,7 @@ struct TestSCFParallelLoopCollapsing // Only apply the transformation on parallel loops where the specified // transformation is valid, but do NOT early abort in the case of invalid // loops. + IRRewriter rewriter(&getContext()); module->walk([&](scf::ParallelOp op) { if (flattenedCombinedLoops.size() != op.getNumLoops()) { op.emitOpError("has ") @@ -97,7 +99,7 @@ struct TestSCFParallelLoopCollapsing << flattenedCombinedLoops.size() << " iter args."; return; } - collapseParallelLoops(op, combinedLoops); + collapseParallelLoops(rewriter, op, combinedLoops); }); } }; diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp index 914aeb4fa79f..9279081cfd45 100644 --- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp +++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/SCF/Utils/Utils.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/BuiltinOps.h" @@ -472,61 +473,6 @@ LogicalResult mlir::loopUnrollByFactor( return success(); } -/// Return the new lower bound, upper bound, and step in that order. Insert any -/// additional bounds calculations before the given builder and any additional -/// conversion back to the original loop induction value inside the given Block. -static LoopParams normalizeLoop(OpBuilder &boundsBuilder, - OpBuilder &insideLoopBuilder, Location loc, - Value lowerBound, Value upperBound, Value step, - Value inductionVar) { - // Check if the loop is already known to have a constant zero lower bound or - // a constant one step. - bool isZeroBased = false; - if (auto ubCst = getConstantIntValue(lowerBound)) - isZeroBased = ubCst.value() == 0; - - bool isStepOne = false; - if (auto stepCst = getConstantIntValue(step)) - isStepOne = stepCst.value() == 1; - - // Compute the number of iterations the loop executes: ceildiv(ub - lb, step) - // assuming the step is strictly positive. Update the bounds and the step - // of the loop to go from 0 to the number of iterations, if necessary. - if (isZeroBased && isStepOne) - return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound, - /*step=*/step}; - - Value diff = boundsBuilder.create(loc, upperBound, lowerBound); - Value newUpperBound = - boundsBuilder.create(loc, diff, step); - - Value newLowerBound = - isZeroBased ? lowerBound - : boundsBuilder.create( - loc, boundsBuilder.getZeroAttr(lowerBound.getType())); - Value newStep = - isStepOne ? step - : boundsBuilder.create( - loc, boundsBuilder.getIntegerAttr(step.getType(), 1)); - - // Insert code computing the value of the original loop induction variable - // from the "normalized" one. - Value scaled = - isStepOne - ? inductionVar - : insideLoopBuilder.create(loc, inductionVar, step); - Value shifted = - isZeroBased - ? scaled - : insideLoopBuilder.create(loc, scaled, lowerBound); - - SmallPtrSet preserve{scaled.getDefiningOp(), - shifted.getDefiningOp()}; - inductionVar.replaceAllUsesExcept(shifted, preserve); - return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound, - /*step=*/newStep}; -} - /// Transform a loop with a strictly positive step /// for %i = %lb to %ub step %s /// into a 0-based loop with step 1 @@ -536,19 +482,107 @@ static LoopParams normalizeLoop(OpBuilder &boundsBuilder, /// expected to be either `loop` or another loop perfectly nested under `loop`. /// Insert the definition of new bounds immediate before `outer`, which is /// expected to be either `loop` or its parent in the loop nest. -static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) { - OpBuilder builder(outer); - OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody()); - auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(), - loop.getLowerBound(), loop.getUpperBound(), - loop.getStep(), loop.getInductionVar()); +static LoopParams emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc, + Value lb, Value ub, Value step) { + // For non-index types, generate `arith` instructions + // Check if the loop is already known to have a constant zero lower bound or + // a constant one step. + bool isZeroBased = false; + if (auto lbCst = getConstantIntValue(lb)) + isZeroBased = lbCst.value() == 0; - loop.setLowerBound(loopPieces.lowerBound); - loop.setUpperBound(loopPieces.upperBound); - loop.setStep(loopPieces.step); + bool isStepOne = false; + if (auto stepCst = getConstantIntValue(step)) + isStepOne = stepCst.value() == 1; + + // Compute the number of iterations the loop executes: ceildiv(ub - lb, step) + // assuming the step is strictly positive. Update the bounds and the step + // of the loop to go from 0 to the number of iterations, if necessary. + if (isZeroBased && isStepOne) + return {lb, ub, step}; + + Value diff = isZeroBased ? ub : rewriter.create(loc, ub, lb); + Value newUpperBound = + isStepOne ? diff : rewriter.create(loc, diff, step); + + Value newLowerBound = isZeroBased + ? lb + : rewriter.create( + loc, rewriter.getZeroAttr(lb.getType())); + Value newStep = isStepOne + ? step + : rewriter.create( + loc, rewriter.getIntegerAttr(step.getType(), 1)); + + return {newLowerBound, newUpperBound, newStep}; } -LogicalResult mlir::coalesceLoops(MutableArrayRef loops) { +/// Get back the original induction variable values after loop normalization +static void denormalizeInductionVariable(RewriterBase &rewriter, Location loc, + Value normalizedIv, Value origLb, + Value origStep) { + Value denormalizedIv; + SmallPtrSet preserve; + bool isStepOne = isConstantIntValue(origStep, 1); + bool isZeroBased = isConstantIntValue(origLb, 0); + + Value scaled = normalizedIv; + if (!isStepOne) { + scaled = rewriter.create(loc, normalizedIv, origStep); + preserve.insert(scaled.getDefiningOp()); + } + denormalizedIv = scaled; + if (!isZeroBased) { + denormalizedIv = rewriter.create(loc, scaled, origLb); + preserve.insert(denormalizedIv.getDefiningOp()); + } + + rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIv, preserve); +} + +/// Helper function to multiply a sequence of values. +static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc, + ArrayRef values) { + assert(!values.empty() && "unexpected empty list"); + Value productOf = values.front(); + for (auto v : values.drop_front()) { + productOf = rewriter.create(loc, productOf, v); + } + return productOf; +} + +/// For each original loop, the value of the +/// induction variable can be obtained by dividing the induction variable of +/// the linearized loop by the total number of iterations of the loops nested +/// in it modulo the number of iterations in this loop (remove the values +/// related to the outer loops): +/// iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i. +/// Compute these iteratively from the innermost loop by creating a "running +/// quotient" of division by the range. +static std::pair, SmallPtrSet> +delinearizeInductionVariable(RewriterBase &rewriter, Location loc, + Value linearizedIv, ArrayRef ubs) { + Value previous = linearizedIv; + SmallVector delinearizedIvs(ubs.size()); + SmallPtrSet preservedUsers; + for (unsigned i = 0, e = ubs.size(); i < e; ++i) { + unsigned idx = ubs.size() - i - 1; + if (i != 0) { + previous = rewriter.create(loc, previous, ubs[idx + 1]); + preservedUsers.insert(previous.getDefiningOp()); + } + Value iv = previous; + if (i != e - 1) { + iv = rewriter.create(loc, previous, ubs[idx]); + preservedUsers.insert(iv.getDefiningOp()); + } + delinearizedIvs[idx] = iv; + } + return {delinearizedIvs, preservedUsers}; +} + +LogicalResult mlir::coalesceLoops(RewriterBase &rewriter, + MutableArrayRef loops) { if (loops.size() < 2) return failure(); @@ -557,57 +591,148 @@ LogicalResult mlir::coalesceLoops(MutableArrayRef loops) { // 1. Make sure all loops iterate from 0 to upperBound with step 1. This // allows the following code to assume upperBound is the number of iterations. - for (auto loop : loops) - normalizeLoop(loop, outermost, innermost); + for (auto loop : loops) { + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(outermost); + Value lb = loop.getLowerBound(); + Value ub = loop.getUpperBound(); + Value step = loop.getStep(); + auto newLoopParams = + emitNormalizedLoopBounds(rewriter, loop.getLoc(), lb, ub, step); + + rewriter.modifyOpInPlace(loop, [&]() { + loop.setLowerBound(newLoopParams.lowerBound); + loop.setUpperBound(newLoopParams.upperBound); + loop.setStep(newLoopParams.step); + }); + + rewriter.setInsertionPointToStart(innermost.getBody()); + denormalizeInductionVariable(rewriter, loop.getLoc(), + loop.getInductionVar(), lb, step); + } // 2. Emit code computing the upper bound of the coalesced loop as product // of the number of iterations of all loops. - OpBuilder builder(outermost); + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(outermost); Location loc = outermost.getLoc(); - Value upperBound = outermost.getUpperBound(); - for (auto loop : loops.drop_front()) - upperBound = - builder.create(loc, upperBound, loop.getUpperBound()); + SmallVector upperBounds = llvm::map_to_vector( + loops, [](auto loop) { return loop.getUpperBound(); }); + Value upperBound = getProductOfIntsOrIndexes(rewriter, loc, upperBounds); outermost.setUpperBound(upperBound); - builder.setInsertionPointToStart(outermost.getBody()); + rewriter.setInsertionPointToStart(innermost.getBody()); + auto [delinearizeIvs, preservedUsers] = delinearizeInductionVariable( + rewriter, loc, outermost.getInductionVar(), upperBounds); + rewriter.replaceAllUsesExcept(outermost.getInductionVar(), delinearizeIvs[0], + preservedUsers); - // 3. Remap induction variables. For each original loop, the value of the - // induction variable can be obtained by dividing the induction variable of - // the linearized loop by the total number of iterations of the loops nested - // in it modulo the number of iterations in this loop (remove the values - // related to the outer loops): - // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i. - // Compute these iteratively from the innermost loop by creating a "running - // quotient" of division by the range. - Value previous = outermost.getInductionVar(); - for (unsigned i = 0, e = loops.size(); i < e; ++i) { - unsigned idx = loops.size() - i - 1; - if (i != 0) - previous = builder.create(loc, previous, - loops[idx + 1].getUpperBound()); + for (int i = loops.size() - 1; i > 0; --i) { + auto outerLoop = loops[i - 1]; + auto innerLoop = loops[i]; - Value iv = (i == e - 1) ? previous - : builder.create( - loc, previous, loops[idx].getUpperBound()); - replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv, - loops.back().getRegion()); + Operation *innerTerminator = innerLoop.getBody()->getTerminator(); + auto yieldedVals = llvm::to_vector(innerTerminator->getOperands()); + rewriter.eraseOp(innerTerminator); + + SmallVector innerBlockArgs; + innerBlockArgs.push_back(delinearizeIvs[i]); + llvm::append_range(innerBlockArgs, outerLoop.getRegionIterArgs()); + rewriter.inlineBlockBefore(innerLoop.getBody(), outerLoop.getBody(), + Block::iterator(innerLoop), innerBlockArgs); + rewriter.replaceOp(innerLoop, yieldedVals); } - - // 4. Move the operations from the innermost just above the second-outermost - // loop, delete the extra terminator and the second-outermost loop. - scf::ForOp second = loops[1]; - innermost.getBody()->back().erase(); - outermost.getBody()->getOperations().splice( - Block::iterator(second.getOperation()), - innermost.getBody()->getOperations()); - second.erase(); return success(); } +LogicalResult mlir::coalesceLoops(MutableArrayRef loops) { + if (loops.empty()) { + return failure(); + } + IRRewriter rewriter(loops.front().getContext()); + return coalesceLoops(rewriter, loops); +} + +LogicalResult mlir::coalescePerfectlyNestedSCFForLoops(scf::ForOp op) { + LogicalResult result(failure()); + SmallVector loops; + getPerfectlyNestedLoops(loops, op); + + // Look for a band of loops that can be coalesced, i.e. perfectly nested + // loops with bounds defined above some loop. + + // 1. For each loop, find above which parent loop its bounds operands are + // defined. + SmallVector operandsDefinedAbove(loops.size()); + for (unsigned i = 0, e = loops.size(); i < e; ++i) { + operandsDefinedAbove[i] = i; + for (unsigned j = 0; j < i; ++j) { + SmallVector boundsOperands = {loops[i].getLowerBound(), + loops[i].getUpperBound(), + loops[i].getStep()}; + if (areValuesDefinedAbove(boundsOperands, loops[j].getRegion())) { + operandsDefinedAbove[i] = j; + break; + } + } + } + + // 2. For each inner loop check that the iter_args for the immediately outer + // loop are the init for the immediately inner loop and that the yields of the + // return of the inner loop is the yield for the immediately outer loop. Keep + // track of where the chain starts from for each loop. + SmallVector iterArgChainStart(loops.size()); + iterArgChainStart[0] = 0; + for (unsigned i = 1, e = loops.size(); i < e; ++i) { + // By default set the start of the chain to itself. + iterArgChainStart[i] = i; + auto outerloop = loops[i - 1]; + auto innerLoop = loops[i]; + if (outerloop.getNumRegionIterArgs() != innerLoop.getNumRegionIterArgs()) { + continue; + } + if (!llvm::equal(outerloop.getRegionIterArgs(), innerLoop.getInitArgs())) { + continue; + } + auto outerloopTerminator = outerloop.getBody()->getTerminator(); + if (!llvm::equal(outerloopTerminator->getOperands(), + innerLoop.getResults())) { + continue; + } + iterArgChainStart[i] = iterArgChainStart[i - 1]; + } + + // 3. Identify bands of loops such that the operands of all of them are + // defined above the first loop in the band. Traverse the nest bottom-up + // so that modifications don't invalidate the inner loops. + for (unsigned end = loops.size(); end > 0; --end) { + unsigned start = 0; + for (; start < end - 1; ++start) { + auto maxPos = + *std::max_element(std::next(operandsDefinedAbove.begin(), start), + std::next(operandsDefinedAbove.begin(), end)); + if (maxPos > start) + continue; + if (iterArgChainStart[end - 1] > start) + continue; + auto band = llvm::MutableArrayRef(loops.data() + start, end - start); + if (succeeded(coalesceLoops(band))) + result = success(); + break; + } + // If a band was found and transformed, keep looking at the loops above + // the outermost transformed loop. + if (start != end - 1) + end = start + 1; + } + return result; +} + void mlir::collapseParallelLoops( - scf::ParallelOp loops, ArrayRef> combinedDimensions) { - OpBuilder outsideBuilder(loops); + RewriterBase &rewriter, scf::ParallelOp loops, + ArrayRef> combinedDimensions) { + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(loops); Location loc = loops.getLoc(); // Presort combined dimensions. @@ -619,25 +744,29 @@ void mlir::collapseParallelLoops( SmallVector normalizedLowerBounds, normalizedSteps, normalizedUpperBounds; for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) { - OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody()); - auto resultBounds = - normalizeLoop(outsideBuilder, insideLoopBuilder, loc, - loops.getLowerBound()[i], loops.getUpperBound()[i], - loops.getStep()[i], loops.getBody()->getArgument(i)); + OpBuilder::InsertionGuard g2(rewriter); + rewriter.setInsertionPoint(loops); + Value lb = loops.getLowerBound()[i]; + Value ub = loops.getUpperBound()[i]; + Value step = loops.getStep()[i]; + auto newLoopParams = emitNormalizedLoopBounds(rewriter, loc, lb, ub, step); + normalizedLowerBounds.push_back(newLoopParams.lowerBound); + normalizedUpperBounds.push_back(newLoopParams.upperBound); + normalizedSteps.push_back(newLoopParams.step); - normalizedLowerBounds.push_back(resultBounds.lowerBound); - normalizedUpperBounds.push_back(resultBounds.upperBound); - normalizedSteps.push_back(resultBounds.step); + rewriter.setInsertionPointToStart(loops.getBody()); + denormalizeInductionVariable(rewriter, loc, loops.getInductionVars()[i], lb, + step); } // Combine iteration spaces. SmallVector lowerBounds, upperBounds, steps; - auto cst0 = outsideBuilder.create(loc, 0); - auto cst1 = outsideBuilder.create(loc, 1); + auto cst0 = rewriter.create(loc, 0); + auto cst1 = rewriter.create(loc, 1); for (auto &sortedDimension : sortedDimensions) { - Value newUpperBound = outsideBuilder.create(loc, 1); + Value newUpperBound = rewriter.create(loc, 1); for (auto idx : sortedDimension) { - newUpperBound = outsideBuilder.create( + newUpperBound = rewriter.create( loc, newUpperBound, normalizedUpperBounds[idx]); } lowerBounds.push_back(cst0); @@ -651,7 +780,7 @@ void mlir::collapseParallelLoops( // value. The remainders then determine based on that range, which iteration // of the original induction value this represents. This is a normalized value // that is un-normalized already by the previous logic. - auto newPloop = outsideBuilder.create( + auto newPloop = rewriter.create( loc, lowerBounds, upperBounds, steps, [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) { for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) { diff --git a/mlir/lib/IR/PatternMatch.cpp b/mlir/lib/IR/PatternMatch.cpp index 5944a0ea46a1..286f47ce6913 100644 --- a/mlir/lib/IR/PatternMatch.cpp +++ b/mlir/lib/IR/PatternMatch.cpp @@ -11,6 +11,7 @@ #include "mlir/IR/IRMapping.h" #include "mlir/IR/Iterators.h" #include "mlir/IR/RegionKindInterface.h" +#include "llvm/ADT/SmallPtrSet.h" using namespace mlir; @@ -250,6 +251,14 @@ void RewriterBase::finalizeOpModification(Operation *op) { rewriteListener->notifyOperationModified(op); } +void RewriterBase::replaceAllUsesExcept( + Value from, Value to, const SmallPtrSetImpl &preservedUsers) { + return replaceUsesWithIf(from, to, [&](OpOperand &use) { + Operation *user = use.getOwner(); + return !preservedUsers.contains(user); + }); +} + void RewriterBase::replaceUsesWithIf(Value from, Value to, function_ref functor, bool *allUsesReplaced) { diff --git a/mlir/test/Dialect/Affine/loop-coalescing.mlir b/mlir/test/Dialect/Affine/loop-coalescing.mlir index 9c17fb24be69..ae0adf5a0a02 100644 --- a/mlir/test/Dialect/Affine/loop-coalescing.mlir +++ b/mlir/test/Dialect/Affine/loop-coalescing.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -affine-loop-coalescing %s | FileCheck %s +// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -affine-loop-coalescing --cse %s | FileCheck %s // CHECK-LABEL: @one_3d_nest func.func @one_3d_nest() { @@ -239,19 +239,15 @@ func.func @coalesce_affine_for(%arg0: memref) { } return } -// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref -// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref -// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref -// CHECK-DAG: %[[T3:.*]] = affine.apply #[[IDENTITY]]()[%[[T0]]] -// CHECK-DAG: %[[T4:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]] -// CHECK-DAG: %[[T5:.*]] = affine.apply #[[PRODUCT]](%[[T3]])[%[[T4]]] -// CHECK-DAG: %[[T6:.*]] = affine.apply #[[IDENTITY]]()[%[[T2]]] -// CHECK-DAG: %[[T7:.*]] = affine.apply #[[PRODUCT]](%[[T5]])[%[[T6]]] -// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]] -// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T6]]] -// CHECK-DAG: %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T6]]] -// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T4]]] -// CHECK-DAG: %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T4]]] +// CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK-DAG: %[[T0:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]] +// CHECK-DAG: %[[T1:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T0]]] +// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T1]])[%[[T0]]] +// CHECK: affine.for %[[IV:.*]] = 0 to %[[T2]] +// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T0]]] +// CHECK-DAG: %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T0]]] +// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T0]]] +// CHECK-DAG: %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T0]]] // CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) // CHECK-NEXT: } // CHECK-NEXT: return @@ -277,18 +273,16 @@ func.func @coalesce_affine_for(%arg0: memref) { } return } -// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref -// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref -// CHECK-DAG: %[[T2:.*]] = affine.apply #[[IDENTITY]]()[%[[T0]]] -// CHECK-DAG: %[[T3:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]] -// CHECK-DAG: %[[T4:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T3]]] -// CHECK-DAG: %[[T5:.*]] = affine.apply #[[SIXTY_FOUR]]() -// CHECK-DAG: %[[T6:.*]] = affine.apply #[[PRODUCT]](%[[T4]])[%[[T5]]] -// CHECK: affine.for %[[IV:.*]] = 0 to %[[T6]] -// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T5]]] -// CHECK-DAG: %[[T8:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T5]]] -// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T8]])[%[[T3]]] -// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T8]])[%[[T3]]] +// CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK-DAG: %[[T0:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]] +// CHECK-DAG: %[[T1:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T0]]] +// CHECK-DAG: %[[T2:.*]] = affine.apply #[[SIXTY_FOUR]]() +// CHECK-DAG: %[[T3:.*]] = affine.apply #[[PRODUCT]](%[[T1]])[%[[T2]]] +// CHECK: affine.for %[[IV:.*]] = 0 to %[[T3]] +// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T2]]] +// CHECK-DAG: %[[T5:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T2]]] +// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T5]])[%[[T0]]] +// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T5]])[%[[T0]]] // CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) // CHECK-NEXT: } // CHECK-NEXT: return @@ -316,19 +310,16 @@ func.func @coalesce_affine_for(%arg0: memref) { } return } -// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref -// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref -// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref -// CHECK-DAG: %[[T3:.*]] = affine.min #[[MAP0]]()[%[[T0]]] -// CHECK-DAG: %[[T4:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]] -// CHECK-DAG: %[[T5:.*]] = affine.apply #[[PRODUCT]](%[[T3]])[%[[T4]]] -// CHECK-DAG: %[[T6:.*]] = affine.apply #[[IDENTITY]]()[%[[T2]]] -// CHECK-DAG: %[[T7:.*]] = affine.apply #[[PRODUCT]](%[[T5]])[%[[T6]]] -// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]] -// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T6]]] -// CHECK-DAG: %[[T9:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T6]]] -// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T4]]] -// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T9]])[%[[T4]]] +// CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref +// CHECK-DAG: %[[T0:.*]] = affine.min #[[MAP0]]()[%[[DIM]]] +// CHECK-DAG: %[[T1:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]] +// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T1]]] +// CHECK-DAG: %[[T3:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T1]]] +// CHECK: affine.for %[[IV:.*]] = 0 to %[[T3]] +// CHECK-DAG: %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T1]]] +// CHECK-DAG: %[[T5:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T1]]] +// CHECK-DAG: %[[J:.*]] = affine.apply #[[MOD]](%[[T5]])[%[[T1]]] +// CHECK-DAG: %[[I:.*]] = affine.apply #[[DIV]](%[[T5]])[%[[T1]]] // CHECK-NEXT: "test.foo"(%[[I]], %[[J]], %[[K]]) // CHECK-NEXT: } // CHECK-NEXT: return @@ -342,12 +333,14 @@ func.func @coalesce_affine_for(%arg0: memref) { func.func @test_loops_do_not_get_coalesced() { affine.for %i = 0 to 7 { affine.for %j = #map0(%i) to min #map1(%i) { + "use"(%i, %j) : (index, index) -> () } } return } // CHECK: affine.for %[[IV0:.*]] = 0 to 7 // CHECK-NEXT: affine.for %[[IV1:.*]] = #[[MAP0]](%[[IV0]]) to min #[[MAP1]](%[[IV0]]) +// CHECK-NEXT: "use"(%[[IV0]], %[[IV1]]) // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: return diff --git a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir index 2d59331b72cf..4dc3e4ea0ef4 100644 --- a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir +++ b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics | FileCheck %s +// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics -allow-unregistered-dialect --cse | FileCheck %s func.func @coalesce_inner() { %c0 = arith.constant 0 : index @@ -14,7 +14,7 @@ func.func @coalesce_inner() { scf.for %k = %i to %j step %c1 { // Inner loop must have been removed. scf.for %l = %i to %j step %c1 { - arith.addi %i, %j : index + "use"(%i, %j) : (index, index) -> () } } {coalesce} } @@ -33,13 +33,19 @@ module attributes {transform.with_named_sequence} { // ----- +// CHECK-DAG: #[[MAP:.+]] = affine_map<() -> (64)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 mod s0)> +// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (d0 floordiv s0)> func.func @coalesce_outer(%arg1: memref<64x64xf32, 1>, %arg2: memref<64x64xf32, 1>, %arg3: memref<64x64xf32, 1>) attributes {} { + // CHECK: %[[T0:.+]] = affine.apply #[[MAP]]() + // CHECK: %[[UB:.+]] = affine.apply #[[MAP1]](%[[T0]])[%[[T0]]] // CHECK: affine.for %[[IV1:.+]] = 0 to %[[UB:.+]] { // CHECK-NOT: affine.for %[[IV2:.+]] affine.for %arg4 = 0 to 64 { affine.for %arg5 = 0 to 64 { - // CHECK: %[[IDX0:.+]] = affine.apply #[[MAP0:.+]](%[[IV1]])[%{{.+}}] - // CHECK: %[[IDX1:.+]] = affine.apply #[[MAP1:.+]](%[[IV1]])[%{{.+}}] + // CHECK: %[[IDX0:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%{{.+}}] + // CHECK: %[[IDX1:.+]] = affine.apply #[[MAP3]](%[[IV1]])[%{{.+}}] // CHECK-NEXT: %{{.+}} = affine.load %{{.+}}[%[[IDX1]], %[[IDX0]]] : memref<64x64xf32, 1> %0 = affine.load %arg1[%arg4, %arg5] : memref<64x64xf32, 1> %1 = affine.load %arg2[%arg4, %arg5] : memref<64x64xf32, 1> @@ -96,3 +102,200 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +func.func @tensor_loops(%arg0 : tensor, %lb0 : index, %ub0 : index, %step0 : index, + %lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> tensor { + %0 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg1 = %arg0) -> tensor { + %1 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg2 = %arg1) -> tensor { + %2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg3 = %arg2) -> tensor { + %3 = "use"(%arg3, %i, %j, %k) : (tensor, index, index, index) -> (tensor) + scf.yield %3 : tensor + } + scf.yield %2 : tensor + } + scf.yield %1 : tensor + } {coalesce} + return %0 : tensor +} +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for"> + %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">) + transform.yield + } +} +// CHECK: func.func @tensor_loops( +// CHECK-SAME: %[[ARG0:.+]]: tensor +// CHECK-SAME: %[[LB0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[UB0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[STEP0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[LB1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[UB1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[STEP1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[LB2:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[UB2:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[STEP2:[a-zA-Z0-9_]+]]: index +// CHECK: %[[NEWUB0_DIFF:.+]] = arith.subi %[[UB0]], %[[LB0]] +// CHECK-DAG: %[[NEWUB0:.+]] = arith.ceildivsi %[[NEWUB0_DIFF]], %[[STEP0]] +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 +// CHECK: %[[NEWUB1_DIFF:.+]] = arith.subi %[[UB1]], %[[LB1]] +// CHECK-DAG: %[[NEWUB1:.+]] = arith.ceildivsi %[[NEWUB1_DIFF]], %[[STEP1]] +// CHECK: %[[NEWUB2_DIFF:.+]] = arith.subi %[[UB2]], %[[LB2]] +// CHECK-DAG: %[[NEWUB2:.+]] = arith.ceildivsi %[[NEWUB2_DIFF]], %[[STEP2]] +// CHECK: %[[PROD1:.+]] = arith.muli %[[NEWUB0]], %[[NEWUB1]] +// CHECK: %[[NEWUB:.+]] = arith.muli %[[PROD1]], %[[NEWUB2]] +// CHECK: %[[RESULT:.+]] = scf.for %[[IV:[a-zA-Z0-9]+]] = %[[C0]] to %[[NEWUB]] step %[[C1]] iter_args(%[[ITER_ARG:.+]] = %[[ARG0]]) +// CHECK: %[[IV2:.+]] = arith.remsi %[[IV]], %[[NEWUB2]] +// CHECK: %[[PREVIOUS:.+]] = arith.divsi %[[IV]], %[[NEWUB2]] +// CHECK: %[[IV1:.+]] = arith.remsi %[[PREVIOUS]], %[[NEWUB1]] +// CHECK: %[[IV0:.+]] = arith.divsi %[[PREVIOUS]], %[[NEWUB1]] +// CHECK: %[[K_STEP:.+]] = arith.muli %[[IV2]], %[[STEP2]] +// CHECK: %[[K:.+]] = arith.addi %[[K_STEP]], %[[LB2]] +// CHECK: %[[J_STEP:.+]] = arith.muli %[[IV1]], %[[STEP1]] +// CHECK: %[[J:.+]] = arith.addi %[[J_STEP]], %[[LB1]] +// CHECK: %[[I_STEP:.+]] = arith.muli %[[IV0]], %[[STEP0]] +// CHECK: %[[I:.+]] = arith.addi %[[I_STEP]], %[[LB0]] +// CHECK: %[[USE:.+]] = "use"(%[[ITER_ARG]], %[[I]], %[[J]], %[[K]]) +// CHECK: scf.yield %[[USE]] +// CHECK: return %[[RESULT]] + +// ----- + +// Coalesce only first two loops, but not the last since the iter_args dont line up +func.func @tensor_loops_first_two(%arg0 : tensor, %arg1 : tensor, %lb0 : index, %ub0 : index, %step0 : index, + %lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> (tensor, tensor) { + %0:2 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (tensor, tensor) { + %1:2 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (tensor, tensor) { + %2:2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg6 = %arg5, %arg7 = %arg4) -> (tensor, tensor) { + %3:2 = "use"(%arg3, %i, %j, %k) : (tensor, index, index, index) -> (tensor, tensor) + scf.yield %3#0, %3#1 : tensor, tensor + } + scf.yield %2#0, %2#1 : tensor, tensor + } + scf.yield %1#0, %1#1 : tensor, tensor + } {coalesce} + return %0#0, %0#1 : tensor, tensor +} +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for"> + %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">) + transform.yield + } +} +// CHECK: func.func @tensor_loops_first_two( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor +// CHECK-SAME: %[[LB0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[UB0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[STEP0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[LB1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[UB1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[STEP1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[LB2:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[UB2:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[STEP2:[a-zA-Z0-9_]+]]: index +// CHECK: scf.for +// CHECK: arith.remsi +// CHECK: arith.divsi +// CHECK: scf.for %{{[a-zA-Z0-9]+}} = %[[LB2]] to %[[UB2]] step %[[STEP2]] +// CHECK-NOT: scf.for +// CHECK: transform.named_sequence + +// ----- + +// Coalesce only first two loops, but not the last since the yields dont match up +func.func @tensor_loops_first_two_2(%arg0 : tensor, %arg1 : tensor, %lb0 : index, %ub0 : index, %step0 : index, + %lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> (tensor, tensor) { + %0:2 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (tensor, tensor) { + %1:2 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (tensor, tensor) { + %2:2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg6 = %arg4, %arg7 = %arg5) -> (tensor, tensor) { + %3:2 = "use"(%arg3, %i, %j, %k) : (tensor, index, index, index) -> (tensor, tensor) + scf.yield %3#0, %3#1 : tensor, tensor + } + scf.yield %2#1, %2#0 : tensor, tensor + } + scf.yield %1#0, %1#1 : tensor, tensor + } {coalesce} + return %0#0, %0#1 : tensor, tensor +} +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for"> + %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">) + transform.yield + } +} +// CHECK: func.func @tensor_loops_first_two_2( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor +// CHECK-SAME: %[[LB0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[UB0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[STEP0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[LB1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[UB1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[STEP1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[LB2:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[UB2:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[STEP2:[a-zA-Z0-9_]+]]: index +// CHECK: scf.for +// CHECK: arith.remsi +// CHECK: arith.divsi +// CHECK: scf.for %{{[a-zA-Z0-9]+}} = %[[LB2]] to %[[UB2]] step %[[STEP2]] +// CHECK-NOT: scf.for +// CHECK: transform.named_sequence + +// ----- + +// Coalesce only last two loops, but not the first since the yields dont match up +func.func @tensor_loops_last_two(%arg0 : tensor, %arg1 : tensor, %lb0 : index, %ub0 : index, %step0 : index, + %lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> (tensor, tensor) { + %0:2 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (tensor, tensor) { + %1:2 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (tensor, tensor) { + %2:2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg6 = %arg4, %arg7 = %arg5) -> (tensor, tensor) { + %3:2 = "use"(%arg3, %i, %j, %k) : (tensor, index, index, index) -> (tensor, tensor) + scf.yield %3#0, %3#1 : tensor, tensor + } + scf.yield %2#0, %2#1 : tensor, tensor + } + scf.yield %1#1, %1#0 : tensor, tensor + } {coalesce} + return %0#0, %0#1 : tensor, tensor +} +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for"> + %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">) + transform.yield + } +} +// CHECK: func.func @tensor_loops_last_two( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor +// CHECK-SAME: %[[LB0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[UB0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[STEP0:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[LB1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[UB1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[STEP1:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[LB2:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[UB2:[a-zA-Z0-9_]+]]: index +// CHECK-SAME: %[[STEP2:[a-zA-Z0-9_]+]]: index +// CHECK: scf.for %{{[a-zA-Z0-9]+}} = %[[LB0]] to %[[UB0]] step %[[STEP0]] +// CHECK: arith.subi +// CHECK: arith.ceildivsi +// CHECK: arith.subi +// CHECK: arith.ceildivsi +// CHECK: scf.for +// CHECK: arith.remsi +// CHECK: arith.divsi +// CHECK-NOT: scf.for +// CHECK: transform.named_sequence + diff --git a/mlir/test/Transforms/parallel-loop-collapsing.mlir b/mlir/test/Transforms/parallel-loop-collapsing.mlir index 660d7edb2fbb..d1c23d584f92 100644 --- a/mlir/test/Transforms/parallel-loop-collapsing.mlir +++ b/mlir/test/Transforms/parallel-loop-collapsing.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize))' | FileCheck %s -// CHECK-LABEL: func @parallel_many_dims() { +// CHECK: func @parallel_many_dims() { func.func @parallel_many_dims() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -28,19 +28,19 @@ func.func @parallel_many_dims() { return } -// CHECK-DAG: [[C12:%.*]] = arith.constant 12 : index -// CHECK-DAG: [[C10:%.*]] = arith.constant 10 : index -// CHECK-DAG: [[C9:%.*]] = arith.constant 9 : index -// CHECK-DAG: [[C6:%.*]] = arith.constant 6 : index -// CHECK-DAG: [[C4:%.*]] = arith.constant 4 : index -// CHECK-DAG: [[C3:%.*]] = arith.constant 3 : index -// CHECK-DAG: [[C2:%.*]] = arith.constant 2 : index -// CHECK-DAG: [[C1:%.*]] = arith.constant 1 : index -// CHECK-DAG: [[C0:%.*]] = arith.constant 0 : index -// CHECK: scf.parallel ([[NEW_I0:%.*]]) = ([[C0]]) to ([[C4]]) step ([[C1]]) { -// CHECK: [[V0:%.*]] = arith.remsi [[NEW_I0]], [[C2]] : index -// CHECK: [[I0:%.*]] = arith.divsi [[NEW_I0]], [[C2]] : index -// CHECK: [[V2:%.*]] = arith.muli [[V0]], [[C10]] : index -// CHECK: [[I3:%.*]] = arith.addi [[V2]], [[C9]] : index -// CHECK: "magic.op"([[I0]], [[C3]], [[C6]], [[I3]], [[C12]]) : (index, index, index, index, index) -> index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index +// CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK: scf.parallel (%[[NEW_I0:.*]]) = (%[[C0]]) to (%[[C4]]) step (%[[C1]]) { +// CHECK: %[[V0:.*]] = arith.remsi %[[NEW_I0]], %[[C2]] : index +// CHECK: %[[I0:.*]] = arith.divsi %[[NEW_I0]], %[[C2]] : index +// CHECK: %[[V2:.*]] = arith.muli %[[V0]], %[[C10]] +// CHECK: %[[I3:.*]] = arith.addi %[[V2]], %[[C9]] +// CHECK: "magic.op"(%[[I0]], %[[C3]], %[[C6]], %[[I3]], %[[C12]]) : (index, index, index, index, index) -> index // CHECK: scf.reduce diff --git a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir index 542786b5fa5e..4eed61a65aa4 100644 --- a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir +++ b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir @@ -13,22 +13,22 @@ func.func @collapse_to_single() { return } -// CHECK-LABEL: func @collapse_to_single() { -// CHECK-DAG: [[C18:%.*]] = arith.constant 18 : index -// CHECK-DAG: [[C6:%.*]] = arith.constant 6 : index -// CHECK-DAG: [[C3:%.*]] = arith.constant 3 : index -// CHECK-DAG: [[C7:%.*]] = arith.constant 7 : index -// CHECK-DAG: [[C4:%.*]] = arith.constant 4 : index -// CHECK-DAG: [[C1:%.*]] = arith.constant 1 : index -// CHECK-DAG: [[C0:%.*]] = arith.constant 0 : index -// CHECK: scf.parallel ([[NEW_I:%.*]]) = ([[C0]]) to ([[C18]]) step ([[C1]]) { -// CHECK: [[I0_COUNT:%.*]] = arith.remsi [[NEW_I]], [[C6]] : index -// CHECK: [[I1_COUNT:%.*]] = arith.divsi [[NEW_I]], [[C6]] : index -// CHECK: [[V0:%.*]] = arith.muli [[I0_COUNT]], [[C4]] : index -// CHECK: [[I1:%.*]] = arith.addi [[V0]], [[C7]] : index -// CHECK: [[V1:%.*]] = arith.muli [[I1_COUNT]], [[C3]] : index -// CHECK: [[I0:%.*]] = arith.addi [[V1]], [[C3]] : index -// CHECK: "magic.op"([[I0]], [[I1]]) : (index, index) -> index +// CHECK: func @collapse_to_single() { +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C7:.*]] = arith.constant 7 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C18:.*]] = arith.constant 18 : index +// CHECK: scf.parallel (%[[NEW_I:.*]]) = (%[[C0]]) to (%[[C18]]) step (%[[C1]]) { +// CHECK: %[[I0_COUNT:.*]] = arith.remsi %[[NEW_I]], %[[C6]] : index +// CHECK: %[[I1_COUNT:.*]] = arith.divsi %[[NEW_I]], %[[C6]] : index +// CHECK: %[[V0:.*]] = arith.muli %[[I0_COUNT]], %[[C4]] +// CHECK: %[[I1:.*]] = arith.addi %[[V0]], %[[C7]] +// CHECK: %[[V1:.*]] = arith.muli %[[I1_COUNT]], %[[C3]] +// CHECK: %[[I0:.*]] = arith.addi %[[V1]], %[[C3]] +// CHECK: "magic.op"(%[[I0]], %[[I1]]) : (index, index) -> index // CHECK: scf.reduce // CHECK-NEXT: } // CHECK-NEXT: return