mirror of
https://github.com/intel/llvm.git
synced 2026-01-29 12:53:33 +08:00
[mlir][Linalg] Drop usage of tileWithLinalgTilingOptions in the structured.tile transform
This is on a path to deprecation. Context: https://discourse.llvm.org/t/psa-retire-tileandfuselinalgops-method/63850 As the interface-based transformation is more generic, some additional folding of AffineMin/MaxOp and some extra canonicalizations are needed. This can be further evolved. Differential Revision: https://reviews.llvm.org/D137195
This commit is contained in:
@@ -189,17 +189,15 @@ struct TiledLinalgOp {
|
||||
FailureOr<TiledLinalgOp> tileLinalgOp(RewriterBase &b, LinalgOp op,
|
||||
const LinalgTilingOptions &options);
|
||||
|
||||
/// Try to peel anad canonicalize loop `op` and return the new result.
|
||||
// TODO: Add support for scf.parallel and affine.for loops.
|
||||
SmallVector<Value> peelLoop(RewriterBase &rewriter, Operation *op);
|
||||
/// Peel and canonicalize 'loops'.
|
||||
void peelLoops(RewriterBase &rewriter, ArrayRef<scf::ForOp> loops);
|
||||
|
||||
/// Peel the loops of a TiledLinalgOp.
|
||||
void peelTiledLinalgOp(RewriterBase &rewriter, TiledLinalgOp &res,
|
||||
ArrayRef<int64_t> peeledLoops,
|
||||
LinalgTilingLoopType loopType);
|
||||
|
||||
/// Interchange the `iterator_types` and `iterator_maps` dimensions and adapts
|
||||
/// the index accesses of `op`. This is an in-place transformation controlled by
|
||||
/// `interchangeVector`. An empty vector is interpreted as the identity
|
||||
/// the index accesses of `op`. This is an in-place transformation controlled
|
||||
/// by `interchangeVector`. An empty vector is interpreted as the identity
|
||||
/// permutation and the transformation returns early.
|
||||
///
|
||||
/// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed with
|
||||
@@ -232,10 +230,10 @@ using AllocBufferCallbackFn = std::function<Optional<Value>(
|
||||
using DeallocBufferCallbackFn =
|
||||
std::function<LogicalResult(OpBuilder &b, Value buffer)>;
|
||||
|
||||
/// Callback function type used to insert copy from original subview to subview
|
||||
/// of the promoted region for the read operands/subview of promoted region to
|
||||
/// original subview for the results. The copy has to happen from `src` to
|
||||
/// `dst`.
|
||||
/// Callback function type used to insert copy from original subview to
|
||||
/// subview of the promoted region for the read operands/subview of promoted
|
||||
/// region to original subview for the results. The copy has to happen from
|
||||
/// `src` to `dst`.
|
||||
using CopyCallbackFn =
|
||||
std::function<LogicalResult(OpBuilder &b, Value src, Value dst)>;
|
||||
|
||||
@@ -247,12 +245,11 @@ struct LinalgPromotionOptions {
|
||||
operandsToPromote->insert(operands.begin(), operands.end());
|
||||
return *this;
|
||||
}
|
||||
/// If ith element of `useFullTiles` is true the full view should be used for
|
||||
/// the promoted buffer of the ith operand in `operandsToPromote`. Otherwise
|
||||
/// the partial view will be used.
|
||||
/// The decision is defaulted to `useFullTileBuffersDefault` when
|
||||
/// `useFullTileBuffers` is None and for operands missing from
|
||||
/// `useFullTileBuffers`.
|
||||
/// If ith element of `useFullTiles` is true the full view should be used
|
||||
/// for the promoted buffer of the ith operand in `operandsToPromote`.
|
||||
/// Otherwise the partial view will be used. The decision is defaulted to
|
||||
/// `useFullTileBuffersDefault` when `useFullTileBuffers` is None and for
|
||||
/// operands missing from `useFullTileBuffers`.
|
||||
Optional<llvm::SmallBitVector> useFullTileBuffers = None;
|
||||
LinalgPromotionOptions &setUseFullTileBuffers(ArrayRef<bool> useFullTiles) {
|
||||
unsigned size = useFullTiles.size();
|
||||
@@ -262,8 +259,8 @@ struct LinalgPromotionOptions {
|
||||
useFullTileBuffers = tmp;
|
||||
return *this;
|
||||
}
|
||||
/// If true all operands unspecified by `useFullTileBuffers` will use the full
|
||||
/// view, otherwise the partial view.
|
||||
/// If true all operands unspecified by `useFullTileBuffers` will use the
|
||||
/// full view, otherwise the partial view.
|
||||
bool useFullTileBuffersDefault = false;
|
||||
LinalgPromotionOptions &setUseFullTileBuffersByDefault(bool use) {
|
||||
useFullTileBuffersDefault = use;
|
||||
@@ -306,10 +303,10 @@ struct LinalgPromotionOptions {
|
||||
};
|
||||
|
||||
/// Create a new buffer using the `allocationFn` provided. The size of this
|
||||
/// buffer is the smallest constant bounding size along each dimension that can
|
||||
/// be computed for the size of the result of `subView`. Returns the allocated
|
||||
/// buffer as `fullLocalView` and the view that matches the size of the result
|
||||
/// of subview operation as `partialLocalView`.
|
||||
/// buffer is the smallest constant bounding size along each dimension that
|
||||
/// can be computed for the size of the result of `subView`. Returns the
|
||||
/// allocated buffer as `fullLocalView` and the view that matches the size of
|
||||
/// the result of subview operation as `partialLocalView`.
|
||||
struct PromotionInfo {
|
||||
Value fullLocalView;
|
||||
Value partialLocalView;
|
||||
@@ -321,7 +318,8 @@ promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView,
|
||||
|
||||
/// Promote the `subViews` into a new buffer allocated at the insertion point
|
||||
/// `b`. Promotion occurs in 3 steps:
|
||||
/// 1. Create a new buffer for a full tile (i.e. not clipped at the boundary).
|
||||
/// 1. Create a new buffer for a full tile (i.e. not clipped at the
|
||||
/// boundary).
|
||||
/// 2. Take a full view on the buffer.
|
||||
/// 3. Take a partial slice of the full view in step 2. and copy into it.
|
||||
///
|
||||
@@ -369,12 +367,12 @@ using TileSizeComputationFunction =
|
||||
/// Creates a number of ranges equal to the number of non-zero in `tileSizes`.
|
||||
/// One for each loop of the LinalgOp that is tiled. The `tileSizes` argument
|
||||
/// has one entry per surrounding loop. It uses zero as the convention that a
|
||||
/// particular loop is not tiled. This convention simplifies implementations by
|
||||
/// avoiding affine map manipulations.
|
||||
/// The returned ranges correspond to the loop ranges, in the proper order, that
|
||||
/// are tiled and for which new loops will be created. Also the function returns
|
||||
/// a map from loop indices of the LinalgOp to the corresponding non-empty range
|
||||
/// indices of newly created loops.
|
||||
/// particular loop is not tiled. This convention simplifies implementations
|
||||
/// by avoiding affine map manipulations. The returned ranges correspond to
|
||||
/// the loop ranges, in the proper order, that are tiled and for which new
|
||||
/// loops will be created. Also the function returns a map from loop indices
|
||||
/// of the LinalgOp to the corresponding non-empty range indices of newly
|
||||
/// created loops.
|
||||
using LoopIndexToRangeIndexMap = DenseMap<int, int>;
|
||||
std::tuple<SmallVector<Range, 4>, LoopIndexToRangeIndexMap>
|
||||
makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map,
|
||||
@@ -392,9 +390,9 @@ struct MultiSizeSpecification {
|
||||
};
|
||||
|
||||
/// Emits the IR computing the multi-sized tiling specification with two tile
|
||||
/// sizes not exceeding `targetSize`, each divisible by `sizeDivisor`, such that
|
||||
/// there exist numbers of tiles with these sizes that fully cover the given
|
||||
/// iteration space `dimension` of the structured `op`.
|
||||
/// sizes not exceeding `targetSize`, each divisible by `sizeDivisor`, such
|
||||
/// that there exist numbers of tiles with these sizes that fully cover the
|
||||
/// given iteration space `dimension` of the structured `op`.
|
||||
///
|
||||
/// The computation is as follows:
|
||||
///
|
||||
@@ -427,11 +425,10 @@ computeMultiTileSizes(OpBuilder &builder, LinalgOp op, unsigned dimension,
|
||||
/// tiling by `numThreads`.
|
||||
/// If non-empty, the `threadDimMapping` is added as an attribute to the
|
||||
/// resulting `scf.foreach_thread`.
|
||||
/// Zero tile sizes indicate that the dimension is not tiled, and can be thought
|
||||
/// of as tiling by the full size of data.
|
||||
/// It is the user's responsibility to ensure that `numThreads` is a
|
||||
/// valid tiling specification (i.e. that only tiles parallel
|
||||
/// dimensions, e.g. in the Linalg case).
|
||||
/// Zero tile sizes indicate that the dimension is not tiled, and can be
|
||||
/// thought of as tiling by the full size of data. It is the user's
|
||||
/// responsibility to ensure that `numThreads` is a valid tiling specification
|
||||
/// (i.e. that only tiles parallel dimensions, e.g. in the Linalg case).
|
||||
struct ForeachThreadTilingResult {
|
||||
Operation *tileOp;
|
||||
Operation *tiledOp;
|
||||
@@ -448,10 +445,10 @@ tileToForeachThreadOpUsingTileSizes(RewriterBase &builder, TilingInterface op,
|
||||
ArrayRef<OpFoldResult> tileSizes,
|
||||
ArrayRef<int64_t> threadDimMapping = {});
|
||||
|
||||
/// All indices returned by IndexOp should be invariant with respect to tiling.
|
||||
/// Therefore, if an operation is tiled, we have to transform the indices
|
||||
/// accordingly, i.e. offset them by the values of the corresponding induction
|
||||
/// variables that are captured implicitly in the body of the op.
|
||||
/// All indices returned by IndexOp should be invariant with respect to
|
||||
/// tiling. Therefore, if an operation is tiled, we have to transform the
|
||||
/// indices accordingly, i.e. offset them by the values of the corresponding
|
||||
/// induction variables that are captured implicitly in the body of the op.
|
||||
///
|
||||
/// Example. `linalg.generic` before tiling:
|
||||
///
|
||||
@@ -491,8 +488,9 @@ tileToForeachThreadOpUsingTileSizes(RewriterBase &builder, TilingInterface op,
|
||||
/// %transformed_i = arith.addi %i, %k : index // index `i` is offset by
|
||||
/// %k %transformed_j = arith.addi %j, %l : index // index `j` is offset
|
||||
/// by %l
|
||||
/// // Every use of %i, %j is replaced with %transformed_i, %transformed_j
|
||||
/// <some operations that use %transformed_i, %transformed_j>
|
||||
/// // Every use of %i, %j is replaced with %transformed_i,
|
||||
/// %transformed_j <some operations that use %transformed_i,
|
||||
/// %transformed_j>
|
||||
/// }: memref<?x?xf32, #strided>, memref<?x?xf32, #strided>
|
||||
/// }
|
||||
/// }
|
||||
@@ -516,8 +514,8 @@ struct LinalgPaddingOptions {
|
||||
paddingDimensions.assign(pd.begin(), pd.end());
|
||||
return *this;
|
||||
}
|
||||
/// A flag for every operand to mark the PadOp as nofold which enables packing
|
||||
/// for statically shaped operands.
|
||||
/// A flag for every operand to mark the PadOp as nofold which enables
|
||||
/// packing for statically shaped operands.
|
||||
SmallVector<bool> packPaddings;
|
||||
LinalgPaddingOptions &setPackPaddings(ArrayRef<bool> pp) {
|
||||
packPaddings.assign(pp.begin(), pp.end());
|
||||
@@ -529,8 +527,8 @@ struct LinalgPaddingOptions {
|
||||
hoistPaddings.assign(hp.begin(), hp.end());
|
||||
return *this;
|
||||
}
|
||||
/// A permutation vector for every operand used to transpose the packed PadOp
|
||||
/// results.
|
||||
/// A permutation vector for every operand used to transpose the packed
|
||||
/// PadOp results.
|
||||
SmallVector<SmallVector<int64_t>> transposePaddings;
|
||||
LinalgPaddingOptions &
|
||||
setTransposePaddings(ArrayRef<SmallVector<int64_t>> tp) {
|
||||
@@ -629,20 +627,12 @@ struct LinalgTilingOptions {
|
||||
}
|
||||
};
|
||||
|
||||
/// Canonicalization patterns relevant to apply after tiling patterns. These are
|
||||
/// applied automatically by the tiling pass but need to be applied manually
|
||||
/// when tiling is called programmatically.
|
||||
/// Canonicalization patterns relevant to apply after tiling patterns. These
|
||||
/// are applied automatically by the tiling pass but need to be applied
|
||||
/// manually when tiling is called programmatically.
|
||||
RewritePatternSet getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx);
|
||||
void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns);
|
||||
|
||||
/// Perform tiling using LinalgTilingOptions.
|
||||
/// Note: this is on a path to deprecation that only works on LinalgOp.
|
||||
/// Clients should favor using `tileUsingSCFForOp` that more generally works on
|
||||
/// TilingInterface.
|
||||
FailureOr<TiledLinalgOp>
|
||||
tileWithLinalgTilingOptions(RewriterBase &rewriter, LinalgOp op,
|
||||
const LinalgTilingOptions &options);
|
||||
|
||||
///
|
||||
/// Linalg padding pattern.
|
||||
///
|
||||
@@ -713,14 +703,14 @@ struct DownscaleDepthwiseConv2DNhwcHwcOp final
|
||||
/// Apply the `generalization` transformation as a pattern.
|
||||
/// See `generalization` for more details.
|
||||
//
|
||||
// TODO: Automatic default pattern class that just unwraps a function returning
|
||||
// FailureOr<GenericOp>.
|
||||
// TODO: Automatic default pattern class that just unwraps a function
|
||||
// returning FailureOr<GenericOp>.
|
||||
struct LinalgGeneralizationPattern
|
||||
: public OpInterfaceRewritePattern<LinalgOp> {
|
||||
using OpInterfaceRewritePattern<LinalgOp>::OpInterfaceRewritePattern;
|
||||
|
||||
/// `matchAndRewrite` implementation that returns the significant transformed
|
||||
/// pieces of IR.
|
||||
/// `matchAndRewrite` implementation that returns the significant
|
||||
/// transformed pieces of IR.
|
||||
FailureOr<GenericOp>
|
||||
returningMatchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const {
|
||||
return generalizeNamedOp(rewriter, op);
|
||||
@@ -765,8 +755,8 @@ void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns,
|
||||
// Op-specific patterns.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
/// tensor::PadOp is not canonicalized away yet, so we provide a transformation
|
||||
/// to `linalg.generic`.
|
||||
/// tensor::PadOp is not canonicalized away yet, so we provide a
|
||||
/// transformation to `linalg.generic`.
|
||||
struct PadOpTransformationPattern : public OpRewritePattern<tensor::PadOp> {
|
||||
using OpRewritePattern<tensor::PadOp>::OpRewritePattern;
|
||||
|
||||
@@ -774,12 +764,12 @@ struct PadOpTransformationPattern : public OpRewritePattern<tensor::PadOp> {
|
||||
PatternRewriter &rewriter) const override;
|
||||
};
|
||||
|
||||
/// Pad the iterator dimensions `paddingDimensions` of all `opToPad` operands to
|
||||
/// a static bounding box. Use `paddingValues` and `packPaddings` to set padding
|
||||
/// value and nofold attribute of the created tensor::PadOps, respectively.
|
||||
/// Update `paddedOp` to the cloned operation with statically shaped
|
||||
/// `paddingDimensions` and return the extracted dynamically shaped results.
|
||||
/// If padding fails, return failure.
|
||||
/// Pad the iterator dimensions `paddingDimensions` of all `opToPad` operands
|
||||
/// to a static bounding box. Use `paddingValues` and `packPaddings` to set
|
||||
/// padding value and nofold attribute of the created tensor::PadOps,
|
||||
/// respectively. Update `paddedOp` to the cloned operation with statically
|
||||
/// shaped `paddingDimensions` and return the extracted dynamically shaped
|
||||
/// results. If padding fails, return failure.
|
||||
FailureOr<SmallVector<Value>>
|
||||
rewriteAsPaddedOp(OpBuilder &b, LinalgOp opToPad,
|
||||
ArrayRef<int64_t> paddingDimensions,
|
||||
@@ -866,7 +856,8 @@ struct LinalgCopyVTRForwardingPattern
|
||||
/// vector.transfer_write %..., %out[...]
|
||||
/// ```
|
||||
/// Where there is no interleaved use between transfer_write and memref.copy.
|
||||
/// This is a custom rewrite to forward partial writes to vector.transfer_write.
|
||||
/// This is a custom rewrite to forward partial writes to
|
||||
/// vector.transfer_write.
|
||||
struct LinalgCopyVTWForwardingPattern
|
||||
: public OpRewritePattern<vector::TransferWriteOp> {
|
||||
using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
|
||||
@@ -903,10 +894,11 @@ private:
|
||||
|
||||
/// Split Reduction options.
|
||||
struct SplitReductionOptions {
|
||||
// Ratio used to split the reduction dimension. If the ratio is <= 1, nothing
|
||||
// will be done.
|
||||
// Ratio used to split the reduction dimension. If the ratio is <= 1,
|
||||
// nothing will be done.
|
||||
int64_t ratio = 0;
|
||||
// Index where the extra dimension is added to the intermediate tensor shape.
|
||||
// Index where the extra dimension is added to the intermediate tensor
|
||||
// shape.
|
||||
unsigned index = 0;
|
||||
// If the inner dimension after splitting is parallel or reduction.
|
||||
bool innerParallel = false;
|
||||
@@ -924,11 +916,10 @@ void populateSplitReductionPattern(
|
||||
const ControlSplitReductionFn &controlSplitReductionFn,
|
||||
bool useAlloc = false);
|
||||
|
||||
/// Apply transformation to split the single linalg op reduction into a parallel
|
||||
/// and reduction dimension. Then create a new linalg.generic op doing the rest
|
||||
/// of the reduction.
|
||||
/// Return the new linalg op with an extra parallel dimension or failure if the
|
||||
/// transformation didn't happen.
|
||||
/// Apply transformation to split the single linalg op reduction into a
|
||||
/// parallel and reduction dimension. Then create a new linalg.generic op
|
||||
/// doing the rest of the reduction. Return the new linalg op with an extra
|
||||
/// parallel dimension or failure if the transformation didn't happen.
|
||||
///
|
||||
/// Example:
|
||||
/// ```
|
||||
@@ -945,10 +936,10 @@ void populateSplitReductionPattern(
|
||||
/// To:
|
||||
/// ```
|
||||
/// %cst = arith.constant 0.000000e+00 : f32
|
||||
/// %0 = tensor.expand_shape %in [[0, 1]] : tensor<32xf32> into tensor<4x8xf32>
|
||||
/// %1 = tensor.empty [4] : tensor<4xf32>
|
||||
/// %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<4xf32>) -> tensor<4xf32>
|
||||
/// %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
|
||||
/// %0 = tensor.expand_shape %in [[0, 1]] : tensor<32xf32> into
|
||||
/// tensor<4x8xf32> %1 = tensor.empty [4] : tensor<4xf32> %2 = linalg.fill
|
||||
/// ins(%cst : f32) outs(%1 : tensor<4xf32>) -> tensor<4xf32> %3 =
|
||||
/// linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
|
||||
/// affine_map<(d0, d1) -> (d0)>],
|
||||
/// iterator_types = ["parallel", "reduction"]}
|
||||
/// ins(%0 : tensor<4x8xf32>) outs(%2 : tensor<4xf32>) {
|
||||
@@ -977,8 +968,8 @@ splitReduction(PatternRewriter &b, LinalgOp op,
|
||||
bool useAlloc = false);
|
||||
|
||||
/// Scaling-based implementation of the split reduction transformation.
|
||||
/// Instead of introducing an ExpandShapeOp, this rewrites a reduction dimension
|
||||
/// `k` into `k * scale + kk`.
|
||||
/// Instead of introducing an ExpandShapeOp, this rewrites a reduction
|
||||
/// dimension `k` into `k * scale + kk`.
|
||||
///
|
||||
/// Example:
|
||||
/// ```
|
||||
@@ -1003,7 +994,8 @@ splitReduction(PatternRewriter &b, LinalgOp op,
|
||||
///
|
||||
/// %3 = linalg.generic {indexing_maps = [#map0, #map1, #map2, #map3],
|
||||
/// iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
|
||||
/// ins(%A, %B, %2 : tensor<16x256xf32>, tensor<256x32xf32>, tensor<64x4xi1>)
|
||||
/// ins(%A, %B, %2 : tensor<16x256xf32>, tensor<256x32xf32>,
|
||||
/// tensor<64x4xi1>)
|
||||
/// outs(%1 : tensor<16x32x64xf32>) {
|
||||
/// ^bb0(%arg3: f32, %arg4: f32, %arg5: i1, %arg6: f32):
|
||||
/// %5 = arith.mulf %arg3, %arg4 : f32
|
||||
|
||||
@@ -885,19 +885,37 @@ DiagnosedSilenceableFailure
|
||||
transform::ScalarizeOp::applyToOne(linalg::LinalgOp target,
|
||||
SmallVectorImpl<Operation *> &results,
|
||||
transform::TransformState &state) {
|
||||
LinalgTilingOptions tilingOptions;
|
||||
tilingOptions.scalarizeDynamicDims();
|
||||
// Tiling with "scalarize_dyn_dims" actually sets the same lambda as the
|
||||
// tile sizes and asserts that it is not already set.
|
||||
scf::SCFTilingOptions tilingOptions;
|
||||
tilingOptions.setTileSizeComputationFunction([&](OpBuilder &b, Operation *) {
|
||||
SmallVector<Value, 4> tileSizes;
|
||||
Location loc = target.getLoc();
|
||||
SmallVector<OpFoldResult> allShapeSizes =
|
||||
target.createFlatListOfOperandDims(b, loc);
|
||||
AffineMap map = target.getShapesToLoopsMap();
|
||||
if (!map)
|
||||
return tileSizes;
|
||||
IRRewriter rewriter(b);
|
||||
SmallVector<OpFoldResult> shapeSizes =
|
||||
makeComposedFoldedMultiResultAffineApply(rewriter, loc, map,
|
||||
allShapeSizes);
|
||||
// If the shape size is dynamic, tile by 1.
|
||||
// Otherwise, do not tile (i.e. tile size 0).
|
||||
for (OpFoldResult shapeSize : shapeSizes) {
|
||||
tileSizes.push_back(getConstantIntValue(shapeSize)
|
||||
? b.create<arith::ConstantIndexOp>(loc, 0)
|
||||
: b.create<arith::ConstantIndexOp>(loc, 1));
|
||||
}
|
||||
return tileSizes;
|
||||
});
|
||||
SmallVector<int64_t> emptyTileSizes;
|
||||
SimpleRewriter rewriter(getContext());
|
||||
rewriter.setInsertionPoint(target);
|
||||
FailureOr<TiledLinalgOp> result =
|
||||
tileWithLinalgTilingOptions(rewriter, target, tilingOptions);
|
||||
if (failed(result))
|
||||
FailureOr<scf::SCFTilingResult> maybeTilingResult = tileUsingSCFForOp(
|
||||
rewriter, cast<TilingInterface>(target.getOperation()), tilingOptions);
|
||||
if (failed(maybeTilingResult))
|
||||
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
|
||||
|
||||
results.push_back(result->op);
|
||||
results.push_back(maybeTilingResult->tiledOp);
|
||||
return DiagnosedSilenceableFailure(success());
|
||||
}
|
||||
|
||||
@@ -1127,7 +1145,7 @@ transform::TileOp::apply(TransformResults &transformResults,
|
||||
return diag;
|
||||
}
|
||||
|
||||
LinalgTilingOptions tilingOptions;
|
||||
scf::SCFTilingOptions tilingOptions;
|
||||
unsigned index = en.index();
|
||||
if (!tileSizes.empty()) {
|
||||
tilingOptions.setTileSizeComputationFunction(
|
||||
@@ -1148,15 +1166,22 @@ transform::TileOp::apply(TransformResults &transformResults,
|
||||
});
|
||||
}
|
||||
|
||||
tilingOptions.setInterchange(extractUIntArray(getInterchange()));
|
||||
tilingOptions.setInterchange(extractI64Array(getInterchange()));
|
||||
SimpleRewriter rewriter(linalgOp.getContext());
|
||||
FailureOr<TiledLinalgOp> tiledOp =
|
||||
tileWithLinalgTilingOptions(rewriter, linalgOp, tilingOptions);
|
||||
if (failed(tiledOp))
|
||||
FailureOr<scf::SCFTilingResult> maybeTilingResult = tileUsingSCFForOp(
|
||||
rewriter, cast<TilingInterface>(linalgOp.getOperation()),
|
||||
tilingOptions);
|
||||
if (failed(maybeTilingResult))
|
||||
return DiagnosedSilenceableFailure::definiteFailure();
|
||||
|
||||
tiled.push_back(tiledOp->op);
|
||||
for (const auto &en2 : llvm::enumerate(tiledOp->loops))
|
||||
if (linalgOp.hasBufferSemantics())
|
||||
rewriter.eraseOp(linalgOp);
|
||||
else
|
||||
rewriter.replaceOp(linalgOp,
|
||||
maybeTilingResult->loops.front()->getResults());
|
||||
|
||||
tiled.push_back(maybeTilingResult->tiledOp);
|
||||
for (const auto &en2 : llvm::enumerate(maybeTilingResult->loops))
|
||||
loops[en2.index()].push_back(en2.value());
|
||||
}
|
||||
|
||||
|
||||
@@ -64,34 +64,6 @@ mlir::linalg::LinalgTilingOptions::setTileSizes(ArrayRef<int64_t> ts) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
LinalgTilingOptions &mlir::linalg::LinalgTilingOptions::scalarizeDynamicDims() {
|
||||
assert(!tileSizeComputationFunction && "tile sizes already set");
|
||||
tileSizeComputationFunction = [](OpBuilder &b, Operation *op) {
|
||||
SmallVector<Value, 4> tileSizes;
|
||||
auto linalgOp = dyn_cast<LinalgOp>(op);
|
||||
if (!linalgOp)
|
||||
return tileSizes;
|
||||
Location loc = linalgOp.getLoc();
|
||||
SmallVector<OpFoldResult> allShapeSizes =
|
||||
linalgOp.createFlatListOfOperandDims(b, loc);
|
||||
AffineMap map = linalgOp.getShapesToLoopsMap();
|
||||
if (!map)
|
||||
return tileSizes;
|
||||
IRRewriter rewriter(b);
|
||||
SmallVector<OpFoldResult> shapeSizes =
|
||||
makeComposedFoldedMultiResultAffineApply(rewriter, loc, map,
|
||||
allShapeSizes);
|
||||
// If the shape size is dynamic, tile by 1. Otherwise, do not tile (tile
|
||||
// size 0).
|
||||
for (OpFoldResult shapeSize : shapeSizes)
|
||||
tileSizes.push_back(getConstantIntValue(shapeSize)
|
||||
? b.create<arith::ConstantIndexOp>(loc, 0)
|
||||
: b.create<arith::ConstantIndexOp>(loc, 1));
|
||||
return tileSizes;
|
||||
};
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// Pad the `opOperand` in the `paddingDimensions` using the padding value and
|
||||
/// the nofold flag found in `paddingValues` and `packPaddings`, respectively.
|
||||
/// Exit early and return the `opOperand` value if the shape dimensions that
|
||||
@@ -246,7 +218,8 @@ linalg::rewriteAsPaddedOp(OpBuilder &b, LinalgOp opToPad,
|
||||
|
||||
/// Try to peel a loop `op` and return the new result.
|
||||
// TODO: Add support for scf.parallel and affine.for loops.
|
||||
static SmallVector<Value, 4> peelLoop(RewriterBase &rewriter, Operation *op) {
|
||||
SmallVector<Value> mlir::linalg::peelLoop(RewriterBase &rewriter,
|
||||
Operation *op) {
|
||||
return llvm::TypeSwitch<Operation *, SmallVector<Value, 4>>(op)
|
||||
.Case<scf::ForOp>([&](scf::ForOp forOp) {
|
||||
scf::ForOp partialIteration;
|
||||
@@ -262,47 +235,8 @@ static SmallVector<Value, 4> peelLoop(RewriterBase &rewriter, Operation *op) {
|
||||
/// Peel and canonicalize 'loops'.
|
||||
void mlir::linalg::peelLoops(RewriterBase &rewriter,
|
||||
ArrayRef<scf::ForOp> loops) {
|
||||
for (auto loopOp : loops) {
|
||||
SmallVector<Value, 4> loopResults;
|
||||
loopResults = peelLoop(rewriter, loopOp);
|
||||
}
|
||||
}
|
||||
|
||||
/// Peel loops after tiling.
|
||||
void mlir::linalg::peelTiledLinalgOp(RewriterBase &rewriter, TiledLinalgOp &res,
|
||||
ArrayRef<int64_t> peeledLoops,
|
||||
LinalgTilingLoopType loopType) {
|
||||
for (int64_t loop : peeledLoops) {
|
||||
assert(loop < static_cast<int64_t>(res.loops.size()) &&
|
||||
"requested peeling of non-existing loop");
|
||||
SmallVector<Value, 4> loopResults;
|
||||
Operation *loopOp = res.loops[loop];
|
||||
loopResults = peelLoop(rewriter, loopOp);
|
||||
|
||||
// The result of the loop nest may change with peeling.
|
||||
if (res.tensorResults.size() == loopOp->getNumResults() &&
|
||||
std::equal(res.tensorResults.begin(), res.tensorResults.end(),
|
||||
loopOp->getResults().begin()))
|
||||
res.tensorResults = loopResults;
|
||||
}
|
||||
}
|
||||
|
||||
FailureOr<TiledLinalgOp>
|
||||
mlir::linalg::tileWithLinalgTilingOptions(RewriterBase &rewriter, LinalgOp op,
|
||||
const LinalgTilingOptions &options) {
|
||||
FailureOr<TiledLinalgOp> res = tileLinalgOp(rewriter, op, options);
|
||||
if (failed(res))
|
||||
return failure();
|
||||
|
||||
// Peel the loops of the TiledLinalgOp.
|
||||
peelTiledLinalgOp(rewriter, *res, options.peeledLoops, options.loopType);
|
||||
|
||||
if (res->tensorResults.empty())
|
||||
rewriter.eraseOp(op);
|
||||
else
|
||||
rewriter.replaceOp(op, res->tensorResults);
|
||||
|
||||
return res;
|
||||
for (auto loopOp : loops)
|
||||
peelLoop(rewriter, loopOp);
|
||||
}
|
||||
|
||||
/// Linalg padding pattern.
|
||||
|
||||
@@ -126,8 +126,8 @@ static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc,
|
||||
bindSymbols(b.getContext(), s0, s1);
|
||||
AffineMap minMap = AffineMap::get(1, 2, {s0, s1 - d0}, b.getContext());
|
||||
Value size = getValueOrCreateConstantIndexOp(b, loc, loopRange.size);
|
||||
return b.create<AffineMinOp>(loc, minMap, ValueRange{iv, tileSize, size})
|
||||
.getResult();
|
||||
return makeComposedFoldedAffineMin(
|
||||
b, loc, minMap, SmallVector<OpFoldResult>{iv, tileSize, size});
|
||||
}
|
||||
|
||||
/// Generate an empty loop nest that represents the tiled loop nest shell.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// RUN: mlir-opt --test-transform-dialect-interpreter --canonicalize %s | FileCheck %s
|
||||
// RUN: mlir-opt --test-transform-dialect-interpreter --scf-for-loop-canonicalization --canonicalize %s | FileCheck %s
|
||||
|
||||
// This implements a 2D multisize tiling with target sizes [3, 10].
|
||||
transform.sequence failures(propagate) {
|
||||
|
||||
@@ -15,25 +15,28 @@ func.func @gemm(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>
|
||||
// CHECK: scf.for
|
||||
// CHECK: scf.for
|
||||
// CHECK: scf.for
|
||||
// CHECK: %[[T7:.+]] = memref.subview %[[ARG0]]
|
||||
// CHECK: %[[T12:.+]] = memref.subview %[[ARG1]]
|
||||
// CHECK: %[[T17:.+]] = memref.subview %[[ARG2]]
|
||||
// CHECK: %[[A0:.*]] = memref.alloc() : memref<1024xi8>
|
||||
// CHECK: %[[V0:.*]] = memref.view %[[A0]][%[[C0]]][] : memref<1024xi8> to memref<16x16xf32>
|
||||
// CHECK: %[[T19:.+]] = memref.subview %[[V0]]
|
||||
// CHECK: %[[A1:.*]] = memref.alloc() : memref<1024xi8>
|
||||
// CHECK: %[[V1:.*]] = memref.view %[[A1]][%[[C0]]][] : memref<1024xi8> to memref<16x16xf32>
|
||||
// CHECK: %[[T21:.+]] = memref.subview %[[V1]]
|
||||
// CHECK: memref.copy %[[T7]], %[[T19]]
|
||||
// CHECK: memref.copy %[[T17]], %[[T21]]
|
||||
// CHECK: linalg.matmul ins(%[[T19]], %[[T12]]{{.*}} outs(%[[T21]]
|
||||
// CHECK: memref.copy %[[T21]], %[[T17]]
|
||||
// CHECK: memref.dealloc %[[A0]]
|
||||
// CHECK: memref.dealloc %[[A1]]
|
||||
// CHECK: %[[svA:.+]] = memref.subview %[[ARG0]]
|
||||
// CHECK: %[[svB:.+]] = memref.subview %[[ARG1]]
|
||||
// CHECK: %[[svC:.+]] = memref.subview %[[ARG2]]
|
||||
|
||||
// CHECK: %[[tmpA:.*]] = memref.alloc() : memref<1024xi8>
|
||||
// CHECK: %[[VA:.*]] = memref.view %[[tmpA]][%[[C0]]][] : memref<1024xi8> to memref<16x16xf32>
|
||||
// CHECK: %[[svAA:.+]] = memref.subview %[[VA]]
|
||||
|
||||
// CHECK: %[[tmpC:.*]] = memref.alloc() : memref<1024xi8>
|
||||
// CHECK: %[[VC:.*]] = memref.view %[[tmpC]][%[[C0]]][] : memref<1024xi8> to memref<16x16xf32>
|
||||
// CHECK: %[[svCC:.+]] = memref.subview %[[VC]]
|
||||
|
||||
// CHECK: memref.copy %[[svA]], %[[svAA]]
|
||||
// CHECK: memref.copy %[[svC]], %[[svCC]]
|
||||
// CHECK: linalg.matmul ins(%[[VA]], %[[svB]]{{.*}} outs(%[[VC]]
|
||||
// CHECK: memref.copy %[[svCC]], %[[svC]]
|
||||
// CHECK: memref.dealloc %[[tmpA]]
|
||||
// CHECK: memref.dealloc %[[tmpC]]
|
||||
|
||||
transform.sequence failures(propagate) {
|
||||
^bb0(%arg1: !pdl.operation):
|
||||
%0 = transform.structured.match ops{["linalg.matmul"]} in %arg1
|
||||
%1, %loops:3 = transform.structured.tile %0 [16, 16, 16]
|
||||
%2 = transform.structured.promote %1 { operands_to_promote = [0, 2], force_full_tiles = [false, false] }
|
||||
%2 = transform.structured.promote %1 { operands_to_promote = [0, 2], force_full_tiles = [false, false], use_full_tiles_by_default }
|
||||
}
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
// RUN: mlir-opt %s -test-transform-dialect-interpreter -canonicalize | FileCheck %s
|
||||
|
||||
// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0, s1] -> (-d0 + s0 + s1 - 1, s1 + 1)>
|
||||
// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0, s1] -> (-d0 + s0 + s1 - 1, s1 + 2)>
|
||||
// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
|
||||
// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 3)>
|
||||
// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
|
||||
// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 3)>
|
||||
// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0] -> (d0 + s0 - 1)>
|
||||
|
||||
func.func @conv(%arg0 : memref<?x?xf32>, %arg1 : memref<?x?xf32>, %arg2 : memref<?x?xf32>) {
|
||||
linalg.conv_2d ins(%arg0, %arg1 : memref<?x?xf32>, memref<?x?xf32>) outs(%arg2 : memref<?x?xf32>)
|
||||
@@ -24,18 +23,19 @@ transform.sequence failures(propagate) {
|
||||
// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
|
||||
// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
|
||||
// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
|
||||
// CHECK-DAG: %[[T0:.*]] = memref.dim %[[ARG1]], %[[C0]]
|
||||
// CHECK-DAG: %[[T1:.*]] = memref.dim %[[ARG1]], %[[C1]]
|
||||
// CHECK-DAG: %[[T2:.*]] = memref.dim %[[ARG2]], %[[C0]]
|
||||
// CHECK-DAG: %[[T3:.*]] = memref.dim %[[ARG2]], %[[C1]]
|
||||
// CHECK: scf.for %[[ARG3:.*]] = %[[C0]] to %[[T2]] step %[[C2]]
|
||||
// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[T3]] step %[[C3]]
|
||||
// CHECK: %[[T4:.*]] = affine.min #[[MAP0]](%[[ARG3]])[%[[T2]], %[[T0]]]
|
||||
// CHECK: %[[T5:.*]] = affine.min #[[MAP1]](%[[ARG4]])[%[[T3]], %[[T1]]]
|
||||
// CHECK: %[[T6:.*]] = affine.min #[[MAP2]](%[[ARG3]])[%[[T2]]
|
||||
// CHECK: %[[T7:.*]] = affine.min #[[MAP3]](%[[ARG4]])[%[[T3]]]
|
||||
// CHECK: %[[SV1:.*]] = memref.subview %[[ARG0]][%[[ARG3]], %[[ARG4]]] [%[[T4]], %[[T5]]]
|
||||
// CHECK: %[[SV2:.*]] = memref.subview %[[ARG2]][%[[ARG3]], %[[ARG4]]] [%[[T6]], %[[T7]]]
|
||||
// CHECK-DAG: %[[KH:.*]] = memref.dim %[[ARG1]], %[[C0]]
|
||||
// CHECK-DAG: %[[KW:.*]] = memref.dim %[[ARG1]], %[[C1]]
|
||||
// CHECK-DAG: %[[H:.*]] = memref.dim %[[ARG2]], %[[C0]]
|
||||
// CHECK-DAG: %[[W:.*]] = memref.dim %[[ARG2]], %[[C1]]
|
||||
// CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[H]] step %[[C2]]
|
||||
// CHECK: %[[T4:.*]] = affine.min #[[MAP0]](%[[I]])[%[[H]]]
|
||||
// CHECK: scf.for %[[J:.*]] = %[[C0]] to %[[W]] step %[[C3]]
|
||||
// CHECK-DAG: %[[T5:.*]] = affine.min #[[MAP1]](%[[J]])[%[[W]]]
|
||||
// CHECK-DAG: %[[T6:.*]] = affine.apply #[[MAP2]](%[[T4]])[%[[KH]]]
|
||||
// CHECK-DAG: %[[T7:.*]] = affine.apply #[[MAP2]](%[[T5]])[%[[KW]]]
|
||||
// CHECK-DAG: %[[SVIN:.*]] = memref.subview %[[ARG0]][%[[I]], %[[J]]] [%[[T6]], %[[T7]]]
|
||||
// CHECK-DAG: %[[SVKER:.*]] = memref.subview %[[ARG1]][0, 0] [%[[KH]], %[[KW]]]
|
||||
// CHECK-DAG: %[[SVOUT:.*]] = memref.subview %[[ARG2]][%[[I]], %[[J]]] [%[[T4]], %[[T5]]]
|
||||
// CHECK: linalg.conv_2d
|
||||
// CHECK-SAME: ins(%[[SV1]], %[[ARG1]]
|
||||
// CHECK-SAME: outs(%[[SV2]]
|
||||
// CHECK-SAME: ins(%[[SVIN]], %[[SVKER]]
|
||||
// CHECK-SAME: outs(%[[SVOUT]]
|
||||
|
||||
@@ -14,7 +14,7 @@ func.func @indexed_vector(%arg0: memref<50xindex>) {
|
||||
transform.sequence failures(propagate) {
|
||||
^bb0(%arg1: !pdl.operation):
|
||||
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1
|
||||
%1, %loop:2 = transform.structured.tile %0 [10, 25]
|
||||
%1, %loop = transform.structured.tile %0 [10]
|
||||
}
|
||||
|
||||
// TILE-10n25-DAG: [[$MAP:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0 + d1)>
|
||||
|
||||
@@ -88,7 +88,7 @@ transform.sequence failures(propagate) {
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
|
||||
// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (2, -d0 + s0)>
|
||||
|
||||
// CHECK: fold_extract_slice
|
||||
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x128xf32>
|
||||
@@ -106,10 +106,10 @@ func.func @fold_extract_slice(
|
||||
// CHECK: %[[E:.*]] = tensor.extract_slice %[[ARG0]][3, 4] [%[[DIM]], 42] [1, 1] : tensor<?x128xf32> to tensor<?x42xf32>
|
||||
|
||||
// CHECK: scf.for %[[IV0:[0-9a-zA-Z]*]] =
|
||||
// CHECK: %[[SIZE0:.*]] = affine.min #[[MAP0]](%[[IV0]])[%[[DIM]]
|
||||
// CHECK: scf.for %[[IV1:[0-9a-zA-Z]*]] =
|
||||
|
||||
// Fold the existing extract slice op into the one created by the tiling.
|
||||
// CHECK: %[[SIZE0:.*]] = affine.min #[[MAP0]](%[[IV0]])[%[[DIM]]
|
||||
// CHECK: %[[T0:.*]] = tensor.extract_slice %[[E]]
|
||||
// CHECK-SAME: %[[IV0]], %[[IV1]]
|
||||
// CHECK-SAME: %[[SIZE0]], 3
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// RUN: mlir-opt %s --test-transform-dialect-interpreter --split-input-file | FileCheck %s
|
||||
// RUN: mlir-opt %s --test-transform-dialect-interpreter --split-input-file -canonicalize | FileCheck %s
|
||||
|
||||
// CHECK-LABEL: func.func @fuse_unary
|
||||
func.func @fuse_unary(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
|
||||
@@ -61,15 +61,12 @@ func.func @interchange_reduction(%input: tensor<12x7x25xf32>) -> tensor<12x25xf3
|
||||
// CHECK-DAG: %[[INIT:.+]] = tensor.empty()
|
||||
// CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index
|
||||
// CHECK-DAG: %[[C7:.+]] = arith.constant 7 : index
|
||||
// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
|
||||
// CHECK: %[[RES:.*]] = scf.for %[[IV0:.+]] = %{{.+}} to %{{.+}} step %[[C5]] iter_args(%[[FOR_ARG0:.+]] = %[[INIT]])
|
||||
// CHECK: scf.for %[[IV1:.+]] = %{{.+}} to %{{.+}} step %[[C7]] iter_args(%[[FOR_ARG1:.+]] = %[[FOR_ARG0]])
|
||||
// CHECK: %[[OUT_SLICE0:.+]] = tensor.extract_slice %[[INPUT]][%[[IV0]], 0, %[[IV1]]]
|
||||
// CHECK: %[[OUT_SLICE1:.+]] = tensor.extract_slice %[[FOR_ARG1]][%[[IV0]], %[[IV1]]]
|
||||
// CHECK: %[[FILL:.+]] = linalg.fill {{.+}} outs(%[[OUT_SLICE1]] : tensor<?x?xf32>)
|
||||
//
|
||||
// Extra 4 constant is introduced, discard it.
|
||||
// CHECK: arith.constant 4 : index
|
||||
// CHECK: %[[C4:.+]] = arith.constant 4 : index
|
||||
// CHECK: scf.for %[[IV2:.+]] = %{{.+}} to %{{.+}} step %[[C4]] iter_args(%[[FOR_ARG2:.+]] = %[[FILL]])
|
||||
// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[OUT_SLICE0]]
|
||||
// CHECK: %[[OUT_SLICE2:.+]] = tensor.extract_slice %[[FOR_ARG2]][0, 0]
|
||||
|
||||
@@ -364,7 +364,7 @@ func.func @matmul_sequence_fusion(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>
|
||||
return %2 : tensor<?x?xf32>
|
||||
}
|
||||
|
||||
// CHECK: #[[MAP:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
|
||||
// CHECK: #[[MAP:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
|
||||
// CHECK: func @matmul_sequence_fusion(
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
|
||||
@@ -384,7 +384,7 @@ func.func @matmul_sequence_fusion(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>
|
||||
// CHECK-DAG: %[[N3:.+]] = tensor.dim %[[ARG5]], %[[C1]]
|
||||
// CHECK: %[[R0:.+]] = scf.for %[[IV:[a-zA-Z0-9_]+]] =
|
||||
// CHECK-SAME: iter_args(%[[ARG8:.+]] = %[[ARG6]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK-DAG: %[[TILE_M:.+]] = affine.min #[[MAP]](%[[IV]])[%{{.+}}, %[[M]]]
|
||||
// CHECK-DAG: %[[TILE_M:.+]] = affine.min #[[MAP]](%[[IV]])[%[[M]]]
|
||||
// CHECK-DAG: %[[SLICE_ARG0:.+]] = tensor.extract_slice %[[ARG0]][%[[IV]], 0] [%[[TILE_M]], %[[N0]]]
|
||||
// CHECK-DAG: %[[SLICE_ARG1:.+]] = tensor.extract_slice %[[ARG1]][0, 0] [%[[N0]], %[[N1]]]
|
||||
// CHECK-DAG: %[[SLICE_ARG2:.+]] = tensor.extract_slice %[[ARG2]][%[[IV]], 0] [%[[TILE_M]], %[[N1]]]
|
||||
|
||||
@@ -7,9 +7,9 @@ func.func @simple_matmul(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
|
||||
outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
return %0 : tensor<?x?xf32>
|
||||
}
|
||||
// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
|
||||
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
|
||||
// CHECK: func.func @simple_matmul(
|
||||
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
|
||||
// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
|
||||
// CHECK-LABEL: func.func @simple_matmul(
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor<?x?xf32>
|
||||
@@ -22,10 +22,10 @@ func.func @simple_matmul(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
|
||||
// CHECK-DAG: %[[N:.+]] = tensor.dim %[[ARG1]], %[[C1]]
|
||||
// CHECK: %[[OUTER:[a-zA-Z0-9]+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[M]] step %[[C10]]
|
||||
// CHECK-SAME: iter_args(%[[INIT0:.+]] = %[[ARG2]])
|
||||
// CHECK: %[[TS_Y:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[M]]]
|
||||
// CHECK: %[[TS_Y:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[M]]]
|
||||
// CHECK: %[[INNER:[a-zA-Z0-9]+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[N]] step %[[C20]]
|
||||
// CHECK-SAME: iter_args(%[[INIT1:.+]] = %[[INIT0]])
|
||||
// CHECK: %[[TS_X:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C20]], %[[N]]]
|
||||
// CHECK: %[[TS_X:.+]] = affine.min #[[$MAP1]](%[[IV1]])[%[[N]]]
|
||||
// CHECK-DAG: %[[LHS_TILE:.+]] = tensor.extract_slice %[[ARG0]]
|
||||
// CHECK-SAME: [%[[IV0]], 0] [%[[TS_Y]], %[[K]]] [1, 1]
|
||||
// CHECK-DAG: %[[RHS_TILE:.+]] = tensor.extract_slice %[[ARG1]]
|
||||
@@ -50,10 +50,10 @@ func.func @simple_matmul_memref(%arg0 : memref<?x?xf32>, %arg1 : memref<?x?xf32>
|
||||
outs(%arg2 : memref<?x?xf32>)
|
||||
return
|
||||
}
|
||||
// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
|
||||
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
|
||||
// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (30, -d0 + s1)>
|
||||
// CHECK: func.func @simple_matmul_memref(
|
||||
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
|
||||
// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
|
||||
// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)>
|
||||
// CHECK-LABEL: func.func @simple_matmul_memref(
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: memref<?x?xf32>
|
||||
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: memref<?x?xf32>
|
||||
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: memref<?x?xf32>
|
||||
@@ -66,11 +66,11 @@ func.func @simple_matmul_memref(%arg0 : memref<?x?xf32>, %arg1 : memref<?x?xf32>
|
||||
// CHECK-DAG: %[[K:.+]] = memref.dim %[[ARG0]], %[[C1]]
|
||||
// CHECK-DAG: %[[N:.+]] = memref.dim %[[ARG1]], %[[C1]]
|
||||
// CHECK: scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[M]] step %[[C10]]
|
||||
// CHECK: %[[TS_M:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[M]]]
|
||||
// CHECK: %[[TS_M:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[M]]]
|
||||
// CHECK: scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[N]] step %[[C20]]
|
||||
// CHECK: %[[TS_N:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C20]], %[[N]]]
|
||||
// CHECK: %[[TS_N:.+]] = affine.min #[[$MAP1]](%[[IV1]])[%[[N]]]
|
||||
// CHECK: scf.for %[[IV2:[a-zA-Z0-9]+]] = %[[C0]] to %[[K]] step %[[C30]]
|
||||
// CHECK: %[[TS_K:.+]] = affine.min #[[MAP2]](%[[IV2]])[%[[C30]], %[[K]]]
|
||||
// CHECK: %[[TS_K:.+]] = affine.min #[[$MAP2]](%[[IV2]])[%[[K]]]
|
||||
// CHECK-DAG: %[[LHS_TILE:.+]] = memref.subview %[[ARG0]]
|
||||
// CHECK-SAME: [%[[IV0]], %[[IV2]]] [%[[TS_M]], %[[TS_K]]] [1, 1]
|
||||
// CHECK-DAG: %[[RHS_TILE:.+]] = memref.subview %[[ARG1]]
|
||||
@@ -100,8 +100,8 @@ func.func @multi_result(%arg0 : tensor<128x200x300xf32>) -> (tensor<128x300x200x
|
||||
} -> (tensor<128x300x200xf32>, tensor<300x128x200xf32>)
|
||||
return %0#0, %0#1 : tensor<128x300x200xf32>, tensor<300x128x200xf32>
|
||||
}
|
||||
// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
|
||||
// CHECK: func.func @multi_result(
|
||||
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (10, -d0 + 128)>
|
||||
// CHECK-LABEL: func.func @multi_result(
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<128x200x300xf32>)
|
||||
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
|
||||
// CHECK-DAG: %[[C10:.+]] = arith.constant 10 : index
|
||||
@@ -112,7 +112,7 @@ func.func @multi_result(%arg0 : tensor<128x200x300xf32>) -> (tensor<128x300x200x
|
||||
// CHECK-DAG: %[[INIT1:.+]] = tensor.empty()
|
||||
// CHECK: %[[OUTER:[a-zA-Z0-9]+]]:2 = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[C128]] step %[[C10]]
|
||||
// CHECK-SAME: iter_args(%[[ARG1:[a-zA-Z0-9]+]] = %[[INIT0]], %[[ARG2:[a-zA-Z0-9]+]] = %[[INIT1]])
|
||||
// CHECK: %[[TS_Y:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[C128]]]
|
||||
// CHECK: %[[TS_Y:.+]] = affine.min #[[$MAP0]](%[[IV0]])
|
||||
// CHECK: %[[INNER:[a-zA-Z0-9]+]]:2 = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[C300]] step %[[C20]]
|
||||
// CHECK-SAME: iter_args(%[[ARG3:[a-zA-Z0-9]+]] = %[[ARG1]], %[[ARG4:[a-zA-Z0-9]+]] = %[[ARG2]])
|
||||
// CHECK-DAG: %[[ARG_TILE:.+]] = tensor.extract_slice %[[ARG0]]
|
||||
@@ -144,12 +144,12 @@ func.func @conv2D(%arg0 : tensor<?x?x?x?xf32>, %arg1 : tensor<?x?x?x?xf32>,
|
||||
outs(%arg2 : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
|
||||
return %0 : tensor<?x?x?x?xf32>
|
||||
}
|
||||
// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
|
||||
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
|
||||
// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (30, -d0 + s1)>
|
||||
// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 2 - 2)>
|
||||
// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 3 - 3)>
|
||||
// CHECK: func.func @conv2D(
|
||||
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
|
||||
// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
|
||||
// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)>
|
||||
// CHECK-DAG: #[[$MAP3:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 2 - 2)>
|
||||
// CHECK-DAG: #[[$MAP4:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 3 - 3)>
|
||||
// CHECK-LABEL: func.func @conv2D(
|
||||
// CHECK-SAME: %[[INPUT:[a-zA-Z0-9]+]]: tensor<?x?x?x?xf32>
|
||||
// CHECK-SAME: %[[FILTER:[a-zA-Z0-9]+]]: tensor<?x?x?x?xf32>
|
||||
// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<?x?x?x?xf32>
|
||||
@@ -169,15 +169,15 @@ func.func @conv2D(%arg0 : tensor<?x?x?x?xf32>, %arg1 : tensor<?x?x?x?xf32>,
|
||||
// CHECK-DAG: %[[S:.+]] = tensor.dim %[[INIT]], %[[C2]]
|
||||
// CHECK: scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[P]] step %[[C10]]
|
||||
// CHECK-SAME: iter_args(%[[INIT0:.+]] = %[[INIT]])
|
||||
// CHECK: %[[TS_P:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[P]]]
|
||||
// CHECK: %[[TS_P:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[P]]]
|
||||
// CHECK: scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[Q]] step %[[C20]]
|
||||
// CHECK-SAME: iter_args(%[[INIT1:.+]] = %[[INIT0]])
|
||||
// CHECK: %[[TS_Q:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C20]], %[[Q]]]
|
||||
// CHECK: %[[TS_Q:.+]] = affine.min #[[$MAP1]](%[[IV1]])[%[[Q]]]
|
||||
// CHECK: scf.for %[[IV2:[a-zA-Z0-9]+]] = %[[C0]] to %[[C]] step %[[C30]]
|
||||
// CHECK-SAME: iter_args(%[[INIT2:.+]] = %[[INIT1]])
|
||||
// CHECK-DAG: %[[TS_C:.+]] = affine.min #[[MAP2]](%[[IV2]])[%[[C30]], %[[C]]]
|
||||
// CHECK-DAG: %[[TS_H:.+]] = affine.apply #[[MAP3]](%[[TS_P]])[%[[R]]]
|
||||
// CHECK-DAG: %[[TS_W:.+]] = affine.apply #[[MAP4]](%[[TS_Q]])[%[[S]]]
|
||||
// CHECK-DAG: %[[TS_C:.+]] = affine.min #[[$MAP2]](%[[IV2]])[%[[C]]]
|
||||
// CHECK-DAG: %[[TS_H:.+]] = affine.apply #[[$MAP3]](%[[TS_P]])[%[[R]]]
|
||||
// CHECK-DAG: %[[TS_W:.+]] = affine.apply #[[$MAP4]](%[[TS_Q]])[%[[S]]]
|
||||
// CHECK-DAG: %[[INPUT_TILE:.+]] = tensor.extract_slice %[[INPUT]]
|
||||
// CHECK-SAME: [0, %[[IV0]], %[[IV1]], %[[IV2]]] [%[[N]], %[[TS_H]], %[[TS_W]], %[[TS_C]]]
|
||||
// CHECK-DAG: %[[FILTER_TILE:.+]] = tensor.extract_slice %[[FILTER]]
|
||||
@@ -234,10 +234,10 @@ func.func @interchange_matmul(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
|
||||
outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
return %0 : tensor<?x?xf32>
|
||||
}
|
||||
// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
|
||||
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (30, -d0 + s1)>
|
||||
// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
|
||||
// CHECK: func.func @interchange_matmul(
|
||||
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
|
||||
// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)>
|
||||
// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
|
||||
// CHECK-LABEL: func.func @interchange_matmul(
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor<?x?xf32>
|
||||
@@ -251,13 +251,13 @@ func.func @interchange_matmul(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
|
||||
// CHECK-DAG: %[[N:.+]] = tensor.dim %[[ARG1]], %[[C1]]
|
||||
// CHECK: %[[OUTER:[a-zA-Z0-9]+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[N]] step %[[C20]]
|
||||
// CHECK-SAME: iter_args(%[[INIT0:.+]] = %[[ARG2]])
|
||||
// CHECK: %[[TS_N:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C20]], %[[N]]]
|
||||
// CHECK: %[[TS_N:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[N]]]
|
||||
// CHECK: %[[INNER1:[a-zA-Z0-9]+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[K]] step %[[C30]]
|
||||
// CHECK-SAME: iter_args(%[[INIT1:.+]] = %[[INIT0]])
|
||||
// CHECK: %[[TS_K:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C30]], %[[K]]]
|
||||
// CHECK: %[[TS_K:.+]] = affine.min #[[$MAP1]](%[[IV1]])[%[[K]]]
|
||||
// CHECK: %[[INNER2:[a-zA-Z0-9]+]] = scf.for %[[IV2:[a-zA-Z0-9]+]] = %[[C0]] to %[[M]] step %[[C10]]
|
||||
// CHECK-SAME: iter_args(%[[INIT2:.+]] = %[[INIT1]])
|
||||
// CHECK-DAG: %[[TS_M:.+]] = affine.min #[[MAP2]](%[[IV2]])[%[[C10]], %[[M]]]
|
||||
// CHECK-DAG: %[[TS_M:.+]] = affine.min #[[$MAP2]](%[[IV2]])[%[[M]]]
|
||||
// CHECK-DAG: %[[LHS_TILE:.+]] = tensor.extract_slice %[[ARG0]]
|
||||
// CHECK-SAME: [%[[IV2]], %[[IV1]]] [%[[TS_M]], %[[TS_K]]] [1, 1]
|
||||
// CHECK-DAG: %[[RHS_TILE:.+]] = tensor.extract_slice %[[ARG1]]
|
||||
|
||||
Reference in New Issue
Block a user