Support VectorTransfer splitting on writes also.

VectorTransfer split previously only split read xfer ops. This adds the same logic to write ops. The resulting code involves 2 conditionals for write ops while read ops only needed 1, but the created ops are built upon the same patterns, so pattern matching/expectations are all consistent other than in regards to the if/else ops. Differential Revision: https://reviews.llvm.org/D102157
2026-01-26 12:26:52 +08:00 · 2021-05-07 16:19:22 +02:00
parent 65e40f0b26
commit 88a48999d2
2 changed files with 402 additions and 34 deletions
--- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
@@ -2379,6 +2379,7 @@ static Value createScopedSubViewIntersection(VectorTransferOpInterface xferOp,
      xferOp.indices().take_front(xferOp.getLeadingShapedRank());
  SmallVector<OpFoldResult, 4> sizes;
  sizes.append(leadingIndices.begin(), leadingIndices.end());
+  auto isaWrite = isa<vector::TransferWriteOp>(xferOp);
  xferOp.zipResultAndIndexing([&](int64_t resultIdx, int64_t indicesIdx) {
    using MapList = ArrayRef<ArrayRef<AffineExpr>>;
    Value dimMemRef = memref_dim(xferOp.source(), indicesIdx);
@@ -2397,7 +2398,7 @@ static Value createScopedSubViewIntersection(VectorTransferOpInterface xferOp,
  SmallVector<OpFoldResult, 4> indices = llvm::to_vector<4>(llvm::map_range(
      xferOp.indices(), [](Value idx) -> OpFoldResult { return idx; }));
  return memref_sub_view(
-      xferOp.source(), indices, sizes,
+      isaWrite ? alloc : xferOp.source(), indices, sizes,
      SmallVector<OpFoldResult>(memrefRank, OpBuilder(xferOp).getIndexAttr(1)));
 }

@@ -2509,14 +2510,119 @@ static scf::IfOp createScopedFullPartialVectorTransferRead(
  return fullPartialIfOp;
 }

+/// Given an `xferOp` for which:
+///   1. `inBoundsCond` and a `compatibleMemRefType` have been computed.
+///   2. a memref of single vector `alloc` has been allocated.
+/// Produce IR resembling:
+/// ```
+///    %1:3 = scf.if (%inBounds) {
+///      memref.cast %A: memref<A...> to compatibleMemRefType
+///      scf.yield %view, ... : compatibleMemRefType, index, index
+///    } else {
+///      %3 = vector.type_cast %extra_alloc :
+///        memref<...> to memref<vector<...>>
+///      %4 = memref.cast %alloc: memref<B...> to compatibleMemRefType
+///      scf.yield %4, ... : compatibleMemRefType, index, index
+///   }
+/// ```
+static ValueRange getLocationToWriteFullVec(vector::TransferWriteOp xferOp,
+                                            TypeRange returnTypes,
+                                            Value inBoundsCond,
+                                            MemRefType compatibleMemRefType,
+                                            Value alloc) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  Value zero = std_constant_index(0);
+  Value memref = xferOp.source();
+  return conditionBuilder(
+      returnTypes, inBoundsCond,
+      [&]() -> scf::ValueVector {
+        Value res = memref;
+        if (compatibleMemRefType != xferOp.getShapedType())
+          res = memref_cast(memref, compatibleMemRefType);
+        scf::ValueVector viewAndIndices{res};
+        viewAndIndices.insert(viewAndIndices.end(), xferOp.indices().begin(),
+                              xferOp.indices().end());
+        return viewAndIndices;
+      },
+      [&]() -> scf::ValueVector {
+        Value casted = memref_cast(alloc, compatibleMemRefType);
+        scf::ValueVector viewAndIndices{casted};
+        viewAndIndices.insert(viewAndIndices.end(), xferOp.getTransferRank(),
+                              zero);
+        return viewAndIndices;
+      });
+}
+
+/// Given an `xferOp` for which:
+///   1. `inBoundsCond` has been computed.
+///   2. a memref of single vector `alloc` has been allocated.
+///   3. it originally wrote to %view
+/// Produce IR resembling:
+/// ```
+///    %notInBounds = xor %inBounds, %true
+///    scf.if (%notInBounds) {
+///      %3 = subview %alloc [...][...][...]
+///      linalg.copy(%3, %view)
+///   }
+/// ```
+static void createScopedFullPartialLinalgCopy(vector::TransferWriteOp xferOp,
+                                              Value inBoundsCond, Value alloc) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  auto &b = ScopedContext::getBuilderRef();
+  auto notInBounds = b.create<XOrOp>(
+      xferOp->getLoc(), inBoundsCond,
+      b.create<::mlir::ConstantIntOp>(xferOp.getLoc(), true, 1));
+
+  conditionBuilder(notInBounds, [&]() {
+    Value memRefSubView = createScopedSubViewIntersection(
+        cast<VectorTransferOpInterface>(xferOp.getOperation()), alloc);
+    linalg_copy(memRefSubView, xferOp.source());
+  });
+}
+
+/// Given an `xferOp` for which:
+///   1. `inBoundsCond` has been computed.
+///   2. a memref of single vector `alloc` has been allocated.
+///   3. it originally wrote to %view
+/// Produce IR resembling:
+/// ```
+///    %notInBounds = xor %inBounds, %true
+///    scf.if (%notInBounds) {
+///      %2 = load %alloc : memref<vector<...>>
+///      vector.transfer_write %2, %view[...] : memref<A...>, vector<...>
+///   }
+/// ```
+static void
+createScopedFullPartialVectorTransferWrite(vector::TransferWriteOp xferOp,
+                                           Value inBoundsCond, Value alloc) {
+  using namespace edsc;
+  using namespace edsc::intrinsics;
+  auto &b = ScopedContext::getBuilderRef();
+  auto notInBounds = b.create<XOrOp>(
+      xferOp->getLoc(), inBoundsCond,
+      b.create<::mlir::ConstantIntOp>(xferOp.getLoc(), true, 1));
+  conditionBuilder(notInBounds, [&]() {
+    BlockAndValueMapping mapping;
+
+    Value load = memref_load(vector_type_cast(
+        MemRefType::get({}, xferOp.vector().getType()), alloc));
+
+    mapping.map(xferOp.vector(), load);
+    b.clone(*xferOp.getOperation(), mapping);
+  });
+}
+
 /// Split a vector.transfer operation into an in-bounds (i.e., no out-of-bounds
 /// masking) fastpath and a slowpath.
+///
+/// For vector.transfer_read:
 /// If `ifOp` is not null and the result is `success, the `ifOp` points to the
 /// newly created conditional upon function return.
 /// To accomodate for the fact that the original vector.transfer indexing may be
 /// arbitrary and the slow path indexes @[0...0] in the temporary buffer, the
 /// scf.if op returns a view and values of type index.
-/// At this time, only vector.transfer_read case is implemented.
 ///
 /// Example (a 2-D vector.transfer_read):
 /// ```
@@ -2537,6 +2643,32 @@ static scf::IfOp createScopedFullPartialVectorTransferRead(
 /// ```
 /// where `alloc` is a top of the function alloca'ed buffer of one vector.
 ///
+/// For vector.transfer_write:
+/// There are 2 conditional blocks. First a block to decide which memref and
+/// indices to use for an unmasked, inbounds write. Then a conditional block to
+/// further copy a partial buffer into the final result in the slow path case.
+///
+/// Example (a 2-D vector.transfer_write):
+/// ```
+///    vector.transfer_write %arg, %0[...], %pad : memref<A...>, vector<...>
+/// ```
+/// is transformed into:
+/// ```
+///    %1:3 = scf.if (%inBounds) {
+///      memref.cast %A: memref<A...> to compatibleMemRefType
+///      scf.yield %view : compatibleMemRefType, index, index
+///    } else {
+///      memref.cast %alloc: memref<B...> to compatibleMemRefType
+///      scf.yield %4 : compatibleMemRefType, index, index
+///     }
+///    %0 = vector.transfer_write %arg, %1#0[%1#1, %1#2] {in_bounds = [true ...
+///                                                                    true]}
+///    scf.if (%notInBounds) {
+///      // slowpath: not in-bounds vector.transfer or linalg.copy.
+///    }
+/// ```
+/// where `alloc` is a top of the function alloca'ed buffer of one vector.
+///
 /// Preconditions:
 ///  1. `xferOp.permutation_map()` must be a minor identity map
 ///  2. the rank of the `xferOp.source()` and the rank of the `xferOp.vector()`
@@ -2554,27 +2686,29 @@ LogicalResult mlir::vector::splitFullAndPartialTransfer(
  SmallVector<bool, 4> bools(xferOp.getTransferRank(), true);
  auto inBoundsAttr = b.getBoolArrayAttr(bools);
  if (options.vectorTransferSplit == VectorTransferSplit::ForceInBounds) {
-    xferOp->setAttr(vector::TransferReadOp::getInBoundsAttrName(),
-                    inBoundsAttr);
+    xferOp->setAttr(xferOp.getInBoundsAttrName(), inBoundsAttr);
    return success();
  }

-  assert(succeeded(splitFullAndPartialTransferPrecondition(xferOp)) &&
-         "Expected splitFullAndPartialTransferPrecondition to hold");
-  auto xferReadOp = dyn_cast<vector::TransferReadOp>(xferOp.getOperation());
+  // Assert preconditions. Additionally, keep the variables in an inner scope to
+  // ensure they aren't used in the wrong scopes further down.
+  {
+    assert(succeeded(splitFullAndPartialTransferPrecondition(xferOp)) &&
+           "Expected splitFullAndPartialTransferPrecondition to hold");

-  // TODO: add support for write case.
-  if (!xferReadOp)
-    return failure();
+    auto xferReadOp = dyn_cast<vector::TransferReadOp>(xferOp.getOperation());
+    auto xferWriteOp = dyn_cast<vector::TransferWriteOp>(xferOp.getOperation());

-  if (xferReadOp.mask())
-    return failure();
+    if (!(xferReadOp || xferWriteOp))
+      return failure();
+    if (xferWriteOp && xferWriteOp.mask())
+      return failure();
+    if (xferReadOp && xferReadOp.mask())
+      return failure();
+  }

  OpBuilder::InsertionGuard guard(b);
-  if (Operation *sourceOp = xferOp.source().getDefiningOp())
-    b.setInsertionPointAfter(sourceOp);
-  else
-    b.setInsertionPoint(xferOp);
+  b.setInsertionPoint(xferOp);
  ScopedContext scope(b, xferOp.getLoc());
  Value inBoundsCond = createScopedInBoundsCond(
      cast<VectorTransferOpInterface>(xferOp.getOperation()));
@@ -2596,26 +2730,57 @@ LogicalResult mlir::vector::splitFullAndPartialTransfer(
  MemRefType compatibleMemRefType =
      getCastCompatibleMemRefType(xferOp.getShapedType().cast<MemRefType>(),
                                  alloc.getType().cast<MemRefType>());
-
-  // Read case: full fill + partial copy -> in-bounds vector.xfer_read.
  SmallVector<Type, 4> returnTypes(1 + xferOp.getTransferRank(),
                                   b.getIndexType());
  returnTypes[0] = compatibleMemRefType;
-  scf::IfOp fullPartialIfOp =
-      options.vectorTransferSplit == VectorTransferSplit::VectorTransfer
-          ? createScopedFullPartialVectorTransferRead(
-                xferReadOp, returnTypes, inBoundsCond, compatibleMemRefType,
-                alloc)
-          : createScopedFullPartialLinalgCopy(xferReadOp, returnTypes,
-                                              inBoundsCond,
-                                              compatibleMemRefType, alloc);
-  if (ifOp)
-    *ifOp = fullPartialIfOp;

-  // Set existing read op to in-bounds, it always reads from a full buffer.
-  for (unsigned i = 0, e = returnTypes.size(); i != e; ++i)
-    xferReadOp.setOperand(i, fullPartialIfOp.getResult(i));
-  xferOp->setAttr(vector::TransferReadOp::getInBoundsAttrName(), inBoundsAttr);
+  if (auto xferReadOp =
+          dyn_cast<vector::TransferReadOp>(xferOp.getOperation())) {
+    // Read case: full fill + partial copy -> in-bounds vector.xfer_read.
+    scf::IfOp fullPartialIfOp =
+        options.vectorTransferSplit == VectorTransferSplit::VectorTransfer
+            ? createScopedFullPartialVectorTransferRead(
+                  xferReadOp, returnTypes, inBoundsCond, compatibleMemRefType,
+                  alloc)
+            : createScopedFullPartialLinalgCopy(xferReadOp, returnTypes,
+                                                inBoundsCond,
+                                                compatibleMemRefType, alloc);
+    if (ifOp)
+      *ifOp = fullPartialIfOp;
+
+    // Set existing read op to in-bounds, it always reads from a full buffer.
+    for (unsigned i = 0, e = returnTypes.size(); i != e; ++i)
+      xferReadOp.setOperand(i, fullPartialIfOp.getResult(i));
+
+    xferOp->setAttr(xferOp.getInBoundsAttrName(), inBoundsAttr);
+
+    return success();
+  }
+
+  auto xferWriteOp = cast<vector::TransferWriteOp>(xferOp.getOperation());
+
+  // Decide which location to write the entire vector to.
+  auto memrefAndIndices = getLocationToWriteFullVec(
+      xferWriteOp, returnTypes, inBoundsCond, compatibleMemRefType, alloc);
+
+  // Do an in bounds write to either the output or the extra allocated buffer.
+  // The operation is cloned to prevent deleting information needed for the
+  // later IR creation.
+  BlockAndValueMapping mapping;
+  mapping.map(xferWriteOp.source(), memrefAndIndices.front());
+  mapping.map(xferWriteOp.indices(), memrefAndIndices.drop_front());
+  auto *clone = b.clone(*xferWriteOp, mapping);
+  clone->setAttr(xferWriteOp.getInBoundsAttrName(), inBoundsAttr);
+
+  // Create a potential copy from the allocated buffer to the final output in
+  // the slow path case.
+  if (options.vectorTransferSplit == VectorTransferSplit::VectorTransfer)
+    createScopedFullPartialVectorTransferWrite(xferWriteOp, inBoundsCond,
+                                               alloc);
+  else
+    createScopedFullPartialLinalgCopy(xferWriteOp, inBoundsCond, alloc);
+
+  xferOp->erase();

  return success();
 }
--- a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -test-vector-transfer-full-partial-split | FileCheck %s
-// RUN: mlir-opt %s -test-vector-transfer-full-partial-split=use-linalg-copy | FileCheck %s --check-prefix=LINALG
+// RUN: mlir-opt %s -test-vector-transfer-full-partial-split -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -test-vector-transfer-full-partial-split=use-linalg-copy -split-input-file | FileCheck %s --check-prefix=LINALG

 // CHECK-DAG: #[[$map_p4:.*]] = affine_map<()[s0] -> (s0 + 4)>
 // CHECK-DAG: #[[$map_p8:.*]] = affine_map<()[s0] -> (s0 + 8)>
@@ -186,3 +186,206 @@ func @split_vector_transfer_read_strided_2d(
  // CHECK: return %[[res]] : vector<4x8xf32>
  return %1 : vector<4x8xf32>
 }
+
+// -----
+
+func @split_vector_transfer_write_2d(%V: vector<4x8xf32>, %A: memref<?x8xf32>, %i: index, %j: index) {
+  vector.transfer_write %V, %A[%i, %j] :
+    vector<4x8xf32>, memref<?x8xf32>
+  return
+}
+
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 4)>
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 8)>
+// CHECK:     func @split_vector_transfer_write_2d(
+// CHECK-SAME:                                         %[[VEC:.*]]: vector<4x8xf32>,
+// CHECK-SAME:                                         %[[DEST:.*]]: memref<?x8xf32>,
+// CHECK-SAME:                                         %[[I:.*]]: index,
+// CHECK-SAME:                                         %[[J:.*]]: index) {
+// CHECK-DAG:       %[[C8:.*]] = constant 8 : index
+// CHECK-DAG:       %[[C0:.*]] = constant 0 : index
+// CHECK-DAG:       %[[CT:.*]] = constant true
+// CHECK:           %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
+// CHECK:           %[[VAL_8:.*]] = affine.apply #[[MAP0]]()[%[[I]]]
+// CHECK:           %[[DIM0:.*]] = memref.dim %[[DEST]], %[[C0]] : memref<?x8xf32>
+// CHECK:           %[[DIM0_IN:.*]] = cmpi sle, %[[VAL_8]], %[[DIM0]] : index
+// CHECK:           %[[DIM1:.*]] = affine.apply #[[MAP1]]()[%[[J]]]
+// CHECK:           %[[DIM1_IN:.*]] = cmpi sle, %[[DIM1]], %[[C8]] : index
+// CHECK:           %[[IN_BOUNDS:.*]] = and %[[DIM0_IN]], %[[DIM1_IN]] : i1
+// CHECK:           %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]] ->
+// CHECK-SAME:          (memref<?x8xf32>, index, index) {
+// CHECK:             scf.yield %[[DEST]], %[[I]], %[[J]] : memref<?x8xf32>, index, index
+// CHECK:           } else {
+// CHECK:             %[[VAL_15:.*]] = memref.cast %[[TEMP]]
+// CHECK-SAME:            : memref<4x8xf32> to memref<?x8xf32>
+// CHECK:             scf.yield %[[VAL_15]], %[[C0]], %[[C0]]
+// CHECK-SAME:            : memref<?x8xf32>, index, index
+// CHECK:           }
+// CHECK:           vector.transfer_write %[[VEC]],
+// CHECK-SAME:           %[[IN_BOUND_DEST:.*]]#0[%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2]
+// CHECK-SAME:           {in_bounds = [true, true]} : vector<4x8xf32>, memref<?x8xf32>
+// CHECK:           %[[OUT_BOUNDS:.*]] = xor %[[IN_BOUNDS]], %[[CT]] : i1
+// CHECK:           scf.if %[[OUT_BOUNDS]] {
+// CHECK:             %[[CASTED:.*]] = vector.type_cast %[[TEMP]]
+// CHECK-SAME:            : memref<4x8xf32> to memref<vector<4x8xf32>>
+// CHECK:             %[[RESULT_COPY:.*]] = memref.load %[[CASTED]][]
+// CHECK-SAME:            : memref<vector<4x8xf32>>
+// CHECK:             vector.transfer_write %[[RESULT_COPY]],
+// CHECK-SAME:            %[[DEST]][%[[I]], %[[J]]]
+// CHECK-SAME:            : vector<4x8xf32>, memref<?x8xf32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// LINALG-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 4)>
+// LINALG-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 8)>
+// LINALG-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 4)>
+// LINALG-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 8)>
+// LINALG-DAG: #[[MAP4:.*]] = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>
+// LINALG:     func @split_vector_transfer_write_2d(
+// LINALG-SAME:                                         %[[VEC:.*]]: vector<4x8xf32>,
+// LINALG-SAME:                                         %[[DEST:.*]]: memref<?x8xf32>,
+// LINALG-SAME:                                         %[[I:.*]]: index,
+// LINALG-SAME:                                         %[[J:.*]]: index) {
+// LINALG-DAG:       %[[CT:.*]] = constant true
+// LINALG-DAG:       %[[C0:.*]] = constant 0 : index
+// LINALG-DAG:       %[[C4:.*]] = constant 4 : index
+// LINALG-DAG:       %[[C8:.*]] = constant 8 : index
+// LINALG:           %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
+// LINALG:           %[[IDX0:.*]] = affine.apply #[[MAP0]]()[%[[I]]]
+// LINALG:           %[[DIM0:.*]] = memref.dim %[[DEST]], %[[C0]] : memref<?x8xf32>
+// LINALG:           %[[DIM0_IN:.*]] = cmpi sle, %[[IDX0]], %[[DIM0]] : index
+// LINALG:           %[[DIM1:.*]] = affine.apply #[[MAP1]]()[%[[J]]]
+// LINALG:           %[[DIM1_IN:.*]] = cmpi sle, %[[DIM1]], %[[C8]] : index
+// LINALG:           %[[IN_BOUNDS:.*]] = and %[[DIM0_IN]], %[[DIM1_IN]] : i1
+// LINALG:           %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]]
+// LINALG-SAME:          -> (memref<?x8xf32>, index, index) {
+// LINALG:             scf.yield %[[DEST]], %[[I]], %[[J]] : memref<?x8xf32>, index, index
+// LINALG:           } else {
+// LINALG:             %[[VAL_16:.*]] = memref.cast %[[TEMP]] : memref<4x8xf32> to memref<?x8xf32>
+// LINALG:             scf.yield %[[VAL_16]], %[[C0]], %[[C0]] : memref<?x8xf32>, index, index
+// LINALG:           }
+// LINALG:           vector.transfer_write %[[VEC]],
+// LINALG-SAME:          %[[IN_BOUND_DEST:.*]]#0[%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2]
+// LINALG-SAME:          {in_bounds = [true, true]} : vector<4x8xf32>, memref<?x8xf32>
+// LINALG:           %[[OUT_BOUNDS:.*]] = xor %[[IN_BOUNDS]], %[[CT]] : i1
+// LINALG:           scf.if %[[OUT_BOUNDS]] {
+// LINALG:             %[[VAL_19:.*]] = memref.dim %[[DEST]], %[[C0]] : memref<?x8xf32>
+// LINALG-DAG:         %[[VAL_20:.*]] = affine.min #[[MAP2]](%[[VAL_19]], %[[I]], %[[C4]])
+// LINALG-DAG:         %[[VAL_21:.*]] = affine.min #[[MAP3]](%[[C8]], %[[J]], %[[C8]])
+// LINALG:             %[[VAL_22:.*]] = memref.subview %[[TEMP]]
+// LINALG-SAME:            [%[[I]], %[[J]]] [%[[VAL_20]], %[[VAL_21]]]
+// LINALG-SAME:            [1, 1] : memref<4x8xf32> to memref<?x?xf32, #[[MAP4]]>
+// LINALG:             linalg.copy(%[[VAL_22]], %[[DEST]])
+// LINALG-SAME:            : memref<?x?xf32, #[[MAP4]]>, memref<?x8xf32>
+// LINALG:           }
+// LINALG:           return
+// LINALG:         }
+
+// -----
+
+func @split_vector_transfer_write_strided_2d(
+    %V: vector<4x8xf32>, %A: memref<7x8xf32, offset:?, strides:[?, 1]>,
+    %i: index, %j: index) {
+  vector.transfer_write %V, %A[%i, %j] :
+    vector<4x8xf32>, memref<7x8xf32, offset:?, strides:[?, 1]>
+  return
+}
+
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 4)>
+// CHECK-DAG: #[[MAP2:.*]] = affine_map<()[s0] -> (s0 + 8)>
+// CHECK:   func @split_vector_transfer_write_strided_2d(
+// CHECK-SAME:                                                 %[[VEC:.*]]: vector<4x8xf32>,
+// CHECK-SAME:                                                 %[[DEST:.*]]: memref<7x8xf32, #[[MAP0]]>,
+// CHECK-SAME:                                                 %[[I:.*]]: index,
+// CHECK-SAME:                                                 %[[J:.*]]: index) {
+// CHECK-DAG:       %[[C7:.*]] = constant 7 : index
+// CHECK-DAG:       %[[C8:.*]] = constant 8 : index
+// CHECK-DAG:       %[[C0:.*]] = constant 0 : index
+// CHECK-DAG:       %[[CT:.*]] = constant true
+// CHECK:           %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
+// CHECK:           %[[DIM0:.*]] = affine.apply #[[MAP1]]()[%[[I]]]
+// CHECK:           %[[DIM0_IN:.*]] = cmpi sle, %[[DIM0]], %[[C7]] : index
+// CHECK:           %[[DIM1:.*]] = affine.apply #[[MAP2]]()[%[[J]]]
+// CHECK:           %[[DIM1_IN:.*]] = cmpi sle, %[[DIM1]], %[[C8]] : index
+// CHECK:           %[[IN_BOUNDS:.*]] = and %[[DIM0_IN]], %[[DIM1_IN]] : i1
+// CHECK:           %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]]
+// CHECK-SAME:          -> (memref<?x8xf32, #[[MAP0]]>, index, index) {
+// CHECK:             %[[VAL_15:.*]] = memref.cast %[[DEST]]
+// CHECK-SAME:            : memref<7x8xf32, #[[MAP0]]> to memref<?x8xf32, #[[MAP0]]>
+// CHECK:             scf.yield %[[VAL_15]], %[[I]], %[[J]]
+// CHECK-SAME:            : memref<?x8xf32, #[[MAP0]]>, index, index
+// CHECK:           } else {
+// CHECK:             %[[VAL_16:.*]] = memref.cast %[[TEMP]]
+// CHECK-SAME:            : memref<4x8xf32> to memref<?x8xf32, #[[MAP0]]>
+// CHECK:             scf.yield %[[VAL_16]], %[[C0]], %[[C0]]
+// CHECK-SAME:            : memref<?x8xf32, #[[MAP0]]>, index, index
+// CHECK:           }
+// CHECK:           vector.transfer_write %[[VEC]],
+// CHECK-SAME:          %[[IN_BOUND_DEST:.*]]#0
+// CHECK-SAME:          [%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2]
+// CHECK-SAME:          {in_bounds = [true, true]} : vector<4x8xf32>, memref<?x8xf32, #[[MAP0]]>
+// CHECK:           %[[OUT_BOUNDS:.*]] = xor %[[IN_BOUNDS]], %[[CT]] : i1
+// CHECK:           scf.if %[[OUT_BOUNDS]] {
+// CHECK:             %[[VAL_19:.*]] = vector.type_cast %[[TEMP]]
+// CHECK-SAME:            : memref<4x8xf32> to memref<vector<4x8xf32>>
+// CHECK:             %[[VAL_20:.*]] = memref.load %[[VAL_19]][]
+// CHECK-SAME:            : memref<vector<4x8xf32>>
+// CHECK:             vector.transfer_write %[[VAL_20]], %[[DEST]][%[[I]], %[[J]]]
+// CHECK-SAME:            : vector<4x8xf32>, memref<7x8xf32, #[[MAP0]]>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// LINALG-DAG: #[[MAP0:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+// LINALG-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 4)>
+// LINALG-DAG: #[[MAP2:.*]] = affine_map<()[s0] -> (s0 + 8)>
+// LINALG-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 4)>
+// LINALG-DAG: #[[MAP4:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 8)>
+// LINALG-DAG: #[[MAP5:.*]] = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>
+// LINALG:   func @split_vector_transfer_write_strided_2d(
+// LINALG-SAME:                                                 %[[VEC:.*]]: vector<4x8xf32>,
+// LINALG-SAME:                                                 %[[DEST:.*]]: memref<7x8xf32, #[[MAP0]]>,
+// LINALG-SAME:                                                 %[[I:.*]]: index,
+// LINALG-SAME:                                                 %[[J:.*]]: index) {
+// LINALG-DAG:       %[[C0:.*]] = constant 0 : index
+// LINALG-DAG:       %[[CT:.*]] = constant true
+// LINALG-DAG:       %[[C7:.*]] = constant 7 : index
+// LINALG-DAG:       %[[C4:.*]] = constant 4 : index
+// LINALG-DAG:       %[[C8:.*]] = constant 8 : index
+// LINALG:           %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
+// LINALG:           %[[DIM0:.*]] = affine.apply #[[MAP1]]()[%[[I]]]
+// LINALG:           %[[DIM0_IN:.*]] = cmpi sle, %[[DIM0]], %[[C7]] : index
+// LINALG:           %[[DIM1:.*]] = affine.apply #[[MAP2]]()[%[[J]]]
+// LINALG:           %[[DIM1_IN:.*]] = cmpi sle, %[[DIM1]], %[[C8]] : index
+// LINALG:           %[[IN_BOUNDS:.*]] = and %[[DIM0_IN]], %[[DIM1_IN]] : i1
+// LINALG:           %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]]
+// LINALG-SAME:          -> (memref<?x8xf32, #[[MAP0]]>, index, index) {
+// LINALG:             %[[VAL_16:.*]] = memref.cast %[[DEST]]
+// LINALG-SAME:            : memref<7x8xf32, #[[MAP0]]> to memref<?x8xf32, #[[MAP0]]>
+// LINALG:             scf.yield %[[VAL_16]], %[[I]], %[[J]]
+// LINALG-SAME:            : memref<?x8xf32, #[[MAP0]]>, index, index
+// LINALG:           } else {
+// LINALG:             %[[VAL_17:.*]] = memref.cast %[[TEMP]]
+// LINALG-SAME:            : memref<4x8xf32> to memref<?x8xf32, #[[MAP0]]>
+// LINALG:             scf.yield %[[VAL_17]], %[[C0]], %[[C0]]
+// LINALG-SAME:            : memref<?x8xf32, #[[MAP0]]>, index, index
+// LINALG:           }
+// LINALG:           vector.transfer_write %[[VEC]],
+// LINALG-SAME:          %[[IN_BOUND_DEST:.*]]#0
+// LINALG-SAME:          [%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2]
+// LINALG-SAME:          {in_bounds = [true, true]}
+// LINALG-SAME:          : vector<4x8xf32>, memref<?x8xf32, #[[MAP0]]>
+// LINALG:           %[[OUT_BOUNDS:.*]] = xor %[[IN_BOUNDS]], %[[CT]] : i1
+// LINALG:           scf.if %[[OUT_BOUNDS]] {
+// LINALG-DAG:         %[[VAL_20:.*]] = affine.min #[[MAP3]](%[[C7]], %[[I]], %[[C4]])
+// LINALG-DAG:         %[[VAL_21:.*]] = affine.min #[[MAP4]](%[[C8]], %[[J]], %[[C8]])
+// LINALG:             %[[VAL_22:.*]] = memref.subview %[[TEMP]]
+// LINALG-SAME:            [%[[I]], %[[J]]] [%[[VAL_20]], %[[VAL_21]]]
+// LINALG-SAME:            [1, 1] : memref<4x8xf32> to memref<?x?xf32, #[[MAP5]]>
+// LINALG:             linalg.copy(%[[VAL_22]], %[[DEST]])
+// LINALG-SAME:            : memref<?x?xf32, #[[MAP5]]>, memref<7x8xf32, #[[MAP0]]>
+// LINALG:           }
+// LINALG:           return
+// LINALG:         }