Support VectorTransfer splitting on writes also.

VectorTransfer split previously only split read xfer ops. This adds
the same logic to write ops. The resulting code involves 2
conditionals for write ops while read ops only needed 1, but the created
ops are built upon the same patterns, so pattern matching/expectations
are all consistent other than in regards to the if/else ops.

Differential Revision: https://reviews.llvm.org/D102157
This commit is contained in:
Tres Popp
2021-05-07 16:19:22 +02:00
parent 65e40f0b26
commit 88a48999d2
2 changed files with 402 additions and 34 deletions

View File

@@ -2379,6 +2379,7 @@ static Value createScopedSubViewIntersection(VectorTransferOpInterface xferOp,
xferOp.indices().take_front(xferOp.getLeadingShapedRank());
SmallVector<OpFoldResult, 4> sizes;
sizes.append(leadingIndices.begin(), leadingIndices.end());
auto isaWrite = isa<vector::TransferWriteOp>(xferOp);
xferOp.zipResultAndIndexing([&](int64_t resultIdx, int64_t indicesIdx) {
using MapList = ArrayRef<ArrayRef<AffineExpr>>;
Value dimMemRef = memref_dim(xferOp.source(), indicesIdx);
@@ -2397,7 +2398,7 @@ static Value createScopedSubViewIntersection(VectorTransferOpInterface xferOp,
SmallVector<OpFoldResult, 4> indices = llvm::to_vector<4>(llvm::map_range(
xferOp.indices(), [](Value idx) -> OpFoldResult { return idx; }));
return memref_sub_view(
xferOp.source(), indices, sizes,
isaWrite ? alloc : xferOp.source(), indices, sizes,
SmallVector<OpFoldResult>(memrefRank, OpBuilder(xferOp).getIndexAttr(1)));
}
@@ -2509,14 +2510,119 @@ static scf::IfOp createScopedFullPartialVectorTransferRead(
return fullPartialIfOp;
}
/// Given an `xferOp` for which:
/// 1. `inBoundsCond` and a `compatibleMemRefType` have been computed.
/// 2. a memref of single vector `alloc` has been allocated.
/// Produce IR resembling:
/// ```
/// %1:3 = scf.if (%inBounds) {
/// memref.cast %A: memref<A...> to compatibleMemRefType
/// scf.yield %view, ... : compatibleMemRefType, index, index
/// } else {
/// %3 = vector.type_cast %extra_alloc :
/// memref<...> to memref<vector<...>>
/// %4 = memref.cast %alloc: memref<B...> to compatibleMemRefType
/// scf.yield %4, ... : compatibleMemRefType, index, index
/// }
/// ```
static ValueRange getLocationToWriteFullVec(vector::TransferWriteOp xferOp,
TypeRange returnTypes,
Value inBoundsCond,
MemRefType compatibleMemRefType,
Value alloc) {
using namespace edsc;
using namespace edsc::intrinsics;
Value zero = std_constant_index(0);
Value memref = xferOp.source();
return conditionBuilder(
returnTypes, inBoundsCond,
[&]() -> scf::ValueVector {
Value res = memref;
if (compatibleMemRefType != xferOp.getShapedType())
res = memref_cast(memref, compatibleMemRefType);
scf::ValueVector viewAndIndices{res};
viewAndIndices.insert(viewAndIndices.end(), xferOp.indices().begin(),
xferOp.indices().end());
return viewAndIndices;
},
[&]() -> scf::ValueVector {
Value casted = memref_cast(alloc, compatibleMemRefType);
scf::ValueVector viewAndIndices{casted};
viewAndIndices.insert(viewAndIndices.end(), xferOp.getTransferRank(),
zero);
return viewAndIndices;
});
}
/// Given an `xferOp` for which:
/// 1. `inBoundsCond` has been computed.
/// 2. a memref of single vector `alloc` has been allocated.
/// 3. it originally wrote to %view
/// Produce IR resembling:
/// ```
/// %notInBounds = xor %inBounds, %true
/// scf.if (%notInBounds) {
/// %3 = subview %alloc [...][...][...]
/// linalg.copy(%3, %view)
/// }
/// ```
static void createScopedFullPartialLinalgCopy(vector::TransferWriteOp xferOp,
Value inBoundsCond, Value alloc) {
using namespace edsc;
using namespace edsc::intrinsics;
auto &b = ScopedContext::getBuilderRef();
auto notInBounds = b.create<XOrOp>(
xferOp->getLoc(), inBoundsCond,
b.create<::mlir::ConstantIntOp>(xferOp.getLoc(), true, 1));
conditionBuilder(notInBounds, [&]() {
Value memRefSubView = createScopedSubViewIntersection(
cast<VectorTransferOpInterface>(xferOp.getOperation()), alloc);
linalg_copy(memRefSubView, xferOp.source());
});
}
/// Given an `xferOp` for which:
/// 1. `inBoundsCond` has been computed.
/// 2. a memref of single vector `alloc` has been allocated.
/// 3. it originally wrote to %view
/// Produce IR resembling:
/// ```
/// %notInBounds = xor %inBounds, %true
/// scf.if (%notInBounds) {
/// %2 = load %alloc : memref<vector<...>>
/// vector.transfer_write %2, %view[...] : memref<A...>, vector<...>
/// }
/// ```
static void
createScopedFullPartialVectorTransferWrite(vector::TransferWriteOp xferOp,
Value inBoundsCond, Value alloc) {
using namespace edsc;
using namespace edsc::intrinsics;
auto &b = ScopedContext::getBuilderRef();
auto notInBounds = b.create<XOrOp>(
xferOp->getLoc(), inBoundsCond,
b.create<::mlir::ConstantIntOp>(xferOp.getLoc(), true, 1));
conditionBuilder(notInBounds, [&]() {
BlockAndValueMapping mapping;
Value load = memref_load(vector_type_cast(
MemRefType::get({}, xferOp.vector().getType()), alloc));
mapping.map(xferOp.vector(), load);
b.clone(*xferOp.getOperation(), mapping);
});
}
/// Split a vector.transfer operation into an in-bounds (i.e., no out-of-bounds
/// masking) fastpath and a slowpath.
///
/// For vector.transfer_read:
/// If `ifOp` is not null and the result is `success, the `ifOp` points to the
/// newly created conditional upon function return.
/// To accomodate for the fact that the original vector.transfer indexing may be
/// arbitrary and the slow path indexes @[0...0] in the temporary buffer, the
/// scf.if op returns a view and values of type index.
/// At this time, only vector.transfer_read case is implemented.
///
/// Example (a 2-D vector.transfer_read):
/// ```
@@ -2537,6 +2643,32 @@ static scf::IfOp createScopedFullPartialVectorTransferRead(
/// ```
/// where `alloc` is a top of the function alloca'ed buffer of one vector.
///
/// For vector.transfer_write:
/// There are 2 conditional blocks. First a block to decide which memref and
/// indices to use for an unmasked, inbounds write. Then a conditional block to
/// further copy a partial buffer into the final result in the slow path case.
///
/// Example (a 2-D vector.transfer_write):
/// ```
/// vector.transfer_write %arg, %0[...], %pad : memref<A...>, vector<...>
/// ```
/// is transformed into:
/// ```
/// %1:3 = scf.if (%inBounds) {
/// memref.cast %A: memref<A...> to compatibleMemRefType
/// scf.yield %view : compatibleMemRefType, index, index
/// } else {
/// memref.cast %alloc: memref<B...> to compatibleMemRefType
/// scf.yield %4 : compatibleMemRefType, index, index
/// }
/// %0 = vector.transfer_write %arg, %1#0[%1#1, %1#2] {in_bounds = [true ...
/// true]}
/// scf.if (%notInBounds) {
/// // slowpath: not in-bounds vector.transfer or linalg.copy.
/// }
/// ```
/// where `alloc` is a top of the function alloca'ed buffer of one vector.
///
/// Preconditions:
/// 1. `xferOp.permutation_map()` must be a minor identity map
/// 2. the rank of the `xferOp.source()` and the rank of the `xferOp.vector()`
@@ -2554,27 +2686,29 @@ LogicalResult mlir::vector::splitFullAndPartialTransfer(
SmallVector<bool, 4> bools(xferOp.getTransferRank(), true);
auto inBoundsAttr = b.getBoolArrayAttr(bools);
if (options.vectorTransferSplit == VectorTransferSplit::ForceInBounds) {
xferOp->setAttr(vector::TransferReadOp::getInBoundsAttrName(),
inBoundsAttr);
xferOp->setAttr(xferOp.getInBoundsAttrName(), inBoundsAttr);
return success();
}
assert(succeeded(splitFullAndPartialTransferPrecondition(xferOp)) &&
"Expected splitFullAndPartialTransferPrecondition to hold");
auto xferReadOp = dyn_cast<vector::TransferReadOp>(xferOp.getOperation());
// Assert preconditions. Additionally, keep the variables in an inner scope to
// ensure they aren't used in the wrong scopes further down.
{
assert(succeeded(splitFullAndPartialTransferPrecondition(xferOp)) &&
"Expected splitFullAndPartialTransferPrecondition to hold");
// TODO: add support for write case.
if (!xferReadOp)
return failure();
auto xferReadOp = dyn_cast<vector::TransferReadOp>(xferOp.getOperation());
auto xferWriteOp = dyn_cast<vector::TransferWriteOp>(xferOp.getOperation());
if (xferReadOp.mask())
return failure();
if (!(xferReadOp || xferWriteOp))
return failure();
if (xferWriteOp && xferWriteOp.mask())
return failure();
if (xferReadOp && xferReadOp.mask())
return failure();
}
OpBuilder::InsertionGuard guard(b);
if (Operation *sourceOp = xferOp.source().getDefiningOp())
b.setInsertionPointAfter(sourceOp);
else
b.setInsertionPoint(xferOp);
b.setInsertionPoint(xferOp);
ScopedContext scope(b, xferOp.getLoc());
Value inBoundsCond = createScopedInBoundsCond(
cast<VectorTransferOpInterface>(xferOp.getOperation()));
@@ -2596,26 +2730,57 @@ LogicalResult mlir::vector::splitFullAndPartialTransfer(
MemRefType compatibleMemRefType =
getCastCompatibleMemRefType(xferOp.getShapedType().cast<MemRefType>(),
alloc.getType().cast<MemRefType>());
// Read case: full fill + partial copy -> in-bounds vector.xfer_read.
SmallVector<Type, 4> returnTypes(1 + xferOp.getTransferRank(),
b.getIndexType());
returnTypes[0] = compatibleMemRefType;
scf::IfOp fullPartialIfOp =
options.vectorTransferSplit == VectorTransferSplit::VectorTransfer
? createScopedFullPartialVectorTransferRead(
xferReadOp, returnTypes, inBoundsCond, compatibleMemRefType,
alloc)
: createScopedFullPartialLinalgCopy(xferReadOp, returnTypes,
inBoundsCond,
compatibleMemRefType, alloc);
if (ifOp)
*ifOp = fullPartialIfOp;
// Set existing read op to in-bounds, it always reads from a full buffer.
for (unsigned i = 0, e = returnTypes.size(); i != e; ++i)
xferReadOp.setOperand(i, fullPartialIfOp.getResult(i));
xferOp->setAttr(vector::TransferReadOp::getInBoundsAttrName(), inBoundsAttr);
if (auto xferReadOp =
dyn_cast<vector::TransferReadOp>(xferOp.getOperation())) {
// Read case: full fill + partial copy -> in-bounds vector.xfer_read.
scf::IfOp fullPartialIfOp =
options.vectorTransferSplit == VectorTransferSplit::VectorTransfer
? createScopedFullPartialVectorTransferRead(
xferReadOp, returnTypes, inBoundsCond, compatibleMemRefType,
alloc)
: createScopedFullPartialLinalgCopy(xferReadOp, returnTypes,
inBoundsCond,
compatibleMemRefType, alloc);
if (ifOp)
*ifOp = fullPartialIfOp;
// Set existing read op to in-bounds, it always reads from a full buffer.
for (unsigned i = 0, e = returnTypes.size(); i != e; ++i)
xferReadOp.setOperand(i, fullPartialIfOp.getResult(i));
xferOp->setAttr(xferOp.getInBoundsAttrName(), inBoundsAttr);
return success();
}
auto xferWriteOp = cast<vector::TransferWriteOp>(xferOp.getOperation());
// Decide which location to write the entire vector to.
auto memrefAndIndices = getLocationToWriteFullVec(
xferWriteOp, returnTypes, inBoundsCond, compatibleMemRefType, alloc);
// Do an in bounds write to either the output or the extra allocated buffer.
// The operation is cloned to prevent deleting information needed for the
// later IR creation.
BlockAndValueMapping mapping;
mapping.map(xferWriteOp.source(), memrefAndIndices.front());
mapping.map(xferWriteOp.indices(), memrefAndIndices.drop_front());
auto *clone = b.clone(*xferWriteOp, mapping);
clone->setAttr(xferWriteOp.getInBoundsAttrName(), inBoundsAttr);
// Create a potential copy from the allocated buffer to the final output in
// the slow path case.
if (options.vectorTransferSplit == VectorTransferSplit::VectorTransfer)
createScopedFullPartialVectorTransferWrite(xferWriteOp, inBoundsCond,
alloc);
else
createScopedFullPartialLinalgCopy(xferWriteOp, inBoundsCond, alloc);
xferOp->erase();
return success();
}

View File

@@ -1,5 +1,5 @@
// RUN: mlir-opt %s -test-vector-transfer-full-partial-split | FileCheck %s
// RUN: mlir-opt %s -test-vector-transfer-full-partial-split=use-linalg-copy | FileCheck %s --check-prefix=LINALG
// RUN: mlir-opt %s -test-vector-transfer-full-partial-split -split-input-file | FileCheck %s
// RUN: mlir-opt %s -test-vector-transfer-full-partial-split=use-linalg-copy -split-input-file | FileCheck %s --check-prefix=LINALG
// CHECK-DAG: #[[$map_p4:.*]] = affine_map<()[s0] -> (s0 + 4)>
// CHECK-DAG: #[[$map_p8:.*]] = affine_map<()[s0] -> (s0 + 8)>
@@ -186,3 +186,206 @@ func @split_vector_transfer_read_strided_2d(
// CHECK: return %[[res]] : vector<4x8xf32>
return %1 : vector<4x8xf32>
}
// -----
func @split_vector_transfer_write_2d(%V: vector<4x8xf32>, %A: memref<?x8xf32>, %i: index, %j: index) {
vector.transfer_write %V, %A[%i, %j] :
vector<4x8xf32>, memref<?x8xf32>
return
}
// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 4)>
// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 8)>
// CHECK: func @split_vector_transfer_write_2d(
// CHECK-SAME: %[[VEC:.*]]: vector<4x8xf32>,
// CHECK-SAME: %[[DEST:.*]]: memref<?x8xf32>,
// CHECK-SAME: %[[I:.*]]: index,
// CHECK-SAME: %[[J:.*]]: index) {
// CHECK-DAG: %[[C8:.*]] = constant 8 : index
// CHECK-DAG: %[[C0:.*]] = constant 0 : index
// CHECK-DAG: %[[CT:.*]] = constant true
// CHECK: %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
// CHECK: %[[VAL_8:.*]] = affine.apply #[[MAP0]]()[%[[I]]]
// CHECK: %[[DIM0:.*]] = memref.dim %[[DEST]], %[[C0]] : memref<?x8xf32>
// CHECK: %[[DIM0_IN:.*]] = cmpi sle, %[[VAL_8]], %[[DIM0]] : index
// CHECK: %[[DIM1:.*]] = affine.apply #[[MAP1]]()[%[[J]]]
// CHECK: %[[DIM1_IN:.*]] = cmpi sle, %[[DIM1]], %[[C8]] : index
// CHECK: %[[IN_BOUNDS:.*]] = and %[[DIM0_IN]], %[[DIM1_IN]] : i1
// CHECK: %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]] ->
// CHECK-SAME: (memref<?x8xf32>, index, index) {
// CHECK: scf.yield %[[DEST]], %[[I]], %[[J]] : memref<?x8xf32>, index, index
// CHECK: } else {
// CHECK: %[[VAL_15:.*]] = memref.cast %[[TEMP]]
// CHECK-SAME: : memref<4x8xf32> to memref<?x8xf32>
// CHECK: scf.yield %[[VAL_15]], %[[C0]], %[[C0]]
// CHECK-SAME: : memref<?x8xf32>, index, index
// CHECK: }
// CHECK: vector.transfer_write %[[VEC]],
// CHECK-SAME: %[[IN_BOUND_DEST:.*]]#0[%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2]
// CHECK-SAME: {in_bounds = [true, true]} : vector<4x8xf32>, memref<?x8xf32>
// CHECK: %[[OUT_BOUNDS:.*]] = xor %[[IN_BOUNDS]], %[[CT]] : i1
// CHECK: scf.if %[[OUT_BOUNDS]] {
// CHECK: %[[CASTED:.*]] = vector.type_cast %[[TEMP]]
// CHECK-SAME: : memref<4x8xf32> to memref<vector<4x8xf32>>
// CHECK: %[[RESULT_COPY:.*]] = memref.load %[[CASTED]][]
// CHECK-SAME: : memref<vector<4x8xf32>>
// CHECK: vector.transfer_write %[[RESULT_COPY]],
// CHECK-SAME: %[[DEST]][%[[I]], %[[J]]]
// CHECK-SAME: : vector<4x8xf32>, memref<?x8xf32>
// CHECK: }
// CHECK: return
// CHECK: }
// LINALG-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 4)>
// LINALG-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 8)>
// LINALG-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 4)>
// LINALG-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 8)>
// LINALG-DAG: #[[MAP4:.*]] = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>
// LINALG: func @split_vector_transfer_write_2d(
// LINALG-SAME: %[[VEC:.*]]: vector<4x8xf32>,
// LINALG-SAME: %[[DEST:.*]]: memref<?x8xf32>,
// LINALG-SAME: %[[I:.*]]: index,
// LINALG-SAME: %[[J:.*]]: index) {
// LINALG-DAG: %[[CT:.*]] = constant true
// LINALG-DAG: %[[C0:.*]] = constant 0 : index
// LINALG-DAG: %[[C4:.*]] = constant 4 : index
// LINALG-DAG: %[[C8:.*]] = constant 8 : index
// LINALG: %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
// LINALG: %[[IDX0:.*]] = affine.apply #[[MAP0]]()[%[[I]]]
// LINALG: %[[DIM0:.*]] = memref.dim %[[DEST]], %[[C0]] : memref<?x8xf32>
// LINALG: %[[DIM0_IN:.*]] = cmpi sle, %[[IDX0]], %[[DIM0]] : index
// LINALG: %[[DIM1:.*]] = affine.apply #[[MAP1]]()[%[[J]]]
// LINALG: %[[DIM1_IN:.*]] = cmpi sle, %[[DIM1]], %[[C8]] : index
// LINALG: %[[IN_BOUNDS:.*]] = and %[[DIM0_IN]], %[[DIM1_IN]] : i1
// LINALG: %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]]
// LINALG-SAME: -> (memref<?x8xf32>, index, index) {
// LINALG: scf.yield %[[DEST]], %[[I]], %[[J]] : memref<?x8xf32>, index, index
// LINALG: } else {
// LINALG: %[[VAL_16:.*]] = memref.cast %[[TEMP]] : memref<4x8xf32> to memref<?x8xf32>
// LINALG: scf.yield %[[VAL_16]], %[[C0]], %[[C0]] : memref<?x8xf32>, index, index
// LINALG: }
// LINALG: vector.transfer_write %[[VEC]],
// LINALG-SAME: %[[IN_BOUND_DEST:.*]]#0[%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2]
// LINALG-SAME: {in_bounds = [true, true]} : vector<4x8xf32>, memref<?x8xf32>
// LINALG: %[[OUT_BOUNDS:.*]] = xor %[[IN_BOUNDS]], %[[CT]] : i1
// LINALG: scf.if %[[OUT_BOUNDS]] {
// LINALG: %[[VAL_19:.*]] = memref.dim %[[DEST]], %[[C0]] : memref<?x8xf32>
// LINALG-DAG: %[[VAL_20:.*]] = affine.min #[[MAP2]](%[[VAL_19]], %[[I]], %[[C4]])
// LINALG-DAG: %[[VAL_21:.*]] = affine.min #[[MAP3]](%[[C8]], %[[J]], %[[C8]])
// LINALG: %[[VAL_22:.*]] = memref.subview %[[TEMP]]
// LINALG-SAME: [%[[I]], %[[J]]] [%[[VAL_20]], %[[VAL_21]]]
// LINALG-SAME: [1, 1] : memref<4x8xf32> to memref<?x?xf32, #[[MAP4]]>
// LINALG: linalg.copy(%[[VAL_22]], %[[DEST]])
// LINALG-SAME: : memref<?x?xf32, #[[MAP4]]>, memref<?x8xf32>
// LINALG: }
// LINALG: return
// LINALG: }
// -----
func @split_vector_transfer_write_strided_2d(
%V: vector<4x8xf32>, %A: memref<7x8xf32, offset:?, strides:[?, 1]>,
%i: index, %j: index) {
vector.transfer_write %V, %A[%i, %j] :
vector<4x8xf32>, memref<7x8xf32, offset:?, strides:[?, 1]>
return
}
// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 4)>
// CHECK-DAG: #[[MAP2:.*]] = affine_map<()[s0] -> (s0 + 8)>
// CHECK: func @split_vector_transfer_write_strided_2d(
// CHECK-SAME: %[[VEC:.*]]: vector<4x8xf32>,
// CHECK-SAME: %[[DEST:.*]]: memref<7x8xf32, #[[MAP0]]>,
// CHECK-SAME: %[[I:.*]]: index,
// CHECK-SAME: %[[J:.*]]: index) {
// CHECK-DAG: %[[C7:.*]] = constant 7 : index
// CHECK-DAG: %[[C8:.*]] = constant 8 : index
// CHECK-DAG: %[[C0:.*]] = constant 0 : index
// CHECK-DAG: %[[CT:.*]] = constant true
// CHECK: %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
// CHECK: %[[DIM0:.*]] = affine.apply #[[MAP1]]()[%[[I]]]
// CHECK: %[[DIM0_IN:.*]] = cmpi sle, %[[DIM0]], %[[C7]] : index
// CHECK: %[[DIM1:.*]] = affine.apply #[[MAP2]]()[%[[J]]]
// CHECK: %[[DIM1_IN:.*]] = cmpi sle, %[[DIM1]], %[[C8]] : index
// CHECK: %[[IN_BOUNDS:.*]] = and %[[DIM0_IN]], %[[DIM1_IN]] : i1
// CHECK: %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]]
// CHECK-SAME: -> (memref<?x8xf32, #[[MAP0]]>, index, index) {
// CHECK: %[[VAL_15:.*]] = memref.cast %[[DEST]]
// CHECK-SAME: : memref<7x8xf32, #[[MAP0]]> to memref<?x8xf32, #[[MAP0]]>
// CHECK: scf.yield %[[VAL_15]], %[[I]], %[[J]]
// CHECK-SAME: : memref<?x8xf32, #[[MAP0]]>, index, index
// CHECK: } else {
// CHECK: %[[VAL_16:.*]] = memref.cast %[[TEMP]]
// CHECK-SAME: : memref<4x8xf32> to memref<?x8xf32, #[[MAP0]]>
// CHECK: scf.yield %[[VAL_16]], %[[C0]], %[[C0]]
// CHECK-SAME: : memref<?x8xf32, #[[MAP0]]>, index, index
// CHECK: }
// CHECK: vector.transfer_write %[[VEC]],
// CHECK-SAME: %[[IN_BOUND_DEST:.*]]#0
// CHECK-SAME: [%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2]
// CHECK-SAME: {in_bounds = [true, true]} : vector<4x8xf32>, memref<?x8xf32, #[[MAP0]]>
// CHECK: %[[OUT_BOUNDS:.*]] = xor %[[IN_BOUNDS]], %[[CT]] : i1
// CHECK: scf.if %[[OUT_BOUNDS]] {
// CHECK: %[[VAL_19:.*]] = vector.type_cast %[[TEMP]]
// CHECK-SAME: : memref<4x8xf32> to memref<vector<4x8xf32>>
// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_19]][]
// CHECK-SAME: : memref<vector<4x8xf32>>
// CHECK: vector.transfer_write %[[VAL_20]], %[[DEST]][%[[I]], %[[J]]]
// CHECK-SAME: : vector<4x8xf32>, memref<7x8xf32, #[[MAP0]]>
// CHECK: }
// CHECK: return
// CHECK: }
// LINALG-DAG: #[[MAP0:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
// LINALG-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 4)>
// LINALG-DAG: #[[MAP2:.*]] = affine_map<()[s0] -> (s0 + 8)>
// LINALG-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 4)>
// LINALG-DAG: #[[MAP4:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 8)>
// LINALG-DAG: #[[MAP5:.*]] = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>
// LINALG: func @split_vector_transfer_write_strided_2d(
// LINALG-SAME: %[[VEC:.*]]: vector<4x8xf32>,
// LINALG-SAME: %[[DEST:.*]]: memref<7x8xf32, #[[MAP0]]>,
// LINALG-SAME: %[[I:.*]]: index,
// LINALG-SAME: %[[J:.*]]: index) {
// LINALG-DAG: %[[C0:.*]] = constant 0 : index
// LINALG-DAG: %[[CT:.*]] = constant true
// LINALG-DAG: %[[C7:.*]] = constant 7 : index
// LINALG-DAG: %[[C4:.*]] = constant 4 : index
// LINALG-DAG: %[[C8:.*]] = constant 8 : index
// LINALG: %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32>
// LINALG: %[[DIM0:.*]] = affine.apply #[[MAP1]]()[%[[I]]]
// LINALG: %[[DIM0_IN:.*]] = cmpi sle, %[[DIM0]], %[[C7]] : index
// LINALG: %[[DIM1:.*]] = affine.apply #[[MAP2]]()[%[[J]]]
// LINALG: %[[DIM1_IN:.*]] = cmpi sle, %[[DIM1]], %[[C8]] : index
// LINALG: %[[IN_BOUNDS:.*]] = and %[[DIM0_IN]], %[[DIM1_IN]] : i1
// LINALG: %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]]
// LINALG-SAME: -> (memref<?x8xf32, #[[MAP0]]>, index, index) {
// LINALG: %[[VAL_16:.*]] = memref.cast %[[DEST]]
// LINALG-SAME: : memref<7x8xf32, #[[MAP0]]> to memref<?x8xf32, #[[MAP0]]>
// LINALG: scf.yield %[[VAL_16]], %[[I]], %[[J]]
// LINALG-SAME: : memref<?x8xf32, #[[MAP0]]>, index, index
// LINALG: } else {
// LINALG: %[[VAL_17:.*]] = memref.cast %[[TEMP]]
// LINALG-SAME: : memref<4x8xf32> to memref<?x8xf32, #[[MAP0]]>
// LINALG: scf.yield %[[VAL_17]], %[[C0]], %[[C0]]
// LINALG-SAME: : memref<?x8xf32, #[[MAP0]]>, index, index
// LINALG: }
// LINALG: vector.transfer_write %[[VEC]],
// LINALG-SAME: %[[IN_BOUND_DEST:.*]]#0
// LINALG-SAME: [%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2]
// LINALG-SAME: {in_bounds = [true, true]}
// LINALG-SAME: : vector<4x8xf32>, memref<?x8xf32, #[[MAP0]]>
// LINALG: %[[OUT_BOUNDS:.*]] = xor %[[IN_BOUNDS]], %[[CT]] : i1
// LINALG: scf.if %[[OUT_BOUNDS]] {
// LINALG-DAG: %[[VAL_20:.*]] = affine.min #[[MAP3]](%[[C7]], %[[I]], %[[C4]])
// LINALG-DAG: %[[VAL_21:.*]] = affine.min #[[MAP4]](%[[C8]], %[[J]], %[[C8]])
// LINALG: %[[VAL_22:.*]] = memref.subview %[[TEMP]]
// LINALG-SAME: [%[[I]], %[[J]]] [%[[VAL_20]], %[[VAL_21]]]
// LINALG-SAME: [1, 1] : memref<4x8xf32> to memref<?x?xf32, #[[MAP5]]>
// LINALG: linalg.copy(%[[VAL_22]], %[[DEST]])
// LINALG-SAME: : memref<?x?xf32, #[[MAP5]]>, memref<7x8xf32, #[[MAP0]]>
// LINALG: }
// LINALG: return
// LINALG: }