Revert "[mlir][sparse] fix sparse tensor rewriting patterns that do not propagate sparse tensor SSA properly."

This reverts commit 70508b614e. This change depends on a reverted change that broke the windows mlir buildbot; reverting to bring remaining mlir bots to green
2026-01-19 17:45:07 +08:00 · 2022-11-07 09:00:08 -08:00
parent 058f727a98
commit ec224e3b68
8 changed files with 120 additions and 181 deletions
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -603,12 +603,9 @@ void ForeachOp::build(
  std::fill_n(std::back_inserter(blockArgTypes), rank, builder.getIndexType());
  // Followed by one value.
  blockArgTypes.push_back(rtp.getElementType());
-  // Followed by reduction variable.
-  blockArgTypes.append(initArgs.getTypes().begin(), initArgs.getTypes().end());

  SmallVector<Location, 4> blockArgLocs;
-  std::fill_n(std::back_inserter(blockArgLocs), blockArgTypes.size(),
-              tensor.getLoc());
+  std::fill_n(std::back_inserter(blockArgLocs), rank + 1, tensor.getLoc());

  OpBuilder::InsertionGuard guard(builder);
  auto &region = *result.regions.front();
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -880,9 +880,6 @@ Value mlir::sparse_tensor::genValueForDense(OpBuilder &builder, Location loc,
  return val;
 }

-// FIXME:
-// 1. Dense tensors loop should be generated by loop emitter.
-// 2. Support reduction variables to propagate SSA chains properly.
 void mlir::sparse_tensor::genDenseTensorOrSparseConstantIterLoop(
    OpBuilder &builder, Location loc, Value src, unsigned rank,
    function_ref<void(OpBuilder &, Location, Value, ValueRange)> bodyBuilder) {
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -356,8 +356,8 @@ public:
    RankedTensorType cooTp = getUnorderedCOOFromType(dstTp);
    auto cooBuffer =
        rewriter.create<AllocTensorOp>(loc, cooTp, dstDynSizes).getResult();
-    ForeachOp foreachOp = rewriter.create<ForeachOp>(
-        loc, srcTensor, cooBuffer,
+    rewriter.create<ForeachOp>(
+        loc, srcTensor, llvm::None,
        [&](OpBuilder &builder, Location loc, ValueRange args, Value v,
            ValueRange reduc) {
          SmallVector<Value, 4> srcIndices;
@@ -368,11 +368,11 @@ public:
          }
          translateIndicesArray(builder, loc, op.getReassociationIndices(),
                                srcIndices, srcSizes, dstSizes, dstIndices);
-          auto t = builder.create<InsertOp>(loc, v, reduc.front(), dstIndices);
-          builder.create<sparse_tensor::YieldOp>(loc, t);
+          builder.create<InsertOp>(loc, v, cooBuffer, dstIndices);
+          builder.create<sparse_tensor::YieldOp>(loc);
        });
-    auto t = rewriter.create<LoadOp>(loc, foreachOp.getResult(0), true);
-    rewriter.replaceOpWithNewOp<ConvertOp>(op, dstTp, t);
+
+    rewriter.replaceOpWithNewOp<ConvertOp>(op, dstTp, cooBuffer);
    return success();
  }
 };
@@ -442,14 +442,13 @@ struct ConcatenateRewriter : public OpRewritePattern<ConcatenateOp> {
        rewriter.create<AllocTensorOp>(loc, cooTp, ValueRange()).getResult();

    Value offset = constantIndex(rewriter, loc, 0);
-    ForeachOp foreachOp;
    for (Value input : op.getInputs()) {
      // Builds the indexing map.

      // Build a for op for each input tensor to append new values into the
      // output tensor.
-      foreachOp = rewriter.create<ForeachOp>(
-          loc, input, cooBuffer,
+      rewriter.create<ForeachOp>(
+          loc, input, llvm::None,
          [&](OpBuilder &builder, Location loc, ValueRange args, Value v,
              ValueRange reduc) {
            SmallVector<Value, 4> indices;
@@ -462,8 +461,8 @@ struct ConcatenateRewriter : public OpRewritePattern<ConcatenateOp> {
                idx = builder.create<arith::AddIOp>(loc, idx, offset);
              indices.push_back(idx);
            }
-            auto t = builder.create<InsertOp>(loc, v, reduc.front(), indices);
-            builder.create<sparse_tensor::YieldOp>(loc, t);
+            builder.create<InsertOp>(loc, v, cooBuffer, indices);
+            builder.create<sparse_tensor::YieldOp>(loc);
          });
      // Accumulates the offset. Note that only static-shaped inputs are allowed
      // by concatenate op verifier, which saves us from computing the offset
@@ -472,10 +471,7 @@ struct ConcatenateRewriter : public OpRewritePattern<ConcatenateOp> {
      assert(!ShapedType::isDynamic(d));
      offset = rewriter.create<arith::AddIOp>(loc, offset,
                                              constantIndex(rewriter, loc, d));
-      cooBuffer = foreachOp.getResult(0);
    }
-
-    cooBuffer = rewriter.create<LoadOp>(loc, cooBuffer, true);
    rewriter.replaceOpWithNewOp<ConvertOp>(op, rtp, cooBuffer);
    return success();
  }
@@ -606,8 +602,8 @@ private:
      srcTp = getUnorderedCOOFromType(srcTp);
      tmpCoo =
          rewriter.create<AllocTensorOp>(loc, srcTp, dynSrcSizes).getResult();
-      auto foreachOp = rewriter.create<ForeachOp>(
-          loc, src, tmpCoo,
+      rewriter.create<ForeachOp>(
+          loc, src, llvm::None,
          [&](OpBuilder &builder, Location loc, ValueRange args, Value v,
              ValueRange reduc) {
            SmallVector<Value, 4> indices;
@@ -615,10 +611,10 @@ private:
              uint64_t dim = toStoredDim(encSrc, i);
              indices.push_back(args[dim]);
            }
-            auto t = builder.create<InsertOp>(loc, v, reduc.front(), indices);
-            builder.create<sparse_tensor::YieldOp>(loc, t);
+            builder.create<InsertOp>(loc, v, tmpCoo, indices);
+            builder.create<sparse_tensor::YieldOp>(loc);
          });
-      src = rewriter.create<LoadOp>(loc, foreachOp.getResult(0), true);
+      src = tmpCoo;
    }

    // Sort the COO tensor so that its elements are ordered via increasing
@@ -657,31 +653,29 @@ private:
    getDynamicSizes(dstTp, srcSizes, dynDstSizes);
    Value dst =
        rewriter.create<AllocTensorOp>(loc, dstTp, dynDstSizes).getResult();
-    auto foreachOp = rewriter.create<ForeachOp>(
-        loc, src, dst,
-        [&](OpBuilder &builder, Location loc, ValueRange args, Value v,
-            ValueRange reduc) {
-          SmallVector<Value, 4> indices;
-          for (int64_t i = 0, e = srcTp.getRank(); i < e; i++) {
-            uint64_t dim = toStoredDim(encDst, i);
-            indices.push_back(args[dim]);
-          }
-          auto t = builder.create<InsertOp>(loc, v, reduc.front(), indices);
-          builder.create<sparse_tensor::YieldOp>(loc, t);
-        });
+    rewriter.create<ForeachOp>(loc, src, llvm::None,
+                               [&](OpBuilder &builder, Location loc,
+                                   ValueRange args, Value v, ValueRange reduc) {
+                                 SmallVector<Value, 4> indices;
+                                 for (int64_t i = 0, e = srcTp.getRank(); i < e;
+                                      i++) {
+                                   uint64_t dim = toStoredDim(encDst, i);
+                                   indices.push_back(args[dim]);
+                                 }
+                                 builder.create<InsertOp>(loc, v, dst, indices);
+                                 builder.create<sparse_tensor::YieldOp>(loc);
+                               });

-    // Release the temporary COO if it is created. Note that tmpCoo is
-    // invalidated due to foreach and updated to src.
+    // Release the temporary COO if it is created.
    if (tmpCoo)
-      rewriter.create<DeallocTensorOp>(loc, src);
+      rewriter.create<DeallocTensorOp>(loc, tmpCoo);

    // Directly replace op with dst results in bufferization error message
    // "sparse tensor allocation should not escape function".
    // As such, we insert a trivial tensor convert which will be removed by
    // codegen.
    rewriter.setInsertionPointAfter(op);
-    auto t = rewriter.create<LoadOp>(loc, foreachOp.getResult(0), true);
-    rewriter.replaceOpWithNewOp<ConvertOp>(op, dstTp, t);
+    rewriter.replaceOpWithNewOp<ConvertOp>(op, dstTp, dst);
    return success();
  }
 };
@@ -700,8 +694,6 @@ public:
    int64_t rank = rtp.getRank();
    auto enc = getSparseTensorEncoding(rtp);

-    SmallVector<Value> reduc = op.getInitArgs();
-
    // 1. Generates loop for the sparse input.
    SparseTensorLoopEmitter loopEmitter(ValueRange{input});
    loopEmitter.initializeLoopEmit(rewriter, loc);
@@ -709,9 +701,7 @@ public:
      // TODO: provide utility function for loop sequences that only contains
      // one for loop?
      loopEmitter.enterNewLoopSeq(rewriter, loc, 0, static_cast<size_t>(i));
-      // Note that reduc will be taken care of by loop emitter and get updated
-      // in place.
-      loopEmitter.enterLoopOverTensorAtDim(rewriter, loc, 0, i, reduc);
+      loopEmitter.enterLoopOverTensorAtDim(rewriter, loc, 0, i);
    }

    SmallVector<Value, 4> coords;
@@ -726,7 +716,15 @@ public:
                    : rewriter.create<memref::LoadOp>(loc, vals, coords);

    // 2. Inline the block in the foreach operator.
+    Block::iterator inlinePos = rewriter.getInsertionPoint();
    Block *srcBlock = op.getBody();
+    // Remove sparse_tensor.yield.
+    rewriter.eraseOp(srcBlock->getTerminator());
+
+    for (int64_t i = 0; i < rank; i++) {
+      loopEmitter.exitCurrentLoop(rewriter, loc);
+      loopEmitter.exitCurrentLoopSeq();
+    }

    SmallVector<Value, 4> args;
    // Remap coordinates.
@@ -736,33 +734,11 @@ public:
    }
    // Remap value.
    args.push_back(val);
-    // Remap reduction variables.
-    args.append(reduc);
-
-    // Remove sparse_tensor.yield.
-    SmallVector<Value> reducValue = srcBlock->getTerminator()->getOperands();
-    rewriter.eraseOp(srcBlock->getTerminator());

    // Inline body.
-    if (!reducValue.empty()) {
-      rewriter.mergeBlocks(srcBlock, rewriter.getBlock(), args);
-    } else {
-      // This is annoying, since scf.for inserts a implicit yield op when
-      // there is no reduction variable upon creation, in this case we need to
-      // merge the block *before* the yield op.
-      rewriter.mergeBlockBefore(srcBlock, &*rewriter.getInsertionPoint(), args);
-    }
-
-    for (int64_t i = 0; i < rank; i++) {
-      // Link the reduction chain. Note that loop emitter update the reducValue
-      // in place.
-      loopEmitter.exitCurrentLoop(rewriter, loc, reducValue);
-      loopEmitter.exitCurrentLoopSeq();
-    }
-
-    // Replace the foreach operator with the value returned by the outtermost
-    // for loop.
-    rewriter.replaceOp(op, reducValue);
+    rewriter.mergeBlockBefore(srcBlock, &*inlinePos, args);
+    // delete the foreach operator.
+    rewriter.eraseOp(op);
    return success();
  }
 };
@@ -825,8 +801,7 @@ struct NewRewriter : public OpRewritePattern<NewOp> {
                    .getResult(0);
    Type eltTp = dstTp.getElementType();
    Value value = genAllocaScalar(rewriter, loc, eltTp);
-    scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, c0, nnz, c1,
-                                                   ArrayRef<Value>(cooBuffer));
+    scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, c0, nnz, c1);
    rewriter.setInsertionPointToStart(forOp.getBody());

    SmallString<18> getNextFuncName{"getSparseTensorReaderNext",
@@ -841,17 +816,13 @@ struct NewRewriter : public OpRewritePattern<NewOp> {
          loc, indices, constantIndex(rewriter, loc, i)));
    }
    Value v = rewriter.create<memref::LoadOp>(loc, value);
-    auto t = rewriter.create<InsertOp>(loc, v, forOp.getRegionIterArg(0),
-                                       indicesArray);
-    rewriter.create<scf::YieldOp>(loc, ArrayRef<Value>(t));
+    rewriter.create<InsertOp>(loc, v, cooBuffer, indicesArray);
    rewriter.setInsertionPointAfter(forOp);
-    // Link SSA chain.
-    cooBuffer = forOp.getResult(0);

    // Release the sparse tensor reader.
    createFuncCall(rewriter, loc, "delSparseTensorReader", {}, {reader},
                   EmitCInterface::Off);
-    cooBuffer = rewriter.create<LoadOp>(loc, cooBuffer, true);
+
    Value newOp = rewriter.replaceOpWithNewOp<ConvertOp>(op, dstTp, cooBuffer);

    // Release the unordered COO tensor buffer.
--- a/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir
+++ b/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir
@@ -116,7 +116,6 @@ func.func @sparse_convert_complex(%arg0: tensor<100xcomplex<f64>>) -> tensor<100
 //      CHECK-RWT:        %[[V:.*]] = tensor.extract %[[A]]{{\[}}%[[FI]], %[[FJ]]] : tensor<2x4xf64>
 //      CHECK-RWT:        %[[NZ:.*]] = arith.cmpf une, %[[V]], %[[F0]] : f64
 //      CHECK-RWT:        scf.if %[[NZ]] {
-//                          // FIXME: the SSA chain is broken here!
 //      CHECK-RWT:          %{{.*}} = sparse_tensor.insert %[[V]] into %[[COO]]{{\[}}%[[FI]], %[[FJ]]]
 //      CHECK-RWT:        }
 //      CHECK-RWT:      }
@@ -127,13 +126,11 @@ func.func @sparse_convert_complex(%arg0: tensor<100xcomplex<f64>>) -> tensor<100
 //      CHECK-RWT:    %[[V2:.*]] = sparse_tensor.values %[[COO]]
 //      CHECK-RWT:    sparse_tensor.sort %[[NNZ]], %[[I0]], %[[I1]] jointly %[[V2]]
 //      CHECK-RWT:    %[[DST:.*]] = bufferization.alloc_tensor()
-//      CHECK-RWT:    %[[NEW_T:.*]] = sparse_tensor.foreach in %[[COO]] init(%[[DST]])
-//      CHECK-RWT:    ^bb0(%[[FI0:.*]]: index, %[[FI1:.*]]: index, %[[FV:.*]]: f64, %[[R0:.*]]: tensor
-//      CHECK-RWT:      %[[RET:.*]] = sparse_tensor.insert %[[FV]] into %[[R0]]{{\[}}%[[FI0]], %[[FI1]]]
-//      CHECK-RWT:      sparse_tensor.yield %[[RET]]
+//      CHECK-RWT:    sparse_tensor.foreach in %[[COO]]
+//      CHECK-RWT:    ^bb0(%[[FI0:.*]]: index, %[[FI1:.*]]: index, %[[FV:.*]]: f64):
+//      CHECK-RWT:      sparse_tensor.insert %[[FV]] into %[[DST]]{{\[}}%[[FI0]], %[[FI1]]]
 //      CHECK-RWT:    }
-//      CHECK-RWT:    %[[NT:.*]] = sparse_tensor.load %[[NEW_T]] hasInserts
-//      CHECK-RWT:    %[[R:.*]] = sparse_tensor.convert %[[NT]]
+//      CHECK-RWT:    %[[R:.*]] = sparse_tensor.convert %[[DST]]
 //      CHECK-RWT:    bufferization.dealloc_tensor %[[COO]]
 //      CHECK-RWT:    return %[[R]] : tensor<2x4xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>
 func.func @sparse_convert_2d(%arg0: tensor<2x4xf64>) -> tensor<2x4xf64, #CSR> {
@@ -182,7 +179,6 @@ func.func @sparse_convert_2d(%arg0: tensor<2x4xf64>) -> tensor<2x4xf64, #CSR> {
 //       CHECK-RWT:    %[[I1r:.*]] = tensor.extract %[[SI]]{{\[}}%[[FI]], %[[C1]]] : tensor<2x2xi64>
 //       CHECK-RWT:    %[[I1:.*]] = arith.index_cast %[[I1r]] : i64 to index
 //       CHECK-RWT:    %[[V:.*]] = tensor.extract %[[SV]]{{\[}}%[[FI]]] : tensor<2xf32>
-//                     // FIXME: the SSA chain is broken here!
 //       CHECK-RWT:    sparse_tensor.insert %[[V]] into %[[COO]]{{\[}}%[[I0]], %[[I1]]]
 //       CHECK-RWT:  }
 //       CHECK-RWT:  %[[TI0:.*]] = sparse_tensor.indices %[[COO]] {dimension = 0 : index}
@@ -191,13 +187,11 @@ func.func @sparse_convert_2d(%arg0: tensor<2x4xf64>) -> tensor<2x4xf64, #CSR> {
 //       CHECK-RWT:  %[[TV:.*]] = sparse_tensor.values %[[COO]]
 //       CHECK-RWT:  sparse_tensor.sort %[[NNZ]], %[[TI0]], %[[TI1]] jointly %[[TV]]
 //       CHECK-RWT:  %[[DST:.*]] = bufferization.alloc_tensor()
-//       CHECK-RWT:  %[[RET:.*]] = sparse_tensor.foreach in %[[COO]] init(%[[DST]])
-//       CHECK-RWT:  ^bb0(%[[F2I0:.*]]: index, %[[F2I1:.*]]: index, %[[F2V:.*]]: f32, %[[R0:.*]]: tensor
-//       CHECK-RWT:    %[[NEW_T:.*]] = sparse_tensor.insert %[[F2V]] into %[[R0]]{{\[}}%[[F2I0]], %[[F2I1]]]
-//       CHECK-RWT:    sparse_tensor.yield %[[NEW_T]]
+//       CHECK-RWT:  sparse_tensor.foreach in %[[COO]]
+//       CHECK-RWT:  ^bb0(%[[F2I0:.*]]: index, %[[F2I1:.*]]: index, %[[F2V:.*]]: f32):
+//       CHECK-RWT:    sparse_tensor.insert %[[F2V]] into %[[DST]]{{\[}}%[[F2I0]], %[[F2I1]]]
 //       CHECK-RWT:  }
-//       CHECK-RWT:  %[[T:.*]] = sparse_tensor.load %[[RET]] hasInserts
-//       CHECK-RWT:  %[[R:.*]] = sparse_tensor.convert %[[T]]
+//       CHECK-RWT:  %[[R:.*]] = sparse_tensor.convert %[[DST]]
 //       CHECK-RWT:  bufferization.dealloc_tensor %[[COO]]
 //       CHECK-RWT:  return %[[R]] : tensor<8x7xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>
 func.func @sparse_constant() -> tensor<8x7xf32, #CSR>{
--- a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
+++ b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
@@ -94,13 +94,11 @@ func.func @sparse_convert_1d_ss(%arg0: tensor<?xf32, #SparseVector64>) -> tensor
 //      CHECK-RWT:  %[[V:.*]] = sparse_tensor.values %[[A]]
 //      CHECK-RWT:  sparse_tensor.sort %[[NNZ]], %[[I0]] jointly %[[V]]
 //      CHECK-RWT:  %[[DST:.*]] = bufferization.alloc_tensor(%[[D]])
-//      CHECK-RWT:  %[[RET:.*]] = sparse_tensor.foreach in %[[A]] init(%[[DST]])
-//      CHECK-RWT:  ^bb0(%[[FI2:.*]]: index, %[[FV2:.*]]: f32, %[[T:.*]]: tensor<?xf32,
-//      CHECK-RWT:    %[[I:.*]] = sparse_tensor.insert %[[FV2]] into %[[T]]{{\[}}%[[FI2]]]
-//      CHECK-RWT:    sparse_tensor.yield %[[I]]
+//      CHECK-RWT:  sparse_tensor.foreach in %[[A]]
+//      CHECK-RWT:  ^bb0(%[[FI2:.*]]: index, %[[FV2:.*]]: f32):
+//      CHECK-RWT:    sparse_tensor.insert %[[FV2]] into %[[DST]]{{\[}}%[[FI2]]]
 //      CHECK-RWT:  }
-//      CHECK-RWT:  %[[T:.*]] = sparse_tensor.load %[[RET]] hasInserts
-//      CHECK-RWT:  %[[R:.*]] = sparse_tensor.convert %[[T]]
+//      CHECK-RWT:  %[[R:.*]] = sparse_tensor.convert %[[DST]]
 //      CHECK-RWT:  return %[[R]] : tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 32, indexBitWidth = 32 }>>
 func.func @sparse_convert(%arg0: tensor<?xf32, #SparseVector64>) -> tensor<?xf32, #SparseVector32> {
  %0 = sparse_tensor.convert %arg0 : tensor<?xf32, #SparseVector64> to tensor<?xf32, #SparseVector32>
--- a/mlir/test/Dialect/SparseTensor/rewriting_for_codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/rewriting_for_codegen.mlir
@@ -18,19 +18,18 @@
 // CHECK:         %[[T:.*]] = bufferization.alloc_tensor(%[[D0]], %[[D1]])
 // CHECK:         %[[N:.*]] = call @getSparseTensorReaderNNZ(%[[R]])
 // CHECK:         %[[VB:.*]] = memref.alloca()
-// CHECK:         %[[T2:.*]] = scf.for %{{.*}} = %[[C0]] to %[[N]] step %[[C1]] iter_args(%[[A2:.*]] = %[[T]])
+// CHECK:         scf.for %{{.*}} = %[[C0]] to %[[N]] step %[[C1]] {
 // CHECK:           func.call @getSparseTensorReaderNextF32(%[[R]], %[[DS]], %[[VB]])
 // CHECK:           %[[E0:.*]] = memref.load %[[DS]]{{\[}}%[[C0]]]
 // CHECK:           %[[E1:.*]] = memref.load %[[DS]]{{\[}}%[[C1]]]
 // CHECK:           %[[V:.*]] = memref.load %[[VB]][]
-// CHECK:           %[[T1:.*]] = sparse_tensor.insert %[[V]] into %[[A2]]{{\[}}%[[E0]], %[[E1]]]
-// CHECK:           scf.yield %[[T1]]
+// CHECK:           sparse_tensor.insert %[[V]] into %[[T]]{{\[}}%[[E0]], %[[E1]]]
 // CHECK:         }
 // CHECK:         call @delSparseTensorReader(%[[R]])
-// CHECK:         %[[T3:.*]] = sparse_tensor.load %[[T2]] hasInserts
-// CHECK:         %[[R:.*]] = sparse_tensor.convert %[[T3]]
-// CHECK:         bufferization.dealloc_tensor %[[T3]]
+// CHECK:         %[[R:.*]] = sparse_tensor.convert %[[T]]
+// CHECK:         bufferization.dealloc_tensor %[[T]]
 // CHECK:         return %[[R]]
+// CHECK:         }
 func.func @sparse_new(%arg0: !llvm.ptr<i8>) -> tensor<?x?xf32, #CSR> {
  %0 = sparse_tensor.new %arg0 : !llvm.ptr<i8> to tensor<?x?xf32, #CSR>
  return %0 : tensor<?x?xf32, #CSR>
--- a/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir
@@ -19,18 +19,16 @@
 //       CHECK:  %[[TMP_5:.*]] = sparse_tensor.values %[[TMP_arg0]] : tensor<2x4xf64, #sparse_tensor
 //       CHECK:  %[[TMP_6:.*]] = memref.load %[[TMP_1]][%[[TMP_c0]]] : memref<?xindex>
 //       CHECK:  %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  %[[RET_1:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]] iter_args(%[[A0:.*]] = %[[TMP_0]]) 
+//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]] {
 //       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
 //   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
 //   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    %[[RET_4:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A1:.*]] = %[[A0]]) 
+//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] {
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
 //       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[NEW_1:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A1]][%[[TMP_23]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
-//       CHECK:      scf.yield %[[NEW_1]]
+//       CHECK:      sparse_tensor.insert %[[TMP_28]] into %[[TMP_0]][%[[TMP_23]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
 //       CHECK:    }
-//       CHECK:    scf.yield %[[RET_4]]
 //       CHECK:  }
 //       CHECK:  %[[TMP_8:.*]] = sparse_tensor.pointers %[[TMP_arg1]] {dimension = 0 : index} : tensor<3x4xf64, #sparse_tensor
 //       CHECK:  %[[TMP_9:.*]] = sparse_tensor.indices %[[TMP_arg1]] {dimension = 0 : index} : tensor<3x4xf64, #sparse_tensor
@@ -39,19 +37,17 @@
 //       CHECK:  %[[TMP_12:.*]] = sparse_tensor.values %[[TMP_arg1]] : tensor<3x4xf64, #sparse_tensor
 //       CHECK:  %[[TMP_13:.*]] = memref.load %[[TMP_8]][%[[TMP_c0]]] : memref<?xindex>
 //       CHECK:  %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  %[[RET_2:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]] iter_args(%[[A2:.*]] = %[[RET_1]]) 
+//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]] {
 //       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
 //   CHECK-DAG:    %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
 //   CHECK-DAG:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    %[[RET_5:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A3:.*]] = %[[A2]]) 
+//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] {
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
 //       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
 //       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
-//       CHECK:      %[[NEW_2:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A3]][%[[TMP_29]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
-//       CHECK:      scf.yield %[[NEW_2]]
+//       CHECK:      sparse_tensor.insert %[[TMP_28]] into %[[TMP_0]][%[[TMP_29]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
 //       CHECK:    }
-//       CHECK:    scf.yield %[[RET_5]]
 //       CHECK:  }
 //       CHECK:  %[[TMP_15:.*]] = sparse_tensor.pointers %[[TMP_arg2]] {dimension = 0 : index} : tensor<4x4xf64, #sparse_tensor
 //       CHECK:  %[[TMP_16:.*]] = sparse_tensor.indices %[[TMP_arg2]] {dimension = 0 : index} : tensor<4x4xf64, #sparse_tensor
@@ -60,22 +56,19 @@
 //       CHECK:  %[[TMP_19:.*]] = sparse_tensor.values %[[TMP_arg2]] : tensor<4x4xf64, #sparse_tensor
 //       CHECK:  %[[TMP_20:.*]] = memref.load %[[TMP_15]][%[[TMP_c0]]] : memref<?xindex>
 //       CHECK:  %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
-//       CHECK:  %[[RET_3:.*]] = scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]] iter_args(%[[A4:.*]] = %[[RET_2]]) 
+//       CHECK:  scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]] {
 //       CHECK:    %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
 //       CHECK:    %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
 //       CHECK:    %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
 //       CHECK:    %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
-//       CHECK:    %[[RET_6:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A5:.*]] = %[[A4]]) 
+//       CHECK:    scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] {
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
 //       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
 //       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
-//       CHECK:      %[[NEW_3:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A5]][%[[TMP_29]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
-//       CHECK:      scf.yield %[[NEW_3]]
+//       CHECK:      sparse_tensor.insert %[[TMP_28]] into %[[TMP_0]][%[[TMP_29]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
 //       CHECK:    }
-//       CHECK:    scf.yield %[[RET_6]]
 //       CHECK:  }
-//       CHECK:  %[[TMP_23:.*]] = sparse_tensor.load %[[RET_3]] hasInserts
-//       CHECK:  %[[TMP_22:.*]] = sparse_tensor.convert %[[TMP_23]] : tensor<9x4xf64, #sparse_tensor
+//       CHECK:  %[[TMP_22:.*]] = sparse_tensor.convert %[[TMP_0]] : tensor<9x4xf64, #sparse_tensor
 //       CHECK:  return %[[TMP_22]] : tensor<9x4xf64, #sparse_tensor
 func.func @concat_sparse_sparse(%arg0: tensor<2x4xf64, #DCSR>,
                                %arg1: tensor<3x4xf64, #DCSR>,
--- a/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
@@ -52,16 +52,14 @@
 // CHECK-RWT:         %[[V:.*]] = sparse_tensor.values %[[S]]
 // CHECK-RWT:         %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
 // CHECK-RWT:         %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
-// CHECK-RWT:         %[[RET:.*]] = scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] iter_args(%[[R:.*]] = %[[B]])
+// CHECK-RWT:         scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] {
 // CHECK-RWT:           %[[SI:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
 // CHECK-RWT:           %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[I]]] : memref<?xf64>
 // CHECK-RWT:           %[[DI0:.*]] = arith.divui %[[SI]], %[[C10]] : index
 // CHECK-RWT:           %[[DI1:.*]] = arith.remui %[[SI]], %[[C10]] : index
-// CHECK-RWT:           %[[NT:.*]] = sparse_tensor.insert %[[SV]] into %[[R]]{{\[}}%[[DI0]], %[[DI1]]]
-// CHECK-RWT:           scf.yield %[[NT:.*]]
+// CHECK-RWT:           sparse_tensor.insert %[[SV]] into %[[B]]{{\[}}%[[DI0]], %[[DI1]]]
 // CHECK-RWT:         }
-// CHECK-RWT:         %[[NT1:.*]] = sparse_tensor.load %[[RET]] hasInserts
-// CHECK-RWT:         %[[T:.*]] = sparse_tensor.convert %[[NT1]]
+// CHECK-RWT:         %[[T:.*]] = sparse_tensor.convert %[[B]]
 // CHECK-RWT:         return %[[T]] : tensor<10x10xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>
 //
 func.func @sparse_expand(%arg0: tensor<100xf64, #SparseVector>) -> tensor<10x10xf64, #SparseMatrix> {
@@ -113,28 +111,25 @@ func.func @sparse_expand(%arg0: tensor<100xf64, #SparseVector>) -> tensor<10x10x
 // CHECK-RWT:         %[[B:.*]] = bufferization.alloc_tensor()
 // CHECK-RWT:         %[[P0:.*]] = sparse_tensor.pointers %[[S]] {dimension = 0 : index}
 // CHECK-RWT:         %[[I0:.*]] = sparse_tensor.indices %[[S]] {dimension = 0 : index}
-// CHECK-RWT:         %[[P1:.*]] = sparse_tensor.pointers %[[S]] {dimension = 1 : index}
-// CHECK-RWT:         %[[I1:.*]] = sparse_tensor.indices %[[S]] {dimension = 1 : index}
-// CHECK-RWT:         %[[V:.*]] = sparse_tensor.values %[[S]]
-// CHECK-RWT:         %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
-// CHECK-RWT:         %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
-// CHECK-RWT:         %[[RET:.*]] = scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] iter_args(%[[A0:.*]] = %[[B]])
-// CHECK-RWT:           %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT-DAG:       %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT-DAG:       %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
-// CHECK-RWT:           %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
-// CHECK-RWT:           %[[RET_1:.*]] = scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] iter_args(%[[A1:.*]] = %[[A0]])
-// CHECK-RWT:             %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
-// CHECK-RWT:             %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[J]]] : memref<?xf64>
-// CHECK-RWT:             %[[T:.*]] = arith.muli %[[SI0]], %[[C10]] : index
-// CHECK-RWT:             %[[DI:.*]] = arith.addi %[[T]], %[[SI1]] : index
-// CHECK-RWT:             %[[R1:.*]] = sparse_tensor.insert %[[SV]] into %[[A1]]{{\[}}%[[DI]]]
-// CHECK-RWT              scf.yield %[[R1]]
-// CHECK-RWT            }
-// CHECK-RWT            scf.yield %[[RET_1]]
-// CHECK-RWT:         }
-// CHECK-RWT:        %[[NT1:.*]] = sparse_tensor.load %[[RET]] hasInserts
-// CHECK-RWT:        %[[T:.*]] = sparse_tensor.convert %[[NT1]]
+// CHECK-RWT:          %[[P1:.*]] = sparse_tensor.pointers %[[S]] {dimension = 1 : index}
+// CHECK-RWT:          %[[I1:.*]] = sparse_tensor.indices %[[S]] {dimension = 1 : index}
+// CHECK-RWT:          %[[V:.*]] = sparse_tensor.values %[[S]]
+// CHECK-RWT:          %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
+// CHECK-RWT:          %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
+// CHECK-RWT:          scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] {
+// CHECK-RWT:            %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT-DAG:        %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT-DAG:        %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
+// CHECK-RWT:            %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
+// CHECK-RWT:            scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] {
+// CHECK-RWT:              %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
+// CHECK-RWT:              %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[J]]] : memref<?xf64>
+// CHECK-RWT:              %[[T:.*]] = arith.muli %[[SI0]], %[[C10]] : index
+// CHECK-RWT:              %[[DI:.*]] = arith.addi %[[T]], %[[SI1]] : index
+// CHECK-RWT:              sparse_tensor.insert %[[SV]] into %[[B]]{{\[}}%[[DI]]]
+// CHECK-RWT             }
+// CHECK-RWT:          }
+// CHECK-RWT:        %[[T:.*]] = sparse_tensor.convert %[[B]]
 // CHECK-RWT:        return %[[T]] : tensor<100xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>
 //
 func.func @sparse_collapse(%arg0: tensor<10x10xf64, #SparseMatrix>) -> tensor<100xf64, #SparseVector> {
@@ -196,7 +191,7 @@ func.func @sparse_collapse(%arg0: tensor<10x10xf64, #SparseMatrix>) -> tensor<10
 // CHECK-RWT:         %[[V:.*]] = sparse_tensor.values %[[S]]
 // CHECK-RWT:         %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
 // CHECK-RWT:         %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
-// CHECK-RWT:         %[[RET:.*]] = scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] iter_args(%[[R:.*]] = %[[B]])
+// CHECK-RWT:         scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] {
 // CHECK-RWT:           %[[SI:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
 // CHECK-RWT:           %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[I]]] : memref<?xf64>
 // CHECK-RWT:           %[[T1:.*]] = arith.muli %[[DD0]], %[[C10]] : index
@@ -205,11 +200,9 @@ func.func @sparse_collapse(%arg0: tensor<10x10xf64, #SparseMatrix>) -> tensor<10
 // CHECK-RWT:           %[[T3:.*]] = arith.remui %[[SI]], %[[T2]] : index
 // CHECK-RWT:           %[[T4:.*]] = arith.divui %[[T2]], %[[C10]] : index
 // CHECK-RWT:           %[[DI1:.*]] = arith.divui %[[T3]], %[[T4]] : index
-// CHECK-RWT:           %[[NT:.*]] = sparse_tensor.insert %[[SV]] into %[[R]]{{\[}}%[[DI0]], %[[DI1]]]
-// CHECK-RWT:           scf.yield %[[NT]]
+// CHECK-RWT:           sparse_tensor.insert %[[SV]] into %[[B]]{{\[}}%[[DI0]], %[[DI1]]]
 // CHECK-RWT:         }
-// CHECK-RWT:         %[[NT1:.*]] = sparse_tensor.load %[[RET]] hasInserts
-// CHECK-RWT:         %[[T:.*]] = sparse_tensor.convert %[[NT1]]
+// CHECK-RWT:         %[[T:.*]] = sparse_tensor.convert %[[B]]
 // CHECK-RWT:         return %[[T]] : tensor<?x10xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>
 //
 func.func @dynamic_sparse_expand(%arg0: tensor<?xf64, #SparseVector>) -> tensor<?x10xf64, #SparseMatrix> {
@@ -267,31 +260,28 @@ func.func @dynamic_sparse_expand(%arg0: tensor<?xf64, #SparseVector>) -> tensor<
 // CHECK-RWT:         %[[B:.*]] = bufferization.alloc_tensor(%[[DD0]])
 // CHECK-RWT:         %[[P0:.*]] = sparse_tensor.pointers %[[S]] {dimension = 0 : index}
 // CHECK-RWT:         %[[I0:.*]] = sparse_tensor.indices %[[S]] {dimension = 0 : index}
-// CHECK-RWT:         %[[P1:.*]] = sparse_tensor.pointers %[[S]] {dimension = 1 : index}
-// CHECK-RWT:         %[[I1:.*]] = sparse_tensor.indices %[[S]] {dimension = 1 : index}
-// CHECK-RWT:         %[[V:.*]] = sparse_tensor.values %[[S]]
-// CHECK-RWT:         %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
-// CHECK-RWT:         %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
-// CHECK-RWT:         %[[RET:.*]] = scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] iter_args(%[[R0:.*]] = %[[B]])
-// CHECK-RWT:           %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT-DAG:       %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT-DAG:       %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
-// CHECK-RWT:           %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
-// CHECK-RWT:           %[[RET_1:.*]] = scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] iter_args(%[[R1:.*]] = %[[R0]])
-// CHECK-RWT:             %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
-// CHECK-RWT:             %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[J]]] : memref<?xf64>
-// CHECK-RWT:             %[[T1:.*]] = arith.divui %[[DD0]], %[[C10]] : index
-// CHECK-RWT:             %[[T2:.*]] = arith.muli %[[SI0]], %[[T1]] : index
-// CHECK-RWT:             %[[T3:.*]] = arith.divui %[[T1]], %[[SD1]] : index
-// CHECK-RWT:             %[[T4:.*]] = arith.muli %[[SI1]], %[[T3]] : index
-// CHECK-RWT:             %[[DI:.*]] = arith.addi %[[T2]], %[[T4]] : index
-// CHECK-RWT:             %[[NT:.*]] = sparse_tensor.insert %[[SV]] into %[[R1]]{{\[}}%[[DI]]]
-// CHECK-RWT              scf.yield %[[NT]]
-// CHECK-RWT            }
-// CHECK-RWT            scf.yield %[[RET_1]]
-// CHECK-RWT:        }
-// CHECK-RWT:        %[[NT1:.*]] = sparse_tensor.load %[[RET]] hasInserts
-// CHECK-RWT:        %[[T:.*]] = sparse_tensor.convert %[[NT1]]
+// CHECK-RWT:          %[[P1:.*]] = sparse_tensor.pointers %[[S]] {dimension = 1 : index}
+// CHECK-RWT:          %[[I1:.*]] = sparse_tensor.indices %[[S]] {dimension = 1 : index}
+// CHECK-RWT:          %[[V:.*]] = sparse_tensor.values %[[S]]
+// CHECK-RWT:          %[[S0:.*]] = memref.load %[[P0]]{{\[}}%[[C0]]] : memref<?xindex>
+// CHECK-RWT:          %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
+// CHECK-RWT:          scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] {
+// CHECK-RWT:            %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT-DAG:        %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT-DAG:        %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
+// CHECK-RWT:            %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
+// CHECK-RWT:            scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] {
+// CHECK-RWT:              %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
+// CHECK-RWT:              %[[SV:.*]] = memref.load %[[V]]{{\[}}%[[J]]] : memref<?xf64>
+// CHECK-RWT:               %[[T1:.*]] = arith.divui %[[DD0]], %[[C10]] : index
+// CHECK-RWT:               %[[T2:.*]] = arith.muli %[[SI0]], %[[T1]] : index
+// CHECK-RWT:               %[[T3:.*]] = arith.divui %[[T1]], %[[SD1]] : index
+// CHECK-RWT:               %[[T4:.*]] = arith.muli %[[SI1]], %[[T3]] : index
+// CHECK-RWT:               %[[DI:.*]] = arith.addi %[[T2]], %[[T4]] : index
+// CHECK-RWT:              sparse_tensor.insert %[[SV]] into %[[B]]{{\[}}%[[DI]]]
+// CHECK-RWT             }
+// CHECK-RWT:          }
+// CHECK-RWT:        %[[T:.*]] = sparse_tensor.convert %[[B]]
 // CHECK-RWT:        return %[[T]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>
 //
 func.func @dynamic_sparse_collapse(%arg0: tensor<10x?xf64, #SparseMatrix>) -> tensor<?xf64, #SparseVector> {