[mlir][sparse] support parallel for/reduction in sparsification.

This patch fix the re-revert D135927 (which caused a windows build failure) to re-enable parallel for/reduction. It also fix a warning caused by D137442. Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D137565
2026-02-05 13:21:04 +08:00 · 2022-11-07 17:10:01 +00:00
parent c4b74658c7
commit 75ac294b35
7 changed files with 284 additions and 126 deletions
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -219,9 +219,12 @@ Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
    OpBuilder &builder, Location loc, size_t tid, size_t dim,
    MutableArrayRef<Value> reduc, bool isParallel, ArrayRef<size_t> extraTids,
    ArrayRef<size_t> extraDims) {
+
  assert(dimTypes[tid].size() > dim);
  // We can not re-enter the same level.
  assert(!coord[tid][dim]);
+  // TODO: support multiple return on parallel for?
+  assert(!isParallel || reduc.size() <= 1);

  Value step = constantIndex(builder, loc, 1);
  auto dimType = dimTypes[tid][dim];
@@ -232,11 +235,38 @@ Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
  Value lo = isSparseInput ? pidxs[tid][dim]      // current offset
                           : loopSeqStack.back(); // univeral tid
  Value hi = highs[tid][dim];
+  Operation *loop = nullptr;
+  Value iv;
+  if (isParallel) {
+    scf::ParallelOp parOp =
+        builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
+    builder.setInsertionPointToStart(parOp.getBody());
+    assert(parOp.getNumReductions() == reduc.size());
+    iv = parOp.getInductionVars()[0];
+
+    // In-place update on the reduction variable vector.
+    // Note that the init vals is not the actual reduction variables but instead
+    // used as a `special handle` to (temporarily) represent them. The
+    // expression on init vals will be moved into scf.reduce and replaced with
+    // the block arguments when exiting the loop (see exitForLoop). This is
+    // needed as we can not build the actual reduction block and get the actual
+    // reduction varaible before users fill parallel loop body.
+    for (int i = 0, e = reduc.size(); i < e; i++)
+      reduc[i] = parOp.getInitVals()[i];
+    loop = parOp;
+  } else {
+    scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
+    builder.setInsertionPointToStart(forOp.getBody());
+    iv = forOp.getInductionVar();
+
+    // In-place update on the reduction variable vector.
+    assert(forOp.getNumRegionIterArgs() == reduc.size());
+    for (int i = 0, e = reduc.size(); i < e; i++)
+      reduc[i] = forOp.getRegionIterArg(i);
+    loop = forOp;
+  }
+  assert(loop && iv);

-  scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
-  builder.setInsertionPointToStart(forOp.getBody());
-  Value iv = forOp.getInductionVar();
-  assert(iv);
  if (isSparseInput) {
    pidxs[tid][dim] = iv;
    // Generating a load on the indices array yields the coordinate.
@@ -253,16 +283,12 @@ Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(

  // NOTE: we can also prepares for next dim here in advance
  // Push the loop into stack
-  loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), forOp,
+  loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), loop,
                         coord[tid][dim]);
  // Emit extra locals.
  emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims);

-  // In-place update on the reduction variable vector.
-  assert(forOp.getNumRegionIterArgs() == reduc.size());
-  for (int i = 0, e = reduc.size(); i < e; i++)
-    reduc[i] = forOp.getRegionIterArg(i);
-  return forOp;
+  return loop;
 }

 Operation *SparseTensorLoopEmitter::enterCoIterationOverTensorsAtDims(
@@ -434,17 +460,73 @@ void SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDenseDims(
  }
 }

-SmallVector<Value, 2>
-SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc,
-                                     ArrayRef<Value> reduc) {
+void SparseTensorLoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
+                                          MutableArrayRef<Value> reduc) {
  LoopLevelInfo &loopInfo = loopStack.back();
  auto &dims = loopStack.back().dims;
  auto &tids = loopStack.back().tids;
-  auto forOp = llvm::cast<scf::ForOp>(loopInfo.loop);
-  if (!reduc.empty()) {
-    assert(reduc.size() == forOp.getNumResults());
-    builder.setInsertionPointToEnd(forOp.getBody());
-    builder.create<scf::YieldOp>(loc, reduc);
+  auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop);
+  if (forOp) {
+    if (!reduc.empty()) {
+      assert(reduc.size() == forOp.getNumResults());
+      rewriter.setInsertionPointToEnd(forOp.getBody());
+      rewriter.create<scf::YieldOp>(loc, reduc);
+    }
+    // Exit the loop.
+    rewriter.setInsertionPointAfter(forOp);
+    // In-place update reduction variables.
+    for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++)
+      reduc[i] = forOp.getResult(i);
+  } else {
+    auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop);
+    if (!reduc.empty()) {
+      assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1);
+      Operation *redExp = reduc.front().getDefiningOp();
+      // Reduction expression should have no use.
+      assert(redExp->getUses().empty());
+      // This must be a binary operation.
+      // NOTE: This is users' responsibilty to ensure the operation are
+      // commutative.
+      assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1);
+
+      Value redVal = parOp.getInitVals().front();
+      Value curVal;
+      if (redExp->getOperand(0) == redVal)
+        curVal = redExp->getOperand(1);
+      else if (redExp->getOperand(1) == redVal)
+        curVal = redExp->getOperand(0);
+      // One of the operands must be the init value (which is also the
+      // previous reduction value).
+      assert(curVal);
+      // The reduction expression should be the only user of the reduction val
+      // inside the parallel for.
+      unsigned numUsers = 0;
+      for (Operation *op : redVal.getUsers()) {
+        if (op->getParentOp() == parOp)
+          numUsers++;
+      }
+      assert(numUsers == 1);
+      (void)numUsers; // to silence unused variable warning in release build
+
+      rewriter.setInsertionPointAfter(redExp);
+      auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
+      // Attach to the reduction op.
+      Block *redBlock = &redOp.getRegion().getBlocks().front();
+      rewriter.setInsertionPointToEnd(redBlock);
+      Operation *newRed = rewriter.clone(*redExp);
+      // Replaces arguments of the reduction expression by using the block
+      // arguments from scf.reduce.
+      rewriter.updateRootInPlace(
+          newRed, [&]() { newRed->setOperands(redBlock->getArguments()); });
+      // Erases the out-dated reduction expression.
+      rewriter.eraseOp(redExp);
+      rewriter.setInsertionPointToEnd(redBlock);
+      rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
+    }
+    rewriter.setInsertionPointAfter(parOp);
+    // In-place update reduction variables.
+    for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++)
+      reduc[i] = parOp.getResult(i);
  }

  // Finished iterating a tensor, clean up
@@ -458,14 +540,10 @@ SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc,
    if (!isDenseDLT(dimTypes[tid][dim]))
      highs[tid][dim] = Value();
  }
-  // exit the loop
-  builder.setInsertionPointAfter(forOp);
-  return forOp.getResults();
 }

-SmallVector<Value, 2>
-SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc,
-                                             ArrayRef<Value> reduc) {
+void SparseTensorLoopEmitter::exitCoIterationLoop(
+    OpBuilder &builder, Location loc, MutableArrayRef<Value> reduc) {
  auto whileOp = llvm::cast<scf::WhileOp>(loopStack.back().loop);
  auto &dims = loopStack.back().dims;
  auto &tids = loopStack.back().tids;
@@ -499,10 +577,10 @@ SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc,
  }

  // Reduction value from users.
-  SmallVector<Value, 2> ret;
-  for (auto red : reduc) {
-    operands.push_back(red);
-    ret.push_back(whileOp->getResult(o++));
+  for (unsigned i = 0, e = reduc.size(); i < e; i++) {
+    operands.push_back(reduc[i]);
+    // In place update reduction variable.
+    reduc[i] = whileOp->getResult(o++);
  }

  // An (optional) universal index.
@@ -517,26 +595,24 @@ SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc,
  assert(o == operands.size());
  builder.create<scf::YieldOp>(loc, operands);
  builder.setInsertionPointAfter(whileOp);
-  return ret;
 }

-SmallVector<Value, 2>
-SparseTensorLoopEmitter::exitCurrentLoop(OpBuilder &builder, Location loc,
-                                         ArrayRef<Value> reduc) {
+void SparseTensorLoopEmitter::exitCurrentLoop(RewriterBase &rewriter,
+                                              Location loc,
+                                              MutableArrayRef<Value> reduc) {
  // Clean up the values, it would help use to discover potential bug at a
  // earlier stage (instead of silently using a wrong value).
  LoopLevelInfo &loopInfo = loopStack.back();
  assert(loopInfo.tids.size() == loopInfo.dims.size());
  SmallVector<Value, 2> red;
  if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
-    red = exitCoiterationLoop(builder, loc, reduc);
+    exitCoIterationLoop(rewriter, loc, reduc);
  } else {
-    red = exitForLoop(builder, loc, reduc);
+    exitForLoop(rewriter, loc, reduc);
  }

  assert(loopStack.size() == loopSeqStack.size());
  loopStack.pop_back();
-  return red;
 }

 //===----------------------------------------------------------------------===//
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -380,8 +380,8 @@ public:
      ArrayRef<size_t> dims, bool needsUniv, MutableArrayRef<Value> reduc = {},
      ArrayRef<size_t> extraTids = {}, ArrayRef<size_t> extraDims = {});

-  SmallVector<Value, 2> exitCurrentLoop(OpBuilder &builder, Location loc,
-                                        ArrayRef<Value> reduc = {});
+  void exitCurrentLoop(RewriterBase &rewriter, Location loc,
+                       MutableArrayRef<Value> reduc = {});

  /// Returns the array of coordinate for all the loop generated till now.
  void getCoordinateArray(SmallVectorImpl<Value> &coords) const {
@@ -452,17 +452,35 @@ private:
                                            ArrayRef<size_t> dims);

  /// Exits a for loop, returns the reduction results, e.g.,
+  /// For sequential for loops:
  /// %ret = for () {
  ///   ...
+  ///   %val = addi %args, %c
  ///   yield %val
  /// }
-  /// Return %ret to user, while %val is provided by users (`reduc`)
-  SmallVector<Value, 2> exitForLoop(OpBuilder &builder, Location loc,
-                                    ArrayRef<Value> reduc);
+  /// For parallel loops, the following generated code by users:
+  /// %ret = parallel () init(%args) {
+  ///   ...
+  ///   %val = op %args, %c
+  /// }
+  /// will be transformed into
+  /// %ret = parallel () init(%args) {
+  ///   ...
+  ///   scf.reduce(%c) bb0(%0, %1){
+  ///     %val = op %0, %1
+  ///     scf.reduce.return %val
+  ///   }
+  /// }
+  /// NOTE: only one instruction will be moved into reduce block, transformation
+  /// will fail if multiple instructions are used to compute the reduction
+  /// value.
+  /// Return %ret to user, while %val is provided by users (`reduc`).
+  void exitForLoop(RewriterBase &rewriter, Location loc,
+                   MutableArrayRef<Value> reduc);

  /// Exits a while loop, returns the reduction results.
-  SmallVector<Value, 2> exitCoiterationLoop(OpBuilder &builder, Location loc,
-                                            ArrayRef<Value> reduc);
+  void exitCoIterationLoop(OpBuilder &builder, Location loc,
+                           MutableArrayRef<Value> reduc);

  // Whether the loop emitter needs to treat the last tensor as the output
  // tensor.
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -410,6 +410,34 @@ static Value getCustomRedId(Operation *op) {
 // Sparse compiler synthesis methods (statements and expressions).
 //===----------------------------------------------------------------------===//

+/// Generates loop boundary statements (entering/exiting loops). The function
+/// passes and updates the reduction value.
+static Optional<Operation *> genLoopBoundary(
+    CodeGen &codegen, Merger &merger,
+    function_ref<Optional<Operation *>(MutableArrayRef<Value> reduc)>
+        callback) {
+  SmallVector<Value, 4> reduc;
+  if (codegen.redVal)
+    reduc.push_back(codegen.redVal);
+  if (codegen.expValues)
+    reduc.push_back(codegen.expCount);
+  if (codegen.insChain)
+    reduc.push_back(codegen.insChain);
+
+  auto r = callback(reduc);
+
+  // Callback should do in-place update on reduction value vector.
+  unsigned i = 0;
+  if (codegen.redVal)
+    updateReduc(merger, codegen, reduc[i++]);
+  if (codegen.expValues)
+    codegen.expCount = reduc[i++];
+  if (codegen.insChain)
+    codegen.insChain = reduc[i];
+
+  return r;
+}
+
 /// Local bufferization of all dense and sparse data structures.
 static void genBuffers(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                       linalg::GenericOp op) {
@@ -869,23 +897,25 @@ static void genExpansion(Merger &merger, CodeGen &codegen, OpBuilder &builder,
 /// Returns parallelization strategy. Any implicit loop in the Linalg
 /// operation that is marked "parallel" is a candidate. Whether it is actually
 /// converted to a parallel operation depends on the requested strategy.
-static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction,
-                          bool isSparse) {
+static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isSparse) {
  // Reject parallelization of sparse output.
  if (codegen.sparseOut)
    return false;
+  // Parallel loops on tensor expansion can cause data races.
+  if (codegen.expCount)
+    return false;
  // Inspect strategy.
  switch (codegen.options.parallelizationStrategy) {
  case SparseParallelizationStrategy::kNone:
    return false;
  case SparseParallelizationStrategy::kDenseOuterLoop:
-    return isOuter && !isSparse && !isReduction;
+    return isOuter && !isSparse;
  case SparseParallelizationStrategy::kAnyStorageOuterLoop:
-    return isOuter && !isReduction;
+    return isOuter;
  case SparseParallelizationStrategy::kDenseAnyLoop:
-    return !isSparse && !isReduction;
+    return !isSparse;
  case SparseParallelizationStrategy::kAnyStorageAnyLoop:
-    return !isReduction;
+    return true;
  }
  llvm_unreachable("unexpected parallelization strategy");
 }
@@ -898,33 +928,16 @@ static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                         ArrayRef<size_t> extraDims) {
  Location loc = op.getLoc();
  auto iteratorTypes = op.getIteratorTypesArray();
-  bool isReduction = linalg::isReductionIterator(iteratorTypes[idx]);
  bool isSparse = isCompressedDLT(merger.getDimLevelType(tid, idx)) ||
                  isSingletonDLT(merger.getDimLevelType(tid, idx));
-  bool isParallel = isParallelFor(codegen, isOuter, isReduction, isSparse);
-  assert(!isParallel);
-
-  // Emit a sequential for loop.
-  SmallVector<Value, 4> operands;
-  if (codegen.redVal)
-    operands.push_back(codegen.redVal);
-  if (codegen.expValues)
-    operands.push_back(codegen.expCount);
-  if (codegen.insChain)
-    operands.push_back(codegen.insChain);
-
-  Operation *loop = codegen.loopEmitter.enterLoopOverTensorAtDim(
-      builder, loc, tid, dim, operands, isParallel, extraTids, extraDims);
-
-  unsigned o = 0;
-  if (codegen.redVal)
-    updateReduc(merger, codegen, operands[o++]);
-  if (codegen.expValues)
-    codegen.expCount = operands[o++];
-  if (codegen.insChain)
-    codegen.insChain = operands[o++];
-  assert(o == operands.size());
+  bool isParallel = isParallelFor(codegen, isOuter, isSparse);

+  Operation *loop =
+      genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
+        return codegen.loopEmitter.enterLoopOverTensorAtDim(
+            builder, loc, tid, dim, reduc, isParallel, extraTids, extraDims);
+      }).value();
+  assert(loop);
  return loop;
 }

@@ -934,29 +947,15 @@ static Operation *genWhile(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                           ArrayRef<size_t> condTids, ArrayRef<size_t> condDims,
                           ArrayRef<size_t> extraTids,
                           ArrayRef<size_t> extraDims) {
-  SmallVector<Value, 4> operands;
-
-  // Construct the while-loop with a parameter for each index.
-  if (codegen.redVal)
-    operands.push_back(codegen.redVal);
-  if (codegen.expValues)
-    operands.push_back(codegen.expCount);
-  if (codegen.insChain)
-    operands.push_back(codegen.insChain);
-
-  Operation *loop = codegen.loopEmitter.enterCoIterationOverTensorsAtDims(
-      builder, op.getLoc(), condTids, condDims, needsUniv, operands, extraTids,
-      extraDims);
-
-  unsigned o = 0;
-  if (codegen.redVal)
-    updateReduc(merger, codegen, operands[o++]);
-  if (codegen.expValues)
-    codegen.expCount = operands[o++];
-  if (codegen.insChain)
-    codegen.insChain = operands[o++];
-  assert(o == operands.size());

+  Operation *loop =
+      genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
+        // Construct the while-loop with a parameter for each index.
+        return codegen.loopEmitter.enterCoIterationOverTensorsAtDims(
+            builder, op.getLoc(), condTids, condDims, needsUniv, reduc,
+            extraTids, extraDims);
+      }).value();
+  assert(loop);
  return loop;
 }

@@ -1186,37 +1185,21 @@ static Operation *startLoop(Merger &merger, CodeGen &codegen,
 }

 /// Ends a single loop in current sequence. Returns new values for needsUniv.
-static bool endLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder,
+static bool endLoop(Merger &merger, CodeGen &codegen, RewriterBase &rewriter,
                    linalg::GenericOp op, Operation *loop, unsigned idx,
                    unsigned li, bool needsUniv) {
  // End a while-loop.
  if (auto whileOp = dyn_cast<scf::WhileOp>(loop)) {
-    finalizeWhileOp(merger, codegen, builder, op, idx, needsUniv,
+    finalizeWhileOp(merger, codegen, rewriter, op, idx, needsUniv,
                    merger.lat(li).bits, whileOp);
  } else {
    needsUniv = false;
  }

-  SmallVector<Value, 2> reduc;
-  if (codegen.redVal)
-    reduc.push_back(codegen.redVal);
-  if (codegen.expValues)
-    reduc.push_back(codegen.expCount);
-  if (codegen.insChain)
-    reduc.push_back(codegen.insChain);
-
-  auto loopRet =
-      codegen.loopEmitter.exitCurrentLoop(builder, op.getLoc(), reduc);
-  assert(reduc.size() == loopRet.size());
-
-  unsigned o = 0;
-  if (codegen.redVal)
-    updateReduc(merger, codegen, loopRet[o++]);
-  if (codegen.expValues)
-    codegen.expCount = loopRet[o++];
-  if (codegen.insChain)
-    codegen.insChain = loopRet[o++];
-  assert(o == loopRet.size());
+  genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
+    codegen.loopEmitter.exitCurrentLoop(rewriter, op.getLoc(), reduc);
+    return llvm::None;
+  });

  return needsUniv;
 }
--- a/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir
@@ -1,14 +1,13 @@
 // RUN: mlir-opt %s -sparsification="parallelization-strategy=none" | \
 // RUN:   FileCheck %s --check-prefix=CHECK-PAR0
-// FIXME: we do not support vectorization/parallel loops in loop emitter right now
-// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \
-// R_U_N:   FileCheck %s --check-prefix=CHECK-PAR1
-// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \
-// R_U_N:   FileCheck %s --check-prefix=CHECK-PAR2
-// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \
-// R_U_N:   FileCheck %s --check-prefix=CHECK-PAR3
-// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
-// R_U_N:   FileCheck %s --check-prefix=CHECK-PAR4
+// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \
+// RUN:   FileCheck %s --check-prefix=CHECK-PAR1
+// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \
+// RUN:   FileCheck %s --check-prefix=CHECK-PAR2
+// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \
+// RUN:   FileCheck %s --check-prefix=CHECK-PAR3
+// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
+// RUN:   FileCheck %s --check-prefix=CHECK-PAR4

 #DenseMatrix = #sparse_tensor.encoding<{
  dimLevelType = [ "dense", "dense" ]
@@ -151,7 +150,8 @@ func.func @scale_ss(%scale: f32,
 //
 // CHECK-PAR4-LABEL: func @matvec
 // CHECK-PAR4:         scf.parallel
-// CHECK-PAR4:           scf.for
+// CHECK-PAR4:           scf.parallel
+// CHECK-PAR4:             scf.reduce
 // CHECK-PAR4:         return
 //
 func.func @matvec(%arga: tensor<16x32xf32, #CSR>,
--- a/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir
@@ -0,0 +1,63 @@
+// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
+// RUN:   FileCheck %s
+
+#CSR = #sparse_tensor.encoding<{
+  dimLevelType = [ "dense", "compressed" ]
+}>
+
+#trait_matvec = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>,  // A
+    affine_map<(i,j) -> (j)>,    // b
+    affine_map<(i,j) -> (i)>     // x (out)
+  ],
+  iterator_types = ["parallel", "reduction"],
+  doc = "x(i) += A(i,j) * b(j)"
+}
+// CHECK-LABEL:  func.func @matvec(
+//  CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<16x32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>,
+//  CHECK-SAME:    %[[TMP_arg1:.*]]: tensor<32xf32>,
+//  CHECK-SAME:    %[[TMP_arg2:.*]]: tensor<16xf32>) -> tensor<16xf32> {
+//   CHECK-DAG:  %[[TMP_c16:.*]] = arith.constant 16 : index
+//   CHECK-DAG:  %[[TMP_c0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:  %[[TMP_c1:.*]] = arith.constant 1 : index
+//       CHECK:  %[[TMP_0:.*]] = sparse_tensor.pointers %[[TMP_arg0]] {dimension = 1 : index}
+//       CHECK:  %[[TMP_1:.*]] = sparse_tensor.indices %[[TMP_arg0]] {dimension = 1 : index}
+//       CHECK:  %[[TMP_2:.*]] = sparse_tensor.values %[[TMP_arg0]]
+//       CHECK:  %[[TMP_3:.*]] = bufferization.to_memref %[[TMP_arg1]] : memref<32xf32>
+//       CHECK:  %[[TMP_4:.*]] = bufferization.to_memref %[[TMP_arg2]] : memref<16xf32>
+//       CHECK:  scf.parallel (%[[TMP_arg3:.*]]) = (%[[TMP_c0]]) to (%[[TMP_c16]]) step (%[[TMP_c1]]) {
+//       CHECK:    %[[TMP_6:.*]] = memref.load %[[TMP_4]][%[[TMP_arg3]]] : memref<16xf32>
+//       CHECK:    %[[TMP_7:.*]] = memref.load %[[TMP_0]][%[[TMP_arg3]]] : memref<?xindex>
+//       CHECK:    %[[TMP_8:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
+//       CHECK:    %[[TMP_9:.*]] = memref.load %[[TMP_0]][%[[TMP_8]]] : memref<?xindex>
+//       CHECK:    %[[TMP_10:.*]] = scf.parallel (%[[TMP_arg4:.*]]) = (%[[TMP_7]]) to (%[[TMP_9]]) step (%[[TMP_c1]]) init (%[[TMP_6]]) -> f32 {
+//       CHECK:      %[[TMP_11:.*]] = memref.load %[[TMP_1]][%[[TMP_arg4]]] : memref<?xindex>
+//       CHECK:      %[[TMP_12:.*]] = memref.load %[[TMP_2]][%[[TMP_arg4]]] : memref<?xf32>
+//       CHECK:      %[[TMP_13:.*]] = memref.load %[[TMP_3]][%[[TMP_11]]] : memref<32xf32>
+//       CHECK:      %[[TMP_14:.*]] = arith.mulf %[[TMP_12]], %[[TMP_13]] : f32
+//       CHECK:      scf.reduce(%[[TMP_14]])  : f32 {
+//       CHECK:      ^bb0(%[[TMP_arg5:.*]]: f32, %[[TMP_arg6:.*]]: f32):
+//       CHECK:        %[[TMP_15:.*]] = arith.addf %[[TMP_arg5]], %[[TMP_arg6]] : f32
+//       CHECK:        scf.reduce.return %[[TMP_15]] : f32
+//       CHECK:      }
+//       CHECK:      scf.yield
+//       CHECK:    }
+//       CHECK:    memref.store %[[TMP_10]], %[[TMP_4]][%[[TMP_arg3]]] : memref<16xf32>
+//       CHECK:    scf.yield
+//       CHECK:  }
+//       CHECK:  %[[TMP_5:.*]] = bufferization.to_tensor %[[TMP_4]] : memref<16xf32>
+//       CHECK:  return %[[TMP_5]] : tensor<16xf32>
+func.func @matvec(%arga: tensor<16x32xf32, #CSR>,
+                  %argb: tensor<32xf32>,
+	          %argx: tensor<16xf32>) -> tensor<16xf32> {
+  %0 = linalg.generic #trait_matvec
+      ins(%arga, %argb : tensor<16x32xf32, #CSR>, tensor<32xf32>)
+     outs(%argx: tensor<16xf32>) {
+    ^bb(%A: f32, %b: f32, %x: f32):
+      %0 = arith.mulf %A, %b : f32
+      %1 = arith.addf %0, %x : f32
+      linalg.yield %1 : f32
+  } -> tensor<16xf32>
+  return %0 : tensor<16xf32>
+}
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
@@ -2,6 +2,14 @@
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
+//
+// Do the same run, but now with parallelization.
+//
+// RUN: mlir-opt %s --sparse-compiler="parallelization-strategy=any-storage-any-loop" | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+

 #CSR = #sparse_tensor.encoding<{
  dimLevelType = [ "dense", "compressed" ],
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir
@@ -4,6 +4,16 @@
 // RUN:  -e entry -entry-point-result=void  \
 // RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
+//
+// Do the same run, but now with parallelization.
+//
+// RUN: mlir-opt %s \
+// RUN:   --sparse-compiler="parallelization-strategy=any-storage-any-loop" | \
+// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s

 !Filename = !llvm.ptr<i8>