[mlir][sparse] Support strided convolution on compressed level.

Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D158912
2026-02-03 02:26:27 +08:00 · 2023-08-25 23:40:17 +00:00
parent c8bf93dba0
commit 07bd5f20bc
3 changed files with 170 additions and 21 deletions
--- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"

 using namespace mlir;
 using namespace mlir::sparse_tensor;
@@ -35,6 +36,8 @@ using namespace mlir::sparse_tensor;
 #define ANDI(lhs, rhs) (builder.create<arith::AndIOp>(loc, (lhs), (rhs)))
 #define SUBI(lhs, rhs) (builder.create<arith::SubIOp>(loc, (lhs), (rhs)))
 #define MULI(lhs, rhs) (builder.create<arith::MulIOp>(loc, (lhs), (rhs)))
+#define REMUI(lhs, rhs) (builder.create<arith::RemUIOp>(loc, (lhs), (rhs)))
+#define DIVUI(lhs, rhs) (builder.create<arith::DivUIOp>(loc, (lhs), (rhs)))
 #define SELECT(c, l, r) (builder.create<arith::SelectOp>(loc, (c), (l), (r)))

 //===----------------------------------------------------------------------===//
@@ -117,8 +120,8 @@ static std::pair<Value, Value> fromSliceCrd(OpBuilder &builder, Location loc,
                                            Level lvl) {
  // sliceCrd = (tensorCrd - offset) / stride
  crd = SUBI(crd, offset);
-  Value rem = builder.create<arith::RemUIOp>(loc, crd, stride);
-  crd = builder.create<arith::DivUIOp>(loc, crd, stride);
+  Value rem = REMUI(crd, stride);
+  crd = DIVUI(crd, stride);
  return std::make_pair(crd, rem);
 }

@@ -725,6 +728,7 @@ Value LoopEmitter::genWhileLoopConditions(OpBuilder &builder, Location loc,
  }
  case LoopCondKind::SparseAffineCond: {
    assert(ivs.size() == 1);
+
    Value crdHi; // loop upper bound
    {
      OpBuilder::InsertionGuard guard(builder);
@@ -732,9 +736,9 @@ Value LoopEmitter::genWhileLoopConditions(OpBuilder &builder, Location loc,
      // crdHi is a loop invariant, hosit the computation outside the loop.
      if (llvm::isa_and_nonnull<scf::WhileOp>(loop))
        builder.setInsertionPoint(loop);
-      auto [size, stride] = sliceMeta[tid][lvl].back();
+      auto [remSz, stride] = sliceMeta[tid][lvl].back();
      assert(stride == 1 && "Not yet implemented");
-      crdHi = ADDI(getMostRecentSliceOnLvl(tid, lvl).offset, size);
+      crdHi = ADDI(getMostRecentSliceOnLvl(tid, lvl).offset, remSz);
    }
    assert(crdHi);
    return genSparseReducedAffineCond(builder, loc,
@@ -792,18 +796,33 @@ std::optional<Value> LoopEmitter::genWhileLoopBody(OpBuilder &builder,
    return std::nullopt;
  }
  case LoopCondKind::SparseAffineUnRedCond: {
+    unsigned depth = sliceStack[tid].back().depth;
+    unsigned curStride = sliceMeta[tid][lvl][depth - 1].second;
    assert(ivs.size() == 3);
-    // Coord is the relative offset related to its parents.
-    // Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1]
-    assert(sliceStack[tid].back().depth == 1 && "TODO: not yet implement");
+
    // Updates the current slice info
    SliceInfo &sliceInfo = sliceStack[tid].back();
    sliceInfo.isNonEmpty = ivs[0];
    sliceInfo.minCrd = ivs[1];
    sliceInfo.offset = ivs[2];
-    coords[tid][lvl] = sliceInfo.offset;
+
+    // Crd (the value we used to coiterate) is the relative offset related to
+    // its parents, we can use the absolute offset here because when depth = 1,
+    // absOffset[lvl][depth - 1] always equals zero.
+    // TODO: Update crd =absOffset[lvl][depth] - absOffset[lvl][depth - 1]
+    assert(depth == 1 && "TODO: not yet implement");
+    Value crd = sliceInfo.offset;
+
+    Value onStride = constantI1(builder, loc, true);
+    if (curStride != 1) {
+      Value strideVal = C_IDX(curStride);
+      Value rem = REMUI(crd, strideVal);
+      crd = DIVUI(crd, strideVal);
+      onStride = CMPI(eq, rem, C_IDX(0));
+    }
+    coords[tid][lvl] = crd;
    // No extra check is needed before accessing the tensor level.
-    return std::nullopt;
+    return onStride;
  }
  default:
    llvm_unreachable("Unhandled LoopCondKind");
@@ -814,11 +833,44 @@ std::optional<Value> LoopEmitter::genWhileLoopBody(OpBuilder &builder,
 ValueRange LoopEmitter::genCheckedValue(OpBuilder &builder, Location loc,
                                        Value pred, ValueRange curArgs,
                                        TensorLvlCond cond) {
-  // Currently only sparse slice condition need extra check.
-  assert(isSliceCond(cond.second) && isSparseCond(cond.second));
-  assert(curArgs.size() == 1);
-  Value nextPos = ADDI(curArgs.front(), C_IDX(1));
-  return SELECT(pred, curArgs.front(), nextPos)->getResults();
+  assert(isSparseCond(cond.second));
+  auto [tid, lvl] = unpackTensorLevel(cond.first);
+  if (isAffineIdxUnRedCond(cond.second)) {
+    unsigned depth = sliceStack[tid].back().depth;
+    unsigned curStride = sliceMeta[tid][lvl][depth - 1].second;
+    if (curStride == 1)
+      return curArgs;
+    // Build
+    // if (onStride) {
+    //    yield curSlice
+    // } else {
+    //    yield nxSlice.
+    //}
+    assert(curArgs.size() == 3);
+    auto ifOp = builder.create<scf::IfOp>(loc, curArgs.getTypes(), pred, true);
+    {
+      OpBuilder::InsertionGuard guard(builder);
+      // If not all slices are legit, yield the updated value.
+      builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+
+      YIELD(curArgs);
+      // If not all slices are legit, yield the updated value.
+      builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
+      auto [nonEmpty, minCrd, offset] =
+          genSliceNextInduction(builder, loc, tid, lvl);
+      SmallVector<Value> nxSlice{nonEmpty, minCrd, offset};
+      YIELD(nxSlice);
+    }
+    // If all slices are legit, start the user generated code.
+    return ifOp.getResults();
+  } else {
+    // Currently only sparse slice condition need extra check.
+    assert(isSliceCond(cond.second) && isSparseCond(cond.second));
+    assert(curArgs.size() == 1);
+    Value nextPos = ADDI(curArgs.front(), C_IDX(1));
+    return SELECT(pred, curArgs.front(), nextPos)->getResults();
+  }
+  llvm_unreachable("unhandled case");
 }

 std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls(
@@ -1877,9 +1929,6 @@ void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc,
                                          TensorId tid, Level lvl) {
  Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
  unsigned depth = levelReducedDep[tid][lvl];
-  // TODO: handle case when the current slice stride is not one.
-  assert(sliceMeta[tid][lvl][depth].second == 1 && "Not yet implemented");
-
  // The remaining slice size after reduction.
  Value remSz = sliceMeta[tid][lvl][depth + 1].first;
  // Dense slice begin is trivial
@@ -2251,8 +2300,6 @@ LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc,

  // FIXME: compute relative offset.
  assert(info.depth - 1 == 0);
-  Value nextRelOffset = nextAbsOffset;
-  nextRelOffset = SELECT(nextNonEmpty, nextRelOffset, c0);
  return std::make_tuple(nextNonEmpty, nextMinCrd, nextAbsOffset);
 }

--- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h
@@ -393,7 +393,7 @@ private:
  }
  static bool isTrivalIdxCond(LoopCondKind k) { return !isAffineIdxCond(k); }

-  /// Whether the affine index expression is not fully reduced.
+  /// Whether the affine index expression is fully reduced.
  static bool isAffineIdxUnRedCond(LoopCondKind k) {
    return isAffineIdxCond(k) && static_cast<uint8_t>(k) & kAffineIdxCondUnRed;
  }
@@ -405,7 +405,7 @@ private:
  // E.g., to iterate over sparse tensor slice, we need to check whether the
  // current cooridnate is on the slice (e.g., due to stride) or not.
  static bool isCondWithExtraCheck(LoopCondKind k) {
-    return isSparseCond(k) && isSliceCond(k);
+    return isSparseCond(k) && (isSliceCond(k) || isAffineIdxUnRedCond(k));
  }

  static LoopCondKind makeLoopCondKind(bool isSparse, bool isSlice,
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_strided_conv_2d_nhwc_hwcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_strided_conv_2d_nhwc_hwcf.mlir
@@ -0,0 +1,102 @@
+//--------------------------------------------------------------------------------------------------
+// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
+//
+// Set-up that's shared across all tests in this directory. In principle, this
+// config could be moved to lit.local.cfg. However, there are downstream users that
+//  do not use these LIT config files. Hence why this is kept inline.
+//
+// DEFINE: %{sparse_compiler_opts} = enable-runtime-library=true
+// DEFINE: %{sparse_compiler_opts_sve} = enable-arm-sve=true %{sparse_compiler_opts}
+// DEFINE: %{compile} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts}"
+// DEFINE: %{compile_sve} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts_sve}"
+// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
+// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
+// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
+//
+// DEFINE: %{env} =
+//--------------------------------------------------------------------------------------------------
+
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation.
+// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and VLA vectorization.
+// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
+
+#CCCC = #sparse_tensor.encoding<{
+  lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ]
+}>
+
+// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
+func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor<?x?x?x?xf32> {
+  %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4) : tensor<?x?x?x?xf32>
+  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nhwc_hwcf(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+  %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
+                                     strides = dense<2> : tensor<2xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
+    outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nhwc_hwcf_CCCC(%arg0: tensor<?x?x?x?xf32, #CCCC>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+  %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
+                                     strides = dense<2> : tensor<2xi64>}
+     ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32>)
+    outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @entry() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c6 = arith.constant 6 : index
+  %c9 = arith.constant 9 : index
+  %f10 = arith.constant 10.00000e+00 : f32
+  %val = arith.constant 2.00000e+00 : f32
+  %zero = arith.constant 0.00000e+00 : f32
+
+  %filter2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+  %in2D_tmp = call @alloc_4d_filled_f32(%c3, %c9, %c9, %c3, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+  %in2D_nhwc = tensor.insert %f10 into %in2D_tmp[%c0, %c3, %c3, %c0] : tensor<?x?x?x?xf32>
+  %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+
+  %in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc
+    : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CCCC>
+
+  %dense_ret = call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
+  %CCCC_ret = call @conv_2d_nhwc_hwcf_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
+
+  // CHECK:      ( ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 20 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ),
+  // CHECK-SAME:   ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ),
+  // CHECK-SAME:   ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ) )
+  %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero
+      : tensor<?x?x?x?xf32>, vector<3x3x3x1xf32>
+  vector.print %dense_v : vector<3x3x3x1xf32>
+
+  // CHECK:      ( ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 20 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ),
+  // CHECK-SAME:   ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ),
+  // CHECK-SAME:   ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ) )
+  %v1 = vector.transfer_read %CCCC_ret[%c0, %c0, %c0, %c0], %zero
+      : tensor<?x?x?x?xf32>, vector<3x3x3x1xf32>
+  vector.print %v1 : vector<3x3x3x1xf32>
+
+  // Free the resources
+  bufferization.dealloc_tensor %in2D_nhwc : tensor<?x?x?x?xf32>
+  bufferization.dealloc_tensor %filter2D_nhwc : tensor<?x?x?x?xf32>
+  bufferization.dealloc_tensor %out2D_nhwc : tensor<?x?x?x?xf32>
+
+  bufferization.dealloc_tensor %in2D_nhwc_CCCC : tensor<?x?x?x?xf32, #CCCC>
+  return
+}