[mlir][sparse] fuse collapse_shape on sparse tensor with GenericOp.

Instead of always materializing a new sparse tensor after reshape, this patch tries to fuses the reshape (currently only on COO) with GenericOp and coiterates with the reshaped tensors without allocating a new sparse tensor.

Reviewed By: aartbik

Differential Revision: https://reviews.llvm.org/D145016
This commit is contained in:
Peiming Liu
2023-02-27 22:40:34 +00:00
parent c29fc69e35
commit fc126022e8
6 changed files with 250 additions and 29 deletions

View File

@@ -103,7 +103,7 @@ SparseTensorEncodingAttr getSparseTensorEncoding(Type type);
/// Returns true iff the given type is a COO type where the last level
/// is unique.
bool isUniqueCOOType(TensorType tp);
bool isUniqueCOOType(Type tp);
/// Returns the starting level for a trailing COO region that spans
/// at least two levels. If no such COO region is found, then returns

View File

@@ -453,7 +453,7 @@ static bool isCOOType(SparseTensorEncodingAttr enc, Level startLvl,
return !isUnique || enc.isUniqueLvl(lvlRank - 1);
}
bool mlir::sparse_tensor::isUniqueCOOType(TensorType tp) {
bool mlir::sparse_tensor::isUniqueCOOType(Type tp) {
return isCOOType(getSparseTensorEncoding(tp), 0, /*isUnique=*/true);
}

View File

@@ -132,31 +132,43 @@ LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput,
initialize(tensors, loopTag, hasOutput, isSparseOut, topSort);
}
void LoopEmitter::initialize(ValueRange tensors, StringAttr loopTag,
bool hasOutput, bool isSparseOut,
ArrayRef<unsigned> topSort) {
void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
bool isSparseOut, ArrayRef<unsigned> topSort) {
// First initializes fields.
this->loopTag = loopTag;
this->hasOutput = hasOutput;
this->isSparseOut = isSparseOut;
this->tensors.assign(tensors.begin(), tensors.end());
this->tensors.assign(ts.begin(), ts.end());
this->isSparseSlices.assign(tensors.size(), false);
this->dimTypes.assign(tensors.size(), std::vector<DimLevelType>());
this->pidxs.assign(tensors.size(), std::vector<Value>());
this->coord.assign(tensors.size(), std::vector<Value>());
this->highs.assign(tensors.size(), std::vector<Value>());
this->lvlSizes.assign(tensors.size(), std::vector<Value>());
this->ptrBuffer.assign(tensors.size(), std::vector<Value>());
this->idxBuffer.assign(tensors.size(), std::vector<Value>());
this->valBuffer.assign(tensors.size(), nullptr);
this->loopStack.reserve(topSort.size());
this->sparsiferLoopLvlMap.assign(topSort.size(), 0);
this->collapseReassoc.assign(tensors.size(), nullptr);
for (size_t tid = 0, e = tensors.size(); tid < e; tid++) {
auto t = tensors[tid];
// a scalar or 0-dimension tensors
if (isZeroRankedTensorOrScalar(t.getType()))
continue;
auto rtp = getRankedTensorType(t);
if (auto reshape = t.getDefiningOp<tensor::CollapseShapeOp>();
isUniqueCOOType(rtp) && reshape) {
// TODO: Supports more kinds of sparse tensors.
// FIXME: We should instead lower reshape operations on sparse tensors to
// view change.
collapseReassoc[tid] = reshape.getReassociation();
rtp = reshape.getSrcType();
// Overwrites the tensor to the source tensor of reshape operations.
tensors[tid] = t = reshape.getSrc();
}
auto rank = static_cast<size_t>(rtp.getRank());
auto enc = getSparseTensorEncoding(rtp);
// We always treat sparse output tensor as dense so that we always iterate
@@ -172,6 +184,7 @@ void LoopEmitter::initialize(ValueRange tensors, StringAttr loopTag,
pidxs[tid].assign(rank, Value());
coord[tid].assign(rank, Value());
highs[tid].assign(rank, Value());
lvlSizes[tid].assign(rank, Value());
ptrBuffer[tid].assign(rank, Value());
idxBuffer[tid].assign(rank, Value());
}
@@ -224,7 +237,8 @@ void LoopEmitter::initializeLoopEmit(OpBuilder &builder, Location loc,
// Find upper bound in current dimension.
// FIXME: `toOrigDim` is deprecated
const Dimension d = toOrigDim(enc, l);
highs[t][l] = mlir::linalg::createOrFoldDimOp(builder, loc, tensor, d);
lvlSizes[t][l] = highs[t][l] =
mlir::linalg::createOrFoldDimOp(builder, loc, tensor, d);
}
// Perform the required bufferization. Dense inputs materialize
@@ -325,6 +339,8 @@ Operation *LoopEmitter::enterLoopOverTensorAtDim(
}
auto enc = getSparseTensorEncoding(tensors[tid].getType());
auto reass = getCollapseReassociation(tid, dim);
dim = reass.front();
// TODO: support dynamic slices.
Value step = constantIndex(builder, loc, 1);
Value lo = isSparseInput ? pidxs[tid][dim] // current offset
@@ -334,6 +350,7 @@ Operation *LoopEmitter::enterLoopOverTensorAtDim(
Operation *loop = nullptr;
Value iv;
if (isParallel) {
assert(collapseReassoc[tid] == nullptr);
scf::ParallelOp parOp =
builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
builder.setInsertionPointToStart(parOp.getBody());
@@ -365,10 +382,21 @@ Operation *LoopEmitter::enterLoopOverTensorAtDim(
Value c;
if (isSparseInput) {
pidxs[tid][dim] = iv;
// Generating a load on the indices array yields the coordinate.
Value ptr = idxBuffer[tid][dim];
c = genIndexLoad(builder, loc, ptr, iv);
assert(reass.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
c = constantIndex(builder, loc, 0);
for (unsigned i = 0; i < reass.size(); i++) {
auto lvl = reass[i];
// For COO, the pidxs are always the same across consecutive levels.
pidxs[tid][lvl] = iv;
// Generating a load on the indices array yields the coordinate.
Value ptr = idxBuffer[tid][lvl];
Value off = genIndexLoad(builder, loc, ptr, iv);
c = builder.create<arith::AddIOp>(loc, c, off);
if (i != reass.size() - 1) {
c = builder.create<arith::MulIOp>(loc, c,
this->lvlSizes[tid][reass[i + 1]]);
}
}
} else {
// Dense tensor, the coordinates is the inducation variable.
c = iv;
@@ -643,27 +671,30 @@ void LoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc,
if (isDenseDLT(dimType))
return;
// Either the first dimension, or the previous dimension has been set.
assert(dim == 0 || pidxs[tid][dim - 1]);
Value c0 = constantIndex(builder, loc, 0);
Value c1 = constantIndex(builder, loc, 1);
if (isCompressedDLT(dimType)) {
Value ptr = ptrBuffer[tid][dim];
auto reassoc = getCollapseReassociation(tid, dim);
for (auto lvl : reassoc) {
// Either the first dimension, or the previous dimension has been set.
assert(lvl == 0 || pidxs[tid][lvl - 1]);
Value c0 = constantIndex(builder, loc, 0);
Value c1 = constantIndex(builder, loc, 1);
if (isCompressedDLT(dimType)) {
Value ptr = ptrBuffer[tid][lvl];
Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1];
pidxs[tid][dim] = genIndexLoad(builder, loc, ptr, pLo);
Value pLo = lvl == 0 ? c0 : pidxs[tid][lvl - 1];
pidxs[tid][lvl] = genIndexLoad(builder, loc, ptr, pLo);
Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
highs[tid][dim] = genIndexLoad(builder, loc, ptr, pHi);
return;
}
if (isSingletonDLT(dimType)) {
Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1];
Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
highs[tid][lvl] = genIndexLoad(builder, loc, ptr, pHi);
return;
}
if (isSingletonDLT(dimType)) {
Value pLo = lvl == 0 ? c0 : pidxs[tid][lvl - 1];
Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
pidxs[tid][dim] = pLo;
highs[tid][dim] = pHi;
return;
pidxs[tid][lvl] = pLo;
highs[tid][lvl] = pHi;
return;
}
}
llvm_unreachable("Unrecognizable dimesion type!");

View File

@@ -256,6 +256,7 @@ private:
std::vector<std::vector<Value>> pidxs;
std::vector<std::vector<Value>> coord;
std::vector<std::vector<Value>> highs;
std::vector<std::vector<Value>> lvlSizes;
std::vector<std::vector<Value>> ptrBuffer; // to_pointers
std::vector<std::vector<Value>> idxBuffer; // to_indices
std::vector<Value> valBuffer; // to_value
@@ -276,6 +277,28 @@ private:
/// general.
std::vector<unsigned> sparsiferLoopLvlMap;
//
// View based reshape related-fields and methods
//
/// Collapse Reassociations related to a specific tensor
// TODO: support expand.
std::vector<ArrayAttr> collapseReassoc;
/// Get the collapse reassociation for tensors[tid] on l. For unreshaped
/// operands, the reassociation is simply an identity transformation.
SmallVector<int64_t, 2> getCollapseReassociation(unsigned tid, unsigned l) {
// Returns for SmallVector<int64_t, 2> just like `ReassociaionIndices`
if (auto reass = collapseReassoc[tid]) {
auto attr = reass[l];
return llvm::to_vector<2>(
llvm::map_range(attr.cast<ArrayAttr>(), [&](Attribute indexAttr) {
return indexAttr.cast<IntegerAttr>().getInt();
}));
}
return {l};
}
/// TODO: not yet used, it should track the current level for each tensor
/// to help eliminate `dim` paramters from above APIs.
/// std::vector<size_t> curLv;

View File

@@ -0,0 +1,59 @@
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparsification --cse --canonicalize | FileCheck %s
#COO_2D = #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ], pointerBitWidth = 32, indexBitWidth = 32 }>
#COO_3D = #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton-nu", "singleton" ], pointerBitWidth = 32, indexBitWidth = 32 }>
// CHECK-LABEL: func.func @sparse_reshape_fused(
// CHECK-SAME: %[[VAL_0:.*]]: tensor<5x6xf32>,
// CHECK-SAME: %[[VAL_1:.*]]: tensor<6x2x3xf32,
// CHECK-DAG: %[[VAL_2:.*]] = arith.constant 5 : index
// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 3 : index
// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index
// CHECK-DAG: %[[VAL_6:.*]] = tensor.empty() : tensor<5x6xf32>
// CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 0 : index}
// CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index}
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 1 : index}
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 2 : index}
// CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]]
// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_6]] : memref<5x6xf32>
// CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_4]] to %[[VAL_2]] step %[[VAL_5]] {
// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xi32>
// CHECK: %[[VAL_15:.*]] = arith.extui %[[VAL_14]] : i32 to i64
// CHECK: %[[VAL_16:.*]] = arith.index_cast %[[VAL_15]] : i64 to index
// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xi32>
// CHECK: %[[VAL_18:.*]] = arith.extui %[[VAL_17]] : i32 to i64
// CHECK: %[[VAL_19:.*]] = arith.index_cast %[[VAL_18]] : i64 to index
// CHECK: scf.for %[[VAL_20:.*]] = %[[VAL_16]] to %[[VAL_19]] step %[[VAL_5]] {
// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_20]]] : memref<?xi32, strided<[?], offset: ?>>
// CHECK: %[[VAL_22:.*]] = arith.extui %[[VAL_21]] : i32 to i64
// CHECK: %[[VAL_23:.*]] = arith.index_cast %[[VAL_22]] : i64 to index
// CHECK: %[[VAL_24:.*]] = tensor.extract %[[VAL_0]]{{\[}}%[[VAL_13]], %[[VAL_23]]] : tensor<5x6xf32>
// CHECK: %[[VAL_25:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_20]]] : memref<?xi32, strided<[?], offset: ?>>
// CHECK: %[[VAL_26:.*]] = arith.extui %[[VAL_25]] : i32 to i64
// CHECK: %[[VAL_27:.*]] = arith.index_cast %[[VAL_26]] : i64 to index
// CHECK: %[[VAL_28:.*]] = arith.muli %[[VAL_27]], %[[VAL_3]] : index
// CHECK: %[[VAL_29:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_20]]] : memref<?xi32, strided<[?], offset: ?>>
// CHECK: %[[VAL_30:.*]] = arith.extui %[[VAL_29]] : i32 to i64
// CHECK: %[[VAL_31:.*]] = arith.index_cast %[[VAL_30]] : i64 to index
// CHECK: %[[VAL_32:.*]] = arith.addi %[[VAL_28]], %[[VAL_31]] : index
// CHECK: %[[VAL_33:.*]] = tensor.extract %[[VAL_6]]{{\[}}%[[VAL_13]], %[[VAL_32]]] : tensor<5x6xf32>
// CHECK: %[[VAL_34:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_20]]] : memref<?xf32>
// CHECK: %[[VAL_35:.*]] = arith.mulf %[[VAL_24]], %[[VAL_34]] : f32
// CHECK: %[[VAL_36:.*]] = arith.addf %[[VAL_33]], %[[VAL_35]] : f32
// CHECK: memref.store %[[VAL_36]], %[[VAL_12]]{{\[}}%[[VAL_13]], %[[VAL_32]]] : memref<5x6xf32>
// CHECK: } {"Emitted from" = "linalg.generic"}
// CHECK: } {"Emitted from" = "linalg.generic"}
// CHECK: %[[VAL_37:.*]] = bufferization.to_tensor %[[VAL_12]] : memref<5x6xf32>
// CHECK: %[[VAL_38:.*]] = tensor.expand_shape %[[VAL_37]] {{\[\[}}0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
// CHECK: %[[VAL_39:.*]] = tensor.cast %[[VAL_38]] : tensor<5x2x3xf32> to tensor<?x?x?xf32>
// CHECK: return %[[VAL_39]] : tensor<?x?x?xf32>
// CHECK: }
func.func @sparse_reshape_fused(%arg0: tensor<5x6xf32>, %arg1: tensor<6x2x3xf32, #COO_3D>) -> tensor<?x?x?xf32> {
%collapsed = tensor.collapse_shape %arg1 [[0], [1, 2]] : tensor<6x2x3xf32, #COO_3D> into tensor<6x6xf32, #COO_2D>
%0 = tensor.empty() : tensor<5x6xf32>
%2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32>, tensor<6x6xf32, #COO_2D>) outs(%0 : tensor<5x6xf32>) -> tensor<5x6xf32>
%expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
%ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
return %ret1 : tensor<?x?x?xf32>
}

View File

@@ -0,0 +1,108 @@
// DEFINE: %{option} = enable-runtime-library=false
// DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option}
// DEFINE: %{run} = TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
// DEFINE: mlir-cpu-runner \
// DEFINE: -e entry -entry-point-result=void \
// DEFINE: -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils | \
// DEFINE: FileCheck %s
//
// RUN: %{compile} | %{run}
//
// Do the same run, but now with direct IR generation.
// REDEFINE: %{option} = "enable-runtime-library=true"
// RUN: %{compile} | %{run}
//
// Do the same run, but now with direct IR generation and vectorization.
// REDEFINE: %{option} = "enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true"
// RUN: %{compile} | %{run}
#COO_2D = #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ], pointerBitWidth = 32, indexBitWidth = 32 }>
#COO_3D = #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton-nu", "singleton" ], pointerBitWidth = 32, indexBitWidth = 32 }>
module {
func.func private @printMemref3dF32(%ptr : tensor<?x?x?xf32>) attributes { llvm.emit_c_interface }
func.func private @printMemref2dF32(%ptr : tensor<?x?xf32>) attributes { llvm.emit_c_interface }
func.func @test_sparse(%arg0: tensor<5x6xf32>, %arg1: tensor<6x2x3xf32, #COO_3D>) -> tensor<?x?x?xf32> {
%collapsed = tensor.collapse_shape %arg1 [[0], [1, 2]] : tensor<6x2x3xf32, #COO_3D> into tensor<6x6xf32, #COO_2D>
%0 = tensor.empty() : tensor<5x6xf32>
%cst = arith.constant 0.000000e+00 : f32
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<5x6xf32>) -> tensor<5x6xf32>
%2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32>, tensor<6x6xf32, #COO_2D>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
%expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
%ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
return %ret1 : tensor<?x?x?xf32>
}
func.func @test_dense(%arg0: tensor<5x6xf32>, %arg1: tensor<6x2x3xf32>) -> tensor<?x?x?xf32> {
%collapsed = tensor.collapse_shape %arg1 [[0], [1, 2]] : tensor<6x2x3xf32> into tensor<6x6xf32>
%0 = tensor.empty() : tensor<5x6xf32>
%cst = arith.constant 0.000000e+00 : f32
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<5x6xf32>) -> tensor<5x6xf32>
%2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32>, tensor<6x6xf32>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
%expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
%ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
return %ret1 : tensor<?x?x?xf32>
}
func.func @entry() {
// Setup two sparse vectors.
%d1 = arith.constant sparse<
[ [0, 0], [1, 1], [2, 2], [2, 3], [4, 5] ],
[1.0, 2.0, 3.0, 4.0, 5.0]
> : tensor<5x6xf32>
%d2 = arith.constant sparse<
[ [0, 0, 0], [1, 1, 1], [2, 1, 1] ],
[ 6.0, 7.0, 8.0]
> : tensor<6x2x3xf32>
// CHECK: Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [5, 2, 3] strides = [6, 3, 1] data =
// CHECK-NEXT:[
// CHECK-SAME: [
// CHECK-SAME: [6, 0, 0],
// CHECK-NEXT: [0, 0, 0]],
// CHECK-NEXT: [
// CHECK-SAME: [0, 0, 0],
// CHECK-NEXT: [0, 14, 0]],
// CHECK-NEXT: [
// CHECK-SAME: [0, 0, 0],
// CHECK-NEXT: [0, 24, 0]],
// CHECK-NEXT: [
// CHECK-SAME: [0, 0, 0],
// CHECK-NEXT: [0, 0, 0]],
// CHECK-NEXT: [
// CHECK-SAME: [0, 0, 0],
// CHECK-NEXT: [0, 0, 0]]]
%do1 = call @test_dense(%d1, %d2) : (tensor<5x6xf32>, tensor<6x2x3xf32>) -> tensor<?x?x?xf32>
call @printMemref3dF32(%do1) : (tensor<?x?x?xf32>) -> ()
// Same results.
// CHECK-NEXT: Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [5, 2, 3] strides = [6, 3, 1] data =
// CHECK-NEXT:[
// CHECK-SAME: [
// CHECK-SAME: [6, 0, 0],
// CHECK-NEXT: [0, 0, 0]],
// CHECK-NEXT: [
// CHECK-SAME: [0, 0, 0],
// CHECK-NEXT: [0, 14, 0]],
// CHECK-NEXT: [
// CHECK-SAME: [0, 0, 0],
// CHECK-NEXT: [0, 24, 0]],
// CHECK-NEXT: [
// CHECK-SAME: [0, 0, 0],
// CHECK-NEXT: [0, 0, 0]],
// CHECK-NEXT: [
// CHECK-SAME: [0, 0, 0],
// CHECK-NEXT: [0, 0, 0]]]
%s2 = sparse_tensor.convert %d2 : tensor<6x2x3xf32> to tensor<6x2x3xf32, #COO_3D>
%so1 = call @test_sparse(%d1, %s2): (tensor<5x6xf32>, tensor<6x2x3xf32, #COO_3D>) -> tensor<?x?x?xf32>
call @printMemref3dF32(%so1) : (tensor<?x?x?xf32>) -> ()
bufferization.dealloc_tensor %s2 : tensor<6x2x3xf32, #COO_3D>
bufferization.dealloc_tensor %do1 : tensor<?x?x?xf32>
bufferization.dealloc_tensor %so1 : tensor<?x?x?xf32>
return
}
}