mirror of
https://github.com/intel/llvm.git
synced 2026-02-03 02:26:27 +08:00
[mlir][sparse] Support strided convolution on compressed level.
Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D158912
This commit is contained in:
@@ -17,6 +17,7 @@
|
||||
#include "mlir/Dialect/SCF/IR/SCF.h"
|
||||
#include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Vector/IR/VectorOps.h"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace mlir::sparse_tensor;
|
||||
@@ -35,6 +36,8 @@ using namespace mlir::sparse_tensor;
|
||||
#define ANDI(lhs, rhs) (builder.create<arith::AndIOp>(loc, (lhs), (rhs)))
|
||||
#define SUBI(lhs, rhs) (builder.create<arith::SubIOp>(loc, (lhs), (rhs)))
|
||||
#define MULI(lhs, rhs) (builder.create<arith::MulIOp>(loc, (lhs), (rhs)))
|
||||
#define REMUI(lhs, rhs) (builder.create<arith::RemUIOp>(loc, (lhs), (rhs)))
|
||||
#define DIVUI(lhs, rhs) (builder.create<arith::DivUIOp>(loc, (lhs), (rhs)))
|
||||
#define SELECT(c, l, r) (builder.create<arith::SelectOp>(loc, (c), (l), (r)))
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
@@ -117,8 +120,8 @@ static std::pair<Value, Value> fromSliceCrd(OpBuilder &builder, Location loc,
|
||||
Level lvl) {
|
||||
// sliceCrd = (tensorCrd - offset) / stride
|
||||
crd = SUBI(crd, offset);
|
||||
Value rem = builder.create<arith::RemUIOp>(loc, crd, stride);
|
||||
crd = builder.create<arith::DivUIOp>(loc, crd, stride);
|
||||
Value rem = REMUI(crd, stride);
|
||||
crd = DIVUI(crd, stride);
|
||||
return std::make_pair(crd, rem);
|
||||
}
|
||||
|
||||
@@ -725,6 +728,7 @@ Value LoopEmitter::genWhileLoopConditions(OpBuilder &builder, Location loc,
|
||||
}
|
||||
case LoopCondKind::SparseAffineCond: {
|
||||
assert(ivs.size() == 1);
|
||||
|
||||
Value crdHi; // loop upper bound
|
||||
{
|
||||
OpBuilder::InsertionGuard guard(builder);
|
||||
@@ -732,9 +736,9 @@ Value LoopEmitter::genWhileLoopConditions(OpBuilder &builder, Location loc,
|
||||
// crdHi is a loop invariant, hosit the computation outside the loop.
|
||||
if (llvm::isa_and_nonnull<scf::WhileOp>(loop))
|
||||
builder.setInsertionPoint(loop);
|
||||
auto [size, stride] = sliceMeta[tid][lvl].back();
|
||||
auto [remSz, stride] = sliceMeta[tid][lvl].back();
|
||||
assert(stride == 1 && "Not yet implemented");
|
||||
crdHi = ADDI(getMostRecentSliceOnLvl(tid, lvl).offset, size);
|
||||
crdHi = ADDI(getMostRecentSliceOnLvl(tid, lvl).offset, remSz);
|
||||
}
|
||||
assert(crdHi);
|
||||
return genSparseReducedAffineCond(builder, loc,
|
||||
@@ -792,18 +796,33 @@ std::optional<Value> LoopEmitter::genWhileLoopBody(OpBuilder &builder,
|
||||
return std::nullopt;
|
||||
}
|
||||
case LoopCondKind::SparseAffineUnRedCond: {
|
||||
unsigned depth = sliceStack[tid].back().depth;
|
||||
unsigned curStride = sliceMeta[tid][lvl][depth - 1].second;
|
||||
assert(ivs.size() == 3);
|
||||
// Coord is the relative offset related to its parents.
|
||||
// Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1]
|
||||
assert(sliceStack[tid].back().depth == 1 && "TODO: not yet implement");
|
||||
|
||||
// Updates the current slice info
|
||||
SliceInfo &sliceInfo = sliceStack[tid].back();
|
||||
sliceInfo.isNonEmpty = ivs[0];
|
||||
sliceInfo.minCrd = ivs[1];
|
||||
sliceInfo.offset = ivs[2];
|
||||
coords[tid][lvl] = sliceInfo.offset;
|
||||
|
||||
// Crd (the value we used to coiterate) is the relative offset related to
|
||||
// its parents, we can use the absolute offset here because when depth = 1,
|
||||
// absOffset[lvl][depth - 1] always equals zero.
|
||||
// TODO: Update crd =absOffset[lvl][depth] - absOffset[lvl][depth - 1]
|
||||
assert(depth == 1 && "TODO: not yet implement");
|
||||
Value crd = sliceInfo.offset;
|
||||
|
||||
Value onStride = constantI1(builder, loc, true);
|
||||
if (curStride != 1) {
|
||||
Value strideVal = C_IDX(curStride);
|
||||
Value rem = REMUI(crd, strideVal);
|
||||
crd = DIVUI(crd, strideVal);
|
||||
onStride = CMPI(eq, rem, C_IDX(0));
|
||||
}
|
||||
coords[tid][lvl] = crd;
|
||||
// No extra check is needed before accessing the tensor level.
|
||||
return std::nullopt;
|
||||
return onStride;
|
||||
}
|
||||
default:
|
||||
llvm_unreachable("Unhandled LoopCondKind");
|
||||
@@ -814,11 +833,44 @@ std::optional<Value> LoopEmitter::genWhileLoopBody(OpBuilder &builder,
|
||||
ValueRange LoopEmitter::genCheckedValue(OpBuilder &builder, Location loc,
|
||||
Value pred, ValueRange curArgs,
|
||||
TensorLvlCond cond) {
|
||||
// Currently only sparse slice condition need extra check.
|
||||
assert(isSliceCond(cond.second) && isSparseCond(cond.second));
|
||||
assert(curArgs.size() == 1);
|
||||
Value nextPos = ADDI(curArgs.front(), C_IDX(1));
|
||||
return SELECT(pred, curArgs.front(), nextPos)->getResults();
|
||||
assert(isSparseCond(cond.second));
|
||||
auto [tid, lvl] = unpackTensorLevel(cond.first);
|
||||
if (isAffineIdxUnRedCond(cond.second)) {
|
||||
unsigned depth = sliceStack[tid].back().depth;
|
||||
unsigned curStride = sliceMeta[tid][lvl][depth - 1].second;
|
||||
if (curStride == 1)
|
||||
return curArgs;
|
||||
// Build
|
||||
// if (onStride) {
|
||||
// yield curSlice
|
||||
// } else {
|
||||
// yield nxSlice.
|
||||
//}
|
||||
assert(curArgs.size() == 3);
|
||||
auto ifOp = builder.create<scf::IfOp>(loc, curArgs.getTypes(), pred, true);
|
||||
{
|
||||
OpBuilder::InsertionGuard guard(builder);
|
||||
// If not all slices are legit, yield the updated value.
|
||||
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
||||
|
||||
YIELD(curArgs);
|
||||
// If not all slices are legit, yield the updated value.
|
||||
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
||||
auto [nonEmpty, minCrd, offset] =
|
||||
genSliceNextInduction(builder, loc, tid, lvl);
|
||||
SmallVector<Value> nxSlice{nonEmpty, minCrd, offset};
|
||||
YIELD(nxSlice);
|
||||
}
|
||||
// If all slices are legit, start the user generated code.
|
||||
return ifOp.getResults();
|
||||
} else {
|
||||
// Currently only sparse slice condition need extra check.
|
||||
assert(isSliceCond(cond.second) && isSparseCond(cond.second));
|
||||
assert(curArgs.size() == 1);
|
||||
Value nextPos = ADDI(curArgs.front(), C_IDX(1));
|
||||
return SELECT(pred, curArgs.front(), nextPos)->getResults();
|
||||
}
|
||||
llvm_unreachable("unhandled case");
|
||||
}
|
||||
|
||||
std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls(
|
||||
@@ -1877,9 +1929,6 @@ void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc,
|
||||
TensorId tid, Level lvl) {
|
||||
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
|
||||
unsigned depth = levelReducedDep[tid][lvl];
|
||||
// TODO: handle case when the current slice stride is not one.
|
||||
assert(sliceMeta[tid][lvl][depth].second == 1 && "Not yet implemented");
|
||||
|
||||
// The remaining slice size after reduction.
|
||||
Value remSz = sliceMeta[tid][lvl][depth + 1].first;
|
||||
// Dense slice begin is trivial
|
||||
@@ -2251,8 +2300,6 @@ LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc,
|
||||
|
||||
// FIXME: compute relative offset.
|
||||
assert(info.depth - 1 == 0);
|
||||
Value nextRelOffset = nextAbsOffset;
|
||||
nextRelOffset = SELECT(nextNonEmpty, nextRelOffset, c0);
|
||||
return std::make_tuple(nextNonEmpty, nextMinCrd, nextAbsOffset);
|
||||
}
|
||||
|
||||
|
||||
@@ -393,7 +393,7 @@ private:
|
||||
}
|
||||
static bool isTrivalIdxCond(LoopCondKind k) { return !isAffineIdxCond(k); }
|
||||
|
||||
/// Whether the affine index expression is not fully reduced.
|
||||
/// Whether the affine index expression is fully reduced.
|
||||
static bool isAffineIdxUnRedCond(LoopCondKind k) {
|
||||
return isAffineIdxCond(k) && static_cast<uint8_t>(k) & kAffineIdxCondUnRed;
|
||||
}
|
||||
@@ -405,7 +405,7 @@ private:
|
||||
// E.g., to iterate over sparse tensor slice, we need to check whether the
|
||||
// current cooridnate is on the slice (e.g., due to stride) or not.
|
||||
static bool isCondWithExtraCheck(LoopCondKind k) {
|
||||
return isSparseCond(k) && isSliceCond(k);
|
||||
return isSparseCond(k) && (isSliceCond(k) || isAffineIdxUnRedCond(k));
|
||||
}
|
||||
|
||||
static LoopCondKind makeLoopCondKind(bool isSparse, bool isSlice,
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
//--------------------------------------------------------------------------------------------------
|
||||
// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
|
||||
//
|
||||
// Set-up that's shared across all tests in this directory. In principle, this
|
||||
// config could be moved to lit.local.cfg. However, there are downstream users that
|
||||
// do not use these LIT config files. Hence why this is kept inline.
|
||||
//
|
||||
// DEFINE: %{sparse_compiler_opts} = enable-runtime-library=true
|
||||
// DEFINE: %{sparse_compiler_opts_sve} = enable-arm-sve=true %{sparse_compiler_opts}
|
||||
// DEFINE: %{compile} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts}"
|
||||
// DEFINE: %{compile_sve} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts_sve}"
|
||||
// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
|
||||
// DEFINE: %{run_opts} = -e entry -entry-point-result=void
|
||||
// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
|
||||
// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
|
||||
//
|
||||
// DEFINE: %{env} =
|
||||
//--------------------------------------------------------------------------------------------------
|
||||
|
||||
// RUN: %{compile} | %{run} | FileCheck %s
|
||||
//
|
||||
// Do the same run, but now with direct IR generation.
|
||||
// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true
|
||||
// RUN: %{compile} | %{run} | FileCheck %s
|
||||
//
|
||||
// Do the same run, but now with direct IR generation and vectorization.
|
||||
// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true
|
||||
// RUN: %{compile} | %{run} | FileCheck %s
|
||||
//
|
||||
// Do the same run, but now with direct IR generation and VLA vectorization.
|
||||
// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
|
||||
|
||||
#CCCC = #sparse_tensor.encoding<{
|
||||
lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ]
|
||||
}>
|
||||
|
||||
// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
|
||||
func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor<?x?x?x?xf32> {
|
||||
%buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4) : tensor<?x?x?x?xf32>
|
||||
%ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
|
||||
return %ret : tensor<?x?x?x?xf32>
|
||||
}
|
||||
|
||||
func.func @conv_2d_nhwc_hwcf(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
|
||||
%ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
|
||||
strides = dense<2> : tensor<2xi64>}
|
||||
ins (%arg0, %arg1: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
|
||||
outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
|
||||
return %ret : tensor<?x?x?x?xf32>
|
||||
}
|
||||
|
||||
func.func @conv_2d_nhwc_hwcf_CCCC(%arg0: tensor<?x?x?x?xf32, #CCCC>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
|
||||
%ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
|
||||
strides = dense<2> : tensor<2xi64>}
|
||||
ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32>)
|
||||
outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
|
||||
return %ret : tensor<?x?x?x?xf32>
|
||||
}
|
||||
|
||||
func.func @entry() {
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%c3 = arith.constant 3 : index
|
||||
%c6 = arith.constant 6 : index
|
||||
%c9 = arith.constant 9 : index
|
||||
%f10 = arith.constant 10.00000e+00 : f32
|
||||
%val = arith.constant 2.00000e+00 : f32
|
||||
%zero = arith.constant 0.00000e+00 : f32
|
||||
|
||||
%filter2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
|
||||
%in2D_tmp = call @alloc_4d_filled_f32(%c3, %c9, %c9, %c3, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
|
||||
%in2D_nhwc = tensor.insert %f10 into %in2D_tmp[%c0, %c3, %c3, %c0] : tensor<?x?x?x?xf32>
|
||||
%out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
|
||||
|
||||
%in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc
|
||||
: tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CCCC>
|
||||
|
||||
%dense_ret = call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
|
||||
%CCCC_ret = call @conv_2d_nhwc_hwcf_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
|
||||
|
||||
// CHECK: ( ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 20 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ),
|
||||
// CHECK-SAME: ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ),
|
||||
// CHECK-SAME: ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ) )
|
||||
%dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero
|
||||
: tensor<?x?x?x?xf32>, vector<3x3x3x1xf32>
|
||||
vector.print %dense_v : vector<3x3x3x1xf32>
|
||||
|
||||
// CHECK: ( ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 20 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ),
|
||||
// CHECK-SAME: ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ),
|
||||
// CHECK-SAME: ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ) )
|
||||
%v1 = vector.transfer_read %CCCC_ret[%c0, %c0, %c0, %c0], %zero
|
||||
: tensor<?x?x?x?xf32>, vector<3x3x3x1xf32>
|
||||
vector.print %v1 : vector<3x3x3x1xf32>
|
||||
|
||||
// Free the resources
|
||||
bufferization.dealloc_tensor %in2D_nhwc : tensor<?x?x?x?xf32>
|
||||
bufferization.dealloc_tensor %filter2D_nhwc : tensor<?x?x?x?xf32>
|
||||
bufferization.dealloc_tensor %out2D_nhwc : tensor<?x?x?x?xf32>
|
||||
|
||||
bufferization.dealloc_tensor %in2D_nhwc_CCCC : tensor<?x?x?x?xf32, #CCCC>
|
||||
return
|
||||
}
|
||||
Reference in New Issue
Block a user