mirror of
https://github.com/intel/llvm.git
synced 2026-02-08 17:28:30 +08:00
[mlir] Add linalg.batch_mmt4d named op
This op is the batched version of linalg.mmt4d. It performs matrix-matrix-transpose multiplication of batched 4-d (5d) inputs as the following:
```
C[b, m1, n1, m0, n0] = sum_{b, k1, k0}(A[b, m1, k1, m0, k0] * B[b, n1, k1, n0, k0])
```
The current use is to provide `linalg.batch_matmul` a lowering path similar to `linalg.matmul -> linalg.mmt4d`.
Differential Revision: https://reviews.llvm.org/D156912
This commit is contained in:
@@ -1137,6 +1137,81 @@ structured_op: !LinalgStructuredOpConfig
|
||||
- !ScalarExpression
|
||||
scalar_arg: rhs
|
||||
--- !LinalgOpConfig
|
||||
metadata: !LinalgOpMetadata
|
||||
name: batch_mmt4d
|
||||
cpp_class_name: BatchMmt4DOp
|
||||
doc: "Performs a batched matrix-matrix-transpose multiplication of two\nbatched-4D\
|
||||
\ (5D) inputs.\n\nBesides the outermost batch dimension has the same semantic\
|
||||
\ as\nlinalg.batch_matmul, the differences from linalg.batch_matmul in the\nnon-batch\
|
||||
\ dimensions are the same as linalg.mmt4d vs. linalg.matmul. See the\ndescription\
|
||||
\ of lingalg.mmt4d."
|
||||
implements:
|
||||
- LinalgContractionOpInterface
|
||||
structured_op: !LinalgStructuredOpConfig
|
||||
args:
|
||||
- !LinalgOperandDefConfig
|
||||
name: lhs
|
||||
kind: input_tensor
|
||||
type_var: LhsType
|
||||
shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6] -> (s0, s1, s2, s3, s4)>
|
||||
- !LinalgOperandDefConfig
|
||||
name: rhs
|
||||
kind: input_tensor
|
||||
type_var: RhsType
|
||||
shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6] -> (s0, s5, s2, s6, s4)>
|
||||
- !LinalgOperandDefConfig
|
||||
name: accum
|
||||
kind: output_tensor
|
||||
type_var: AccumType
|
||||
shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6] -> (s0, s1, s5, s3, s6)>
|
||||
indexing_maps: !LinalgIndexingMapsConfig
|
||||
static_indexing_maps:
|
||||
- affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6] -> (d0,
|
||||
d1, d3, d4, d6)>
|
||||
- affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6] -> (d0,
|
||||
d2, d3, d5, d6)>
|
||||
- affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6] -> (d0,
|
||||
d1, d2, d4, d5)>
|
||||
iterator_types:
|
||||
- parallel
|
||||
- parallel
|
||||
- parallel
|
||||
- reduction
|
||||
- parallel
|
||||
- parallel
|
||||
- reduction
|
||||
assignments:
|
||||
- !ScalarAssign
|
||||
arg: accum
|
||||
value: !ScalarExpression
|
||||
scalar_fn:
|
||||
kind: binary
|
||||
fn_name: add
|
||||
operands:
|
||||
- !ScalarExpression
|
||||
scalar_arg: accum
|
||||
- !ScalarExpression
|
||||
scalar_fn:
|
||||
kind: binary
|
||||
fn_name: mul
|
||||
operands:
|
||||
- !ScalarExpression
|
||||
scalar_fn:
|
||||
kind: type
|
||||
fn_name: cast_signed
|
||||
type_var: AccumType
|
||||
operands:
|
||||
- !ScalarExpression
|
||||
scalar_arg: lhs
|
||||
- !ScalarExpression
|
||||
scalar_fn:
|
||||
kind: type
|
||||
fn_name: cast_signed
|
||||
type_var: AccumType
|
||||
operands:
|
||||
- !ScalarExpression
|
||||
scalar_arg: rhs
|
||||
--- !LinalgOpConfig
|
||||
metadata: !LinalgOpMetadata
|
||||
name: batch_matmul
|
||||
cpp_class_name: BatchMatmulOp
|
||||
|
||||
@@ -350,6 +350,27 @@ def mmt4d(
|
||||
) * TypeFn.cast_signed(TV.AccumType, rhs[D.n, D.k, D.n0, D.k0])
|
||||
|
||||
|
||||
@linalg_structured_op
|
||||
def batch_mmt4d(
|
||||
lhs=TensorDef(TV.LhsType, Batch, S.M, S.K, S.M0, S.K0),
|
||||
rhs=TensorDef(TV.RhsType, Batch, S.N, S.K, S.N0, S.K0),
|
||||
accum=TensorDef(TV.AccumType, Batch, S.M, S.N, S.M0, S.N0, output=True),
|
||||
):
|
||||
"""Performs a batched matrix-matrix-transpose multiplication of two
|
||||
batched-4D (5D) inputs.
|
||||
|
||||
Besides the outermost batch dimension has the same semantic as
|
||||
linalg.batch_matmul, the differences from linalg.batch_matmul in the
|
||||
non-batch dimensions are the same as linalg.mmt4d vs. linalg.matmul. See the
|
||||
description of lingalg.mmt4d.
|
||||
"""
|
||||
domain(D.b, D.m, D.n, D.k, D.m0, D.n0, D.k0)
|
||||
implements(ContractionOpInterface)
|
||||
accum[D.b, D.m, D.n, D.m0, D.n0] += TypeFn.cast_signed(
|
||||
TV.AccumType, lhs[D.b, D.m, D.k, D.m0, D.k0]
|
||||
) * TypeFn.cast_signed(TV.AccumType, rhs[D.b, D.n, D.k, D.n0, D.k0])
|
||||
|
||||
|
||||
@linalg_structured_op
|
||||
def batch_matmul(
|
||||
A=TensorDef(T1, Batch, S.M, S.K),
|
||||
|
||||
@@ -1187,6 +1187,17 @@ func.func @batchmatmul_transpose_b(%arg0: memref<2x3x5xf32>, %arg1: memref<2x7x5
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @batch_mmt4d
|
||||
func.func @batch_mmt4d(%arg0: tensor<128x10x32x8x1xf32>, %arg1: tensor<128x80x32x4x1xf32>, %arg2: tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> {
|
||||
// CHECK: %{{.+}} = linalg.batch_mmt4d
|
||||
// CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>)
|
||||
// CHECK-SAME: outs(%{{.+}} : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
|
||||
%0 = linalg.batch_mmt4d ins(%arg0, %arg1 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%arg2 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
|
||||
return %0: tensor<128x10x80x8x4xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @add_dynamic
|
||||
func.func @add_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
|
||||
// CHECK: linalg.add
|
||||
|
||||
Reference in New Issue
Block a user