mirror of
https://github.com/intel/llvm.git
synced 2026-01-23 07:58:23 +08:00
[MLIR][Vectorize] Refactor Vectorize use-def propagation.
This CL refactors a few things in Vectorize.cpp:
1. a clear distinction is made between:
a. the LoadOp are the roots of vectorization and must be vectorized
eagerly and propagate their value; and
b. the StoreOp which are the terminals of vectorization and must be
vectorized late (i.e. they do not produce values that need to be
propagated).
2. the StoreOp must be vectorized late because in general it can store a value
that is not reachable from the subset of loads defined in the
current pattern. One trivial such case is storing a constant defined at the
top-level of the MLFunction and that needs to be turned into a splat.
3. a description of the algorithm is given;
4. the implementation matches the algorithm;
5. the last example is made parametric, in practice it will fully rely on the
implementation of vector_transfer_read/write which will handle boundary
conditions and padding. This will happen by lowering to a lower-level
abstraction either:
a. directly in MLIR (whether DMA or just loops or any async tasks in the
future) (whiteboxing);
b. in LLO/LLVM-IR/whatever blackbox library call/ search + swizzle inventor
one may want to use;
c. a partial mix of a. and b. (grey-boxing)
5. minor cleanups are applied;
6. mistakenly disabled unit tests are re-enabled (oopsie).
With this CL, this MLIR snippet:
```
mlfunc @vector_add_2d(%M : index, %N : index) -> memref<?x?xf32> {
%A = alloc (%M, %N) : memref<?x?xf32>
%B = alloc (%M, %N) : memref<?x?xf32>
%C = alloc (%M, %N) : memref<?x?xf32>
%f1 = constant 1.0 : f32
%f2 = constant 2.0 : f32
for %i0 = 0 to %M {
for %i1 = 0 to %N {
// non-scoped %f1
store %f1, %A[%i0, %i1] : memref<?x?xf32>
}
}
for %i4 = 0 to %M {
for %i5 = 0 to %N {
%a5 = load %A[%i4, %i5] : memref<?x?xf32>
%b5 = load %B[%i4, %i5] : memref<?x?xf32>
%s5 = addf %a5, %b5 : f32
// non-scoped %f1
%s6 = addf %s5, %f1 : f32
store %s6, %C[%i4, %i5] : memref<?x?xf32>
}
}
return %C : memref<?x?xf32>
}
```
vectorized with these arguments:
```
-vectorize -virtual-vector-size 256 --test-fastest-varying=0
```
vectorization produces this standard innermost-loop vectorized code:
```
mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> memref<?x?xf32> {
%0 = alloc(%arg0, %arg1) : memref<?x?xf32>
%1 = alloc(%arg0, %arg1) : memref<?x?xf32>
%2 = alloc(%arg0, %arg1) : memref<?x?xf32>
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 2.000000e+00 : f32
for %i0 = 0 to %arg0 {
for %i1 = 0 to %arg1 step 256 {
%cst_1 = constant splat<vector<256xf32>, 1.000000e+00> : vector<256xf32>
"vector_transfer_write"(%cst_1, %0, %i0, %i1) : (vector<256xf32>, memref<?x?xf32>, index, index) -> ()
}
}
for %i2 = 0 to %arg0 {
for %i3 = 0 to %arg1 step 256 {
%3 = "vector_transfer_read"(%0, %i2, %i3) : (memref<?x?xf32>, index, index) -> vector<256xf32>
%4 = "vector_transfer_read"(%1, %i2, %i3) : (memref<?x?xf32>, index, index) -> vector<256xf32>
%5 = addf %3, %4 : vector<256xf32>
%cst_2 = constant splat<vector<256xf32>, 1.000000e+00> : vector<256xf32>
%6 = addf %5, %cst_2 : vector<256xf32>
"vector_transfer_write"(%6, %2, %i2, %i3) : (vector<256xf32>, memref<?x?xf32>, index, index) -> ()
}
}
return %2 : memref<?x?xf32>
}
```
Of course, much more intricate n-D imperfectly-nested patterns can be emitted too in a fully declarative fashion, but this is enough for now.
PiperOrigin-RevId: 222280209
This commit is contained in:
committed by
jpienaar
parent
19573e2939
commit
87d46aaf4b
@@ -31,6 +31,7 @@ class AffineExpr;
|
||||
class ForStmt;
|
||||
class MemRefType;
|
||||
class MLValue;
|
||||
class OperationStmt;
|
||||
|
||||
/// Returns the trip count of the loop as an affine expression if the latter is
|
||||
/// expressible as an affine expression, and nullptr otherwise. The trip count
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,9 +1,9 @@
|
||||
// RUN: mlir-opt %s -vectorize -virtual-vector-size 128 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC1D
|
||||
// RUN_: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC2D
|
||||
// RUN_: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=1 | FileCheck %s -check-prefix=VEC2D_T
|
||||
// RUN_: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC2D_O
|
||||
// RUN_: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=2 | FileCheck %s -check-prefix=VEC2D_OT
|
||||
// RUN_: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 64 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC3D
|
||||
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC2D
|
||||
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=1 | FileCheck %s -check-prefix=VEC2D_T
|
||||
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC2D_O
|
||||
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=2 | FileCheck %s -check-prefix=VEC2D_OT
|
||||
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 64 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC3D
|
||||
|
||||
#map0 = (d0) -> (d0)
|
||||
#map1 = (d0, d1) -> (d0, d1)
|
||||
@@ -269,48 +269,70 @@ mlfunc @vec3d(%A : memref<?x?x?xf32>) {
|
||||
return
|
||||
}
|
||||
|
||||
mlfunc @vector_add_2d() -> f32 {
|
||||
%A = alloc () : memref<32x1024xf32, 0>
|
||||
%B = alloc () : memref<32x1024xf32, 0>
|
||||
%C = alloc () : memref<32x1024xf32, 0>
|
||||
mlfunc @vector_add_2d(%M : index, %N : index) -> f32 {
|
||||
%A = alloc (%M, %N) : memref<?x?xf32, 0>
|
||||
%B = alloc (%M, %N) : memref<?x?xf32, 0>
|
||||
%C = alloc (%M, %N) : memref<?x?xf32, 0>
|
||||
%f1 = constant 1.0 : f32
|
||||
%f2 = constant 2.0 : f32
|
||||
for %i0 = 0 to 32 {
|
||||
for %i1 = 0 to 1024 {
|
||||
for %i0 = 0 to %M {
|
||||
for %i1 = 0 to %N {
|
||||
// VEC1D: [[C1:%.*]] = constant splat<vector<128xf32>, 1.000000e+00> : vector<128xf32>
|
||||
// VEC1D: "vector_transfer_write"([[C1]], {{.*}}) : (vector<128xf32>, memref<32x1024xf32>, index, index) -> ()
|
||||
// VEC1D: "vector_transfer_write"([[C1]], {{.*}}) : (vector<128xf32>, memref<?x?xf32>, index, index) -> ()
|
||||
// VEC2D: [[C1:%.*]] = constant splat<vector<32x256xf32>, 1.000000e+00> : vector<32x256xf32>
|
||||
// VEC2D: "vector_transfer_write"([[C1]], {{.*}}) : (vector<32x256xf32>, memref<32x1024xf32>, index, index) -> ()
|
||||
store %f1, %A[%i0, %i1] : memref<32x1024xf32, 0>
|
||||
// VEC2D: "vector_transfer_write"([[C1]], {{.*}}) : (vector<32x256xf32>, memref<?x?xf32>, index, index) -> ()
|
||||
// non-scoped %f1
|
||||
store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
|
||||
}
|
||||
}
|
||||
for %i2 = 0 to 32 {
|
||||
for %i3 = 0 to 1024 {
|
||||
for %i2 = 0 to %M {
|
||||
for %i3 = 0 to %N {
|
||||
// VEC1D: [[C3:%.*]] = constant splat<vector<128xf32>, 2.000000e+00> : vector<128xf32>
|
||||
// VEC1D: "vector_transfer_write"([[C3]], {{.*}}) : (vector<128xf32>, memref<32x1024xf32>, index, index) -> ()
|
||||
// VEC1D: "vector_transfer_write"([[C3]], {{.*}}) : (vector<128xf32>, memref<?x?xf32>, index, index) -> ()
|
||||
// VEC2D: [[C3:%.*]] = constant splat<vector<32x256xf32>, 2.000000e+00> : vector<32x256xf32>
|
||||
// VEC2D: "vector_transfer_write"([[C3]], {{.*}}) : (vector<32x256xf32>, memref<32x1024xf32>, index, index) -> ()
|
||||
store %f2, %B[%i2, %i3] : memref<32x1024xf32, 0>
|
||||
// VEC2D: "vector_transfer_write"([[C3]], {{.*}}) : (vector<32x256xf32>, memref<?x?xf32>, index, index) -> ()
|
||||
// non-scoped %f2
|
||||
store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
|
||||
}
|
||||
}
|
||||
for %i4 = 0 to 32 {
|
||||
for %i5 = 0 to 1024 {
|
||||
// VEC1D: [[A5:%.*]] = "vector_transfer_read"(%0, {{.*}}) : (memref<32x1024xf32>, index, index) -> vector<128xf32>
|
||||
// VEC1D: [[B5:%.*]] = "vector_transfer_read"(%1, {{.*}}) : (memref<32x1024xf32>, index, index) -> vector<128xf32>
|
||||
for %i4 = 0 to %M {
|
||||
for %i5 = 0 to %N {
|
||||
//
|
||||
// VEC1D: [[A5:%.*]] = "vector_transfer_read"(%0, {{.*}}) : (memref<?x?xf32>, index, index) -> vector<128xf32>
|
||||
// VEC1D: [[B5:%.*]] = "vector_transfer_read"(%1, {{.*}}) : (memref<?x?xf32>, index, index) -> vector<128xf32>
|
||||
// VEC1D: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<128xf32>
|
||||
// VEC1D: "vector_transfer_write"([[S5]], {{.*}}) : (vector<128xf32>, memref<32x1024xf32>, index, index) -> ()
|
||||
// VEC2D: [[A5:%.*]] = "vector_transfer_read"(%0, {{.*}}) : (memref<32x1024xf32>, index, index) -> vector<32x256xf32>
|
||||
// VEC2D: [[B5:%.*]] = "vector_transfer_read"(%1, {{.*}}) : (memref<32x1024xf32>, index, index) -> vector<32x256xf32>
|
||||
// VEC1D: [[SPLAT1:%.*]] = constant splat<vector<128xf32>, 1.000000e+00> : vector<128xf32>
|
||||
// VEC1D: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<128xf32>
|
||||
// VEC1D: [[SPLAT2:%.*]] = constant splat<vector<128xf32>, 2.000000e+00> : vector<128xf32>
|
||||
// VEC1D: [[S7:%.*]] = addf [[S5]], [[SPLAT2]] : vector<128xf32>
|
||||
// VEC1D: [[S8:%.*]] = addf [[S7]], [[S6]] : vector<128xf32>
|
||||
// VEC1D: "vector_transfer_write"([[S8]], {{.*}}) : (vector<128xf32>, memref<?x?xf32>, index, index) -> ()
|
||||
//
|
||||
// VEC2D: [[A5:%.*]] = "vector_transfer_read"(%0, {{.*}}) : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
|
||||
// VEC2D: [[B5:%.*]] = "vector_transfer_read"(%1, {{.*}}) : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
|
||||
// VEC2D: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<32x256xf32>
|
||||
// VEC2D: "vector_transfer_write"([[S5]], {{.*}}) : (vector<32x256xf32>, memref<32x1024xf32>, index, index) -> ()
|
||||
%a5 = load %A[%i4, %i5] : memref<32x1024xf32, 0>
|
||||
%b5 = load %B[%i4, %i5] : memref<32x1024xf32, 0>
|
||||
// VEC2D: [[SPLAT1:%.*]] = constant splat<vector<32x256xf32>, 1.000000e+00> : vector<32x256xf32>
|
||||
// VEC2D: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<32x256xf32>
|
||||
// VEC2D: [[SPLAT2:%.*]] = constant splat<vector<32x256xf32>, 2.000000e+00> : vector<32x256xf32>
|
||||
// VEC2D: [[S7:%.*]] = addf [[S5]], [[SPLAT2]] : vector<32x256xf32>
|
||||
// VEC2D: [[S8:%.*]] = addf [[S7]], [[S6]] : vector<32x256xf32>
|
||||
// VEC2D: "vector_transfer_write"([[S8]], {{.*}}) : (vector<32x256xf32>, memref<?x?xf32>, index, index) -> ()
|
||||
//
|
||||
%a5 = load %A[%i4, %i5] : memref<?x?xf32, 0>
|
||||
%b5 = load %B[%i4, %i5] : memref<?x?xf32, 0>
|
||||
%s5 = addf %a5, %b5 : f32
|
||||
store %s5, %C[%i4, %i5] : memref<32x1024xf32, 0>
|
||||
// non-scoped %f1
|
||||
%s6 = addf %s5, %f1 : f32
|
||||
// non-scoped %f2
|
||||
%s7 = addf %s5, %f2 : f32
|
||||
// diamond dependency.
|
||||
%s8 = addf %s7, %s6 : f32
|
||||
store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
|
||||
}
|
||||
}
|
||||
%c7 = constant 7 : index
|
||||
%c42 = constant 42 : index
|
||||
%res = load %C[%c7, %c42] : memref<32x1024xf32, 0>
|
||||
%res = load %C[%c7, %c42] : memref<?x?xf32, 0>
|
||||
return %res : f32
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user