[MLIR][Vectorize] Refactor Vectorize use-def propagation.

This CL refactors a few things in Vectorize.cpp:
1. a clear distinction is made between:
  a. the LoadOp are the roots of vectorization and must be vectorized
  eagerly and propagate their value; and
  b. the StoreOp which are the terminals of vectorization and must be
  vectorized late (i.e. they do not produce values that need to be
  propagated).
2. the StoreOp must be vectorized late because in general it can store a value
that is not reachable from the subset of loads defined in the
current pattern. One trivial such case is storing a constant defined at the
top-level of the MLFunction and that needs to be turned into a splat.
3. a description of the algorithm is given;
4. the implementation matches the algorithm;
5. the last example is made parametric, in practice it will fully rely on the
implementation of vector_transfer_read/write which will handle boundary
conditions and padding. This will happen by lowering to a lower-level
abstraction either:
  a. directly in MLIR (whether DMA or just loops or any async tasks in the
     future) (whiteboxing);
  b. in LLO/LLVM-IR/whatever blackbox library call/ search + swizzle inventor
  one may want to use;
  c. a partial mix of a. and b. (grey-boxing)
5. minor cleanups are applied;
6. mistakenly disabled unit tests are re-enabled (oopsie).

With this CL, this MLIR snippet:
```
mlfunc @vector_add_2d(%M : index, %N : index) -> memref<?x?xf32> {
  %A = alloc (%M, %N) : memref<?x?xf32>
  %B = alloc (%M, %N) : memref<?x?xf32>
  %C = alloc (%M, %N) : memref<?x?xf32>
  %f1 = constant 1.0 : f32
  %f2 = constant 2.0 : f32
  for %i0 = 0 to %M {
    for %i1 = 0 to %N {
      // non-scoped %f1
      store %f1, %A[%i0, %i1] : memref<?x?xf32>
    }
  }
  for %i4 = 0 to %M {
    for %i5 = 0 to %N {
      %a5 = load %A[%i4, %i5] : memref<?x?xf32>
      %b5 = load %B[%i4, %i5] : memref<?x?xf32>
      %s5 = addf %a5, %b5 : f32
      // non-scoped %f1
      %s6 = addf %s5, %f1 : f32
      store %s6, %C[%i4, %i5] : memref<?x?xf32>
    }
  }
  return %C : memref<?x?xf32>
}
```

vectorized with these arguments:
```
-vectorize -virtual-vector-size 256 --test-fastest-varying=0
```

vectorization produces this standard innermost-loop vectorized code:
```
mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> memref<?x?xf32> {
  %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
  %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
  %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
  %cst = constant 1.000000e+00 : f32
  %cst_0 = constant 2.000000e+00 : f32
  for %i0 = 0 to %arg0 {
    for %i1 = 0 to %arg1 step 256 {
      %cst_1 = constant splat<vector<256xf32>, 1.000000e+00> : vector<256xf32>
      "vector_transfer_write"(%cst_1, %0, %i0, %i1) : (vector<256xf32>, memref<?x?xf32>, index, index) -> ()
    }
  }
  for %i2 = 0 to %arg0 {
    for %i3 = 0 to %arg1 step 256 {
      %3 = "vector_transfer_read"(%0, %i2, %i3) : (memref<?x?xf32>, index, index) -> vector<256xf32>
      %4 = "vector_transfer_read"(%1, %i2, %i3) : (memref<?x?xf32>, index, index) -> vector<256xf32>
      %5 = addf %3, %4 : vector<256xf32>
      %cst_2 = constant splat<vector<256xf32>, 1.000000e+00> : vector<256xf32>
      %6 = addf %5, %cst_2 : vector<256xf32>
      "vector_transfer_write"(%6, %2, %i2, %i3) : (vector<256xf32>, memref<?x?xf32>, index, index) -> ()
    }
  }
  return %2 : memref<?x?xf32>
}
```

Of course, much more intricate n-D imperfectly-nested patterns can be emitted too in a fully declarative fashion, but this is enough for now.

PiperOrigin-RevId: 222280209
This commit is contained in:
Nicolas Vasilache
2018-11-20 11:39:32 -08:00
committed by jpienaar
parent 19573e2939
commit 87d46aaf4b
3 changed files with 873 additions and 315 deletions

View File

@@ -31,6 +31,7 @@ class AffineExpr;
class ForStmt;
class MemRefType;
class MLValue;
class OperationStmt;
/// Returns the trip count of the loop as an affine expression if the latter is
/// expressible as an affine expression, and nullptr otherwise. The trip count

File diff suppressed because it is too large Load Diff

View File

@@ -1,9 +1,9 @@
// RUN: mlir-opt %s -vectorize -virtual-vector-size 128 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC1D
// RUN_: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC2D
// RUN_: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=1 | FileCheck %s -check-prefix=VEC2D_T
// RUN_: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC2D_O
// RUN_: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=2 | FileCheck %s -check-prefix=VEC2D_OT
// RUN_: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 64 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC3D
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC2D
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=1 | FileCheck %s -check-prefix=VEC2D_T
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC2D_O
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=2 | FileCheck %s -check-prefix=VEC2D_OT
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 64 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s -check-prefix=VEC3D
#map0 = (d0) -> (d0)
#map1 = (d0, d1) -> (d0, d1)
@@ -269,48 +269,70 @@ mlfunc @vec3d(%A : memref<?x?x?xf32>) {
return
}
mlfunc @vector_add_2d() -> f32 {
%A = alloc () : memref<32x1024xf32, 0>
%B = alloc () : memref<32x1024xf32, 0>
%C = alloc () : memref<32x1024xf32, 0>
mlfunc @vector_add_2d(%M : index, %N : index) -> f32 {
%A = alloc (%M, %N) : memref<?x?xf32, 0>
%B = alloc (%M, %N) : memref<?x?xf32, 0>
%C = alloc (%M, %N) : memref<?x?xf32, 0>
%f1 = constant 1.0 : f32
%f2 = constant 2.0 : f32
for %i0 = 0 to 32 {
for %i1 = 0 to 1024 {
for %i0 = 0 to %M {
for %i1 = 0 to %N {
// VEC1D: [[C1:%.*]] = constant splat<vector<128xf32>, 1.000000e+00> : vector<128xf32>
// VEC1D: "vector_transfer_write"([[C1]], {{.*}}) : (vector<128xf32>, memref<32x1024xf32>, index, index) -> ()
// VEC1D: "vector_transfer_write"([[C1]], {{.*}}) : (vector<128xf32>, memref<?x?xf32>, index, index) -> ()
// VEC2D: [[C1:%.*]] = constant splat<vector<32x256xf32>, 1.000000e+00> : vector<32x256xf32>
// VEC2D: "vector_transfer_write"([[C1]], {{.*}}) : (vector<32x256xf32>, memref<32x1024xf32>, index, index) -> ()
store %f1, %A[%i0, %i1] : memref<32x1024xf32, 0>
// VEC2D: "vector_transfer_write"([[C1]], {{.*}}) : (vector<32x256xf32>, memref<?x?xf32>, index, index) -> ()
// non-scoped %f1
store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
}
}
for %i2 = 0 to 32 {
for %i3 = 0 to 1024 {
for %i2 = 0 to %M {
for %i3 = 0 to %N {
// VEC1D: [[C3:%.*]] = constant splat<vector<128xf32>, 2.000000e+00> : vector<128xf32>
// VEC1D: "vector_transfer_write"([[C3]], {{.*}}) : (vector<128xf32>, memref<32x1024xf32>, index, index) -> ()
// VEC1D: "vector_transfer_write"([[C3]], {{.*}}) : (vector<128xf32>, memref<?x?xf32>, index, index) -> ()
// VEC2D: [[C3:%.*]] = constant splat<vector<32x256xf32>, 2.000000e+00> : vector<32x256xf32>
// VEC2D: "vector_transfer_write"([[C3]], {{.*}}) : (vector<32x256xf32>, memref<32x1024xf32>, index, index) -> ()
store %f2, %B[%i2, %i3] : memref<32x1024xf32, 0>
// VEC2D: "vector_transfer_write"([[C3]], {{.*}}) : (vector<32x256xf32>, memref<?x?xf32>, index, index) -> ()
// non-scoped %f2
store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
}
}
for %i4 = 0 to 32 {
for %i5 = 0 to 1024 {
// VEC1D: [[A5:%.*]] = "vector_transfer_read"(%0, {{.*}}) : (memref<32x1024xf32>, index, index) -> vector<128xf32>
// VEC1D: [[B5:%.*]] = "vector_transfer_read"(%1, {{.*}}) : (memref<32x1024xf32>, index, index) -> vector<128xf32>
for %i4 = 0 to %M {
for %i5 = 0 to %N {
//
// VEC1D: [[A5:%.*]] = "vector_transfer_read"(%0, {{.*}}) : (memref<?x?xf32>, index, index) -> vector<128xf32>
// VEC1D: [[B5:%.*]] = "vector_transfer_read"(%1, {{.*}}) : (memref<?x?xf32>, index, index) -> vector<128xf32>
// VEC1D: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<128xf32>
// VEC1D: "vector_transfer_write"([[S5]], {{.*}}) : (vector<128xf32>, memref<32x1024xf32>, index, index) -> ()
// VEC2D: [[A5:%.*]] = "vector_transfer_read"(%0, {{.*}}) : (memref<32x1024xf32>, index, index) -> vector<32x256xf32>
// VEC2D: [[B5:%.*]] = "vector_transfer_read"(%1, {{.*}}) : (memref<32x1024xf32>, index, index) -> vector<32x256xf32>
// VEC1D: [[SPLAT1:%.*]] = constant splat<vector<128xf32>, 1.000000e+00> : vector<128xf32>
// VEC1D: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<128xf32>
// VEC1D: [[SPLAT2:%.*]] = constant splat<vector<128xf32>, 2.000000e+00> : vector<128xf32>
// VEC1D: [[S7:%.*]] = addf [[S5]], [[SPLAT2]] : vector<128xf32>
// VEC1D: [[S8:%.*]] = addf [[S7]], [[S6]] : vector<128xf32>
// VEC1D: "vector_transfer_write"([[S8]], {{.*}}) : (vector<128xf32>, memref<?x?xf32>, index, index) -> ()
//
// VEC2D: [[A5:%.*]] = "vector_transfer_read"(%0, {{.*}}) : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
// VEC2D: [[B5:%.*]] = "vector_transfer_read"(%1, {{.*}}) : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
// VEC2D: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<32x256xf32>
// VEC2D: "vector_transfer_write"([[S5]], {{.*}}) : (vector<32x256xf32>, memref<32x1024xf32>, index, index) -> ()
%a5 = load %A[%i4, %i5] : memref<32x1024xf32, 0>
%b5 = load %B[%i4, %i5] : memref<32x1024xf32, 0>
// VEC2D: [[SPLAT1:%.*]] = constant splat<vector<32x256xf32>, 1.000000e+00> : vector<32x256xf32>
// VEC2D: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<32x256xf32>
// VEC2D: [[SPLAT2:%.*]] = constant splat<vector<32x256xf32>, 2.000000e+00> : vector<32x256xf32>
// VEC2D: [[S7:%.*]] = addf [[S5]], [[SPLAT2]] : vector<32x256xf32>
// VEC2D: [[S8:%.*]] = addf [[S7]], [[S6]] : vector<32x256xf32>
// VEC2D: "vector_transfer_write"([[S8]], {{.*}}) : (vector<32x256xf32>, memref<?x?xf32>, index, index) -> ()
//
%a5 = load %A[%i4, %i5] : memref<?x?xf32, 0>
%b5 = load %B[%i4, %i5] : memref<?x?xf32, 0>
%s5 = addf %a5, %b5 : f32
store %s5, %C[%i4, %i5] : memref<32x1024xf32, 0>
// non-scoped %f1
%s6 = addf %s5, %f1 : f32
// non-scoped %f2
%s7 = addf %s5, %f2 : f32
// diamond dependency.
%s8 = addf %s7, %s6 : f32
store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
}
}
%c7 = constant 7 : index
%c42 = constant 42 : index
%res = load %C[%c7, %c42] : memref<32x1024xf32, 0>
%res = load %C[%c7, %c42] : memref<?x?xf32, 0>
return %res : f32
}
}