mirror of
https://github.com/intel/llvm.git
synced 2026-01-14 03:50:17 +08:00
[mlir][x86vector] Sink Vector.transfer_reads and vector.load before the consumer (#169333)
Adds a pattern that sinks vector producer ops (`vector.load` and `vector.transfer_read`) forward in a block to their earliest legal use, reducing live ranges and improving scheduling opportunities. **The lowering pattern**: `batch_reduce.matmul` (input) -> register-tiling(M, N) -> Vectorization (to `vector.contract`) -> `unroll` vector.contract (`unit` dims) -> `hoisting` transformation (move `C` loads/store outside batch/k loop) -> **sink vector producers** -> apply `licm`, `canonicalization`, and `bufferize` -> `vector.contract` to `fma` -> **sink vector producers**.
This commit is contained in:
@@ -38,6 +38,17 @@ def ApplyVectorContractToPackedTypeDotProductPatternsOp : Op<Transform_Dialect,
|
||||
let assemblyFormat = "attr-dict";
|
||||
}
|
||||
|
||||
def ApplySinkVectorProducerOpsPatternsOp : Op<Transform_Dialect,
|
||||
"apply_patterns.x86vector.sink_vector_producer_ops",
|
||||
[DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
|
||||
let description = [{
|
||||
Collect patterns to sink vector producer operations forward in a block to
|
||||
place them immediately before their first use.
|
||||
}];
|
||||
|
||||
let assemblyFormat = "attr-dict";
|
||||
}
|
||||
|
||||
|
||||
#endif // X86VECTOR_TRANSFORM_OPS
|
||||
|
||||
|
||||
@@ -91,6 +91,10 @@ void populateVectorContractToFMAPatterns(RewritePatternSet &patterns);
|
||||
void populateVectorContractToPackedTypeDotProductPatterns(
|
||||
RewritePatternSet &patterns);
|
||||
|
||||
// Performs forward scheduling of vector producer ops to minimize their live
|
||||
// range by placing them at their earliest legal use site
|
||||
void populateSinkVectorProducerOpsPatterns(RewritePatternSet &patterns);
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
/// Helpers extracted from:
|
||||
/// - clang/lib/Headers/avxintrin.h
|
||||
|
||||
@@ -32,6 +32,11 @@ void mlir::transform::ApplyVectorContractToPackedTypeDotProductPatternsOp::
|
||||
x86vector::populateVectorContractToPackedTypeDotProductPatterns(patterns);
|
||||
}
|
||||
|
||||
void mlir::transform::ApplySinkVectorProducerOpsPatternsOp::populatePatterns(
|
||||
RewritePatternSet &patterns) {
|
||||
x86vector::populateSinkVectorProducerOpsPatterns(patterns);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Transform op registration
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
@@ -3,6 +3,7 @@ add_mlir_dialect_library(MLIRX86VectorTransforms
|
||||
LegalizeForLLVMExport.cpp
|
||||
VectorContractToFMA.cpp
|
||||
VectorContractToPackedTypeDotProduct.cpp
|
||||
SinkVectorProducerOps.cpp
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRArithDialect
|
||||
|
||||
148
mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
Normal file
148
mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
Normal file
@@ -0,0 +1,148 @@
|
||||
//===- SinkVectorProducerOps.cpp ------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/Vector/IR/VectorOps.h"
|
||||
#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
|
||||
#include "mlir/Dialect/X86Vector/Transforms.h"
|
||||
#include "mlir/Dialect/X86Vector/X86VectorDialect.h"
|
||||
|
||||
#include "mlir/IR/BuiltinAttributes.h"
|
||||
#include "mlir/IR/Dominance.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace mlir::vector;
|
||||
using namespace mlir::x86vector;
|
||||
|
||||
static FailureOr<llvm::SmallVector<Operation *>>
|
||||
getSameBlockUsers(Operation *op) {
|
||||
llvm::SmallVector<Operation *> opUsers;
|
||||
for (OpResult result : op->getResults()) {
|
||||
for (Operation *user : result.getUsers()) {
|
||||
// Check prod and users belongs to same block.
|
||||
if (op->getBlock() != user->getBlock())
|
||||
return failure();
|
||||
opUsers.push_back(user);
|
||||
}
|
||||
}
|
||||
|
||||
return opUsers;
|
||||
}
|
||||
|
||||
// Prevent pathological looping:
|
||||
// If two/three producers are used by same consumer, will end in looping of
|
||||
// moving the producers.
|
||||
// For example:
|
||||
// %1 = prod1
|
||||
// %2 = prod2
|
||||
// %3 = prod3
|
||||
// %4 = op %1, %2, %3
|
||||
static bool checkLooping(Operation *op) {
|
||||
llvm::SmallVector<Operation *> operations;
|
||||
operations.push_back(op);
|
||||
|
||||
// Retrive the next immediate operation until it is a vector.load or
|
||||
// a vector.transfer_read
|
||||
Operation *nextOp = op->getNextNode();
|
||||
while (nextOp) {
|
||||
if (isa<vector::LoadOp>(nextOp) || isa<vector::TransferReadOp>(nextOp)) {
|
||||
operations.push_back(op);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
nextOp = nextOp->getNextNode();
|
||||
}
|
||||
|
||||
// If all the loads or transfer_reads have same immediate nextOp as its
|
||||
// user, then it loops.
|
||||
for (Operation *op : operations) {
|
||||
FailureOr<llvm::SmallVector<Operation *>> users = getSameBlockUsers(op);
|
||||
if (failed(users))
|
||||
return false;
|
||||
|
||||
if (!llvm::is_contained(*users, nextOp))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Sink vector producers forward to reduce live ranges.
|
||||
/// This pattern applies to ops such as vector.load and vector.transfer_read.
|
||||
template <typename producerOp>
|
||||
struct SinkVectorProducerOps final : public OpRewritePattern<producerOp> {
|
||||
using OpRewritePattern<producerOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(producerOp op,
|
||||
PatternRewriter &rewriter) const override {
|
||||
|
||||
auto users = getSameBlockUsers(op);
|
||||
if (failed(users))
|
||||
return failure();
|
||||
|
||||
if (checkLooping(op))
|
||||
return failure();
|
||||
|
||||
llvm::DenseMap<Operation *, llvm::SmallVector<Operation *>> prodsAllUsers;
|
||||
llvm::DenseMap<Operation *, Operation *> prodsFirstUser;
|
||||
|
||||
llvm::SmallVector<Operation *> opUsers = *users;
|
||||
prodsAllUsers.try_emplace(op, opUsers);
|
||||
|
||||
// Iterate until the last instruction to find the first users of all
|
||||
// producers within the block.
|
||||
Operation *nextOp = op;
|
||||
|
||||
while ((nextOp = nextOp->getNextNode())) {
|
||||
|
||||
if (isa<vector::LoadOp>(nextOp) || isa<vector::TransferReadOp>(nextOp)) {
|
||||
auto nextUsers = getSameBlockUsers(nextOp);
|
||||
|
||||
if (failed(nextUsers))
|
||||
continue;
|
||||
llvm::SmallVector<Operation *> nextOpUsers = *nextUsers;
|
||||
prodsAllUsers.try_emplace(nextOp, nextOpUsers);
|
||||
} else {
|
||||
llvm::SmallVector<Operation *> operations;
|
||||
|
||||
for (auto &entry : prodsAllUsers) {
|
||||
llvm::SmallVector<Operation *> &users = entry.second;
|
||||
|
||||
if (llvm::is_contained(users, nextOp)) {
|
||||
Operation *operation = entry.first;
|
||||
operations.push_back(operation);
|
||||
prodsFirstUser.try_emplace(operation, nextOp);
|
||||
}
|
||||
}
|
||||
|
||||
for (Operation *op : operations) {
|
||||
prodsAllUsers.erase(op);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Move all the loads or transfer_reads before its first use.
|
||||
for (auto &entry : prodsFirstUser) {
|
||||
Operation *prod = entry.first;
|
||||
Operation *consumer = entry.second;
|
||||
|
||||
prod->moveBefore(consumer);
|
||||
}
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void x86vector::populateSinkVectorProducerOpsPatterns(
|
||||
RewritePatternSet &patterns) {
|
||||
patterns.add<SinkVectorProducerOps<vector::TransferReadOp>,
|
||||
SinkVectorProducerOps<vector::LoadOp>>(patterns.getContext());
|
||||
}
|
||||
199
mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir
Normal file
199
mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir
Normal file
@@ -0,0 +1,199 @@
|
||||
// RUN: mlir-opt %s -transform-interpreter -cse -split-input-file | FileCheck %s
|
||||
|
||||
func.func @sink_vector_loads(%arg0: memref<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> {
|
||||
%c0 = arith.constant 0 : index
|
||||
%c8 = arith.constant 8 : index
|
||||
%0 = vector.load %arg0[%c0, %c0] : memref<16x16xf32>, vector<8xf32>
|
||||
%1 = vector.load %arg0[%c0, %c8] : memref<16x16xf32>, vector<8xf32>
|
||||
%2 = vector.load %arg0[%c8, %c0] : memref<16x16xf32>, vector<8xf32>
|
||||
%3 = vector.load %arg0[%c8, %c8] : memref<16x16xf32>, vector<8xf32>
|
||||
%4 = vector.fma %0, %1, %arg1 : vector<8xf32>
|
||||
%5 = vector.fma %2, %3, %4 : vector<8xf32>
|
||||
return %5 : vector<8xf32>
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @sink_vector_loads
|
||||
// CHECK: vector.load
|
||||
// CHECK-NEXT: vector.load
|
||||
// CHECK-NEXT: vector.fma
|
||||
// CHECK-NEXT: vector.load
|
||||
// CHECK-NEXT: vector.load
|
||||
// CHECK-NEXT: vector.fma
|
||||
|
||||
module attributes {transform.with_named_sequence} {
|
||||
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
|
||||
%0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
|
||||
transform.apply_patterns to %0 {
|
||||
transform.apply_patterns.x86vector.sink_vector_producer_ops
|
||||
} : !transform.any_op
|
||||
transform.yield
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @sink_vector_transfer_reads(%arg0: memref<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> {
|
||||
%c0 = arith.constant 0 : index
|
||||
%c8 = arith.constant 8 : index
|
||||
%0 = ub.poison : f32
|
||||
%1 = vector.transfer_read %arg0[%c0, %c0], %0 {in_bounds = [true]} : memref<16x16xf32>, vector<8xf32>
|
||||
%2 = vector.transfer_read %arg0[%c0, %c8], %0 {in_bounds = [true]} : memref<16x16xf32>, vector<8xf32>
|
||||
%3 = vector.transfer_read %arg0[%c8, %c0], %0 {in_bounds = [true]} : memref<16x16xf32>, vector<8xf32>
|
||||
%4 = vector.transfer_read %arg0[%c8, %c8], %0 {in_bounds = [true]} : memref<16x16xf32>, vector<8xf32>
|
||||
%5 = vector.fma %1, %2, %arg1 : vector<8xf32>
|
||||
%6 = vector.fma %3, %4, %5 : vector<8xf32>
|
||||
return %6 : vector<8xf32>
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @sink_vector_transfer_reads
|
||||
// CHECK: vector.transfer_read
|
||||
// CHECK-NEXT: vector.transfer_read
|
||||
// CHECK-NEXT: vector.fma
|
||||
// CHECK-NEXT: vector.transfer_read
|
||||
// CHECK-NEXT: vector.transfer_read
|
||||
// CHECK-NEXT: vector.fma
|
||||
|
||||
module attributes {transform.with_named_sequence} {
|
||||
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
|
||||
%0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
|
||||
transform.apply_patterns to %0 {
|
||||
transform.apply_patterns.x86vector.sink_vector_producer_ops
|
||||
} : !transform.any_op
|
||||
transform.yield
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @sink_vector_transfer_reads_tensor(%arg0: tensor<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> {
|
||||
%c0 = arith.constant 0 : index
|
||||
%c8 = arith.constant 8 : index
|
||||
%0 = ub.poison : f32
|
||||
%1 = vector.transfer_read %arg0[%c0, %c0], %0 {in_bounds = [true]} : tensor<16x16xf32>, vector<8xf32>
|
||||
%2 = vector.transfer_read %arg0[%c0, %c8], %0 {in_bounds = [true]} : tensor<16x16xf32>, vector<8xf32>
|
||||
%3 = vector.transfer_read %arg0[%c8, %c0], %0 {in_bounds = [true]} : tensor<16x16xf32>, vector<8xf32>
|
||||
%4 = vector.transfer_read %arg0[%c8, %c8], %0 {in_bounds = [true]} : tensor<16x16xf32>, vector<8xf32>
|
||||
%5 = vector.fma %1, %2, %arg1 : vector<8xf32>
|
||||
%6 = vector.fma %3, %4, %5 : vector<8xf32>
|
||||
return %6 : vector<8xf32>
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @sink_vector_transfer_reads_tensor
|
||||
// CHECK: vector.transfer_read
|
||||
// CHECK-NEXT: vector.transfer_read
|
||||
// CHECK-NEXT: vector.fma
|
||||
// CHECK-NEXT: vector.transfer_read
|
||||
// CHECK-NEXT: vector.transfer_read
|
||||
// CHECK-NEXT: vector.fma
|
||||
|
||||
module attributes {transform.with_named_sequence} {
|
||||
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
|
||||
%0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
|
||||
transform.apply_patterns to %0 {
|
||||
transform.apply_patterns.x86vector.sink_vector_producer_ops
|
||||
} : !transform.any_op
|
||||
transform.yield
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4, d1)>
|
||||
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3, d1)>
|
||||
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3)>
|
||||
|
||||
func.func @sink_vector_transfer_reads_bf16(%arg0: tensor<4x64x32x2xbf16>, %arg1: tensor<4x32x64x2xbf16>, %arg2: vector<1x16xf32>) -> vector<1x16xf32> {
|
||||
%0 = ub.poison : bf16
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%c16 = arith.constant 16 : index
|
||||
%extracted_slice = tensor.extract_slice %arg0[%c0, %c0, %c0, 0] [1, 4, 1, 2] [1, 1, 1, 1] : tensor<4x64x32x2xbf16> to tensor<1x4x1x2xbf16>
|
||||
%extracted_slice_0 = tensor.extract_slice %arg1[%c0, %c0, %c0, 0] [1, 1, 32, 2] [1, 1, 1, 1] : tensor<4x32x64x2xbf16> to tensor<1x1x32x2xbf16>
|
||||
%1 = vector.transfer_read %extracted_slice[%c0, %c0, %c0, %c0], %0 {in_bounds = [true, true, true, true]} : tensor<1x4x1x2xbf16>, vector<1x1x1x2xbf16>
|
||||
%2 = vector.transfer_read %extracted_slice[%c0, %c1, %c0, %c0], %0 {in_bounds = [true, true, true, true]} : tensor<1x4x1x2xbf16>, vector<1x1x1x2xbf16>
|
||||
%3 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0], %0 {in_bounds = [true, true, true, true]} : tensor<1x1x32x2xbf16>, vector<1x1x16x2xbf16>
|
||||
%4 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c16, %c0], %0 {in_bounds = [true, true, true, true]} : tensor<1x1x32x2xbf16>, vector<1x1x16x2xbf16>
|
||||
%5 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %1, %3, %arg2 {unroll_shape = array<i64: 1, 2, 1, 16, 1>} : vector<1x1x1x2xbf16>, vector<1x1x16x2xbf16> into vector<1x16xf32>
|
||||
%6 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %1, %4, %5 {unroll_shape = array<i64: 1, 2, 1, 16, 1>} : vector<1x1x1x2xbf16>, vector<1x1x16x2xbf16> into vector<1x16xf32>
|
||||
%7 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %2, %3, %6 {unroll_shape = array<i64: 1, 2, 1, 16, 1>} : vector<1x1x1x2xbf16>, vector<1x1x16x2xbf16> into vector<1x16xf32>
|
||||
%8 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %2, %4, %7 {unroll_shape = array<i64: 1, 2, 1, 16, 1>} : vector<1x1x1x2xbf16>, vector<1x1x16x2xbf16> into vector<1x16xf32>
|
||||
return %8 : vector<1x16xf32>
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @sink_vector_transfer_reads_bf16
|
||||
// CHECK: vector.transfer_read
|
||||
// CHECK-NEXT: vector.transfer_read
|
||||
// CHECK-NEXT: vector.contract
|
||||
// CHECK-NEXT: vector.transfer_read
|
||||
// CHECK-NEXT: vector.contract
|
||||
// CHECK-NEXT: vector.transfer_read
|
||||
// CHECK-NEXT: vector.contract
|
||||
// CHECK-NEXT: vector.contract
|
||||
|
||||
module attributes {transform.with_named_sequence} {
|
||||
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
|
||||
%0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
|
||||
transform.apply_patterns to %0 {
|
||||
transform.apply_patterns.x86vector.sink_vector_producer_ops
|
||||
} : !transform.any_op
|
||||
transform.yield
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @negative_no_infinite_looping(%arg0: memref<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> {
|
||||
%c0 = arith.constant 0 : index
|
||||
%c8 = arith.constant 8 : index
|
||||
%0 = vector.load %arg0[%c0, %c0] : memref<16x16xf32>, vector<8xf32>
|
||||
%1 = vector.load %arg0[%c0, %c8] : memref<16x16xf32>, vector<8xf32>
|
||||
%2 = vector.fma %0, %1, %arg1 : vector<8xf32>
|
||||
return %2: vector<8xf32>
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @negative_no_infinite_looping
|
||||
// CHECK: vector.load
|
||||
// CHECK-NEXT: vector.load
|
||||
// CHECK-NEXT: vector.fma
|
||||
|
||||
module attributes {transform.with_named_sequence} {
|
||||
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
|
||||
%0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
|
||||
transform.apply_patterns to %0 {
|
||||
transform.apply_patterns.x86vector.sink_vector_producer_ops
|
||||
} : !transform.any_op
|
||||
transform.yield
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @negative_no_sink_outside_block(%arg0: memref<8x16xf32>, %arg1: i1) -> vector<8xf32> {
|
||||
%c0 = arith.constant 0 : index
|
||||
%c8 = arith.constant 8 : index
|
||||
%0 = vector.load %arg0[%c0, %c0] : memref<8x16xf32>, vector<8xf32>
|
||||
%1 = vector.load %arg0[%c0, %c8] : memref<8x16xf32>, vector<8xf32>
|
||||
%2 = scf.if %arg1 -> (vector<8xf32>) {
|
||||
scf.yield %0 : vector<8xf32>
|
||||
} else {
|
||||
scf.yield %1 : vector<8xf32>
|
||||
}
|
||||
return %2 : vector<8xf32>
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @negative_no_sink_outside_block
|
||||
// CHECK: vector.load
|
||||
// CHECK-NEXT: vector.load
|
||||
// CHECK-NEXT: scf.if
|
||||
// CHECK-NEXT: scf.yield
|
||||
|
||||
module attributes {transform.with_named_sequence} {
|
||||
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
|
||||
%0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
|
||||
transform.apply_patterns to %0 {
|
||||
transform.apply_patterns.x86vector.sink_vector_producer_ops
|
||||
} : !transform.any_op
|
||||
transform.yield
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user