[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
//===- LowerVectorTransfers.cpp - LowerVectorTransfers Pass Impl *- C++ -*-===//
|
|
|
|
|
//
|
|
|
|
|
// Copyright 2019 The MLIR Authors.
|
|
|
|
|
//
|
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
|
//
|
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
//
|
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
|
// limitations under the License.
|
|
|
|
|
// =============================================================================
|
|
|
|
|
//
|
|
|
|
|
// This file implements target-dependent lowering of vector transfer operations.
|
|
|
|
|
//
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
|
|
#include "mlir/Analysis/AffineAnalysis.h"
|
|
|
|
|
#include "mlir/Analysis/MLFunctionMatcher.h"
|
|
|
|
|
#include "mlir/Analysis/Utils.h"
|
|
|
|
|
#include "mlir/Analysis/VectorAnalysis.h"
|
|
|
|
|
#include "mlir/IR/AffineExpr.h"
|
|
|
|
|
#include "mlir/IR/AffineMap.h"
|
|
|
|
|
#include "mlir/IR/Attributes.h"
|
|
|
|
|
#include "mlir/IR/Builders.h"
|
|
|
|
|
#include "mlir/IR/BuiltinOps.h"
|
|
|
|
|
#include "mlir/IR/Location.h"
|
2018-12-17 14:11:31 -08:00
|
|
|
#include "mlir/IR/Matchers.h"
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
#include "mlir/IR/OperationSupport.h"
|
2018-12-17 14:11:31 -08:00
|
|
|
#include "mlir/IR/PatternMatch.h"
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
#include "mlir/IR/Types.h"
|
|
|
|
|
#include "mlir/Pass.h"
|
|
|
|
|
#include "mlir/StandardOps/StandardOps.h"
|
2018-12-14 09:31:17 -08:00
|
|
|
#include "mlir/SuperVectorOps/SuperVectorOps.h"
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
#include "mlir/Support/Functional.h"
|
2018-12-17 14:11:31 -08:00
|
|
|
#include "mlir/Transforms/MLPatternLoweringPass.h"
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
#include "mlir/Transforms/Passes.h"
|
|
|
|
|
|
|
|
|
|
#include "llvm/ADT/SetVector.h"
|
|
|
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
|
#include <type_traits>
|
|
|
|
|
|
|
|
|
|
///
|
|
|
|
|
/// Implements lowering of VectorTransferReadOp and VectorTransferWriteOp to a
|
|
|
|
|
/// proper abstraction for the hardware.
|
|
|
|
|
///
|
|
|
|
|
/// For now only a simple loop nest is emitted.
|
|
|
|
|
///
|
|
|
|
|
|
|
|
|
|
using llvm::dbgs;
|
|
|
|
|
using llvm::SetVector;
|
|
|
|
|
|
|
|
|
|
using namespace mlir;
|
|
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "lower-vector-transfers"
|
|
|
|
|
|
2018-12-27 14:35:10 -08:00
|
|
|
/// Creates the Value for the sum of `a` and `b` without building a
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
/// full-fledged AffineMap for all indices.
|
|
|
|
|
///
|
|
|
|
|
/// Prerequisites:
|
|
|
|
|
/// `a` and `b` must be of IndexType.
|
2018-12-27 15:06:22 -08:00
|
|
|
static Value *add(FuncBuilder *b, Location loc, Value *v, Value *w) {
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
assert(v->getType().isa<IndexType>() && "v must be of IndexType");
|
|
|
|
|
assert(w->getType().isa<IndexType>() && "w must be of IndexType");
|
|
|
|
|
auto *context = b->getContext();
|
|
|
|
|
auto d0 = getAffineDimExpr(0, context);
|
|
|
|
|
auto d1 = getAffineDimExpr(1, context);
|
|
|
|
|
auto map = AffineMap::get(2, 0, {d0 + d1}, {});
|
2018-12-27 14:35:10 -08:00
|
|
|
return b->create<AffineApplyOp>(loc, map, ArrayRef<mlir::Value *>{v, w})
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
->getResult(0);
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-17 14:11:31 -08:00
|
|
|
namespace {
|
|
|
|
|
struct LowerVectorTransfersState : public MLFuncGlobalLoweringState {
|
|
|
|
|
// Top of the function constant zero index.
|
2018-12-27 14:35:10 -08:00
|
|
|
Value *zero;
|
2018-12-17 14:11:31 -08:00
|
|
|
};
|
|
|
|
|
} // namespace
|
|
|
|
|
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
/// Performs simple lowering into a combination of:
|
|
|
|
|
/// 1. local memory allocation,
|
|
|
|
|
/// 2. vector_load/vector_store from/to local buffer
|
|
|
|
|
/// 3. perfect loop nest over scalar loads/stores from/to remote memory.
|
|
|
|
|
///
|
|
|
|
|
/// This is a simple sketch for now but does the job.
|
|
|
|
|
// TODO(ntv): This function has a lot of code conditioned on the template
|
|
|
|
|
// argument being one of the two types. Extract the common behavior into helper
|
|
|
|
|
// functions and detemplatizing it.
|
|
|
|
|
template <typename VectorTransferOpTy>
|
2018-12-17 14:11:31 -08:00
|
|
|
static void rewriteAsLoops(VectorTransferOpTy *transfer,
|
|
|
|
|
MLFuncLoweringRewriter *rewriter,
|
|
|
|
|
LowerVectorTransfersState *state) {
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
static_assert(
|
|
|
|
|
std::is_same<VectorTransferOpTy, VectorTransferReadOp>::value ||
|
|
|
|
|
std::is_same<VectorTransferOpTy, VectorTransferWriteOp>::value,
|
|
|
|
|
"Must be called on either VectorTransferReadOp or VectorTransferWriteOp");
|
|
|
|
|
auto vectorType = transfer->getVectorType();
|
|
|
|
|
auto vectorShape = vectorType.getShape();
|
|
|
|
|
// tmpMemRefType is used for staging the transfer in a local scalar buffer.
|
|
|
|
|
auto tmpMemRefType =
|
|
|
|
|
MemRefType::get(vectorShape, vectorType.getElementType(), {}, 0);
|
|
|
|
|
// vectorMemRefType is a view of tmpMemRefType as one vector.
|
|
|
|
|
auto vectorMemRefType = MemRefType::get({1}, vectorType, {}, 0);
|
|
|
|
|
|
2018-12-17 14:11:31 -08:00
|
|
|
// Get the ML function builder.
|
2018-12-28 08:48:09 -08:00
|
|
|
// We need access to the Function builder stored internally in the
|
2018-12-17 14:11:31 -08:00
|
|
|
// MLFunctionLoweringRewriter general rewriting API does not provide
|
2018-12-28 16:05:35 -08:00
|
|
|
// ML-specific functions (ForInst and Block manipulation). While we could
|
2018-12-17 14:11:31 -08:00
|
|
|
// forward them or define a whole rewriting chain based on MLFunctionBuilder
|
|
|
|
|
// instead of Builer, the code for it would be duplicate boilerplate. As we
|
|
|
|
|
// go towards unifying ML and CFG functions, this separation will disappear.
|
2018-12-27 15:06:22 -08:00
|
|
|
FuncBuilder &b = *rewriter->getBuilder();
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
|
|
|
|
|
// 1. First allocate the local buffer in fast memory.
|
|
|
|
|
// TODO(ntv): CL memory space.
|
|
|
|
|
// TODO(ntv): Allocation padding for potential bank conflicts (e.g. GPUs).
|
|
|
|
|
auto tmpScalarAlloc = b.create<AllocOp>(transfer->getLoc(), tmpMemRefType);
|
2018-12-17 14:10:52 -08:00
|
|
|
auto vecView = b.create<VectorTypeCastOp>(
|
|
|
|
|
transfer->getLoc(), tmpScalarAlloc->getResult(), vectorMemRefType);
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
|
|
|
|
|
// 2. Store the vector to local storage in case of a vector_transfer_write.
|
|
|
|
|
// TODO(ntv): This vector_store operation should be further lowered in the
|
|
|
|
|
// case of GPUs.
|
|
|
|
|
if (std::is_same<VectorTransferOpTy, VectorTransferWriteOp>::value) {
|
|
|
|
|
b.create<StoreOp>(vecView->getLoc(), transfer->getVector(),
|
2018-12-27 14:35:10 -08:00
|
|
|
vecView->getResult(),
|
|
|
|
|
ArrayRef<mlir::Value *>{state->zero});
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 3. Emit the loop-nest.
|
|
|
|
|
// TODO(ntv): Invert the mapping and indexing contiguously in the remote
|
|
|
|
|
// memory.
|
|
|
|
|
// TODO(ntv): Handle broadcast / slice properly.
|
|
|
|
|
auto permutationMap = transfer->getPermutationMap();
|
2018-12-28 16:05:35 -08:00
|
|
|
SetVector<ForInst *> loops;
|
2018-12-27 14:35:10 -08:00
|
|
|
SmallVector<Value *, 8> accessIndices(transfer->getIndices());
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
for (auto it : llvm::enumerate(transfer->getVectorType().getShape())) {
|
|
|
|
|
auto composed = composeWithUnboundedMap(
|
|
|
|
|
getAffineDimExpr(it.index(), b.getContext()), permutationMap);
|
2018-12-28 16:05:35 -08:00
|
|
|
auto *forInst = b.createFor(transfer->getLoc(), 0, it.value());
|
|
|
|
|
loops.insert(forInst);
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
// Setting the insertion point to the innermost loop achieves nesting.
|
2018-12-23 08:17:48 -08:00
|
|
|
b.setInsertionPointToStart(loops.back()->getBody());
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
if (composed == getAffineConstantExpr(0, b.getContext())) {
|
|
|
|
|
transfer->emitWarning(
|
|
|
|
|
"Redundant copy can be implemented as a vector broadcast");
|
|
|
|
|
} else {
|
|
|
|
|
auto dim = composed.template cast<AffineDimExpr>();
|
|
|
|
|
assert(accessIndices.size() > dim.getPosition());
|
|
|
|
|
accessIndices[dim.getPosition()] =
|
|
|
|
|
::add(&b, transfer->getLoc(), accessIndices[dim.getPosition()],
|
|
|
|
|
loops.back());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 4. Emit memory operations within the loops.
|
|
|
|
|
// TODO(ntv): SelectOp + padding value for load out-of-bounds.
|
|
|
|
|
if (std::is_same<VectorTransferOpTy, VectorTransferReadOp>::value) {
|
|
|
|
|
// VectorTransferReadOp.
|
|
|
|
|
// a. read scalar from remote;
|
|
|
|
|
// b. write scalar to local.
|
|
|
|
|
auto scalarLoad = b.create<LoadOp>(transfer->getLoc(),
|
|
|
|
|
transfer->getMemRef(), accessIndices);
|
2018-12-27 14:35:10 -08:00
|
|
|
b.create<StoreOp>(transfer->getLoc(), scalarLoad->getResult(),
|
|
|
|
|
tmpScalarAlloc->getResult(),
|
|
|
|
|
functional::map([](Value *val) { return val; }, loops));
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
} else {
|
|
|
|
|
// VectorTransferWriteOp.
|
|
|
|
|
// a. read scalar from local;
|
|
|
|
|
// b. write scalar to remote.
|
|
|
|
|
auto scalarLoad = b.create<LoadOp>(
|
|
|
|
|
transfer->getLoc(), tmpScalarAlloc->getResult(),
|
2018-12-27 14:35:10 -08:00
|
|
|
functional::map([](Value *val) { return val; }, loops));
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
b.create<StoreOp>(transfer->getLoc(), scalarLoad->getResult(),
|
|
|
|
|
transfer->getMemRef(), accessIndices);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 5. Read the vector from local storage in case of a vector_transfer_read.
|
|
|
|
|
// TODO(ntv): This vector_load operation should be further lowered in the
|
|
|
|
|
// case of GPUs.
|
2018-12-27 14:35:10 -08:00
|
|
|
llvm::SmallVector<Value *, 1> newResults = {};
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
if (std::is_same<VectorTransferOpTy, VectorTransferReadOp>::value) {
|
2018-12-28 04:14:52 -08:00
|
|
|
b.setInsertionPoint(cast<OperationInst>(transfer->getInstruction()));
|
2018-12-17 14:10:52 -08:00
|
|
|
auto *vector = b.create<LoadOp>(transfer->getLoc(), vecView->getResult(),
|
2018-12-27 14:35:10 -08:00
|
|
|
ArrayRef<Value *>{state->zero})
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
->getResult();
|
2018-12-17 14:11:31 -08:00
|
|
|
newResults.push_back(vector);
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 6. Free the local buffer.
|
2018-12-28 04:14:52 -08:00
|
|
|
b.setInsertionPoint(transfer->getInstruction());
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
b.create<DeallocOp>(transfer->getLoc(), tmpScalarAlloc);
|
|
|
|
|
|
2018-12-28 16:05:35 -08:00
|
|
|
// 7. It is now safe to erase the instruction.
|
2018-12-28 04:14:52 -08:00
|
|
|
rewriter->replaceOp(transfer->getInstruction(), newResults);
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
}
|
|
|
|
|
|
2018-12-17 14:11:31 -08:00
|
|
|
namespace {
|
|
|
|
|
template <typename VectorTransferOpTy>
|
|
|
|
|
class VectorTransferExpander : public MLLoweringPattern {
|
|
|
|
|
public:
|
|
|
|
|
explicit VectorTransferExpander(MLIRContext *context)
|
|
|
|
|
: MLLoweringPattern(VectorTransferOpTy::getOperationName(), 1, context) {}
|
|
|
|
|
|
2018-12-27 21:21:41 -08:00
|
|
|
PatternMatchResult match(OperationInst *op) const override {
|
2018-12-17 14:11:31 -08:00
|
|
|
if (m_Op<VectorTransferOpTy>().match(op))
|
|
|
|
|
return matchSuccess();
|
|
|
|
|
return matchFailure();
|
2018-12-07 19:00:25 -08:00
|
|
|
}
|
|
|
|
|
|
2018-12-28 16:05:35 -08:00
|
|
|
void rewriteOpInst(OperationInst *op,
|
2018-12-27 21:21:41 -08:00
|
|
|
MLFuncGlobalLoweringState *funcWiseState,
|
2018-12-17 14:11:31 -08:00
|
|
|
std::unique_ptr<PatternState> opState,
|
|
|
|
|
MLFuncLoweringRewriter *rewriter) const override {
|
|
|
|
|
rewriteAsLoops(&*op->dyn_cast<VectorTransferOpTy>(), rewriter,
|
|
|
|
|
static_cast<LowerVectorTransfersState *>(funcWiseState));
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
}
|
2018-12-17 14:11:31 -08:00
|
|
|
};
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
|
|
namespace {
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
|
2018-12-17 14:11:31 -08:00
|
|
|
struct LowerVectorTransfersPass
|
|
|
|
|
: public MLPatternLoweringPass<
|
|
|
|
|
VectorTransferExpander<VectorTransferReadOp>,
|
|
|
|
|
VectorTransferExpander<VectorTransferWriteOp>> {
|
|
|
|
|
LowerVectorTransfersPass()
|
|
|
|
|
: MLPatternLoweringPass(&LowerVectorTransfersPass::passID) {}
|
|
|
|
|
|
|
|
|
|
std::unique_ptr<MLFuncGlobalLoweringState>
|
2018-12-28 08:48:09 -08:00
|
|
|
makeFuncWiseState(Function *f) const override {
|
2018-12-17 14:11:31 -08:00
|
|
|
auto state = llvm::make_unique<LowerVectorTransfersState>();
|
2018-12-27 15:06:22 -08:00
|
|
|
auto builder = FuncBuilder(f);
|
2018-12-17 14:11:31 -08:00
|
|
|
state->zero = builder.create<ConstantIndexOp>(builder.getUnknownLoc(), 0);
|
|
|
|
|
return state;
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
}
|
|
|
|
|
|
2018-12-17 14:11:31 -08:00
|
|
|
static char passID;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
} // end anonymous namespace
|
|
|
|
|
|
|
|
|
|
char LowerVectorTransfersPass::passID = 0;
|
[MLIR] Add LowerVectorTransfersPass
This CL adds a pass that lowers VectorTransferReadOp and VectorTransferWriteOp
to a simple loop nest via local buffer allocations.
This is an MLIR->MLIR lowering based on builders.
A few TODOs are left to address in particular:
1. invert the permutation map so the accesses to the remote memref are coalesced;
2. pad the alloc for bank conflicts in local memory (e.g. GPUs shared_memory);
3. support broadcast / avoid copies when permutation_map is not of full column rank
4. add a proper "element_cast" op
One notable limitation is this does not plan on supporting boundary conditions.
It should be significantly easier to use pre-baked MLIR functions to handle such paddings.
This is left for future consideration.
Therefore the current CL only works properly for full-tile cases atm.
This CL also adds 2 simple tests:
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N step 4 {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
vector_transfer_write %f1, %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : vector<5x4x3xf32>, memref<?x?x?x?xf32, 0>, index, index, index, index
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 step 4 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
store %cst, %2[%c0] : memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i1, %i5)
for %i6 = 0 to 3 {
%5 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%6 = load %1[%i4, %i5, %i6] : memref<5x4x3xf32>
store %6, %0[%5, %4, %i2, %3] : memref<?x?x?x?xf32>
dealloc %1 : memref<5x4x3xf32>
```
and
```mlir
for %i0 = 0 to %M step 3 {
for %i1 = 0 to %N {
for %i2 = 0 to %O {
for %i3 = 0 to %P step 5 {
%f = vector_transfer_read %A, %i0, %i1, %i2, %i3 {permutation_map: (d0, d1, d2, d3) -> (d3, 0, d0)} : (memref<?x?x?x?xf32, 0>, index, index, index, index) -> vector<5x4x3xf32>
```
lowers into:
```mlir
for %i0 = 0 to %arg0 step 3 {
for %i1 = 0 to %arg1 {
for %i2 = 0 to %arg2 {
for %i3 = 0 to %arg3 step 5 {
%1 = alloc() : memref<5x4x3xf32>
%2 = "element_type_cast"(%1) : (memref<5x4x3xf32>) -> memref<1xvector<5x4x3xf32>>
for %i4 = 0 to 5 {
%3 = affine_apply (d0, d1) -> (d0 + d1) (%i3, %i4)
for %i5 = 0 to 4 {
for %i6 = 0 to 3 {
%4 = affine_apply (d0, d1) -> (d0 + d1) (%i0, %i6)
%5 = load %0[%4, %i1, %i2, %3] : memref<?x?x?x?xf32>
store %5, %1[%i4, %i5, %i6] : memref<5x4x3xf32>
%6 = load %2[%c0] : memref<1xvector<5x4x3xf32>>
dealloc %1 : memref<5x4x3xf32>
```
PiperOrigin-RevId: 224552717
2018-12-07 11:48:54 -08:00
|
|
|
|
|
|
|
|
FunctionPass *mlir::createLowerVectorTransfersPass() {
|
|
|
|
|
return new LowerVectorTransfersPass();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static PassRegistration<LowerVectorTransfersPass>
|
|
|
|
|
pass("lower-vector-transfers", "Materializes vector transfer ops to a "
|
|
|
|
|
"proper abstraction for the hardware");
|
|
|
|
|
|
|
|
|
|
#undef DEBUG_TYPE
|