Loop invariant code motion.

-- PiperOrigin-RevId: 244043679
2026-01-31 07:27:33 +08:00 · 2019-04-17 12:18:37 -07:00
parent c9f21cf355
commit 7905da656e
4 changed files with 324 additions and 0 deletions
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -83,6 +83,10 @@ FunctionPassBase *createLoopFusionPass(unsigned fastMemorySpace = 0,
                                       uint64_t localBufSizeThreshold = 0,
                                       bool maximalFusion = false);

+/// Creates a loop invariant code motion pass that hoists loop invariant
+/// instructions out of the loop.
+FunctionPassBase *createLoopInvariantCodeMotionPass();
+
 /// Creates a pass to pipeline explicit movement of data across levels of the
 /// memory hierarchy.
 FunctionPassBase *createPipelineDataTransferPass();
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(MLIRTransforms
  DialectConversion.cpp
  DmaGeneration.cpp
  LoopFusion.cpp
+  LoopInvariantCodeMotion.cpp
  LoopTiling.cpp
  LoopUnrollAndJam.cpp
  LoopUnroll.cpp
--- a/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
+++ b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
@@ -0,0 +1,130 @@
+//===- LoopInvariantCodeMotion.cpp - Code to perform loop fusion-----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements loop invariant code motion.
+//
+//===----------------------------------------------------------------------===//
+
+#include <iomanip>
+#include <sstream>
+
+#include "mlir/AffineOps/AffineOps.h"
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/Utils.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/StandardOps/Ops.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/Utils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "licm"
+
+using llvm::SetVector;
+
+using namespace mlir;
+
+namespace {
+
+/// Loop invariant code motion (LICM) pass.
+/// TODO(asabne) : The pass is missing zero-trip tests.
+/// TODO(asabne) : Check for the presence of side effects before hoisting.
+struct LoopInvariantCodeMotion : public FunctionPass<LoopInvariantCodeMotion> {
+  void runOnFunction() override;
+  void runOnAffineForOp(AffineForOp forOp);
+  std::vector<AffineForOp> forOps;
+};
+} // end anonymous namespace
+
+FunctionPassBase *mlir::createLoopInvariantCodeMotionPass() {
+  return new LoopInvariantCodeMotion();
+}
+
+void LoopInvariantCodeMotion::runOnAffineForOp(AffineForOp forOp) {
+  auto *loopBody = forOp.getBody();
+
+  // This is the place where hoisted instructions would reside.
+  FuncBuilder b(forOp.getOperation());
+
+  // This vector is used to place loop invariant operations.
+  SmallVector<Operation *, 8> opsToMove;
+
+  SetVector<Operation *> loopDefinedOps;
+  // Generate forward slice which contains ops that fall under the transitive
+  // definition closure following the loop induction variable.
+  getForwardSlice(forOp, &loopDefinedOps);
+
+  for (auto i : loopDefinedOps) {
+    LLVM_DEBUG(i->print(llvm::dbgs() << "\nLoop-dependent op\n"));
+  }
+
+  for (auto &op : *loopBody) {
+    // If the operation is loop invariant, insert it into opsToMove.
+    if (!op.isa<AffineForOp>() && !op.isa<AffineTerminatorOp>() &&
+        loopDefinedOps.count(&op) != 1) {
+      LLVM_DEBUG(op.print(llvm::dbgs() << "\nLICM'ing op\n"));
+      opsToMove.push_back(&op);
+    }
+  }
+
+  // For all instructions that we found to be invariant, place them sequentially
+  // right before the for loop.
+  for (auto *op : opsToMove) {
+    op->moveBefore(forOp);
+  }
+
+  LLVM_DEBUG(forOp.getOperation()->print(llvm::dbgs() << "\nModified loop\n"));
+
+  // If the for loop body has a single operation (the terminator), erase it.
+  if (forOp.getBody()->getOperations().size() == 1) {
+    assert(forOp.getBody()->getOperations().front().isa<AffineTerminatorOp>());
+    forOp.erase();
+  }
+}
+
+void LoopInvariantCodeMotion::runOnFunction() {
+  forOps.clear();
+
+  // Gather all loops in a function, and order them in innermost-loop-first
+  // order. This way, we first LICM from the inner loop, and place the ops in
+  // the outer loop, which in turn can be further LICM'ed. This saves iterating
+  // on the inner loop operations while LICMing through the outer loop.
+  getFunction().walk<AffineForOp>(
+      [&](AffineForOp forOp) { forOps.push_back(forOp); });
+  // We gather loops first, and then go over them later because we don't want to
+  // mess the iterators up.
+  for (auto forOp : forOps) {
+    auto *forInst = forOp.getOperation();
+    LLVM_DEBUG(forInst->print(llvm::dbgs() << "\nOriginal loop\n"));
+    runOnAffineForOp(forOp);
+  }
+}
+
+static PassRegistration<LoopInvariantCodeMotion>
+    pass("loop-invariant-code-motion",
+         "Hoist loop invariant instructions outside of the loop");
--- a/mlir/test/Transforms/loop-invariant-code-motion.mlir
+++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir
@@ -0,0 +1,189 @@
+// RUN: mlir-opt %s -loop-invariant-code-motion -split-input-file -verify | FileCheck %s
+
+func @nested_loops_both_having_invariant_code() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    %v0 = addf %cf7, %cf8 : f32
+    affine.for %i1 = 0 to 10 {
+      store %v0, %m[%i0] : memref<10xf32>
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 7.000000e+00 : f32
+  // CHECK-NEXT: %cst_0 = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %1 = addf %cst, %cst_0 : f32
+  // CHECK-NEXT: affine.for %i0 = 0 to 10 {
+  // CHECK-NEXT: store %1, %0[%i0] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// The store-load forwarding can see through affine apply's since it relies on
+// dependence information.
+// CHECK-LABEL: func @store_affine_apply
+func @store_affine_apply() -> memref<10xf32> {
+  %cf7 = constant 7.0 : f32
+  %m = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+      %t0 = affine.apply (d1) -> (d1 + 1)(%i0)
+      store %cf7, %m[%t0] : memref<10xf32>
+  }
+  return %m : memref<10xf32>
+// CHECK:       %cst = constant 7.000000e+00 : f32
+// CHECK-NEXT:  %0 = alloc() : memref<10xf32>
+// CHECK-NEXT:  affine.for %i0 = 0 to 10 {
+// CHECK-NEXT:      %1 = affine.apply #map2(%i0)
+// CHECK-NEXT:      store %cst, %0[%1] : memref<10xf32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return %0 : memref<10xf32>
+}
+
+func @nested_loops_code_invariant_to_both() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      %v0 = addf %cf7, %cf8 : f32
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 7.000000e+00 : f32
+  // CHECK-NEXT: %cst_0 = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %1 = addf %cst, %cst_0 : f32
+  // CHECK-NEXT: return
+  return
+}
+
+func @single_loop_nothing_invariant() {
+  %m1 = alloc() : memref<10xf32>
+  %m2 = alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 10 {
+    %v0 = load %m1[%i0] : memref<10xf32>
+    %v1 = load %m2[%i0] : memref<10xf32>
+    %v2 = addf %v0, %v1 : f32
+    store %v2, %m1[%i0] : memref<10xf32>
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %1 = alloc() : memref<10xf32>
+  // CHECK-NEXT: affine.for %i0 = 0 to 10 {
+  // CHECK-NEXT: %2 = load %0[%i0] : memref<10xf32>
+  // CHECK-NEXT: %3 = load %1[%i0] : memref<10xf32>
+  // CHECK-NEXT: %4 = addf %2, %3 : f32
+  // CHECK-NEXT: store %4, %0[%i0] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+
+func @invariant_code_inside_affine_if() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    %t0 = affine.apply (d1) -> (d1 + 1)(%i0)
+    affine.if (d0, d1) : (d1 - d0 >= 0) (%i0, %t0) {
+        %cf9 = addf %cf8, %cf8 : f32
+        store %cf9, %m[%i0] : memref<10xf32>
+
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: affine.for %i0 = 0 to 10 {
+  // CHECK-NEXT: %1 = affine.apply #map2(%i0)
+  // CHECK-NEXT: affine.if #set0(%i0, %1) {
+  // CHECK-NEXT: %2 = addf %cst, %cst : f32
+  // CHECK-NEXT: store %2, %0[%i0] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+
+  return
+}
+
+
+func @nested_loops_with_common_and_uncommon_invariant_code() {
+  %m = alloc() : memref<10xf32>
+  %cf7 = constant 7.0 : f32
+  %cf8 = constant 8.0 : f32
+
+  affine.for %i0 = 0 to 10 {
+    %v0 = addf %cf7, %cf8 : f32
+    affine.for %i1 = 0 to 10 {
+      %v1 = addf %cf7, %cf7 : f32
+      store %v0, %m[%i1] : memref<10xf32>
+      store %v0, %m[%i0] : memref<10xf32>
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 7.000000e+00 : f32
+  // CHECK-NEXT: %cst_0 = constant 8.000000e+00 : f32
+  // CHECK-NEXT: %1 = addf %cst, %cst_0 : f32
+  // CHECK-NEXT: %2 = addf %cst, %cst : f32
+  // CHECK-NEXT: affine.for %i0 = 0 to 10 {
+  // CHECK-NEXT: store %1, %0[%i0] : memref<10xf32>
+  // CHECK-NEXT: affine.for %i1 = 0 to 10 {
+  // CHECK-NEXT:   store %1, %0[%i1] : memref<10xf32>
+  // CHECK-NEXT:  }
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+func @invariant_affine_if() {
+  %m = alloc() : memref<10xf32>
+  %cf8 = constant 8.0 : f32
+  affine.for %i0 = 0 to 10 {
+    affine.for %i1 = 0 to 10 {
+      affine.if (d0, d1) : (d1 - d0 >= 0) (%i0, %i0) {
+          %cf9 = addf %cf8, %cf8 : f32
+          store %cf9, %m[%i0] : memref<10xf32>
+
+      }
+    }
+  }
+
+  // CHECK: %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT: %cst = constant 8.000000e+00 : f32
+  // CHECK-NEXT: affine.for %i0 = 0 to 10 {
+  // CHECK-NEXT: affine.if #set0(%i0, %i0) {
+  // CHECK-NEXT: %1 = addf %cst, %cst : f32
+  // CHECK-NEXT: store %1, %0[%i0] : memref<10xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+
+  return
+}
+
+func @invariant_constant_and_load() {
+  %m = alloc() : memref<100xf32>
+  affine.for %i0 = 0 to 5 {
+    %c0 = constant 0 : index
+    %v = load %m[%c0] : memref<100xf32>
+    store %v, %m[%i0] : memref<100xf32>
+  }
+
+  // CHECK: %0 = alloc() : memref<100xf32>
+  // CHECK-NEXT: %c0 = constant 0 : index
+  // CHECK-NEXT: %1 = load %0[%c0] : memref<100xf32>
+  // CHECK-NEXT: affine.for %i0 = 0 to 5 {
+  // CHECK-NEXT:  store %1, %0[%i0] : memref<100xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+
+  return
+}
+