NFC - Move explicit copy/dma generation utility out of pass and into LoopUtils

- turn copy/dma generation method into a utility in LoopUtils, allowing it to be reused elsewhere. - no functional/logic change to the pass/utility - trim down header includes in files affected Signed-off-by: Uday Bondhugula <uday@polymagelabs.com> Closes tensorflow/mlir#124 COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/mlir/pull/124 from bondhugula:datacopy 9f346e62e5bd9dd1986720a30a35f302eb4d3252 PiperOrigin-RevId: 269106088
2026-02-07 16:11:27 +08:00 · 2019-09-14 13:23:18 -07:00
parent 1366467a3b
commit 4f32ae61b4
3 changed files with 717 additions and 673 deletions
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -24,11 +24,11 @@
 #ifndef MLIR_TRANSFORMS_LOOP_UTILS_H
 #define MLIR_TRANSFORMS_LOOP_UTILS_H

+#include "mlir/IR/Block.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"

 namespace mlir {
-class AffineMap;
 class AffineForOp;
 class FuncOp;
 class OpBuilder;
@@ -159,6 +159,34 @@ Loops tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value *> sizes,
 /// Returns the newly created intra-tile loops.
 Loops tilePerfectlyNested(loop::ForOp rootForOp, ArrayRef<Value *> sizes);

+/// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
+struct AffineCopyOptions {
+  // True if DMAs should be generated instead of point-wise copies.
+  bool generateDma;
+  // The slower memory space from which data is to be moved.
+  unsigned slowMemorySpace;
+  // Memory space of the faster one (typically a scratchpad).
+  unsigned fastMemorySpace;
+  // Memory space to place tags in: only meaningful for DMAs.
+  unsigned tagMemorySpace;
+  // Capacity of the fast memory space in bytes.
+  uint64_t fastMemCapacityBytes;
+};
+
+/// Performs explicit copying for the contiguous sequence of operations in the
+/// block iterator range [`begin', `end'), where `end' can't be past the
+/// terminator of the block (since additional operations are potentially
+/// inserted right before `end`. Returns the total size of fast memory space
+/// buffers used. `copyOptions` provides various parameters, and the output
+/// argument `copyNests` is the set of all copy nests inserted, each represented
+/// by its root affine.for. Since we generate alloc's and dealloc's for all fast
+/// buffers (before and after the range of operations resp. or at a hoisted
+/// position), all of the fast memory capacity is assumed to be available for
+/// processing this block range.
+uint64_t affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
+                                const AffineCopyOptions &copyOptions,
+                                DenseSet<Operation *> &copyNests);
+
 /// Tile a nest of standard for loops rooted at `rootForOp` by finding such
 /// parametric tile sizes that the outer loops have a fixed number of iterations
 /// as defined in `sizes`.
--- a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
+++ b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
@@ -28,10 +28,8 @@
 //
 //===----------------------------------------------------------------------===//

-#include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/AffineOps/AffineOps.h"
-#include "mlir/Dialect/StandardOps/Ops.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopUtils.h"
@@ -113,30 +111,7 @@ struct AffineDataCopyGeneration
        skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {}

  void runOnFunction() override;
-  LogicalResult runOnBlock(Block *block);
-  uint64_t runOnBlock(Block::iterator begin, Block::iterator end);
-
-  LogicalResult generateCopy(const MemRefRegion &region, Block *block,
-                             Block::iterator begin, Block::iterator end,
-                             Block *copyPlacementBlock,
-                             Block::iterator copyInPlacementStart,
-                             Block::iterator copyOutPlacementStart,
-                             uint64_t *sizeInBytes, Block::iterator *nBegin,
-                             Block::iterator *nEnd);
-
-  // List of memory regions to copy for. We need a map vector to have a
-  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
-  // since the alloc's for example are identical except for the SSA id.
-  SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4> readRegions;
-  SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4> writeRegions;
-
-  // Nests that are copy in's or copy out's; the root AffineForOp of that
-  // nest is stored herein.
-  DenseSet<Operation *> copyNests;
-
-  // Map from original memref's to the fast buffers that their accesses are
-  // replaced with.
-  DenseMap<Value *, Value *> fastBufferMap;
+  LogicalResult runOnBlock(Block *block, DenseSet<Operation *> &copyNests);

  // Slow memory space associated with copies.
  const unsigned slowMemorySpace;
@@ -173,427 +148,20 @@ std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass(
      fastMemCapacityBytes);
 }

-// Info comprising stride and number of elements transferred every stride.
-struct StrideInfo {
-  int64_t stride;
-  int64_t numEltPerStride;
-};
-
-/// Returns striding information for a copy/transfer of this region with
-/// potentially multiple striding levels from outermost to innermost. For an
-/// n-dimensional region, there can be at most n-1 levels of striding
-/// successively nested.
-//  TODO(bondhugula): make this work with non-identity layout maps.
-static void getMultiLevelStrides(const MemRefRegion &region,
-                                 ArrayRef<int64_t> bufferShape,
-                                 SmallVectorImpl<StrideInfo> *strideInfos) {
-  if (bufferShape.size() <= 1)
-    return;
-
-  int64_t numEltPerStride = 1;
-  int64_t stride = 1;
-  for (int d = bufferShape.size() - 1; d >= 1; d--) {
-    int64_t dimSize = region.memref->getType().cast<MemRefType>().getDimSize(d);
-    stride *= dimSize;
-    numEltPerStride *= bufferShape[d];
-    // A stride is needed only if the region has a shorter extent than the
-    // memref along the dimension *and* has an extent greater than one along the
-    // next major dimension.
-    if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
-      strideInfos->push_back({stride, numEltPerStride});
-    }
-  }
-}
-
-/// Construct the memref region to just include the entire memref. Returns false
-/// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
-/// enclosing loop IVs of opInst (starting from the outermost) that the region
-/// is parametric on.
-static bool getFullMemRefAsRegion(Operation *opInst, unsigned numParamLoopIVs,
-                                  MemRefRegion *region) {
-  unsigned rank;
-  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
-    rank = loadOp.getMemRefType().getRank();
-    region->memref = loadOp.getMemRef();
-    region->setWrite(false);
-  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
-    rank = storeOp.getMemRefType().getRank();
-    region->memref = storeOp.getMemRef();
-    region->setWrite(true);
-  } else {
-    assert(false && "expected load or store op");
-    return false;
-  }
-  auto memRefType = region->memref->getType().cast<MemRefType>();
-  if (!memRefType.hasStaticShape())
-    return false;
-
-  auto *regionCst = region->getConstraints();
-
-  // Just get the first numSymbols IVs, which the memref region is parametric
-  // on.
-  SmallVector<AffineForOp, 4> ivs;
-  getLoopIVs(*opInst, &ivs);
-  ivs.resize(numParamLoopIVs);
-  SmallVector<Value *, 4> symbols;
-  extractForInductionVars(ivs, &symbols);
-  regionCst->reset(rank, numParamLoopIVs, 0);
-  regionCst->setIdValues(rank, rank + numParamLoopIVs, symbols);
-
-  // Memref dim sizes provide the bounds.
-  for (unsigned d = 0; d < rank; d++) {
-    auto dimSize = memRefType.getDimSize(d);
-    assert(dimSize > 0 && "filtered dynamic shapes above");
-    regionCst->addConstantLowerBound(d, 0);
-    regionCst->addConstantUpperBound(d, dimSize - 1);
-  }
-  return true;
-}
-
-static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
-emitRemarkForBlock(Block &block) {
-  return block.getParentOp()->emitRemark();
-}
-
-/// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
-/// returns the outermost AffineForOp of the copy loop nest. `memIndicesStart'
-/// holds the lower coordinates of the region in the original memref to copy
-/// in/out. If `copyOut' is true, generates a copy-out; otherwise a copy-in.
-static AffineForOp generatePointWiseCopy(Location loc, Value *memref,
-                                         Value *fastMemRef,
-                                         ArrayRef<Value *> memIndicesStart,
-                                         ArrayRef<int64_t> fastBufferShape,
-                                         bool isCopyOut, OpBuilder b) {
-  assert(!memIndicesStart.empty() && "only 1-d or more memrefs");
-
-  // The copy-in nest is generated as follows as an example for a 2-d region:
-  // for x = ...
-  //   for y = ...
-  //     fast_buf[x][y] = buf[mem_x + x][mem_y + y]
-
-  SmallVector<Value *, 4> fastBufIndices, memIndices;
-  AffineForOp copyNestRoot;
-  for (unsigned d = 0, e = fastBufferShape.size(); d < e; ++d) {
-    auto forOp = b.create<AffineForOp>(loc, 0, fastBufferShape[d]);
-    if (d == 0)
-      copyNestRoot = forOp;
-    b = forOp.getBodyBuilder();
-    fastBufIndices.push_back(forOp.getInductionVar());
-    // Construct the subscript for the slow memref being copied.
-    SmallVector<Value *, 2> operands = {memIndicesStart[d],
-                                        forOp.getInductionVar()};
-    auto memIndex = b.create<AffineApplyOp>(
-        loc,
-        b.getAffineMap(2, 0, b.getAffineDimExpr(0) + b.getAffineDimExpr(1)),
-        operands);
-    memIndices.push_back(memIndex);
-  }
-
-  if (!isCopyOut) {
-    // Copy in.
-    auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
-    b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufIndices);
-    return copyNestRoot;
-  }
-
-  // Copy out.
-  auto load = b.create<AffineLoadOp>(loc, fastMemRef, fastBufIndices);
-  b.create<AffineStoreOp>(loc, load, memref, memIndices);
-  return copyNestRoot;
-}
-
-/// Creates a buffer in the faster memory space for the specified memref region;
-/// generates a copy from the lower memory space to this one, and replaces all
-/// loads/stores in the block range [`begin', `end') of `block' to load/store
-/// from that buffer. Returns failure if copies could not be generated due to
-/// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart`
-/// in copyPlacementBlock specify the insertion points where the incoming copies
-/// and outgoing copies, respectively, should be inserted (the insertion happens
-/// right before the insertion point). Since `begin` can itself be invalidated
-/// due to the memref rewriting done from this method, the output argument
-/// `nBegin` is set to its replacement (set to `begin` if no invalidation
-/// happens). Since outgoing copies could have  been inserted at `end`, the
-/// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the
-/// size of the fast buffer allocated.
-LogicalResult AffineDataCopyGeneration::generateCopy(
-    const MemRefRegion &region, Block *block, Block::iterator begin,
-    Block::iterator end, Block *copyPlacementBlock,
-    Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
-    uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd) {
-  *nBegin = begin;
-  *nEnd = end;
-
-  if (begin == end)
-    return success();
-
-  // Is the copy out point at the end of the block where we are doing
-  // explicit copying.
-  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
-
-  // Copies for read regions are going to be inserted at 'begin'.
-  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
-  // Copies for write regions are going to be inserted at 'end'.
-  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
-  OpBuilder &b = region.isWrite() ? epilogue : prologue;
-
-  // Builder to create constants at the top level.
-  auto func = copyPlacementBlock->getParent()->getParentOfType<FuncOp>();
-  OpBuilder top(func.getBody());
-
-  auto loc = region.loc;
-  auto *memref = region.memref;
-  auto memRefType = memref->getType().cast<MemRefType>();
-
-  auto layoutMaps = memRefType.getAffineMaps();
-  if (layoutMaps.size() > 1 ||
-      (layoutMaps.size() == 1 && !layoutMaps[0].isIdentity())) {
-    LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
-    return failure();
-  }
-
-  // Indices to use for the copying.
-  // Indices for the original memref being copied from/to.
-  SmallVector<Value *, 4> memIndices;
-  // Indices for the faster buffer being copied into/from.
-  SmallVector<Value *, 4> bufIndices;
-
-  unsigned rank = memRefType.getRank();
-  SmallVector<int64_t, 4> fastBufferShape;
-
-  // Compute the extents of the buffer.
-  std::vector<SmallVector<int64_t, 4>> lbs;
-  SmallVector<int64_t, 8> lbDivisors;
-  lbs.reserve(rank);
-  Optional<int64_t> numElements = region.getConstantBoundingSizeAndShape(
-      &fastBufferShape, &lbs, &lbDivisors);
-  if (!numElements.hasValue()) {
-    LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
-    return failure();
-  }
-
-  if (numElements.getValue() == 0) {
-    LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
-    *sizeInBytes = 0;
-    return success();
-  }
-
-  const FlatAffineConstraints *cst = region.getConstraints();
-  // 'regionSymbols' hold values that this memory region is symbolic/paramteric
-  // on; these typically include loop IVs surrounding the level at which the
-  // copy generation is being done or other valid symbols in MLIR.
-  SmallVector<Value *, 8> regionSymbols;
-  cst->getIdValues(rank, cst->getNumIds(), &regionSymbols);
-
-  // Construct the index expressions for the fast memory buffer. The index
-  // expression for a particular dimension of the fast buffer is obtained by
-  // subtracting out the lower bound on the original memref's data region
-  // along the corresponding dimension.
-
-  // Index start offsets for faster memory buffer relative to the original.
-  SmallVector<AffineExpr, 4> offsets;
-  offsets.reserve(rank);
-  for (unsigned d = 0; d < rank; d++) {
-    assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
-
-    AffineExpr offset = top.getAffineConstantExpr(0);
-    for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) {
-      offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
-    }
-    assert(lbDivisors[d] > 0);
-    offset =
-        (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
-
-    // Set copy start location for this dimension in the lower memory space
-    // memref.
-    if (auto caf = offset.dyn_cast<AffineConstantExpr>()) {
-      auto indexVal = caf.getValue();
-      if (indexVal == 0) {
-        memIndices.push_back(zeroIndex);
-      } else {
-        memIndices.push_back(
-            top.create<ConstantIndexOp>(loc, indexVal).getResult());
-      }
-    } else {
-      // The coordinate for the start location is just the lower bound along the
-      // corresponding dimension on the memory region (stored in 'offset').
-      auto map = top.getAffineMap(
-          cst->getNumDimIds() + cst->getNumSymbolIds() - rank, 0, offset);
-      memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));
-    }
-    // The fast buffer is copied into at location zero; addressing is relative.
-    bufIndices.push_back(zeroIndex);
-
-    // Record the offsets since they are needed to remap the memory accesses of
-    // the original memref further below.
-    offsets.push_back(offset);
-  }
-
-  // The faster memory space buffer.
-  Value *fastMemRef;
-
-  // Check if a buffer was already created.
-  bool existingBuf = fastBufferMap.count(memref) > 0;
-  if (!existingBuf) {
-    auto fastMemRefType = top.getMemRefType(
-        fastBufferShape, memRefType.getElementType(), {}, fastMemorySpace);
-
-    // Create the fast memory space buffer just before the 'affine.for'
-    // operation.
-    fastMemRef = prologue.create<AllocOp>(loc, fastMemRefType).getResult();
-    // Record it.
-    fastBufferMap[memref] = fastMemRef;
-    // fastMemRefType is a constant shaped memref.
-    *sizeInBytes = getMemRefSizeInBytes(fastMemRefType).getValue();
-    LLVM_DEBUG(emitRemarkForBlock(*block)
-               << "Creating fast buffer of type " << fastMemRefType
-               << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
-               << " KiB\n");
-  } else {
-    // Reuse the one already created.
-    fastMemRef = fastBufferMap[memref];
-    *sizeInBytes = 0;
-  }
-
-  auto numElementsSSA =
-      top.create<ConstantIndexOp>(loc, numElements.getValue());
-
-  SmallVector<StrideInfo, 4> strideInfos;
-  getMultiLevelStrides(region, fastBufferShape, &strideInfos);
-
-  // TODO(bondhugula): use all stride levels once DmaStartOp is extended for
-  // multi-level strides.
-  if (strideInfos.size() > 1) {
-    LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
-    return failure();
-  }
-
-  Value *stride = nullptr;
-  Value *numEltPerStride = nullptr;
-  if (!strideInfos.empty()) {
-    stride = top.create<ConstantIndexOp>(loc, strideInfos[0].stride);
-    numEltPerStride =
-        top.create<ConstantIndexOp>(loc, strideInfos[0].numEltPerStride);
-  }
-
-  // Record the last operation where we want the memref replacement to end. We
-  // later do the memref replacement only in [begin, postDomFilter] so
-  // that the original memref's used in the data movement code themselves don't
-  // get replaced.
-  auto postDomFilter = std::prev(end);
-
-  // Create fully composed affine maps for each memref.
-  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
-  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
-  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
-  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
-
-  if (!generateDma) {
-    // Point-wise copy generation.
-    auto copyNest = generatePointWiseCopy(loc, memref, fastMemRef, memIndices,
-                                          fastBufferShape,
-                                          /*isCopyOut=*/region.isWrite(), b);
-
-    // Record this so that we can skip it from yet another copy.
-    copyNests.insert(copyNest);
-
-    // Since new ops are being appended (for copy out's), adjust the end to
-    // mark end of block range being processed if necessary.
-    if (region.isWrite() && isCopyOutAtEndOfBlock)
-      *nEnd = Block::iterator(copyNest.getOperation());
-  } else {
-    // DMA generation.
-    // Create a tag (single element 1-d memref) for the DMA.
-    auto tagMemRefType =
-        top.getMemRefType({1}, top.getIntegerType(32), {}, tagMemorySpace);
-    auto tagMemRef = prologue.create<AllocOp>(loc, tagMemRefType);
-
-    SmallVector<Value *, 4> tagIndices({zeroIndex});
-    auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
-    fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
-    if (!region.isWrite()) {
-      // DMA non-blocking read from original buffer to fast buffer.
-      b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
-                                 fastMemRef, bufAffineMap, bufIndices,
-                                 tagMemRef, tagAffineMap, tagIndices,
-                                 numElementsSSA, stride, numEltPerStride);
-    } else {
-      // DMA non-blocking write from fast buffer to the original memref.
-      auto op = b.create<AffineDmaStartOp>(
-          loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
-          memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
-          stride, numEltPerStride);
-      // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
-      // end to mark end of block range being processed.
-      if (isCopyOutAtEndOfBlock)
-        *nEnd = Block::iterator(op.getOperation());
-    }
-
-    // Matching DMA wait to block on completion; tag always has a 0 index.
-    b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
-                              numElementsSSA);
-
-    // Generate dealloc for the tag.
-    auto tagDeallocOp = epilogue.create<DeallocOp>(loc, tagMemRef);
-    if (*nEnd == end && isCopyOutAtEndOfBlock)
-      // Since new ops are being appended (for outgoing DMAs), adjust the end to
-      // mark end of range of the original.
-      *nEnd = Block::iterator(tagDeallocOp.getOperation());
-  }
-
-  // Generate dealloc for the buffer.
-  if (!existingBuf) {
-    auto bufDeallocOp = epilogue.create<DeallocOp>(loc, fastMemRef);
-    // When generating pointwise copies, `nEnd' has to be set to deallocOp on
-    // the fast buffer (since it marks the new end insertion point).
-    if (!generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
-      *nEnd = Block::iterator(bufDeallocOp.getOperation());
-  }
-
-  // Replace all uses of the old memref with the faster one while remapping
-  // access indices (subtracting out lower bound offsets for each dimension).
-  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
-  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
-  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
-  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
-  // d2, d3 correspond to the original indices (%i, %j).
-  SmallVector<AffineExpr, 4> remapExprs;
-  remapExprs.reserve(rank);
-  for (unsigned i = 0; i < rank; i++) {
-    // The starting operands of indexRemap will be regionSymbols (the symbols on
-    // which the memref region is parametric); then those corresponding to
-    // the memref's original indices follow.
-    auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
-    remapExprs.push_back(dimExpr - offsets[i]);
-  }
-  auto indexRemap = b.getAffineMap(regionSymbols.size() + rank, 0, remapExprs);
-
-  // Record the begin since it may be invalidated by memref replacement.
-  Block::iterator prevOfBegin;
-  bool isBeginAtStartOfBlock = (begin == block->begin());
-  if (!isBeginAtStartOfBlock)
-    prevOfBegin = std::prev(begin);
-
-  // *Only* those uses within the range [begin, end) of 'block' are replaced.
-  if (failed(replaceAllMemRefUsesWith(memref, fastMemRef,
-                                      /*extraIndices=*/{}, indexRemap,
-                                      /*extraOperands=*/regionSymbols,
-                                      /*domInstFilter=*/&*begin,
-                                      /*postDomInstFilter=*/&*postDomFilter)))
-    llvm_unreachable("memref replacement guaranteed to succeed here");
-
-  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
-
-  return success();
-}
-
 /// Generate copies for this block. The block is partitioned into separate
 /// ranges: each range is either a sequence of one or more operations starting
 /// and ending with an affine load or store op, or just an affine.forop (which
 /// could have other affine for op's nested within).
-LogicalResult AffineDataCopyGeneration::runOnBlock(Block *block) {
+LogicalResult
+AffineDataCopyGeneration::runOnBlock(Block *block,
+                                     DenseSet<Operation *> &copyNests) {
  if (block->empty())
    return success();

+  AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
+                                   fastMemorySpace, tagMemorySpace,
+                                   fastMemCapacityBytes};
+
  // Every affine.forop in the block starts and ends a block range for copying;
  // in addition, a contiguous sequence of operations starting with a
  // load/store op but not including any copy nests themselves is also
@@ -620,7 +188,8 @@ LogicalResult AffineDataCopyGeneration::runOnBlock(Block *block) {
    // If you hit a non-copy for loop, we will split there.
    if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
      // Perform the copying up unti this 'for' op first.
-      runOnBlock(/*begin=*/curBegin, /*end=*/it);
+      affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
+                             copyNests);

      // Returns true if the footprint is known to exceed capacity.
      auto exceedsCapacity = [&](AffineForOp forOp) {
@@ -643,7 +212,7 @@ LogicalResult AffineDataCopyGeneration::runOnBlock(Block *block) {
      if (recurseInner) {
        // We'll recurse and do the copies at an inner level for 'forInst'.
        // Recurse onto the body of this loop.
-        runOnBlock(forOp.getBody());
+        runOnBlock(forOp.getBody(), copyNests);
      } else {
        // We have enough capacity, i.e., copies will be computed for the
        // portion of the block until 'it', and for 'it', which is 'forOp'. Note
@@ -653,7 +222,8 @@ LogicalResult AffineDataCopyGeneration::runOnBlock(Block *block) {
        // Inner loop copies have their own scope - we don't thus update
        // consumed capacity. The footprint check above guarantees this inner
        // loop's footprint fits.
-        runOnBlock(/*begin=*/it, /*end=*/std::next(it));
+        affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions,
+                               copyNests);
      }
      // Get to the next load or store op after 'forOp'.
      curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
@@ -675,243 +245,27 @@ LogicalResult AffineDataCopyGeneration::runOnBlock(Block *block) {
    // Can't be a terminator because it would have been skipped above.
    assert(!curBegin->isKnownTerminator() && "can't be a terminator");
    // Exclude the affine terminator - hence, the std::prev.
-    runOnBlock(/*begin=*/curBegin, /*end=*/std::prev(block->end()));
+    affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()),
+                           copyOptions, copyNests);
  }

  return success();
 }

-/// Given a memref region, determine the lowest depth at which transfers can be
-/// placed for it, and return the corresponding block, start and end positions
-/// in the block for placing incoming (read) and outgoing (write) copies
-/// respectively. The lowest depth depends on whether the region being accessed
-/// is hoistable with respect to one or more immediately surrounding loops.
-static void
-findHighestBlockForPlacement(const MemRefRegion &region, Block &block,
-                             Block::iterator &begin, Block::iterator &end,
-                             Block **copyPlacementBlock,
-                             Block::iterator *copyInPlacementStart,
-                             Block::iterator *copyOutPlacementStart) {
-  const auto *cst = region.getConstraints();
-  SmallVector<Value *, 4> symbols;
-  cst->getIdValues(cst->getNumDimIds(), cst->getNumDimAndSymbolIds(), &symbols);
-
-  SmallVector<AffineForOp, 4> enclosingFors;
-  getLoopIVs(*block.begin(), &enclosingFors);
-  // Walk up loop parents till we find an IV on which this region is
-  // symbolic/variant.
-  auto it = enclosingFors.rbegin();
-  for (auto e = enclosingFors.rend(); it != e; ++it) {
-    // TODO(bondhugula): also need to be checking this for regions symbols that
-    // aren't loop IVs, whether we are within their resp. defs' dominance scope.
-    if (llvm::is_contained(symbols, it->getInductionVar()))
-      break;
-  }
-
-  if (it != enclosingFors.rbegin()) {
-    auto lastInvariantIV = *std::prev(it);
-    *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation());
-    *copyOutPlacementStart = std::next(*copyInPlacementStart);
-    *copyPlacementBlock = lastInvariantIV.getOperation()->getBlock();
-  } else {
-    *copyInPlacementStart = begin;
-    *copyOutPlacementStart = end;
-    *copyPlacementBlock = &block;
-  }
-}
-
-/// Generates copies for a contiguous sequence of operations in `block` in the
-/// iterator range [`begin', `end'), where `end' can't be past the terminator of
-/// the block (since additional operations are potentially inserted right before
-/// `end'. Returns the total size of the fast buffers used.
-//  Since we generate alloc's and dealloc's for all fast buffers (before and
-//  after the range of operations resp.), all of the fast memory capacity is
-//  assumed to be available for processing this block range.
-uint64_t AffineDataCopyGeneration::runOnBlock(Block::iterator begin,
-                                              Block::iterator end) {
-  if (begin == end)
-    return 0;
-
-  assert(begin->getBlock() == std::prev(end)->getBlock() &&
-         "Inconsistent block begin/end args");
-  assert(end != end->getBlock()->end() && "end can't be the block terminator");
-
-  Block *block = begin->getBlock();
-
-  // Copies will be generated for this depth, i.e., symbolic in all loops
-  // surrounding the this block range.
-  unsigned copyDepth = getNestingDepth(*begin);
-
-  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
-                          << "\n");
-  LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
-  LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
-
-  readRegions.clear();
-  writeRegions.clear();
-  fastBufferMap.clear();
-
-  // To check for errors when walking the block.
-  bool error = false;
-
-  // Walk this range of operations  to gather all memory regions.
-  block->walk(begin, end, [&](Operation *opInst) {
-    // Gather regions to allocate to buffers in faster memory space.
-    if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
-      if (loadOp.getMemRefType().getMemorySpace() != slowMemorySpace)
-        return;
-    } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
-      if (storeOp.getMemRefType().getMemorySpace() != slowMemorySpace)
-        return;
-    } else {
-      // Neither load nor a store op.
-      return;
-    }
-
-    // Compute the MemRefRegion accessed.
-    auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
-    if (failed(region->compute(opInst, copyDepth))) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Error obtaining memory region: semi-affine maps?\n");
-      LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
-      if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
-        LLVM_DEBUG(
-            opInst->emitError("Non-constant memref sizes not yet supported"));
-        error = true;
-        return;
-      }
-    }
-
-    // Each memref has a single buffer associated with it irrespective of how
-    // many load's and store's happen on it.
-    // TODO(bondhugula): in the future, when regions don't intersect and satisfy
-    // other properties (based on load/store regions), we could consider
-    // multiple buffers per memref.
-
-    // Add to the appropriate region if it's not already in it, or take a
-    // bounding box union with the existing one if it's already in there.
-    // Note that a memref may have both read and write regions - so update the
-    // region in the other list if one exists (write in case of read and vice
-    // versa) since there is a single bounding box for a memref across all reads
-    // and writes that happen on it.
-
-    // Attempts to update; returns true if 'region' exists in targetRegions.
-    auto updateRegion =
-        [&](const SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4>
-                &targetRegions) {
-          auto it = targetRegions.find(region->memref);
-          if (it == targetRegions.end())
-            return false;
-
-          // Perform a union with the existing region.
-          if (failed(it->second->unionBoundingBox(*region))) {
-            LLVM_DEBUG(llvm::dbgs()
-                       << "Memory region bounding box failed; "
-                          "over-approximating to the entire memref\n");
-            // If the union fails, we will overapproximate.
-            if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
-              LLVM_DEBUG(opInst->emitError(
-                  "Non-constant memref sizes not yet supported"));
-              error = true;
-              return true;
-            }
-            it->second->getConstraints()->clearAndCopyFrom(
-                *region->getConstraints());
-          } else {
-            // Union was computed and stored in 'it->second': copy to 'region'.
-            region->getConstraints()->clearAndCopyFrom(
-                *it->second->getConstraints());
-          }
-          return true;
-        };
-
-    bool existsInRead = updateRegion(readRegions);
-    if (error)
-      return;
-    bool existsInWrite = updateRegion(writeRegions);
-    if (error)
-      return;
-
-    // Finally add it to the region list.
-    if (region->isWrite() && !existsInWrite) {
-      writeRegions[region->memref] = std::move(region);
-    } else if (!region->isWrite() && !existsInRead) {
-      readRegions[region->memref] = std::move(region);
-    }
-  });
-
-  if (error) {
-    begin->emitError(
-        "copy generation failed for one or more memref's in this block\n");
-    return 0;
-  }
-
-  uint64_t totalCopyBuffersSizeInBytes = 0;
-  bool ret = true;
-  auto processRegions =
-      [&](const SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4>
-              &regions) {
-        for (const auto &regionEntry : regions) {
-          // For each region, hoist copy in/out past all hoistable
-          // 'affine.for's.
-          Block::iterator copyInPlacementStart, copyOutPlacementStart;
-          Block *copyPlacementBlock;
-          findHighestBlockForPlacement(
-              *regionEntry.second, *block, begin, end, &copyPlacementBlock,
-              &copyInPlacementStart, &copyOutPlacementStart);
-
-          uint64_t sizeInBytes;
-          Block::iterator nBegin, nEnd;
-          LogicalResult iRet =
-              generateCopy(*regionEntry.second, block, begin, end,
-                           copyPlacementBlock, copyInPlacementStart,
-                           copyOutPlacementStart, &sizeInBytes, &nBegin, &nEnd);
-          if (succeeded(iRet)) {
-            // begin/end could have been invalidated, and need update.
-            begin = nBegin;
-            end = nEnd;
-            totalCopyBuffersSizeInBytes += sizeInBytes;
-          }
-          ret = ret & succeeded(iRet);
-        }
-      };
-  processRegions(readRegions);
-  processRegions(writeRegions);
-
-  if (!ret) {
-    begin->emitError(
-        "copy generation failed for one or more memref's in this block\n");
-    return totalCopyBuffersSizeInBytes;
-  }
-
-  // For a range of operations, a note will be emitted at the caller.
-  AffineForOp forOp;
-  uint64_t sizeInKib = llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024);
-  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
-    forOp.emitRemark()
-        << sizeInKib
-        << " KiB of copy buffers in fast memory space for this block\n";
-  }
-
-  if (totalCopyBuffersSizeInBytes > fastMemCapacityBytes) {
-    StringRef str = "Total size of all copy buffers' for this block "
-                    "exceeds fast memory capacity\n";
-    block->getParentOp()->emitError(str);
-  }
-
-  return totalCopyBuffersSizeInBytes;
-}
-
 void AffineDataCopyGeneration::runOnFunction() {
  FuncOp f = getFunction();
  OpBuilder topBuilder(f.getBody());
  zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0);

+  // Nests that are copy-in's or copy-out's; the root AffineForOps of those
+  // nests are stored herein.
+  DenseSet<Operation *> copyNests;
+
  // Clear recorded copy nests.
  copyNests.clear();

  for (auto &block : f)
-    runOnBlock(&block);
+    runOnBlock(&block, copyNests);

  // Promote any single iteration loops in the copy nests.
  for (auto nest : copyNests) {
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -22,29 +22,29 @@
 #include "mlir/Transforms/LoopUtils.h"

 #include "mlir/Analysis/AffineAnalysis.h"
-#include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/AffineOps/AffineOps.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
-#include "mlir/Dialect/StandardOps/Ops.h"
-#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/IR/Builders.h"
 #include "mlir/IR/Function.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/Operation.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"

 #define DEBUG_TYPE "LoopUtils"

 using namespace mlir;
+using llvm::MapVector;
 using llvm::SetVector;
+using llvm::SmallMapVector;

 /// Computes the cleanup loop lower bound of the loop being unrolled with
 /// the specified unroll factor; this bound will also be upper bound of the main
@@ -1130,3 +1130,665 @@ void mlir::mapLoopToProcessorIds(loop::ForOp forOp,
    step = b.create<MulIOp>(loc, step, numProcs);
  forOp.setStep(step);
 }
+
+/// Given a memref region, determine the lowest depth at which transfers can be
+/// placed for it, and return the corresponding block, start and end positions
+/// in the block for placing incoming (read) and outgoing (write) copies
+/// respectively. The lowest depth depends on whether the region being accessed
+/// is hoistable with respect to one or more immediately surrounding loops.
+static void
+findHighestBlockForPlacement(const MemRefRegion &region, Block &block,
+                             Block::iterator &begin, Block::iterator &end,
+                             Block **copyPlacementBlock,
+                             Block::iterator *copyInPlacementStart,
+                             Block::iterator *copyOutPlacementStart) {
+  const auto *cst = region.getConstraints();
+  SmallVector<Value *, 4> symbols;
+  cst->getIdValues(cst->getNumDimIds(), cst->getNumDimAndSymbolIds(), &symbols);
+
+  SmallVector<AffineForOp, 4> enclosingFors;
+  getLoopIVs(*block.begin(), &enclosingFors);
+  // Walk up loop parents till we find an IV on which this region is
+  // symbolic/variant.
+  auto it = enclosingFors.rbegin();
+  for (auto e = enclosingFors.rend(); it != e; ++it) {
+    // TODO(bondhugula): also need to be checking this for regions symbols that
+    // aren't loop IVs, whether we are within their resp. defs' dominance scope.
+    if (llvm::is_contained(symbols, it->getInductionVar()))
+      break;
+  }
+
+  if (it != enclosingFors.rbegin()) {
+    auto lastInvariantIV = *std::prev(it);
+    *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation());
+    *copyOutPlacementStart = std::next(*copyInPlacementStart);
+    *copyPlacementBlock = lastInvariantIV.getOperation()->getBlock();
+  } else {
+    *copyInPlacementStart = begin;
+    *copyOutPlacementStart = end;
+    *copyPlacementBlock = &block;
+  }
+}
+
+// Info comprising stride and number of elements transferred every stride.
+struct StrideInfo {
+  int64_t stride;
+  int64_t numEltPerStride;
+};
+
+/// Returns striding information for a copy/transfer of this region with
+/// potentially multiple striding levels from outermost to innermost. For an
+/// n-dimensional region, there can be at most n-1 levels of striding
+/// successively nested.
+//  TODO(bondhugula): make this work with non-identity layout maps.
+static void getMultiLevelStrides(const MemRefRegion &region,
+                                 ArrayRef<int64_t> bufferShape,
+                                 SmallVectorImpl<StrideInfo> *strideInfos) {
+  if (bufferShape.size() <= 1)
+    return;
+
+  int64_t numEltPerStride = 1;
+  int64_t stride = 1;
+  for (int d = bufferShape.size() - 1; d >= 1; d--) {
+    int64_t dimSize = region.memref->getType().cast<MemRefType>().getDimSize(d);
+    stride *= dimSize;
+    numEltPerStride *= bufferShape[d];
+    // A stride is needed only if the region has a shorter extent than the
+    // memref along the dimension *and* has an extent greater than one along the
+    // next major dimension.
+    if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
+      strideInfos->push_back({stride, numEltPerStride});
+    }
+  }
+}
+
+/// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
+/// returns the outermost AffineForOp of the copy loop nest. `memIndicesStart'
+/// holds the lower coordinates of the region in the original memref to copy
+/// in/out. If `copyOut' is true, generates a copy-out; otherwise a copy-in.
+static AffineForOp generatePointWiseCopy(Location loc, Value *memref,
+                                         Value *fastMemRef,
+                                         AffineMap memAffineMap,
+                                         ArrayRef<Value *> memIndicesStart,
+                                         ArrayRef<int64_t> fastBufferShape,
+                                         bool isCopyOut, OpBuilder b) {
+  assert(!memIndicesStart.empty() && "only 1-d or more memrefs");
+
+  // The copy-in nest is generated as follows as an example for a 2-d region:
+  // for x = ...
+  //   for y = ...
+  //     fast_buf[x][y] = buf[mem_x + x][mem_y + y]
+
+  SmallVector<Value *, 4> fastBufIndices, memIndices;
+  AffineForOp copyNestRoot;
+  for (unsigned d = 0, e = fastBufferShape.size(); d < e; ++d) {
+    auto forOp = b.create<AffineForOp>(loc, 0, fastBufferShape[d]);
+    if (d == 0)
+      copyNestRoot = forOp;
+    b = forOp.getBodyBuilder();
+    fastBufIndices.push_back(forOp.getInductionVar());
+
+    Value *memBase =
+        (memAffineMap == b.getMultiDimIdentityMap(memAffineMap.getNumDims()))
+            ? memIndicesStart[d]
+            : b.create<AffineApplyOp>(
+                  loc,
+                  b.getAffineMap(memAffineMap.getNumDims(),
+                                 memAffineMap.getNumSymbols(),
+                                 memAffineMap.getResult(d)),
+                  memIndicesStart);
+
+    // Construct the subscript for the slow memref being copied.
+    SmallVector<Value *, 2> operands = {memBase, forOp.getInductionVar()};
+    auto memIndex = b.create<AffineApplyOp>(
+        loc,
+        b.getAffineMap(2, 0, b.getAffineDimExpr(0) + b.getAffineDimExpr(1)),
+        operands);
+    memIndices.push_back(memIndex);
+  }
+
+  if (!isCopyOut) {
+    // Copy in.
+    auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
+    b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufIndices);
+    return copyNestRoot;
+  }
+
+  // Copy out.
+  auto load = b.create<AffineLoadOp>(loc, fastMemRef, fastBufIndices);
+  b.create<AffineStoreOp>(loc, load, memref, memIndices);
+  return copyNestRoot;
+}
+
+static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
+emitRemarkForBlock(Block &block) {
+  return block.getParentOp()->emitRemark();
+}
+
+/// Creates a buffer in the faster memory space for the specified memref region;
+/// generates a copy from the lower memory space to this one, and replaces all
+/// loads/stores in the block range [`begin', `end') of `block' to load/store
+/// from that buffer. Returns failure if copies could not be generated due to
+/// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart`
+/// in copyPlacementBlock specify the insertion points where the incoming copies
+/// and outgoing copies, respectively, should be inserted (the insertion happens
+/// right before the insertion point). Since `begin` can itself be invalidated
+/// due to the memref rewriting done from this method, the output argument
+/// `nBegin` is set to its replacement (set to `begin` if no invalidation
+/// happens). Since outgoing copies could have  been inserted at `end`, the
+/// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the
+/// size of the fast buffer allocated.
+static LogicalResult generateCopy(
+    const MemRefRegion &region, Block *block, Block::iterator begin,
+    Block::iterator end, Block *copyPlacementBlock,
+    Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
+    AffineCopyOptions copyOptions, DenseMap<Value *, Value *> &fastBufferMap,
+    DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
+    Block::iterator *nBegin, Block::iterator *nEnd) {
+  *nBegin = begin;
+  *nEnd = end;
+
+  FuncOp f = begin->getParentOfType<FuncOp>();
+  OpBuilder topBuilder(f.getBody());
+  Value *zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0);
+
+  if (begin == end)
+    return success();
+
+  // Is the copy out point at the end of the block where we are doing
+  // explicit copying.
+  bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
+
+  // Copies for read regions are going to be inserted at 'begin'.
+  OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
+  // Copies for write regions are going to be inserted at 'end'.
+  OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
+  OpBuilder &b = region.isWrite() ? epilogue : prologue;
+
+  // Builder to create constants at the top level.
+  auto func = copyPlacementBlock->getParent()->getParentOfType<FuncOp>();
+  OpBuilder top(func.getBody());
+
+  auto loc = region.loc;
+  auto *memref = region.memref;
+  auto memRefType = memref->getType().cast<MemRefType>();
+
+  auto layoutMaps = memRefType.getAffineMaps();
+  if (layoutMaps.size() > 1 ||
+      (layoutMaps.size() == 1 && !layoutMaps[0].isIdentity())) {
+    LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
+    return failure();
+  }
+
+  // Indices to use for the copying.
+  // Indices for the original memref being copied from/to.
+  SmallVector<Value *, 4> memIndices;
+  // Indices for the faster buffer being copied into/from.
+  SmallVector<Value *, 4> bufIndices;
+
+  unsigned rank = memRefType.getRank();
+  SmallVector<int64_t, 4> fastBufferShape;
+
+  // Compute the extents of the buffer.
+  std::vector<SmallVector<int64_t, 4>> lbs;
+  SmallVector<int64_t, 8> lbDivisors;
+  lbs.reserve(rank);
+  Optional<int64_t> numElements = region.getConstantBoundingSizeAndShape(
+      &fastBufferShape, &lbs, &lbDivisors);
+  if (!numElements.hasValue()) {
+    LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
+    return failure();
+  }
+
+  if (numElements.getValue() == 0) {
+    LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
+    *sizeInBytes = 0;
+    return success();
+  }
+
+  const FlatAffineConstraints *cst = region.getConstraints();
+  // 'regionSymbols' hold values that this memory region is symbolic/paramteric
+  // on; these typically include loop IVs surrounding the level at which the
+  // copy generation is being done or other valid symbols in MLIR.
+  SmallVector<Value *, 8> regionSymbols;
+  cst->getIdValues(rank, cst->getNumIds(), &regionSymbols);
+
+  // Construct the index expressions for the fast memory buffer. The index
+  // expression for a particular dimension of the fast buffer is obtained by
+  // subtracting out the lower bound on the original memref's data region
+  // along the corresponding dimension.
+
+  // Index start offsets for faster memory buffer relative to the original.
+  SmallVector<AffineExpr, 4> offsets;
+  offsets.reserve(rank);
+  for (unsigned d = 0; d < rank; d++) {
+    assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
+
+    AffineExpr offset = top.getAffineConstantExpr(0);
+    for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) {
+      offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
+    }
+    assert(lbDivisors[d] > 0);
+    offset =
+        (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
+
+    // Set copy start location for this dimension in the lower memory space
+    // memref.
+    if (auto caf = offset.dyn_cast<AffineConstantExpr>()) {
+      auto indexVal = caf.getValue();
+      if (indexVal == 0) {
+        memIndices.push_back(zeroIndex);
+      } else {
+        memIndices.push_back(
+            top.create<ConstantIndexOp>(loc, indexVal).getResult());
+      }
+    } else {
+      // The coordinate for the start location is just the lower bound along the
+      // corresponding dimension on the memory region (stored in 'offset').
+      auto map = top.getAffineMap(
+          cst->getNumDimIds() + cst->getNumSymbolIds() - rank, 0, offset);
+      memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));
+    }
+    // The fast buffer is copied into at location zero; addressing is relative.
+    bufIndices.push_back(zeroIndex);
+
+    // Record the offsets since they are needed to remap the memory accesses of
+    // the original memref further below.
+    offsets.push_back(offset);
+  }
+
+  // The faster memory space buffer.
+  Value *fastMemRef;
+
+  // Check if a buffer was already created.
+  bool existingBuf = fastBufferMap.count(memref) > 0;
+  if (!existingBuf) {
+    AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
+    auto fastMemRefType =
+        top.getMemRefType(fastBufferShape, memRefType.getElementType(),
+                          fastBufferLayout, copyOptions.fastMemorySpace);
+
+    // Create the fast memory space buffer just before the 'affine.for'
+    // operation.
+    fastMemRef = prologue.create<AllocOp>(loc, fastMemRefType).getResult();
+    // Record it.
+    fastBufferMap[memref] = fastMemRef;
+    // fastMemRefType is a constant shaped memref.
+    *sizeInBytes = getMemRefSizeInBytes(fastMemRefType).getValue();
+    LLVM_DEBUG(emitRemarkForBlock(*block)
+               << "Creating fast buffer of type " << fastMemRefType
+               << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
+               << " KiB\n");
+  } else {
+    // Reuse the one already created.
+    fastMemRef = fastBufferMap[memref];
+    *sizeInBytes = 0;
+  }
+
+  auto numElementsSSA =
+      top.create<ConstantIndexOp>(loc, numElements.getValue());
+
+  SmallVector<StrideInfo, 4> strideInfos;
+  getMultiLevelStrides(region, fastBufferShape, &strideInfos);
+
+  // TODO(bondhugula): use all stride levels once DmaStartOp is extended for
+  // multi-level strides.
+  if (strideInfos.size() > 1) {
+    LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
+    return failure();
+  }
+
+  Value *stride = nullptr;
+  Value *numEltPerStride = nullptr;
+  if (!strideInfos.empty()) {
+    stride = top.create<ConstantIndexOp>(loc, strideInfos[0].stride);
+    numEltPerStride =
+        top.create<ConstantIndexOp>(loc, strideInfos[0].numEltPerStride);
+  }
+
+  // Record the last operation where we want the memref replacement to end. We
+  // later do the memref replacement only in [begin, postDomFilter] so
+  // that the original memref's used in the data movement code themselves don't
+  // get replaced.
+  auto postDomFilter = std::prev(end);
+
+  // Create fully composed affine maps for each memref.
+  auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
+  fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
+  auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
+  fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
+
+  if (!copyOptions.generateDma) {
+    // Point-wise copy generation.
+    auto copyNest = generatePointWiseCopy(loc, memref, fastMemRef, memAffineMap,
+                                          memIndices, fastBufferShape,
+                                          /*isCopyOut=*/region.isWrite(), b);
+
+    // Record this so that we can skip it from yet another copy.
+    copyNests.insert(copyNest);
+
+    // Since new ops are being appended (for copy out's), adjust the end to
+    // mark end of block range being processed if necessary.
+    if (region.isWrite() && isCopyOutAtEndOfBlock)
+      *nEnd = Block::iterator(copyNest.getOperation());
+  } else {
+    // DMA generation.
+    // Create a tag (single element 1-d memref) for the DMA.
+    auto tagMemRefType = top.getMemRefType({1}, top.getIntegerType(32), {},
+                                           copyOptions.tagMemorySpace);
+    auto tagMemRef = prologue.create<AllocOp>(loc, tagMemRefType);
+
+    SmallVector<Value *, 4> tagIndices({zeroIndex});
+    auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
+    fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
+    if (!region.isWrite()) {
+      // DMA non-blocking read from original buffer to fast buffer.
+      b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
+                                 fastMemRef, bufAffineMap, bufIndices,
+                                 tagMemRef, tagAffineMap, tagIndices,
+                                 numElementsSSA, stride, numEltPerStride);
+    } else {
+      // DMA non-blocking write from fast buffer to the original memref.
+      auto op = b.create<AffineDmaStartOp>(
+          loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
+          memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
+          stride, numEltPerStride);
+      // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
+      // end to mark end of block range being processed.
+      if (isCopyOutAtEndOfBlock)
+        *nEnd = Block::iterator(op.getOperation());
+    }
+
+    // Matching DMA wait to block on completion; tag always has a 0 index.
+    b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
+                              numElementsSSA);
+
+    // Generate dealloc for the tag.
+    auto tagDeallocOp = epilogue.create<DeallocOp>(loc, tagMemRef);
+    if (*nEnd == end && isCopyOutAtEndOfBlock)
+      // Since new ops are being appended (for outgoing DMAs), adjust the end to
+      // mark end of range of the original.
+      *nEnd = Block::iterator(tagDeallocOp.getOperation());
+  }
+
+  // Generate dealloc for the buffer.
+  if (!existingBuf) {
+    auto bufDeallocOp = epilogue.create<DeallocOp>(loc, fastMemRef);
+    // When generating pointwise copies, `nEnd' has to be set to deallocOp on
+    // the fast buffer (since it marks the new end insertion point).
+    if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
+      *nEnd = Block::iterator(bufDeallocOp.getOperation());
+  }
+
+  // Replace all uses of the old memref with the faster one while remapping
+  // access indices (subtracting out lower bound offsets for each dimension).
+  // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
+  // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
+  // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
+  // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
+  // d2, d3 correspond to the original indices (%i, %j).
+  SmallVector<AffineExpr, 4> remapExprs;
+  remapExprs.reserve(rank);
+  for (unsigned i = 0; i < rank; i++) {
+    // The starting operands of indexRemap will be regionSymbols (the symbols on
+    // which the memref region is parametric); then those corresponding to
+    // the memref's original indices follow.
+    auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
+    remapExprs.push_back(dimExpr - offsets[i]);
+  }
+  auto indexRemap = b.getAffineMap(regionSymbols.size() + rank, 0, remapExprs);
+
+  // Record the begin since it may be invalidated by memref replacement.
+  Block::iterator prevOfBegin;
+  bool isBeginAtStartOfBlock = (begin == block->begin());
+  if (!isBeginAtStartOfBlock)
+    prevOfBegin = std::prev(begin);
+
+  // *Only* those uses within the range [begin, end) of 'block' are replaced.
+  replaceAllMemRefUsesWith(memref, fastMemRef,
+                           /*extraIndices=*/{}, indexRemap,
+                           /*extraOperands=*/regionSymbols,
+                           /*domInstFilter=*/&*begin,
+                           /*postDomInstFilter=*/&*postDomFilter);
+
+  *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
+
+  return success();
+}
+
+/// Construct the memref region to just include the entire memref. Returns false
+/// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
+/// enclosing loop IVs of opInst (starting from the outermost) that the region
+/// is parametric on.
+static bool getFullMemRefAsRegion(Operation *opInst, unsigned numParamLoopIVs,
+                                  MemRefRegion *region) {
+  unsigned rank;
+  if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
+    rank = loadOp.getMemRefType().getRank();
+    region->memref = loadOp.getMemRef();
+    region->setWrite(false);
+  } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
+    rank = storeOp.getMemRefType().getRank();
+    region->memref = storeOp.getMemRef();
+    region->setWrite(true);
+  } else {
+    assert(false && "expected load or store op");
+    return false;
+  }
+  auto memRefType = region->memref->getType().cast<MemRefType>();
+  if (!memRefType.hasStaticShape())
+    return false;
+
+  auto *regionCst = region->getConstraints();
+
+  // Just get the first numSymbols IVs, which the memref region is parametric
+  // on.
+  SmallVector<AffineForOp, 4> ivs;
+  getLoopIVs(*opInst, &ivs);
+  ivs.resize(numParamLoopIVs);
+  SmallVector<Value *, 4> symbols;
+  extractForInductionVars(ivs, &symbols);
+  regionCst->reset(rank, numParamLoopIVs, 0);
+  regionCst->setIdValues(rank, rank + numParamLoopIVs, symbols);
+
+  // Memref dim sizes provide the bounds.
+  for (unsigned d = 0; d < rank; d++) {
+    auto dimSize = memRefType.getDimSize(d);
+    assert(dimSize > 0 && "filtered dynamic shapes above");
+    regionCst->addConstantLowerBound(d, 0);
+    regionCst->addConstantUpperBound(d, dimSize - 1);
+  }
+  return true;
+}
+
+/// Generates copies for a contiguous sequence of operations in `block` in the
+/// iterator range [`begin', `end'), where `end' can't be past the terminator of
+/// the block (since additional operations are potentially inserted right before
+/// `end'. Returns the total size of the fast buffers used.
+//  Since we generate alloc's and dealloc's for all fast buffers (before and
+//  after the range of operations resp.), all of the fast memory capacity is
+//  assumed to be available for processing this block range.
+uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
+                                      Block::iterator end,
+                                      const AffineCopyOptions &copyOptions,
+                                      DenseSet<Operation *> &copyNests) {
+  if (begin == end)
+    return 0;
+
+  assert(begin->getBlock() == std::prev(end)->getBlock() &&
+         "Inconsistent block begin/end args");
+  assert(end != end->getBlock()->end() && "end can't be the block terminator");
+
+  Block *block = begin->getBlock();
+
+  // Copies will be generated for this depth, i.e., symbolic in all loops
+  // surrounding the this block range.
+  unsigned copyDepth = getNestingDepth(*begin);
+
+  LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
+                          << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
+
+  // List of memory regions to copy for. We need a map vector to have a
+  // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
+  // since the alloc's for example are identical except for the SSA id.
+  SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4> readRegions;
+  SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4> writeRegions;
+
+  // Map from original memref's to the fast buffers that their accesses are
+  // replaced with.
+  DenseMap<Value *, Value *> fastBufferMap;
+
+  // To check for errors when walking the block.
+  bool error = false;
+
+  // Walk this range of operations  to gather all memory regions.
+  block->walk(begin, end, [&](Operation *opInst) {
+    // Gather regions to allocate to buffers in faster memory space.
+    if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
+      if ((loadOp.getMemRefType().getMemorySpace() !=
+           copyOptions.slowMemorySpace))
+        return;
+    } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
+      if (storeOp.getMemRefType().getMemorySpace() !=
+          copyOptions.slowMemorySpace)
+        return;
+    } else {
+      // Neither load nor a store op.
+      return;
+    }
+
+    // Compute the MemRefRegion accessed.
+    auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
+    if (failed(region->compute(opInst, copyDepth))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Error obtaining memory region: semi-affine maps?\n");
+      LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
+      if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
+        LLVM_DEBUG(
+            opInst->emitError("Non-constant memref sizes not yet supported"));
+        error = true;
+        return;
+      }
+    }
+
+    // Each memref has a single buffer associated with it irrespective of how
+    // many load's and store's happen on it.
+    // TODO(bondhugula): in the future, when regions don't intersect and satisfy
+    // other properties (based on load/store regions), we could consider
+    // multiple buffers per memref.
+
+    // Add to the appropriate region if it's not already in it, or take a
+    // bounding box union with the existing one if it's already in there.
+    // Note that a memref may have both read and write regions - so update the
+    // region in the other list if one exists (write in case of read and vice
+    // versa) since there is a single bounding box for a memref across all reads
+    // and writes that happen on it.
+
+    // Attempts to update; returns true if 'region' exists in targetRegions.
+    auto updateRegion =
+        [&](const SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4>
+                &targetRegions) {
+          auto it = targetRegions.find(region->memref);
+          if (it == targetRegions.end())
+            return false;
+
+          // Perform a union with the existing region.
+          if (failed(it->second->unionBoundingBox(*region))) {
+            LLVM_DEBUG(llvm::dbgs()
+                       << "Memory region bounding box failed; "
+                          "over-approximating to the entire memref\n");
+            // If the union fails, we will overapproximate.
+            if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
+              LLVM_DEBUG(opInst->emitError(
+                  "Non-constant memref sizes not yet supported"));
+              error = true;
+              return true;
+            }
+            it->second->getConstraints()->clearAndCopyFrom(
+                *region->getConstraints());
+          } else {
+            // Union was computed and stored in 'it->second': copy to 'region'.
+            region->getConstraints()->clearAndCopyFrom(
+                *it->second->getConstraints());
+          }
+          return true;
+        };
+
+    bool existsInRead = updateRegion(readRegions);
+    if (error)
+      return;
+    bool existsInWrite = updateRegion(writeRegions);
+    if (error)
+      return;
+
+    // Finally add it to the region list.
+    if (region->isWrite() && !existsInWrite) {
+      writeRegions[region->memref] = std::move(region);
+    } else if (!region->isWrite() && !existsInRead) {
+      readRegions[region->memref] = std::move(region);
+    }
+  });
+
+  if (error) {
+    begin->emitError(
+        "copy generation failed for one or more memref's in this block\n");
+    return 0;
+  }
+
+  uint64_t totalCopyBuffersSizeInBytes = 0;
+  bool ret = true;
+  auto processRegions =
+      [&](const SmallMapVector<Value *, std::unique_ptr<MemRefRegion>, 4>
+              &regions) {
+        for (const auto &regionEntry : regions) {
+          // For each region, hoist copy in/out past all hoistable
+          // 'affine.for's.
+          Block::iterator copyInPlacementStart, copyOutPlacementStart;
+          Block *copyPlacementBlock;
+          findHighestBlockForPlacement(
+              *regionEntry.second, *block, begin, end, &copyPlacementBlock,
+              &copyInPlacementStart, &copyOutPlacementStart);
+
+          uint64_t sizeInBytes;
+          Block::iterator nBegin, nEnd;
+          LogicalResult iRet = generateCopy(
+              *regionEntry.second, block, begin, end, copyPlacementBlock,
+              copyInPlacementStart, copyOutPlacementStart, copyOptions,
+              fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
+          if (succeeded(iRet)) {
+            // begin/end could have been invalidated, and need update.
+            begin = nBegin;
+            end = nEnd;
+            totalCopyBuffersSizeInBytes += sizeInBytes;
+          }
+          ret = ret & succeeded(iRet);
+        }
+      };
+  processRegions(readRegions);
+  processRegions(writeRegions);
+
+  if (!ret) {
+    begin->emitError(
+        "copy generation failed for one or more memref's in this block\n");
+    return totalCopyBuffersSizeInBytes;
+  }
+
+  // For a range of operations, a note will be emitted at the caller.
+  AffineForOp forOp;
+  uint64_t sizeInKib = llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024);
+  if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
+    forOp.emitRemark()
+        << sizeInKib
+        << " KiB of copy buffers in fast memory space for this block\n";
+  }
+
+  if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
+    StringRef str = "Total size of all copy buffers' for this block "
+                    "exceeds fast memory capacity\n";
+    block->getParentOp()->emitError(str);
+  }
+
+  return totalCopyBuffersSizeInBytes;
+}