Files
llvm/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
Alex Zinenko c25b20c0f6 [mlir] NFC: Rename LoopOps dialect to SCF (Structured Control Flow)
This dialect contains various structured control flow operaitons, not only
loops, reflect this in the name. Drop the Ops suffix for consistency with other
dialects.

Note that this only moves the files and changes the C++ namespace from 'loop'
to 'scf'. The visible IR prefix remains the same and will be updated
separately. The conversions will also be updated separately.

Differential Revision: https://reviews.llvm.org/D79578
2020-05-11 15:04:27 +02:00

171 lines
6.7 KiB
C++

//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements utilities that allow one to create IR moving the data
// across different levels of the GPU memory hierarchy.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/MemoryPromotion.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/SCF/EDSC/Builders.h"
#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
using namespace mlir;
using namespace mlir::edsc;
using namespace mlir::edsc::intrinsics;
using namespace mlir::gpu;
/// Returns the textual name of a GPU dimension.
static StringRef getDimName(unsigned dim) {
if (dim == 0)
return "x";
if (dim == 1)
return "y";
if (dim == 2)
return "z";
llvm_unreachable("dimension ID overflow");
}
/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
/// values using the bounds derived from the "from" value. Emits at least
/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
/// single-iteration loops. Maps the innermost loops to thread dimensions, in
/// reverse order to enable access coalescing in the innermost loop.
static void insertCopyLoops(OpBuilder &builder, Location loc,
MemRefBoundsCapture &bounds, Value from, Value to) {
// Create EDSC handles for bounds.
unsigned rank = bounds.rank();
SmallVector<Value, 4> lbs, ubs, steps;
// Make sure we have enough loops to use all thread dimensions, these trivial
// loops should be outermost and therefore inserted first.
if (rank < GPUDialect::getNumWorkgroupDimensions()) {
unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
Value zero = std_constant_index(0);
Value one = std_constant_index(1);
lbs.resize(extraLoops, zero);
ubs.resize(extraLoops, one);
steps.resize(extraLoops, one);
}
// Add existing bounds.
lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());
// Emit constant operations for steps.
steps.reserve(lbs.size());
llvm::transform(bounds.getSteps(), std::back_inserter(steps),
[](int64_t step) { return std_constant_index(step); });
// Obtain thread identifiers and block sizes, necessary to map to them.
auto indexType = builder.getIndexType();
SmallVector<Value, 3> threadIds, blockDims;
for (unsigned i = 0; i < 3; ++i) {
auto dimName = builder.getStringAttr(getDimName(i));
threadIds.push_back(
builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
blockDims.push_back(
builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
}
// Produce the loop nest with copies.
SmallVector<Value, 8> ivs(lbs.size());
LoopNestBuilder(ivs, lbs, ubs, steps)([&]() {
auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
StdIndexedValue fromHandle(from), toHandle(to);
toHandle(activeIvs) = fromHandle(activeIvs);
});
// Map the innermost loops to threads in reverse order.
for (auto en :
llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
GPUDialect::getNumWorkgroupDimensions())))) {
Value v = en.value();
auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
mapLoopToProcessorIds(loop, {threadIds[en.index()]},
{blockDims[en.index()]});
}
}
/// Emits the loop nests performing the copy to the designated location in the
/// beginning of the region, and from the designated location immediately before
/// the terminator of the first block of the region. The region is expected to
/// have one block. This boils down to the following structure
///
/// ^bb(...):
/// <loop-bound-computation>
/// for %arg0 = ... to ... step ... {
/// ...
/// for %argN = <thread-id-x> to ... step <block-dim-x> {
/// %0 = load %from[%arg0, ..., %argN]
/// store %0, %to[%arg0, ..., %argN]
/// }
/// ...
/// }
/// gpu.barrier
/// <... original body ...>
/// gpu.barrier
/// for %arg0 = ... to ... step ... {
/// ...
/// for %argN = <thread-id-x> to ... step <block-dim-x> {
/// %1 = load %to[%arg0, ..., %argN]
/// store %1, %from[%arg0, ..., %argN]
/// }
/// ...
/// }
///
/// Inserts the barriers unconditionally since different threads may be copying
/// values and reading them. An analysis would be required to eliminate barriers
/// in case where value is only used by the thread that copies it. Both copies
/// are inserted unconditionally, an analysis would be required to only copy
/// live-in and live-out values when necessary. This copies the entire memref
/// pointed to by "from". In case a smaller block would be sufficient, the
/// caller can create a subview of the memref and promote it instead.
static void insertCopies(Region &region, Location loc, Value from, Value to) {
auto fromType = from.getType().cast<MemRefType>();
auto toType = to.getType().cast<MemRefType>();
(void)fromType;
(void)toType;
assert(fromType.getShape() == toType.getShape());
assert(fromType.getRank() != 0);
assert(llvm::hasSingleElement(region) &&
"unstructured control flow not supported");
OpBuilder builder(region.getContext());
builder.setInsertionPointToStart(&region.front());
ScopedContext edscContext(builder, loc);
MemRefBoundsCapture fromBoundsCapture(from);
insertCopyLoops(builder, loc, fromBoundsCapture, from, to);
builder.create<gpu::BarrierOp>(loc);
builder.setInsertionPoint(&region.front().back());
builder.create<gpu::BarrierOp>(loc);
insertCopyLoops(builder, loc, fromBoundsCapture, to, from);
}
/// Promotes a function argument to workgroup memory in the given function. The
/// copies will be inserted in the beginning and in the end of the function.
void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
Value value = op.getArgument(arg);
auto type = value.getType().dyn_cast<MemRefType>();
assert(type && type.hasStaticShape() && "can only promote memrefs");
Value attribution =
op.addWorkgroupAttribution(type.getShape(), type.getElementType());
// Replace the uses first since only the original uses are currently present.
// Then insert the copies.
value.replaceAllUsesWith(attribution);
insertCopies(op.getBody(), op.getLoc(), value, attribution);
}