Files
llvm/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

2186 lines
82 KiB
C++
Raw Normal View History

//===- GPUDialect.cpp - MLIR Dialect for GPU Kernels implementation -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the GPU kernel-related dialect and its operations.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/DialectImplementation.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/PatternMatch.h"
[mlir][gpu] Introduce `gpu.dynamic_shared_memory` Op (#71546) While the `gpu.launch` Op allows setting the size via the `dynamic_shared_memory_size` argument, accessing the dynamic shared memory is very convoluted. This PR implements the proposed Op, `gpu.dynamic_shared_memory` that aims to simplify the utilization of dynamic shared memory. RFC: https://discourse.llvm.org/t/rfc-simplifying-dynamic-shared-memory-access-in-gpu/ **Proposal from RFC** This PR `gpu.dynamic.shared.memory` Op to use dynamic shared memory feature efficiently. It is is a powerful feature that enables the allocation of shared memory at runtime with the kernel launch on the host. Afterwards, the memory can be accessed directly from the device. I believe similar story exists for AMDGPU. **Current way Using Dynamic Shared Memory with MLIR** Let me illustrate the challenges of using dynamic shared memory in MLIR with an example below. The process involves several steps: - memref.global 0-sized array LLVM's NVPTX backend expects - dynamic_shared_memory_size Set the size of dynamic shared memory - memref.get_global Access the global symbol - reinterpret_cast and subview Many OPs for pointer arithmetic ``` // Step 1. Create 0-sized global symbol. Manually set the alignment memref.global "private" @dynamicShmem : memref<0xf16, 3> { alignment = 16 } func.func @main() { // Step 2. Allocate shared memory gpu.launch blocks(...) threads(...) dynamic_shared_memory_size %c10000 { // Step 3. Access the global object %shmem = memref.get_global @dynamicShmem : memref<0xf16, 3> // Step 4. A sequence of `memref.reinterpret_cast` and `memref.subview` operations. %4 = memref.reinterpret_cast %shmem to offset: [0], sizes: [14, 64, 128], strides: [8192,128,1] : memref<0xf16, 3> to memref<14x64x128xf16,3> %5 = memref.subview %4[7, 0, 0][7, 64, 128][1,1,1] : memref<14x64x128xf16,3> to memref<7x64x128xf16, strided<[8192, 128, 1], offset: 57344>, 3> %6 = memref.subview %5[2, 0, 0][1, 64, 128][1,1,1] : memref<7x64x128xf16, strided<[8192, 128, 1], offset: 57344>, 3> to memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> %7 = memref.subview %6[0, 0][64, 64][1,1] : memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> to memref<64x64xf16, strided<[128, 1], offset: 73728>, 3> %8 = memref.subview %6[32, 0][64, 64][1,1] : memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> to memref<64x64xf16, strided<[128, 1], offset: 77824>, 3> // Step.5 Use "test.use.shared.memory"(%7) : (memref<64x64xf16, strided<[128, 1], offset: 73728>, 3>) -> (index) "test.use.shared.memory"(%8) : (memref<64x64xf16, strided<[128, 1], offset: 77824>, 3>) -> (index) gpu.terminator } ``` Let’s write the program above with that: ``` func.func @main() { gpu.launch blocks(...) threads(...) dynamic_shared_memory_size %c10000 { %i = arith.constant 18 : index // Step 1: Obtain shared memory directly %shmem = gpu.dynamic_shared_memory : memref<?xi8, 3> %c147456 = arith.constant 147456 : index %c155648 = arith.constant 155648 : index %7 = memref.view %shmem[%c147456][] : memref<?xi8, 3> to memref<64x64xf16, 3> %8 = memref.view %shmem[%c155648][] : memref<?xi8, 3> to memref<64x64xf16, 3> // Step 2: Utilize the shared memory "test.use.shared.memory"(%7) : (memref<64x64xf16, 3>) -> (index) "test.use.shared.memory"(%8) : (memref<64x64xf16, 3>) -> (index) } } ``` This PR resolves #72513
2023-11-16 14:42:17 +01:00
#include "mlir/IR/SymbolTable.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/Interfaces/FunctionImplementation.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Transforms/InliningUtils.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/StringSaver.h"
#include <cassert>
using namespace mlir;
using namespace mlir::gpu;
#include "mlir/Dialect/GPU/IR/GPUOpsDialect.cpp.inc"
//===----------------------------------------------------------------------===//
// GPU Device Mapping Attributes
//===----------------------------------------------------------------------===//
int64_t GPUBlockMappingAttr::getMappingId() const {
return static_cast<int64_t>(getBlock());
}
bool GPUBlockMappingAttr::isLinearMapping() const {
return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0);
}
int64_t GPUBlockMappingAttr::getRelativeIndex() const {
return isLinearMapping()
? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0)
: getMappingId();
}
int64_t GPUWarpgroupMappingAttr::getMappingId() const {
return static_cast<int64_t>(getWarpgroup());
}
bool GPUWarpgroupMappingAttr::isLinearMapping() const {
return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0);
}
int64_t GPUWarpgroupMappingAttr::getRelativeIndex() const {
return isLinearMapping()
? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0)
: getMappingId();
}
int64_t GPUWarpMappingAttr::getMappingId() const {
return static_cast<int64_t>(getWarp());
}
bool GPUWarpMappingAttr::isLinearMapping() const {
return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0);
}
int64_t GPUWarpMappingAttr::getRelativeIndex() const {
return isLinearMapping()
? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0)
: getMappingId();
}
int64_t GPUThreadMappingAttr::getMappingId() const {
return static_cast<int64_t>(getThread());
}
bool GPUThreadMappingAttr::isLinearMapping() const {
return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0);
}
int64_t GPUThreadMappingAttr::getRelativeIndex() const {
return isLinearMapping()
? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0)
: getMappingId();
}
int64_t GPUMemorySpaceMappingAttr::getMappingId() const {
return static_cast<int64_t>(getAddressSpace());
}
bool GPUMemorySpaceMappingAttr::isLinearMapping() const {
llvm_unreachable("GPUMemorySpaceMappingAttr does not support linear mapping");
}
int64_t GPUMemorySpaceMappingAttr::getRelativeIndex() const {
llvm_unreachable("GPUMemorySpaceMappingAttr does not support relative index");
}
//===----------------------------------------------------------------------===//
// MMAMatrixType
//===----------------------------------------------------------------------===//
MMAMatrixType MMAMatrixType::get(ArrayRef<int64_t> shape, Type elementType,
StringRef operand) {
return Base::get(elementType.getContext(), shape, elementType, operand);
}
MMAMatrixType
MMAMatrixType::getChecked(function_ref<InFlightDiagnostic()> emitError,
ArrayRef<int64_t> shape, Type elementType,
StringRef operand) {
return Base::getChecked(emitError, elementType.getContext(), shape,
elementType, operand);
}
unsigned MMAMatrixType::getNumDims() const { return getImpl()->numDims; }
ArrayRef<int64_t> MMAMatrixType::getShape() const {
return getImpl()->getShape();
}
Type MMAMatrixType::getElementType() const { return getImpl()->elementType; }
StringRef MMAMatrixType::getOperand() const { return getImpl()->getOperand(); }
bool MMAMatrixType::isValidElementType(Type elementType) {
return elementType.isF16() || elementType.isF32() ||
elementType.isUnsignedInteger(8) || elementType.isSignedInteger(8) ||
elementType.isInteger(32);
}
LogicalResult
MMAMatrixType::verify(function_ref<InFlightDiagnostic()> emitError,
ArrayRef<int64_t> shape, Type elementType,
StringRef operand) {
if (!operand.equals("AOp") && !operand.equals("BOp") &&
!operand.equals("COp"))
return emitError() << "operand expected to be one of AOp, BOp or COp";
if (shape.size() != 2)
return emitError() << "MMAMatrixType must have exactly two dimensions";
if (!MMAMatrixType::isValidElementType(elementType))
return emitError()
<< "MMAMatrixType elements must be SI8, UI8, I32, F16, or F32";
return success();
}
//===----------------------------------------------------------------------===//
// GPUDialect
//===----------------------------------------------------------------------===//
[mlir][gpu] Introduce `gpu.dynamic_shared_memory` Op (#71546) While the `gpu.launch` Op allows setting the size via the `dynamic_shared_memory_size` argument, accessing the dynamic shared memory is very convoluted. This PR implements the proposed Op, `gpu.dynamic_shared_memory` that aims to simplify the utilization of dynamic shared memory. RFC: https://discourse.llvm.org/t/rfc-simplifying-dynamic-shared-memory-access-in-gpu/ **Proposal from RFC** This PR `gpu.dynamic.shared.memory` Op to use dynamic shared memory feature efficiently. It is is a powerful feature that enables the allocation of shared memory at runtime with the kernel launch on the host. Afterwards, the memory can be accessed directly from the device. I believe similar story exists for AMDGPU. **Current way Using Dynamic Shared Memory with MLIR** Let me illustrate the challenges of using dynamic shared memory in MLIR with an example below. The process involves several steps: - memref.global 0-sized array LLVM's NVPTX backend expects - dynamic_shared_memory_size Set the size of dynamic shared memory - memref.get_global Access the global symbol - reinterpret_cast and subview Many OPs for pointer arithmetic ``` // Step 1. Create 0-sized global symbol. Manually set the alignment memref.global "private" @dynamicShmem : memref<0xf16, 3> { alignment = 16 } func.func @main() { // Step 2. Allocate shared memory gpu.launch blocks(...) threads(...) dynamic_shared_memory_size %c10000 { // Step 3. Access the global object %shmem = memref.get_global @dynamicShmem : memref<0xf16, 3> // Step 4. A sequence of `memref.reinterpret_cast` and `memref.subview` operations. %4 = memref.reinterpret_cast %shmem to offset: [0], sizes: [14, 64, 128], strides: [8192,128,1] : memref<0xf16, 3> to memref<14x64x128xf16,3> %5 = memref.subview %4[7, 0, 0][7, 64, 128][1,1,1] : memref<14x64x128xf16,3> to memref<7x64x128xf16, strided<[8192, 128, 1], offset: 57344>, 3> %6 = memref.subview %5[2, 0, 0][1, 64, 128][1,1,1] : memref<7x64x128xf16, strided<[8192, 128, 1], offset: 57344>, 3> to memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> %7 = memref.subview %6[0, 0][64, 64][1,1] : memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> to memref<64x64xf16, strided<[128, 1], offset: 73728>, 3> %8 = memref.subview %6[32, 0][64, 64][1,1] : memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> to memref<64x64xf16, strided<[128, 1], offset: 77824>, 3> // Step.5 Use "test.use.shared.memory"(%7) : (memref<64x64xf16, strided<[128, 1], offset: 73728>, 3>) -> (index) "test.use.shared.memory"(%8) : (memref<64x64xf16, strided<[128, 1], offset: 77824>, 3>) -> (index) gpu.terminator } ``` Let’s write the program above with that: ``` func.func @main() { gpu.launch blocks(...) threads(...) dynamic_shared_memory_size %c10000 { %i = arith.constant 18 : index // Step 1: Obtain shared memory directly %shmem = gpu.dynamic_shared_memory : memref<?xi8, 3> %c147456 = arith.constant 147456 : index %c155648 = arith.constant 155648 : index %7 = memref.view %shmem[%c147456][] : memref<?xi8, 3> to memref<64x64xf16, 3> %8 = memref.view %shmem[%c155648][] : memref<?xi8, 3> to memref<64x64xf16, 3> // Step 2: Utilize the shared memory "test.use.shared.memory"(%7) : (memref<64x64xf16, 3>) -> (index) "test.use.shared.memory"(%8) : (memref<64x64xf16, 3>) -> (index) } } ``` This PR resolves #72513
2023-11-16 14:42:17 +01:00
bool GPUDialect::isWorkgroupMemoryAddressSpace(Attribute memorySpace) {
if (!memorySpace)
return false;
if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
return gpuAttr.getValue() == getWorkgroupAddressSpace();
return false;
}
[mlir][gpu] Introduce `gpu.dynamic_shared_memory` Op (#71546) While the `gpu.launch` Op allows setting the size via the `dynamic_shared_memory_size` argument, accessing the dynamic shared memory is very convoluted. This PR implements the proposed Op, `gpu.dynamic_shared_memory` that aims to simplify the utilization of dynamic shared memory. RFC: https://discourse.llvm.org/t/rfc-simplifying-dynamic-shared-memory-access-in-gpu/ **Proposal from RFC** This PR `gpu.dynamic.shared.memory` Op to use dynamic shared memory feature efficiently. It is is a powerful feature that enables the allocation of shared memory at runtime with the kernel launch on the host. Afterwards, the memory can be accessed directly from the device. I believe similar story exists for AMDGPU. **Current way Using Dynamic Shared Memory with MLIR** Let me illustrate the challenges of using dynamic shared memory in MLIR with an example below. The process involves several steps: - memref.global 0-sized array LLVM's NVPTX backend expects - dynamic_shared_memory_size Set the size of dynamic shared memory - memref.get_global Access the global symbol - reinterpret_cast and subview Many OPs for pointer arithmetic ``` // Step 1. Create 0-sized global symbol. Manually set the alignment memref.global "private" @dynamicShmem : memref<0xf16, 3> { alignment = 16 } func.func @main() { // Step 2. Allocate shared memory gpu.launch blocks(...) threads(...) dynamic_shared_memory_size %c10000 { // Step 3. Access the global object %shmem = memref.get_global @dynamicShmem : memref<0xf16, 3> // Step 4. A sequence of `memref.reinterpret_cast` and `memref.subview` operations. %4 = memref.reinterpret_cast %shmem to offset: [0], sizes: [14, 64, 128], strides: [8192,128,1] : memref<0xf16, 3> to memref<14x64x128xf16,3> %5 = memref.subview %4[7, 0, 0][7, 64, 128][1,1,1] : memref<14x64x128xf16,3> to memref<7x64x128xf16, strided<[8192, 128, 1], offset: 57344>, 3> %6 = memref.subview %5[2, 0, 0][1, 64, 128][1,1,1] : memref<7x64x128xf16, strided<[8192, 128, 1], offset: 57344>, 3> to memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> %7 = memref.subview %6[0, 0][64, 64][1,1] : memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> to memref<64x64xf16, strided<[128, 1], offset: 73728>, 3> %8 = memref.subview %6[32, 0][64, 64][1,1] : memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> to memref<64x64xf16, strided<[128, 1], offset: 77824>, 3> // Step.5 Use "test.use.shared.memory"(%7) : (memref<64x64xf16, strided<[128, 1], offset: 73728>, 3>) -> (index) "test.use.shared.memory"(%8) : (memref<64x64xf16, strided<[128, 1], offset: 77824>, 3>) -> (index) gpu.terminator } ``` Let’s write the program above with that: ``` func.func @main() { gpu.launch blocks(...) threads(...) dynamic_shared_memory_size %c10000 { %i = arith.constant 18 : index // Step 1: Obtain shared memory directly %shmem = gpu.dynamic_shared_memory : memref<?xi8, 3> %c147456 = arith.constant 147456 : index %c155648 = arith.constant 155648 : index %7 = memref.view %shmem[%c147456][] : memref<?xi8, 3> to memref<64x64xf16, 3> %8 = memref.view %shmem[%c155648][] : memref<?xi8, 3> to memref<64x64xf16, 3> // Step 2: Utilize the shared memory "test.use.shared.memory"(%7) : (memref<64x64xf16, 3>) -> (index) "test.use.shared.memory"(%8) : (memref<64x64xf16, 3>) -> (index) } } ``` This PR resolves #72513
2023-11-16 14:42:17 +01:00
bool GPUDialect::hasWorkgroupMemoryAddressSpace(MemRefType type) {
Attribute memorySpace = type.getMemorySpace();
return isWorkgroupMemoryAddressSpace(memorySpace);
}
bool GPUDialect::isKernel(Operation *op) {
UnitAttr isKernelAttr = op->getAttrOfType<UnitAttr>(getKernelFuncAttrName());
return static_cast<bool>(isKernelAttr);
}
namespace {
/// This class defines the interface for handling inlining with gpu
/// operations.
struct GPUInlinerInterface : public DialectInlinerInterface {
using DialectInlinerInterface::DialectInlinerInterface;
/// All gpu dialect ops can be inlined.
bool isLegalToInline(Operation *, Region *, bool, IRMapping &) const final {
return true;
}
};
} // namespace
void GPUDialect::initialize() {
addTypes<AsyncTokenType>();
addTypes<MMAMatrixType>();
addTypes<SparseDnTensorHandleType>();
addTypes<SparseSpMatHandleType>();
addTypes<SparseSpGEMMOpHandleType>();
addOperations<
#define GET_OP_LIST
#include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc"
>();
addAttributes<
#define GET_ATTRDEF_LIST
#include "mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc"
>();
addInterfaces<GPUInlinerInterface>();
}
static std::string getSparseHandleKeyword(SparseHandleKind kind) {
switch (kind) {
case SparseHandleKind::DnTensor:
return "sparse.dntensor_handle";
case SparseHandleKind::SpMat:
return "sparse.spmat_handle";
case SparseHandleKind::SpGEMMOp:
return "sparse.spgemmop_handle";
}
llvm_unreachable("unknown sparse handle kind");
return "";
}
Type GPUDialect::parseType(DialectAsmParser &parser) const {
// Parse the main keyword for the type.
StringRef keyword;
if (parser.parseKeyword(&keyword))
return Type();
MLIRContext *context = getContext();
// Handle 'async token' types.
if (keyword == "async.token")
return AsyncTokenType::get(context);
if (keyword == "mma_matrix") {
SMLoc beginLoc = parser.getNameLoc();
// Parse '<'.
if (parser.parseLess())
return nullptr;
// Parse the size and elementType.
SmallVector<int64_t> shape;
Type elementType;
if (parser.parseDimensionList(shape, /*allowDynamic=*/false) ||
parser.parseType(elementType))
return nullptr;
// Parse ','
if (parser.parseComma())
return nullptr;
// Parse operand.
std::string operand;
if (failed(parser.parseOptionalString(&operand)))
return nullptr;
// Parse '>'.
if (parser.parseGreater())
return nullptr;
return MMAMatrixType::getChecked(mlir::detail::getDefaultDiagnosticEmitFn(
parser.getEncodedSourceLoc(beginLoc)),
shape, elementType, operand);
}
if (keyword == getSparseHandleKeyword(SparseHandleKind::DnTensor))
return SparseDnTensorHandleType::get(context);
if (keyword == getSparseHandleKeyword(SparseHandleKind::SpMat))
return SparseSpMatHandleType::get(context);
if (keyword == getSparseHandleKeyword(SparseHandleKind::SpGEMMOp))
return SparseSpGEMMOpHandleType::get(context);
parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword);
return Type();
}
// TODO: print refined type here. Notice that should be corresponding to the
// parser
void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
TypeSwitch<Type>(type)
.Case<AsyncTokenType>([&](Type) { os << "async.token"; })
.Case<SparseDnTensorHandleType>([&](Type) {
os << getSparseHandleKeyword(SparseHandleKind::DnTensor);
})
.Case<SparseSpMatHandleType>(
[&](Type) { os << getSparseHandleKeyword(SparseHandleKind::SpMat); })
.Case<SparseSpGEMMOpHandleType>([&](Type) {
os << getSparseHandleKeyword(SparseHandleKind::SpGEMMOp);
})
.Case<MMAMatrixType>([&](MMAMatrixType fragTy) {
os << "mma_matrix<";
auto shape = fragTy.getShape();
for (auto dim = shape.begin(), e = shape.end() - 1; dim != e; ++dim)
os << *dim << 'x';
os << shape.back() << 'x' << fragTy.getElementType();
os << ", \"" << fragTy.getOperand() << "\"" << '>';
})
.Default([](Type) { llvm_unreachable("unexpected 'gpu' type kind"); });
}
LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
NamedAttribute attr) {
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
if (!llvm::isa<UnitAttr>(attr.getValue()) ||
attr.getName() != getContainerModuleAttrName())
return success();
auto module = dyn_cast<ModuleOp>(op);
if (!module)
return op->emitError("expected '")
<< getContainerModuleAttrName() << "' attribute to be attached to '"
<< ModuleOp::getOperationName() << '\'';
auto walkResult = module.walk([&module](LaunchFuncOp launchOp) -> WalkResult {
// Ignore launches that are nested more or less deep than functions in the
// module we are currently checking.
if (!launchOp->getParentOp() ||
launchOp->getParentOp()->getParentOp() != module)
return success();
// Ignore launch ops with missing attributes here. The errors will be
// reported by the verifiers of those ops.
if (!launchOp->getAttrOfType<SymbolRefAttr>(
LaunchFuncOp::getKernelAttrName(launchOp->getName())))
return success();
// Check that `launch_func` refers to a well-formed GPU kernel container.
StringAttr kernelContainerName = launchOp.getKernelModuleName();
Operation *kernelContainer = module.lookupSymbol(kernelContainerName);
if (!kernelContainer)
return launchOp.emitOpError()
<< "kernel container '" << kernelContainerName.getValue()
<< "' is undefined";
// If the container is a GPU binary op return success.
if (isa<BinaryOp>(kernelContainer))
return success();
auto kernelModule = dyn_cast<GPUModuleOp>(kernelContainer);
if (!kernelModule)
return launchOp.emitOpError()
<< "kernel module '" << kernelContainerName.getValue()
<< "' is undefined";
// Check that `launch_func` refers to a well-formed kernel function.
Operation *kernelFunc = module.lookupSymbol(launchOp.getKernelAttr());
if (!kernelFunc)
return launchOp.emitOpError("kernel function '")
<< launchOp.getKernel() << "' is undefined";
auto kernelConvertedFunction = dyn_cast<FunctionOpInterface>(kernelFunc);
if (!kernelConvertedFunction) {
InFlightDiagnostic diag = launchOp.emitOpError()
<< "referenced kernel '" << launchOp.getKernel()
<< "' is not a function";
diag.attachNote(kernelFunc->getLoc()) << "see the kernel definition here";
return diag;
}
if (!kernelFunc->getAttrOfType<mlir::UnitAttr>(
GPUDialect::getKernelFuncAttrName()))
return launchOp.emitOpError("kernel function is missing the '")
<< GPUDialect::getKernelFuncAttrName() << "' attribute";
// TODO: If the kernel isn't a GPU function (which happens during separate
// compilation), do not check type correspondence as it would require the
// verifier to be aware of the type conversion.
auto kernelGPUFunction = dyn_cast<gpu::GPUFuncOp>(kernelFunc);
if (!kernelGPUFunction)
[mlir] use unpacked memref descriptors at function boundaries The existing (default) calling convention for memrefs in standard-to-LLVM conversion was motivated by interfacing with LLVM IR produced from C sources. In particular, it passes a pointer to the memref descriptor structure when calling the function. Therefore, the descriptor is allocated on stack before the call. This convention leads to several problems. PR44644 indicates a problem with stack exhaustion when calling functions with memref-typed arguments in a loop. Allocating outside of the loop may lead to concurrent access problems in case the loop is parallel. When targeting GPUs, the contents of the stack-allocated memory for the descriptor (passed by pointer) needs to be explicitly copied to the device. Using an aggregate type makes it impossible to attach pointer-specific argument attributes pertaining to alignment and aliasing in the LLVM dialect. Change the default calling convention for memrefs in standard-to-LLVM conversion to transform a memref into a list of arguments, each of primitive type, that are comprised in the memref descriptor. This avoids stack allocation for ranked memrefs (and thus stack exhaustion and potential concurrent access problems) and simplifies the device function invocation on GPUs. Provide an option in the standard-to-LLVM conversion to generate auxiliary wrapper function with the same interface as the previous calling convention, compatible with LLVM IR porduced from C sources. These auxiliary functions pack the individual values into a descriptor structure or unpack it. They also handle descriptor stack allocation if necessary, serving as an allocation scope: the memory reserved by `alloca` will be freed on exiting the auxiliary function. The effect of this change on MLIR-generated only LLVM IR is minimal. When interfacing MLIR-generated LLVM IR with C-generated LLVM IR, the integration only needs to require auxiliary functions and change the function name to call the wrapper function instead of the original function. This also opens the door to forwarding aliasing and alignment information from memrefs to LLVM IR pointers in the standrd-to-LLVM conversion.
2020-02-10 14:12:47 +01:00
return success();
unsigned actualNumArguments = launchOp.getNumKernelOperands();
[mlir] use unpacked memref descriptors at function boundaries The existing (default) calling convention for memrefs in standard-to-LLVM conversion was motivated by interfacing with LLVM IR produced from C sources. In particular, it passes a pointer to the memref descriptor structure when calling the function. Therefore, the descriptor is allocated on stack before the call. This convention leads to several problems. PR44644 indicates a problem with stack exhaustion when calling functions with memref-typed arguments in a loop. Allocating outside of the loop may lead to concurrent access problems in case the loop is parallel. When targeting GPUs, the contents of the stack-allocated memory for the descriptor (passed by pointer) needs to be explicitly copied to the device. Using an aggregate type makes it impossible to attach pointer-specific argument attributes pertaining to alignment and aliasing in the LLVM dialect. Change the default calling convention for memrefs in standard-to-LLVM conversion to transform a memref into a list of arguments, each of primitive type, that are comprised in the memref descriptor. This avoids stack allocation for ranked memrefs (and thus stack exhaustion and potential concurrent access problems) and simplifies the device function invocation on GPUs. Provide an option in the standard-to-LLVM conversion to generate auxiliary wrapper function with the same interface as the previous calling convention, compatible with LLVM IR porduced from C sources. These auxiliary functions pack the individual values into a descriptor structure or unpack it. They also handle descriptor stack allocation if necessary, serving as an allocation scope: the memory reserved by `alloca` will be freed on exiting the auxiliary function. The effect of this change on MLIR-generated only LLVM IR is minimal. When interfacing MLIR-generated LLVM IR with C-generated LLVM IR, the integration only needs to require auxiliary functions and change the function name to call the wrapper function instead of the original function. This also opens the door to forwarding aliasing and alignment information from memrefs to LLVM IR pointers in the standrd-to-LLVM conversion.
2020-02-10 14:12:47 +01:00
unsigned expectedNumArguments = kernelGPUFunction.getNumArguments();
if (expectedNumArguments != actualNumArguments)
return launchOp.emitOpError("got ")
<< actualNumArguments << " kernel operands but expected "
<< expectedNumArguments;
auto functionType = kernelGPUFunction.getFunctionType();
[mlir] use unpacked memref descriptors at function boundaries The existing (default) calling convention for memrefs in standard-to-LLVM conversion was motivated by interfacing with LLVM IR produced from C sources. In particular, it passes a pointer to the memref descriptor structure when calling the function. Therefore, the descriptor is allocated on stack before the call. This convention leads to several problems. PR44644 indicates a problem with stack exhaustion when calling functions with memref-typed arguments in a loop. Allocating outside of the loop may lead to concurrent access problems in case the loop is parallel. When targeting GPUs, the contents of the stack-allocated memory for the descriptor (passed by pointer) needs to be explicitly copied to the device. Using an aggregate type makes it impossible to attach pointer-specific argument attributes pertaining to alignment and aliasing in the LLVM dialect. Change the default calling convention for memrefs in standard-to-LLVM conversion to transform a memref into a list of arguments, each of primitive type, that are comprised in the memref descriptor. This avoids stack allocation for ranked memrefs (and thus stack exhaustion and potential concurrent access problems) and simplifies the device function invocation on GPUs. Provide an option in the standard-to-LLVM conversion to generate auxiliary wrapper function with the same interface as the previous calling convention, compatible with LLVM IR porduced from C sources. These auxiliary functions pack the individual values into a descriptor structure or unpack it. They also handle descriptor stack allocation if necessary, serving as an allocation scope: the memory reserved by `alloca` will be freed on exiting the auxiliary function. The effect of this change on MLIR-generated only LLVM IR is minimal. When interfacing MLIR-generated LLVM IR with C-generated LLVM IR, the integration only needs to require auxiliary functions and change the function name to call the wrapper function instead of the original function. This also opens the door to forwarding aliasing and alignment information from memrefs to LLVM IR pointers in the standrd-to-LLVM conversion.
2020-02-10 14:12:47 +01:00
for (unsigned i = 0; i < expectedNumArguments; ++i) {
if (launchOp.getKernelOperand(i).getType() != functionType.getInput(i)) {
return launchOp.emitOpError("type of function argument ")
<< i << " does not match";
}
}
return success();
});
return walkResult.wasInterrupted() ? failure() : success();
}
/// Parses an optional list of async operands with an optional leading keyword.
/// (`async`)? (`[` ssa-id-list `]`)?
///
/// This method is used by the tablegen assembly format for async ops as well.
static ParseResult parseAsyncDependencies(
OpAsmParser &parser, Type &asyncTokenType,
SmallVectorImpl<OpAsmParser::UnresolvedOperand> &asyncDependencies) {
auto loc = parser.getCurrentLocation();
if (succeeded(parser.parseOptionalKeyword("async"))) {
if (parser.getNumResults() == 0)
return parser.emitError(loc, "needs to be named when marked 'async'");
asyncTokenType = parser.getBuilder().getType<AsyncTokenType>();
}
return parser.parseOperandList(asyncDependencies,
OpAsmParser::Delimiter::OptionalSquare);
}
/// Prints optional async dependencies with its leading keyword.
/// (`async`)? (`[` ssa-id-list `]`)?
// Used by the tablegen assembly format for several async ops.
static void printAsyncDependencies(OpAsmPrinter &printer, Operation *op,
Type asyncTokenType,
OperandRange asyncDependencies) {
if (asyncTokenType)
printer << "async";
if (asyncDependencies.empty())
return;
if (asyncTokenType)
printer << ' ';
printer << '[';
llvm::interleaveComma(asyncDependencies, printer);
printer << ']';
}
// GPU Memory attributions functions shared by LaunchOp and GPUFuncOp.
/// Parses a GPU function memory attribution.
///
/// memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
/// (`private` `(` ssa-id-and-type-list `)`)?
///
/// Note that this function parses only one of the two similar parts, with the
/// keyword provided as argument.
static ParseResult
parseAttributions(OpAsmParser &parser, StringRef keyword,
SmallVectorImpl<OpAsmParser::Argument> &args) {
// If we could not parse the keyword, just assume empty list and succeed.
if (failed(parser.parseOptionalKeyword(keyword)))
return success();
return parser.parseArgumentList(args, OpAsmParser::Delimiter::Paren,
/*allowType=*/true);
}
/// Prints a GPU function memory attribution.
static void printAttributions(OpAsmPrinter &p, StringRef keyword,
ArrayRef<BlockArgument> values) {
if (values.empty())
return;
p << ' ' << keyword << '(';
llvm::interleaveComma(
values, p, [&p](BlockArgument v) { p << v << " : " << v.getType(); });
p << ')';
}
/// Verifies a GPU function memory attribution.
static LogicalResult verifyAttributions(Operation *op,
ArrayRef<BlockArgument> attributions,
gpu::AddressSpace memorySpace) {
for (Value v : attributions) {
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
auto type = llvm::dyn_cast<MemRefType>(v.getType());
if (!type)
return op->emitOpError() << "expected memref type in attribution";
// We can only verify the address space if it hasn't already been lowered
// from the AddressSpaceAttr to a target-specific numeric value.
auto addressSpace =
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
llvm::dyn_cast_or_null<gpu::AddressSpaceAttr>(type.getMemorySpace());
if (!addressSpace)
continue;
if (addressSpace.getValue() != memorySpace)
return op->emitOpError()
<< "expected memory space " << stringifyAddressSpace(memorySpace)
<< " in attribution";
}
return success();
}
//===----------------------------------------------------------------------===//
// AllReduceOp
//===----------------------------------------------------------------------===//
static bool verifyReduceOpAndType(gpu::AllReduceOperation opName,
Type resType) {
return (opName != gpu::AllReduceOperation::AND &&
opName != gpu::AllReduceOperation::OR &&
opName != gpu::AllReduceOperation::XOR) ||
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
llvm::isa<IntegerType>(resType);
}
LogicalResult gpu::AllReduceOp::verifyRegions() {
if (getBody().empty() != getOp().has_value())
return emitError("expected either an op attribute or a non-empty body");
if (!getBody().empty()) {
if (getBody().getNumArguments() != 2)
return emitError("expected two region arguments");
for (auto argument : getBody().getArguments()) {
if (argument.getType() != getType())
return emitError("incorrect region argument type");
}
unsigned yieldCount = 0;
for (Block &block : getBody()) {
if (auto yield = dyn_cast<gpu::YieldOp>(block.getTerminator())) {
if (yield.getNumOperands() != 1)
return emitError("expected one gpu.yield operand");
if (yield.getOperand(0).getType() != getType())
return emitError("incorrect gpu.yield type");
++yieldCount;
}
}
if (yieldCount == 0)
return emitError("expected gpu.yield op in region");
} else {
gpu::AllReduceOperation opName = *getOp();
if (!verifyReduceOpAndType(opName, getType())) {
return emitError()
<< '`' << gpu::stringifyAllReduceOperation(opName)
<< "` accumulator is only compatible with Integer type";
}
}
return success();
}
static bool canMakeGroupOpUniform(Operation *op) {
auto launchOp = dyn_cast<gpu::LaunchOp>(op->getParentOp());
if (!launchOp)
return false;
Region &body = launchOp.getBody();
assert(!body.empty() && "Invalid region");
// Only convert ops in gpu::launch entry block for now.
return op->getBlock() == &body.front();
}
OpFoldResult gpu::AllReduceOp::fold(FoldAdaptor /*adaptor*/) {
if (!getUniform() && canMakeGroupOpUniform(*this)) {
setUniform(true);
return getResult();
}
return nullptr;
}
// TODO: Support optional custom attributes (without dialect prefix).
static ParseResult parseAllReduceOperation(AsmParser &parser,
AllReduceOperationAttr &attr) {
StringRef enumStr;
if (!parser.parseOptionalKeyword(&enumStr)) {
std::optional<AllReduceOperation> op =
gpu::symbolizeAllReduceOperation(enumStr);
if (!op)
return parser.emitError(parser.getCurrentLocation(), "invalid op kind");
attr = AllReduceOperationAttr::get(parser.getContext(), *op);
}
return success();
}
static void printAllReduceOperation(AsmPrinter &printer, Operation *op,
AllReduceOperationAttr attr) {
if (attr)
attr.print(printer);
}
//===----------------------------------------------------------------------===//
// SubgroupReduceOp
//===----------------------------------------------------------------------===//
LogicalResult gpu::SubgroupReduceOp::verify() {
gpu::AllReduceOperation opName = getOp();
if (!verifyReduceOpAndType(opName, getType())) {
return emitError() << '`' << gpu::stringifyAllReduceOperation(opName)
<< "` accumulator is only compatible with Integer type";
}
return success();
}
OpFoldResult gpu::SubgroupReduceOp::fold(FoldAdaptor /*adaptor*/) {
if (!getUniform() && canMakeGroupOpUniform(*this)) {
setUniform(true);
return getResult();
}
return nullptr;
}
//===----------------------------------------------------------------------===//
// AsyncOpInterface
//===----------------------------------------------------------------------===//
void gpu::addAsyncDependency(Operation *op, Value token) {
op->insertOperands(0, {token});
if (!op->template hasTrait<OpTrait::AttrSizedOperandSegments>())
return;
auto attrName =
OpTrait::AttrSizedOperandSegments<void>::getOperandSegmentSizeAttr();
auto sizeAttr = op->template getAttrOfType<DenseI32ArrayAttr>(attrName);
// Async dependencies is the only variadic operand.
if (!sizeAttr)
return;
SmallVector<int32_t, 8> sizes(sizeAttr.asArrayRef());
++sizes.front();
op->setAttr(attrName, Builder(op->getContext()).getDenseI32ArrayAttr(sizes));
}
//===----------------------------------------------------------------------===//
// LaunchOp
//===----------------------------------------------------------------------===//
void LaunchOp::build(OpBuilder &builder, OperationState &result,
Value gridSizeX, Value gridSizeY, Value gridSizeZ,
Value getBlockSizeX, Value getBlockSizeY,
Value getBlockSizeZ, Value dynamicSharedMemorySize,
Type asyncTokenType, ValueRange asyncDependencies,
TypeRange workgroupAttributions,
TypeRange privateAttributions) {
// Add a WorkGroup attribution attribute. This attribute is required to
// identify private attributions in the list of block argguments.
result.addAttribute(getNumWorkgroupAttributionsAttrName(),
builder.getI64IntegerAttr(workgroupAttributions.size()));
// Add Op operands.
result.addOperands(asyncDependencies);
if (asyncTokenType)
result.types.push_back(builder.getType<AsyncTokenType>());
// Add grid and block sizes as op operands, followed by the data operands.
result.addOperands({gridSizeX, gridSizeY, gridSizeZ, getBlockSizeX,
getBlockSizeY, getBlockSizeZ});
if (dynamicSharedMemorySize)
result.addOperands(dynamicSharedMemorySize);
// Create a kernel body region with kNumConfigRegionAttributes + N memory
// attributions, where the first kNumConfigRegionAttributes arguments have
// `index` type and the rest have the same types as the data operands.
Region *kernelRegion = result.addRegion();
Block *body = new Block();
// TODO: Allow passing in proper locations here.
for (unsigned i = 0; i < kNumConfigRegionAttributes; ++i)
body->addArgument(builder.getIndexType(), result.location);
// Add WorkGroup & Private attributions to the region arguments.
for (Type argTy : workgroupAttributions)
body->addArgument(argTy, result.location);
for (Type argTy : privateAttributions)
body->addArgument(argTy, result.location);
kernelRegion->push_back(body);
// Fill OperandSegmentSize Attribute.
SmallVector<int32_t, 8> segmentSizes(8, 1);
segmentSizes.front() = asyncDependencies.size();
segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0;
result.addAttribute(getOperandSegmentSizeAttr(),
builder.getDenseI32ArrayAttr(segmentSizes));
}
KernelDim3 LaunchOp::getBlockIds() {
assert(!getBody().empty() && "LaunchOp body must not be empty.");
auto args = getBody().getArguments();
return KernelDim3{args[0], args[1], args[2]};
}
KernelDim3 LaunchOp::getThreadIds() {
assert(!getBody().empty() && "LaunchOp body must not be empty.");
auto args = getBody().getArguments();
return KernelDim3{args[3], args[4], args[5]};
}
KernelDim3 LaunchOp::getGridSize() {
assert(!getBody().empty() && "LaunchOp body must not be empty.");
auto args = getBody().getArguments();
return KernelDim3{args[6], args[7], args[8]};
}
KernelDim3 LaunchOp::getBlockSize() {
assert(!getBody().empty() && "LaunchOp body must not be empty.");
auto args = getBody().getArguments();
return KernelDim3{args[9], args[10], args[11]};
}
KernelDim3 LaunchOp::getGridSizeOperandValues() {
auto operands = getOperands().drop_front(getAsyncDependencies().size());
return KernelDim3{operands[0], operands[1], operands[2]};
}
KernelDim3 LaunchOp::getBlockSizeOperandValues() {
auto operands = getOperands().drop_front(getAsyncDependencies().size());
return KernelDim3{operands[3], operands[4], operands[5]};
}
LogicalResult LaunchOp::verifyRegions() {
// Kernel launch takes kNumConfigOperands leading operands for grid/block
// sizes and transforms them into kNumConfigRegionAttributes region arguments
// for block/thread identifiers and grid/block sizes.
if (!getBody().empty()) {
if (getBody().getNumArguments() <
kNumConfigRegionAttributes + getNumWorkgroupAttributions())
return emitOpError("unexpected number of region arguments");
}
// Verify Attributions Address Spaces.
if (failed(verifyAttributions(getOperation(), getWorkgroupAttributions(),
GPUDialect::getWorkgroupAddressSpace())) ||
failed(verifyAttributions(getOperation(), getPrivateAttributions(),
GPUDialect::getPrivateAddressSpace())))
return failure();
// Block terminators without successors are expected to exit the kernel region
// and must be `gpu.terminator`.
for (Block &block : getBody()) {
if (block.empty())
continue;
if (block.back().getNumSuccessors() != 0)
continue;
if (!isa<gpu::TerminatorOp>(&block.back())) {
return block.back()
.emitError()
.append("expected '", gpu::TerminatorOp::getOperationName(),
"' or a terminator with successors")
.attachNote(getLoc())
.append("in '", LaunchOp::getOperationName(), "' body region");
}
}
if (getNumResults() == 0 && getAsyncToken())
return emitOpError("needs to be named when async keyword is specified");
return success();
}
// Pretty-print the kernel grid/block size assignment as
// (%iter-x, %iter-y, %iter-z) in
// (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use)
// where %size-* and %iter-* will correspond to the body region arguments.
static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size,
KernelDim3 operands, KernelDim3 ids) {
p << '(' << ids.x << ", " << ids.y << ", " << ids.z << ") in (";
p << size.x << " = " << operands.x << ", ";
p << size.y << " = " << operands.y << ", ";
p << size.z << " = " << operands.z << ')';
}
void LaunchOp::print(OpAsmPrinter &p) {
if (getAsyncToken()) {
p << " async";
if (!getAsyncDependencies().empty())
p << " [" << getAsyncDependencies() << ']';
}
// Print the launch configuration.
p << ' ' << getBlocksKeyword();
printSizeAssignment(p, getGridSize(), getGridSizeOperandValues(),
getBlockIds());
p << ' ' << getThreadsKeyword();
printSizeAssignment(p, getBlockSize(), getBlockSizeOperandValues(),
getThreadIds());
if (getDynamicSharedMemorySize())
p << ' ' << getDynamicSharedMemorySizeKeyword() << ' '
<< getDynamicSharedMemorySize();
printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributions());
printAttributions(p, getPrivateKeyword(), getPrivateAttributions());
p << ' ';
p.printRegion(getBody(), /*printEntryBlockArgs=*/false);
p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{
LaunchOp::getOperandSegmentSizeAttr(),
getNumWorkgroupAttributionsAttrName()});
}
// Parse the size assignment blocks for blocks and threads. These have the form
// (%region_arg, %region_arg, %region_arg) in
// (%region_arg = %operand, %region_arg = %operand, %region_arg = %operand)
// where %region_arg are percent-identifiers for the region arguments to be
// introduced further (SSA defs), and %operand are percent-identifiers for the
// SSA value uses.
static ParseResult
parseSizeAssignment(OpAsmParser &parser,
MutableArrayRef<OpAsmParser::UnresolvedOperand> sizes,
MutableArrayRef<OpAsmParser::UnresolvedOperand> regionSizes,
MutableArrayRef<OpAsmParser::UnresolvedOperand> indices) {
assert(indices.size() == 3 && "space for three indices expected");
SmallVector<OpAsmParser::UnresolvedOperand, 3> args;
if (parser.parseOperandList(args, OpAsmParser::Delimiter::Paren,
/*allowResultNumber=*/false) ||
parser.parseKeyword("in") || parser.parseLParen())
return failure();
std::move(args.begin(), args.end(), indices.begin());
for (int i = 0; i < 3; ++i) {
if (i != 0 && parser.parseComma())
return failure();
if (parser.parseOperand(regionSizes[i], /*allowResultNumber=*/false) ||
parser.parseEqual() || parser.parseOperand(sizes[i]))
return failure();
}
return parser.parseRParen();
}
/// Parses a Launch operation.
/// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)?
/// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
/// `threads` `(` ssa-id-list `)` `in` ssa-reassignment
/// memory-attribution
/// region attr-dict?
/// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
// Sizes of the grid and block.
SmallVector<OpAsmParser::UnresolvedOperand, LaunchOp::kNumConfigOperands>
sizes(LaunchOp::kNumConfigOperands);
MutableArrayRef<OpAsmParser::UnresolvedOperand> sizesRef(sizes);
// Actual (data) operands passed to the kernel.
SmallVector<OpAsmParser::UnresolvedOperand, 4> dataOperands;
// Region arguments to be created.
SmallVector<OpAsmParser::UnresolvedOperand, 16> regionArgs(
LaunchOp::kNumConfigRegionAttributes);
MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs);
// Parse optional async dependencies.
SmallVector<OpAsmParser::UnresolvedOperand, 4> asyncDependencies;
Type asyncTokenType;
if (failed(
parseAsyncDependencies(parser, asyncTokenType, asyncDependencies)) ||
parser.resolveOperands(asyncDependencies, asyncTokenType,
result.operands))
return failure();
if (parser.getNumResults() > 0)
result.types.push_back(asyncTokenType);
// Parse the size assignment segments: the first segment assigns grid sizes
// and defines values for block identifiers; the second segment assigns block
// sizes and defines values for thread identifiers. In the region argument
// list, identifiers precede sizes, and block-related values precede
// thread-related values.
if (parser.parseKeyword(LaunchOp::getBlocksKeyword().data()) ||
parseSizeAssignment(parser, sizesRef.take_front(3),
regionArgsRef.slice(6, 3),
regionArgsRef.slice(0, 3)) ||
parser.parseKeyword(LaunchOp::getThreadsKeyword().data()) ||
parseSizeAssignment(parser, sizesRef.drop_front(3),
regionArgsRef.slice(9, 3),
regionArgsRef.slice(3, 3)) ||
parser.resolveOperands(sizes, parser.getBuilder().getIndexType(),
result.operands))
return failure();
OpAsmParser::UnresolvedOperand dynamicSharedMemorySize;
bool hasDynamicSharedMemorySize = false;
if (!parser.parseOptionalKeyword(
LaunchOp::getDynamicSharedMemorySizeKeyword())) {
hasDynamicSharedMemorySize = true;
if (parser.parseOperand(dynamicSharedMemorySize) ||
parser.resolveOperand(dynamicSharedMemorySize,
parser.getBuilder().getI32Type(),
result.operands))
return failure();
}
// Create the region arguments, it has kNumConfigRegionAttributes arguments
// that correspond to block/thread identifiers and grid/block sizes, all
// having `index` type, a variadic number of WorkGroup Attributions and
// a variadic number of Private Attributions. The number of WorkGroup
// Attributions is stored in the attr with name:
// LaunchOp::getNumWorkgroupAttributionsAttrName().
Type index = parser.getBuilder().getIndexType();
SmallVector<Type, LaunchOp::kNumConfigRegionAttributes> dataTypes(
LaunchOp::kNumConfigRegionAttributes, index);
SmallVector<OpAsmParser::Argument> regionArguments;
for (auto ssaValueAndType : llvm::zip(regionArgs, dataTypes)) {
OpAsmParser::Argument arg;
arg.ssaName = std::get<0>(ssaValueAndType);
arg.type = std::get<1>(ssaValueAndType);
regionArguments.push_back(arg);
}
Builder &builder = parser.getBuilder();
// Parse workgroup memory attributions.
if (failed(parseAttributions(parser, LaunchOp::getWorkgroupKeyword(),
regionArguments)))
return failure();
// Store the number of operands we just parsed as the number of workgroup
// memory attributions.
unsigned numWorkgroupAttrs =
regionArguments.size() - LaunchOp::kNumConfigRegionAttributes;
result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(),
builder.getI64IntegerAttr(numWorkgroupAttrs));
// Parse private memory attributions.
if (failed(parseAttributions(parser, LaunchOp::getPrivateKeyword(),
regionArguments)))
return failure();
// Introduce the body region and parse it. The region has
// kNumConfigRegionAttributes arguments that correspond to
// block/thread identifiers and grid/block sizes, all having `index` type.
Region *body = result.addRegion();
if (parser.parseRegion(*body, regionArguments) ||
parser.parseOptionalAttrDict(result.attributes))
return failure();
SmallVector<int32_t, 8> segmentSizes(8, 1);
segmentSizes.front() = asyncDependencies.size();
segmentSizes.back() = hasDynamicSharedMemorySize ? 1 : 0;
result.addAttribute(LaunchOp::getOperandSegmentSizeAttr(),
parser.getBuilder().getDenseI32ArrayAttr(segmentSizes));
return success();
}
/// Simplify the gpu.launch when the range of a thread or block ID is
/// trivially known to be one.
struct FoldLaunchArguments : public OpRewritePattern<LaunchOp> {
using OpRewritePattern<LaunchOp>::OpRewritePattern;
LogicalResult matchAndRewrite(LaunchOp op,
PatternRewriter &rewriter) const override {
// If the range implies a single value for `id`, replace `id`'s uses by
// zero.
Value zero;
bool simplified = false;
auto constPropIdUses = [&](Value id, Value size) {
// Check if size is trivially one.
if (!matchPattern(size, m_One()))
return;
if (id.getUses().empty())
return;
if (!simplified) {
// Create a zero value the first time.
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointToStart(&op.getBody().front());
zero =
rewriter.create<arith::ConstantIndexOp>(op.getLoc(), /*value=*/0);
}
rewriter.replaceAllUsesWith(id, zero);
simplified = true;
};
constPropIdUses(op.getBlockIds().x, op.getGridSizeX());
constPropIdUses(op.getBlockIds().y, op.getGridSizeY());
constPropIdUses(op.getBlockIds().z, op.getGridSizeZ());
constPropIdUses(op.getThreadIds().x, op.getBlockSizeX());
constPropIdUses(op.getThreadIds().y, op.getBlockSizeY());
constPropIdUses(op.getThreadIds().z, op.getBlockSizeZ());
return success(simplified);
}
};
void LaunchOp::getCanonicalizationPatterns(RewritePatternSet &rewrites,
MLIRContext *context) {
rewrites.add<FoldLaunchArguments>(context);
}
/// Adds a new block argument that corresponds to buffers located in
/// workgroup memory.
BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) {
auto attrName = getNumWorkgroupAttributionsAttrName();
auto attr = (*this)->getAttrOfType<IntegerAttr>(attrName);
(*this)->setAttr(attrName,
IntegerAttr::get(attr.getType(), attr.getValue() + 1));
return getBody().insertArgument(
LaunchOp::kNumConfigRegionAttributes + attr.getInt(), type, loc);
}
/// Adds a new block argument that corresponds to buffers located in
/// private memory.
BlockArgument LaunchOp::addPrivateAttribution(Type type, Location loc) {
// Buffers on the private memory always come after buffers on the workgroup
// memory.
return getBody().addArgument(type, loc);
}
//===----------------------------------------------------------------------===//
// LaunchFuncOp
//===----------------------------------------------------------------------===//
void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
GPUFuncOp kernelFunc, KernelDim3 gridSize,
KernelDim3 getBlockSize, Value dynamicSharedMemorySize,
ValueRange kernelOperands, Type asyncTokenType,
ValueRange asyncDependencies,
std::optional<KernelDim3> clusterSize) {
result.addOperands(asyncDependencies);
if (asyncTokenType)
result.types.push_back(builder.getType<AsyncTokenType>());
// Add grid and block sizes as op operands, followed by the data operands.
result.addOperands({gridSize.x, gridSize.y, gridSize.z, getBlockSize.x,
getBlockSize.y, getBlockSize.z});
if (clusterSize.has_value())
result.addOperands({clusterSize->x, clusterSize->y, clusterSize->z});
if (dynamicSharedMemorySize)
result.addOperands(dynamicSharedMemorySize);
result.addOperands(kernelOperands);
auto kernelModule = kernelFunc->getParentOfType<GPUModuleOp>();
auto kernelSymbol =
SymbolRefAttr::get(kernelModule.getNameAttr(),
{SymbolRefAttr::get(kernelFunc.getNameAttr())});
Properties &prop = result.getOrAddProperties<Properties>();
prop.kernel = kernelSymbol;
size_t segmentSizesLen = std::size(prop.operandSegmentSizes);
// Initialize the segment sizes to 1.
for (auto &sz : prop.operandSegmentSizes)
sz = 1;
prop.operandSegmentSizes[0] = asyncDependencies.size();
if (!clusterSize.has_value()) {
prop.operandSegmentSizes[segmentSizesLen - 4] = 0;
prop.operandSegmentSizes[segmentSizesLen - 5] = 0;
prop.operandSegmentSizes[segmentSizesLen - 6] = 0;
}
prop.operandSegmentSizes[segmentSizesLen - 3] =
dynamicSharedMemorySize ? 1 : 0;
prop.operandSegmentSizes[segmentSizesLen - 2] =
static_cast<int32_t>(kernelOperands.size());
prop.operandSegmentSizes[segmentSizesLen - 1] = 0;
}
void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
SymbolRefAttr kernel, KernelDim3 gridSize,
KernelDim3 getBlockSize, Value dynamicSharedMemorySize,
ValueRange kernelOperands, Value asyncObject,
std::optional<KernelDim3> clusterSize) {
// Add grid and block sizes as op operands, followed by the data operands.
result.addOperands({gridSize.x, gridSize.y, gridSize.z, getBlockSize.x,
getBlockSize.y, getBlockSize.z});
if (clusterSize.has_value())
result.addOperands({clusterSize->x, clusterSize->y, clusterSize->z});
if (dynamicSharedMemorySize)
result.addOperands(dynamicSharedMemorySize);
result.addOperands(kernelOperands);
if (asyncObject)
result.addOperands(asyncObject);
Properties &prop = result.getOrAddProperties<Properties>();
prop.kernel = kernel;
size_t segmentSizesLen = std::size(prop.operandSegmentSizes);
// Initialize the segment sizes to 1.
for (auto &sz : prop.operandSegmentSizes)
sz = 1;
prop.operandSegmentSizes[0] = 0;
if (!clusterSize.has_value()) {
prop.operandSegmentSizes[segmentSizesLen - 4] = 0;
prop.operandSegmentSizes[segmentSizesLen - 5] = 0;
prop.operandSegmentSizes[segmentSizesLen - 6] = 0;
}
prop.operandSegmentSizes[segmentSizesLen - 3] =
dynamicSharedMemorySize ? 1 : 0;
prop.operandSegmentSizes[segmentSizesLen - 2] =
static_cast<int32_t>(kernelOperands.size());
prop.operandSegmentSizes[segmentSizesLen - 1] = asyncObject ? 1 : 0;
}
StringAttr LaunchFuncOp::getKernelModuleName() {
return getKernel().getRootReference();
}
StringAttr LaunchFuncOp::getKernelName() {
return getKernel().getLeafReference();
}
unsigned LaunchFuncOp::getNumKernelOperands() {
return getKernelOperands().size();
}
Value LaunchFuncOp::getKernelOperand(unsigned i) {
return getKernelOperands()[i];
}
KernelDim3 LaunchFuncOp::getGridSizeOperandValues() {
auto operands = getOperands().drop_front(getAsyncDependencies().size());
return KernelDim3{operands[0], operands[1], operands[2]};
}
KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
auto operands = getOperands().drop_front(getAsyncDependencies().size());
return KernelDim3{operands[3], operands[4], operands[5]};
}
KernelDim3 LaunchFuncOp::getClusterSizeOperandValues() {
assert(hasClusterSize() &&
"cluster size is not set, check hasClusterSize() first");
auto operands = getOperands().drop_front(getAsyncDependencies().size());
return KernelDim3{operands[6], operands[7], operands[8]};
}
LogicalResult LaunchFuncOp::verify() {
auto module = (*this)->getParentOfType<ModuleOp>();
if (!module)
return emitOpError("expected to belong to a module");
if (!module->getAttrOfType<UnitAttr>(
GPUDialect::getContainerModuleAttrName()))
return emitOpError("expected the closest surrounding module to have the '" +
GPUDialect::getContainerModuleAttrName() +
"' attribute");
if (hasClusterSize()) {
if (getClusterSizeY().getType() != getClusterSizeX().getType() ||
getClusterSizeZ().getType() != getClusterSizeX().getType())
return emitOpError()
<< "expects types of the cluster dimensions must be the same";
}
return success();
}
static ParseResult
parseLaunchDimType(OpAsmParser &parser, Type &dimTy,
std::optional<OpAsmParser::UnresolvedOperand> clusterValue,
Type &clusterXTy, Type &clusterYTy, Type &clusterZTy) {
if (succeeded(parser.parseOptionalColon())) {
if (parser.parseType(dimTy))
return failure();
} else {
dimTy = IndexType::get(parser.getContext());
}
if (clusterValue.has_value()) {
clusterXTy = clusterYTy = clusterZTy = dimTy;
}
return success();
}
static void printLaunchDimType(OpAsmPrinter &printer, Operation *op, Type dimTy,
Value clusterValue, Type clusterXTy,
Type clusterYTy, Type clusterZTy) {
if (!dimTy.isIndex())
printer << ": " << dimTy;
}
static ParseResult parseLaunchFuncOperands(
OpAsmParser &parser,
SmallVectorImpl<OpAsmParser::UnresolvedOperand> &argNames,
SmallVectorImpl<Type> &argTypes) {
if (parser.parseOptionalKeyword("args"))
return success();
auto parseElement = [&]() -> ParseResult {
return failure(parser.parseOperand(argNames.emplace_back()) ||
parser.parseColonType(argTypes.emplace_back()));
};
return parser.parseCommaSeparatedList(OpAsmParser::Delimiter::Paren,
parseElement, " in argument list");
}
static void printLaunchFuncOperands(OpAsmPrinter &printer, Operation *,
OperandRange operands, TypeRange types) {
if (operands.empty())
return;
printer << "args(";
llvm::interleaveComma(llvm::zip(operands, types), printer,
[&](const auto &pair) {
printer.printOperand(std::get<0>(pair));
printer << " : ";
printer.printType(std::get<1>(pair));
});
printer << ")";
}
//===----------------------------------------------------------------------===//
// ShuffleOp
//===----------------------------------------------------------------------===//
void ShuffleOp::build(OpBuilder &builder, OperationState &result, Value value,
int32_t offset, int32_t width, ShuffleMode mode) {
build(builder, result, value,
builder.create<arith::ConstantOp>(result.location,
builder.getI32IntegerAttr(offset)),
builder.create<arith::ConstantOp>(result.location,
builder.getI32IntegerAttr(width)),
mode);
}
//===----------------------------------------------------------------------===//
// BarrierOp
//===----------------------------------------------------------------------===//
namespace {
/// Remove gpu.barrier after gpu.barrier, the threads are already synchronized!
LogicalResult eraseRedundantGpuBarrierOps(BarrierOp op,
PatternRewriter &rewriter) {
if (isa_and_nonnull<BarrierOp>(op->getNextNode())) {
rewriter.eraseOp(op);
return success();
}
return failure();
}
} // end anonymous namespace
void BarrierOp::getCanonicalizationPatterns(RewritePatternSet &results,
MLIRContext *context) {
results.add(eraseRedundantGpuBarrierOps);
}
//===----------------------------------------------------------------------===//
// GPUFuncOp
//===----------------------------------------------------------------------===//
/// Adds a new block argument that corresponds to buffers located in
/// workgroup memory.
BlockArgument GPUFuncOp::addWorkgroupAttribution(Type type, Location loc) {
auto attrName = getNumWorkgroupAttributionsAttrName();
auto attr = (*this)->getAttrOfType<IntegerAttr>(attrName);
(*this)->setAttr(attrName,
IntegerAttr::get(attr.getType(), attr.getValue() + 1));
return getBody().insertArgument(
getFunctionType().getNumInputs() + attr.getInt(), type, loc);
}
/// Adds a new block argument that corresponds to buffers located in
/// private memory.
BlockArgument GPUFuncOp::addPrivateAttribution(Type type, Location loc) {
// Buffers on the private memory always come after buffers on the workgroup
// memory.
return getBody().addArgument(type, loc);
}
void GPUFuncOp::build(OpBuilder &builder, OperationState &result,
StringRef name, FunctionType type,
TypeRange workgroupAttributions,
TypeRange privateAttributions,
ArrayRef<NamedAttribute> attrs) {
result.addAttribute(SymbolTable::getSymbolAttrName(),
builder.getStringAttr(name));
result.addAttribute(getFunctionTypeAttrName(result.name),
TypeAttr::get(type));
result.addAttribute(getNumWorkgroupAttributionsAttrName(),
builder.getI64IntegerAttr(workgroupAttributions.size()));
result.addAttributes(attrs);
Region *body = result.addRegion();
Block *entryBlock = new Block;
// TODO: Allow passing in proper locations here.
for (Type argTy : type.getInputs())
entryBlock->addArgument(argTy, result.location);
for (Type argTy : workgroupAttributions)
entryBlock->addArgument(argTy, result.location);
for (Type argTy : privateAttributions)
entryBlock->addArgument(argTy, result.location);
body->getBlocks().push_back(entryBlock);
}
/// Parses a GPU function memory attribution.
///
/// memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
/// (`private` `(` ssa-id-and-type-list `)`)?
///
/// Note that this function parses only one of the two similar parts, with the
/// keyword provided as argument.
static ParseResult
parseAttributions(OpAsmParser &parser, StringRef keyword,
SmallVectorImpl<OpAsmParser::Argument> &args,
Attribute &attributionAttrs) {
// If we could not parse the keyword, just assume empty list and succeed.
if (failed(parser.parseOptionalKeyword(keyword)))
return success();
size_t existingArgs = args.size();
ParseResult result =
parser.parseArgumentList(args, OpAsmParser::Delimiter::Paren,
/*allowType=*/true, /*allowAttrs=*/true);
if (failed(result))
return result;
bool hadAttrs = llvm::any_of(ArrayRef(args).drop_front(existingArgs),
[](const OpAsmParser::Argument &arg) -> bool {
return arg.attrs && !arg.attrs.empty();
});
if (!hadAttrs) {
attributionAttrs = nullptr;
return result;
}
Builder &builder = parser.getBuilder();
SmallVector<Attribute> attributionAttrsVec;
for (const auto &argument : ArrayRef(args).drop_front(existingArgs)) {
if (!argument.attrs)
attributionAttrsVec.push_back(builder.getDictionaryAttr({}));
else
attributionAttrsVec.push_back(argument.attrs);
}
attributionAttrs = builder.getArrayAttr(attributionAttrsVec);
return result;
}
/// Parses a GPU function.
///
/// <operation> ::= `gpu.func` symbol-ref-id `(` argument-list `)`
/// (`->` function-result-list)? memory-attribution `kernel`?
/// function-attributes? region
ParseResult GPUFuncOp::parse(OpAsmParser &parser, OperationState &result) {
SmallVector<OpAsmParser::Argument> entryArgs;
SmallVector<DictionaryAttr> resultAttrs;
SmallVector<Type> resultTypes;
bool isVariadic;
// Parse the function name.
StringAttr nameAttr;
if (parser.parseSymbolName(nameAttr, ::mlir::SymbolTable::getSymbolAttrName(),
result.attributes))
return failure();
auto signatureLocation = parser.getCurrentLocation();
[mlir] Convert OpTrait::FunctionLike to FunctionOpInterface This commit refactors the FunctionLike trait into an interface (FunctionOpInterface). FunctionLike as it is today is already a pseudo-interface, with many users checking the presence of the trait and then manually into functionality implemented in the function_like_impl namespace. By transitioning to an interface, these accesses are much cleaner (ideally with no direct calls to the impl namespace outside of the implementation of the derived function operations, e.g. for parsing/printing utilities). I've tried to maintain as much compatability with the current state as possible, while also trying to clean up as much of the cruft as possible. The general migration plan for current users of FunctionLike is as follows: * function_like_impl -> function_interface_impl Realistically most user calls should remove references to functions within this namespace outside of a vary narrow set (e.g. parsing/printing utilities). Calls to the attribute name accessors should be migrated to the `FunctionOpInterface::` equivalent, most everything else should be updated to be driven through an instance of the interface. * OpTrait::FunctionLike -> FunctionOpInterface `hasTrait` checks will need to be moved to isa, along with the other various Trait vs Interface API differences. * populateFunctionLikeTypeConversionPattern -> populateFunctionOpInterfaceTypeConversionPattern Fixes #52917 Differential Revision: https://reviews.llvm.org/D117272
2022-01-13 20:51:38 -08:00
if (failed(function_interface_impl::parseFunctionSignature(
parser, /*allowVariadic=*/false, entryArgs, isVariadic, resultTypes,
resultAttrs)))
return failure();
if (!entryArgs.empty() && entryArgs[0].ssaName.name.empty())
return parser.emitError(signatureLocation)
<< "gpu.func requires named arguments";
// Construct the function type. More types will be added to the region, but
// not to the function type.
Builder &builder = parser.getBuilder();
SmallVector<Type> argTypes;
for (auto &arg : entryArgs)
argTypes.push_back(arg.type);
auto type = builder.getFunctionType(argTypes, resultTypes);
result.addAttribute(getFunctionTypeAttrName(result.name),
TypeAttr::get(type));
function_interface_impl::addArgAndResultAttrs(
builder, result, entryArgs, resultAttrs, getArgAttrsAttrName(result.name),
getResAttrsAttrName(result.name));
Attribute workgroupAttributionAttrs;
// Parse workgroup memory attributions.
if (failed(parseAttributions(parser, GPUFuncOp::getWorkgroupKeyword(),
entryArgs, workgroupAttributionAttrs)))
return failure();
// Store the number of operands we just parsed as the number of workgroup
// memory attributions.
unsigned numWorkgroupAttrs = entryArgs.size() - type.getNumInputs();
result.addAttribute(GPUFuncOp::getNumWorkgroupAttributionsAttrName(),
builder.getI64IntegerAttr(numWorkgroupAttrs));
if (workgroupAttributionAttrs)
result.addAttribute(GPUFuncOp::getWorkgroupAttribAttrsAttrName(result.name),
workgroupAttributionAttrs);
Attribute privateAttributionAttrs;
// Parse private memory attributions.
if (failed(parseAttributions(parser, GPUFuncOp::getPrivateKeyword(),
entryArgs, privateAttributionAttrs)))
return failure();
if (privateAttributionAttrs)
result.addAttribute(GPUFuncOp::getPrivateAttribAttrsAttrName(result.name),
privateAttributionAttrs);
// Parse the kernel attribute if present.
if (succeeded(parser.parseOptionalKeyword(GPUFuncOp::getKernelKeyword())))
result.addAttribute(GPUDialect::getKernelFuncAttrName(),
builder.getUnitAttr());
// Parse attributes.
if (failed(parser.parseOptionalAttrDictWithKeyword(result.attributes)))
return failure();
// Parse the region. If no argument names were provided, take all names
// (including those of attributions) from the entry block.
auto *body = result.addRegion();
return parser.parseRegion(*body, entryArgs);
}
static void printAttributions(OpAsmPrinter &p, StringRef keyword,
ArrayRef<BlockArgument> values,
ArrayAttr attributes) {
if (values.empty())
return;
p << ' ' << keyword << '(';
llvm::interleaveComma(
llvm::enumerate(values), p, [&p, attributes](auto pair) {
BlockArgument v = pair.value();
p << v << " : " << v.getType();
size_t attributionIndex = pair.index();
DictionaryAttr attrs;
if (attributes && attributionIndex < attributes.size())
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
attrs = llvm::cast<DictionaryAttr>(attributes[attributionIndex]);
if (attrs)
p.printOptionalAttrDict(attrs.getValue());
});
p << ')';
}
void GPUFuncOp::print(OpAsmPrinter &p) {
p << ' ';
p.printSymbolName(getName());
FunctionType type = getFunctionType();
function_interface_impl::printFunctionSignature(p, *this, type.getInputs(),
/*isVariadic=*/false,
type.getResults());
printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributions(),
getWorkgroupAttribAttrs().value_or(nullptr));
printAttributions(p, getPrivateKeyword(), getPrivateAttributions(),
getPrivateAttribAttrs().value_or(nullptr));
if (isKernel())
p << ' ' << getKernelKeyword();
[mlir] Convert OpTrait::FunctionLike to FunctionOpInterface This commit refactors the FunctionLike trait into an interface (FunctionOpInterface). FunctionLike as it is today is already a pseudo-interface, with many users checking the presence of the trait and then manually into functionality implemented in the function_like_impl namespace. By transitioning to an interface, these accesses are much cleaner (ideally with no direct calls to the impl namespace outside of the implementation of the derived function operations, e.g. for parsing/printing utilities). I've tried to maintain as much compatability with the current state as possible, while also trying to clean up as much of the cruft as possible. The general migration plan for current users of FunctionLike is as follows: * function_like_impl -> function_interface_impl Realistically most user calls should remove references to functions within this namespace outside of a vary narrow set (e.g. parsing/printing utilities). Calls to the attribute name accessors should be migrated to the `FunctionOpInterface::` equivalent, most everything else should be updated to be driven through an instance of the interface. * OpTrait::FunctionLike -> FunctionOpInterface `hasTrait` checks will need to be moved to isa, along with the other various Trait vs Interface API differences. * populateFunctionLikeTypeConversionPattern -> populateFunctionOpInterfaceTypeConversionPattern Fixes #52917 Differential Revision: https://reviews.llvm.org/D117272
2022-01-13 20:51:38 -08:00
function_interface_impl::printFunctionAttributes(
p, *this,
{getNumWorkgroupAttributionsAttrName(),
GPUDialect::getKernelFuncAttrName(), getFunctionTypeAttrName(),
getArgAttrsAttrName(), getResAttrsAttrName(),
getWorkgroupAttribAttrsAttrName(), getPrivateAttribAttrsAttrName()});
p << ' ';
p.printRegion(getBody(), /*printEntryBlockArgs=*/false);
}
static DictionaryAttr getAttributionAttrs(GPUFuncOp op, unsigned index,
StringAttr attrName) {
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
auto allAttrs = llvm::dyn_cast_or_null<ArrayAttr>(op->getAttr(attrName));
if (!allAttrs || index >= allAttrs.size())
return DictionaryAttr();
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
return llvm::cast<DictionaryAttr>(allAttrs[index]);
}
DictionaryAttr GPUFuncOp::getworkgroupAttributionAttrs(unsigned index) {
return getAttributionAttrs(*this, index, getWorkgroupAttribAttrsAttrName());
}
DictionaryAttr GPUFuncOp::getPrivateAttributionAttrs(unsigned index) {
return getAttributionAttrs(*this, index, getPrivateAttribAttrsAttrName());
}
static void setAttributionAttrs(GPUFuncOp op, unsigned index,
DictionaryAttr value, StringAttr attrName) {
MLIRContext *ctx = op.getContext();
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
auto allAttrs = llvm::dyn_cast_or_null<ArrayAttr>(op->getAttr(attrName));
SmallVector<Attribute> elements;
if (allAttrs)
elements.append(allAttrs.begin(), allAttrs.end());
while (elements.size() <= index)
elements.push_back(DictionaryAttr::get(ctx));
if (!value)
elements[index] = DictionaryAttr::get(ctx);
else
elements[index] = value;
ArrayAttr newValue = ArrayAttr::get(ctx, elements);
op->setAttr(attrName, newValue);
}
void GPUFuncOp::setworkgroupAttributionAttrs(unsigned index,
DictionaryAttr value) {
setAttributionAttrs(*this, index, value, getWorkgroupAttribAttrsAttrName());
}
void GPUFuncOp::setPrivateAttributionAttrs(unsigned int index,
DictionaryAttr value) {
setAttributionAttrs(*this, index, value, getPrivateAttribAttrsAttrName());
}
static Attribute getAttributionAttr(GPUFuncOp op, unsigned index,
StringAttr name, StringAttr attrsName) {
DictionaryAttr dict = getAttributionAttrs(op, index, attrsName);
if (!dict)
return Attribute();
return dict.get(name);
}
Attribute GPUFuncOp::getWorkgroupAttributionAttr(unsigned index,
StringAttr name) {
assert(index < getNumWorkgroupAttributions() &&
"index must map to a workgroup attribution");
return getAttributionAttr(*this, index, name,
getWorkgroupAttribAttrsAttrName());
}
Attribute GPUFuncOp::getPrivateAttributionAttr(unsigned index,
StringAttr name) {
assert(index < getNumPrivateAttributions() &&
"index must map to a private attribution");
return getAttributionAttr(*this, index, name,
getPrivateAttribAttrsAttrName());
}
static void setAttributionAttr(GPUFuncOp op, unsigned index, StringAttr name,
Attribute value, StringAttr attrsName) {
MLIRContext *ctx = op.getContext();
SmallVector<NamedAttribute> elems;
DictionaryAttr oldDict = getAttributionAttrs(op, index, attrsName);
if (oldDict)
elems.append(oldDict.getValue().begin(), oldDict.getValue().end());
bool found = false;
bool mustSort = true;
for (unsigned i = 0, e = elems.size(); i < e; ++i) {
if (elems[i].getName() == name) {
found = true;
if (!value) {
std::swap(elems[i], elems[elems.size() - 1]);
elems.pop_back();
} else {
mustSort = false;
elems[i] = NamedAttribute(elems[i].getName(), value);
}
break;
}
}
if (!found) {
if (!value)
return;
elems.emplace_back(name, value);
}
if (mustSort) {
DictionaryAttr::sortInPlace(elems);
}
auto newDict = DictionaryAttr::getWithSorted(ctx, elems);
setAttributionAttrs(op, index, newDict, attrsName);
}
void GPUFuncOp::setWorkgroupAttributionAttr(unsigned index, StringAttr name,
Attribute value) {
assert(index < getNumWorkgroupAttributions() &&
"index must map to a workgroup attribution");
setAttributionAttr(*this, index, name, value,
getWorkgroupAttribAttrsAttrName());
}
void GPUFuncOp::setPrivateAttributionAttr(unsigned index, StringAttr name,
Attribute value) {
assert(index < getNumPrivateAttributions() &&
"index must map to a private attribution");
setAttributionAttr(*this, index, name, value,
getPrivateAttribAttrsAttrName());
}
LogicalResult GPUFuncOp::verifyType() {
if (isKernel() && getFunctionType().getNumResults() != 0)
return emitOpError() << "expected void return type for kernel function";
return success();
}
/// Verifies the body of the function.
LogicalResult GPUFuncOp::verifyBody() {
if (empty())
return emitOpError() << "expected body with at least one block";
unsigned numFuncArguments = getNumArguments();
unsigned numWorkgroupAttributions = getNumWorkgroupAttributions();
unsigned numBlockArguments = front().getNumArguments();
if (numBlockArguments < numFuncArguments + numWorkgroupAttributions)
return emitOpError() << "expected at least "
<< numFuncArguments + numWorkgroupAttributions
<< " arguments to body region";
ArrayRef<Type> funcArgTypes = getFunctionType().getInputs();
for (unsigned i = 0; i < numFuncArguments; ++i) {
Type blockArgType = front().getArgument(i).getType();
if (funcArgTypes[i] != blockArgType)
return emitOpError() << "expected body region argument #" << i
<< " to be of type " << funcArgTypes[i] << ", got "
<< blockArgType;
}
if (failed(verifyAttributions(getOperation(), getWorkgroupAttributions(),
GPUDialect::getWorkgroupAddressSpace())) ||
failed(verifyAttributions(getOperation(), getPrivateAttributions(),
GPUDialect::getPrivateAddressSpace())))
return failure();
return success();
}
static LogicalResult verifyKnownLaunchSizeAttr(gpu::GPUFuncOp op,
StringRef attrName) {
auto maybeAttr = op->getAttr(attrName);
if (!maybeAttr)
return success();
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
auto array = llvm::dyn_cast<DenseI32ArrayAttr>(maybeAttr);
if (!array)
return op.emitOpError(attrName + " must be a dense i32 array");
if (array.size() != 3)
return op.emitOpError(attrName + " must contain exactly 3 elements");
return success();
}
LogicalResult GPUFuncOp::verify() {
if (failed(verifyKnownLaunchSizeAttr(*this, getKnownBlockSizeAttrName())))
return failure();
if (failed(verifyKnownLaunchSizeAttr(*this, getKnownGridSizeAttrName())))
return failure();
return success();
}
//===----------------------------------------------------------------------===//
// ReturnOp
//===----------------------------------------------------------------------===//
LogicalResult gpu::ReturnOp::verify() {
GPUFuncOp function = (*this)->getParentOfType<GPUFuncOp>();
FunctionType funType = function.getFunctionType();
if (funType.getNumResults() != getOperands().size())
return emitOpError()
.append("expected ", funType.getNumResults(), " result operands")
.attachNote(function.getLoc())
.append("return type declared here");
for (const auto &pair : llvm::enumerate(
llvm::zip(function.getFunctionType().getResults(), getOperands()))) {
auto [type, operand] = pair.value();
if (type != operand.getType())
return emitOpError() << "unexpected type `" << operand.getType()
<< "' for operand #" << pair.index();
}
return success();
}
//===----------------------------------------------------------------------===//
// GPUModuleOp
//===----------------------------------------------------------------------===//
void GPUModuleOp::build(OpBuilder &builder, OperationState &result,
StringRef name, ArrayAttr targets) {
ensureTerminator(*result.addRegion(), builder, result.location);
result.attributes.push_back(builder.getNamedAttr(
::mlir::SymbolTable::getSymbolAttrName(), builder.getStringAttr(name)));
if (targets)
result.getOrAddProperties<Properties>().targets = targets;
}
void GPUModuleOp::build(OpBuilder &builder, OperationState &result,
StringRef name, ArrayRef<Attribute> targets) {
build(builder, result, name,
targets.empty() ? ArrayAttr() : builder.getArrayAttr(targets));
}
ParseResult GPUModuleOp::parse(OpAsmParser &parser, OperationState &result) {
StringAttr nameAttr;
ArrayAttr targetsAttr;
if (parser.parseSymbolName(nameAttr, mlir::SymbolTable::getSymbolAttrName(),
result.attributes))
return failure();
// Parse the optional array of target attributes.
OptionalParseResult targetsAttrResult =
parser.parseOptionalAttribute(targetsAttr, Type{});
if (targetsAttrResult.has_value()) {
if (failed(*targetsAttrResult)) {
return failure();
}
result.getOrAddProperties<Properties>().targets = targetsAttr;
}
// If module attributes are present, parse them.
if (parser.parseOptionalAttrDictWithKeyword(result.attributes))
return failure();
// Parse the module body.
auto *body = result.addRegion();
if (parser.parseRegion(*body, {}))
return failure();
// Ensure that this module has a valid terminator.
GPUModuleOp::ensureTerminator(*body, parser.getBuilder(), result.location);
return success();
}
void GPUModuleOp::print(OpAsmPrinter &p) {
p << ' ';
p.printSymbolName(getName());
if (Attribute attr = getTargetsAttr()) {
p << ' ';
p.printAttribute(attr);
p << ' ';
}
p.printOptionalAttrDictWithKeyword(
(*this)->getAttrs(),
{mlir::SymbolTable::getSymbolAttrName(), getTargetsAttrName()});
p << ' ';
p.printRegion(getRegion(), /*printEntryBlockArgs=*/false,
/*printBlockTerminators=*/false);
}
bool GPUModuleOp::hasTarget(Attribute target) {
if (ArrayAttr targets = getTargetsAttr())
return llvm::count(targets.getValue(), target);
return false;
}
void GPUModuleOp::setTargets(ArrayRef<TargetAttrInterface> targets) {
ArrayAttr &targetsAttr = getProperties().targets;
SmallVector<Attribute> targetsVector(targets);
targetsAttr = ArrayAttr::get(getContext(), targetsVector);
}
//===----------------------------------------------------------------------===//
// GPUBinaryOp
//===----------------------------------------------------------------------===//
void BinaryOp::build(OpBuilder &builder, OperationState &result, StringRef name,
Attribute offloadingHandler, ArrayAttr objects) {
auto &properties = result.getOrAddProperties<Properties>();
result.attributes.push_back(builder.getNamedAttr(
SymbolTable::getSymbolAttrName(), builder.getStringAttr(name)));
properties.objects = objects;
if (offloadingHandler)
properties.offloadingHandler = offloadingHandler;
else
properties.offloadingHandler = builder.getAttr<SelectObjectAttr>(nullptr);
}
void BinaryOp::build(OpBuilder &builder, OperationState &result, StringRef name,
Attribute offloadingHandler, ArrayRef<Attribute> objects) {
build(builder, result, name, offloadingHandler,
objects.empty() ? ArrayAttr() : builder.getArrayAttr(objects));
}
static ParseResult parseOffloadingHandler(OpAsmParser &parser,
Attribute &offloadingHandler) {
if (succeeded(parser.parseOptionalLess())) {
if (parser.parseAttribute(offloadingHandler))
return failure();
if (parser.parseGreater())
return failure();
}
if (!offloadingHandler)
offloadingHandler = parser.getBuilder().getAttr<SelectObjectAttr>(nullptr);
return success();
}
static void printOffloadingHandler(OpAsmPrinter &printer, Operation *op,
Attribute offloadingHandler) {
if (offloadingHandler != SelectObjectAttr::get(op->getContext(), nullptr))
printer << '<' << offloadingHandler << '>';
}
//===----------------------------------------------------------------------===//
// GPUMemcpyOp
//===----------------------------------------------------------------------===//
LogicalResult MemcpyOp::verify() {
auto srcType = getSrc().getType();
auto dstType = getDst().getType();
if (getElementTypeOrSelf(srcType) != getElementTypeOrSelf(dstType))
return emitOpError("arguments have incompatible element type");
if (failed(verifyCompatibleShape(srcType, dstType)))
return emitOpError("arguments have incompatible shape");
return success();
}
namespace {
/// Erases a common case of copy ops where a destination value is used only by
/// the copy op, alloc and dealloc ops.
struct EraseTrivialCopyOp : public OpRewritePattern<MemcpyOp> {
using OpRewritePattern<MemcpyOp>::OpRewritePattern;
LogicalResult matchAndRewrite(MemcpyOp op,
PatternRewriter &rewriter) const override {
Value dest = op.getDst();
Operation *destDefOp = dest.getDefiningOp();
// `dest` must be defined by an op having Allocate memory effect in order to
// perform the folding.
if (!destDefOp ||
!hasSingleEffect<MemoryEffects::Allocate>(destDefOp, dest))
return failure();
// We can erase `op` iff `dest` has no other use apart from its
// use by `op` and dealloc ops.
if (llvm::any_of(dest.getUsers(), [op, dest](Operation *user) {
return user != op &&
!hasSingleEffect<MemoryEffects::Free>(user, dest);
}))
return failure();
// We can perform the folding if and only if op has a single async
// dependency and produces an async token as result, or if it does not have
// any async dependency and does not produce any async token result.
if (op.getAsyncDependencies().size() > 1 ||
((op.getAsyncDependencies().empty() && op.getAsyncToken()) ||
(!op.getAsyncDependencies().empty() && !op.getAsyncToken())))
return failure();
rewriter.replaceOp(op, op.getAsyncDependencies());
return success();
}
};
} // end anonymous namespace
void MemcpyOp::getCanonicalizationPatterns(RewritePatternSet &results,
MLIRContext *context) {
results.add<EraseTrivialCopyOp>(context);
}
//===----------------------------------------------------------------------===//
// GPU_SubgroupMmaLoadMatrixOp
//===----------------------------------------------------------------------===//
LogicalResult SubgroupMmaLoadMatrixOp::verify() {
auto srcType = getSrcMemref().getType();
auto resType = getRes().getType();
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
auto resMatrixType = llvm::cast<gpu::MMAMatrixType>(resType);
auto operand = resMatrixType.getOperand();
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
auto srcMemrefType = llvm::cast<MemRefType>(srcType);
if (!isLastMemrefDimUnitStride(srcMemrefType))
return emitError(
"expected source memref most minor dim must have unit stride");
if (!operand.equals("AOp") && !operand.equals("BOp") &&
!operand.equals("COp"))
return emitError("only AOp, BOp and COp can be loaded");
return success();
}
//===----------------------------------------------------------------------===//
// GPU_SubgroupMmaStoreMatrixOp
//===----------------------------------------------------------------------===//
LogicalResult SubgroupMmaStoreMatrixOp::verify() {
auto srcType = getSrc().getType();
auto dstType = getDstMemref().getType();
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
auto srcMatrixType = llvm::cast<gpu::MMAMatrixType>(srcType);
auto dstMemrefType = llvm::cast<MemRefType>(dstType);
if (!isLastMemrefDimUnitStride(dstMemrefType))
return emitError(
"expected destination memref most minor dim must have unit stride");
if (!srcMatrixType.getOperand().equals("COp"))
return emitError(
"expected the operand matrix being stored to have 'COp' operand type");
return success();
}
//===----------------------------------------------------------------------===//
// GPU_SubgroupMmaComputeOp
//===----------------------------------------------------------------------===//
LogicalResult SubgroupMmaComputeOp::verify() {
enum OperandMap { A, B, C };
SmallVector<MMAMatrixType, 3> opTypes;
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
opTypes.push_back(llvm::cast<MMAMatrixType>(getOpA().getType()));
opTypes.push_back(llvm::cast<MMAMatrixType>(getOpB().getType()));
opTypes.push_back(llvm::cast<MMAMatrixType>(getOpC().getType()));
if (!opTypes[A].getOperand().equals("AOp") ||
!opTypes[B].getOperand().equals("BOp") ||
!opTypes[C].getOperand().equals("COp"))
return emitError("operands must be in the order AOp, BOp, COp");
ArrayRef<int64_t> aShape, bShape, cShape;
aShape = opTypes[A].getShape();
bShape = opTypes[B].getShape();
cShape = opTypes[C].getShape();
if (aShape[1] != bShape[0] || aShape[0] != cShape[0] ||
bShape[1] != cShape[1])
return emitError("operand shapes do not satisfy matmul constraints");
return success();
}
LogicalResult MemcpyOp::fold(FoldAdaptor adaptor,
SmallVectorImpl<::mlir::OpFoldResult> &results) {
return memref::foldMemRefCast(*this);
}
LogicalResult MemsetOp::fold(FoldAdaptor adaptor,
SmallVectorImpl<::mlir::OpFoldResult> &results) {
return memref::foldMemRefCast(*this);
}
//===----------------------------------------------------------------------===//
// GPU_WaitOp
//===----------------------------------------------------------------------===//
namespace {
/// Remove gpu.wait op use of gpu.wait op def without async dependencies.
/// %t = gpu.wait async [] // No async dependencies.
/// ... gpu.wait ... [%t, ...] // %t can be removed.
struct EraseRedundantGpuWaitOpPairs : public OpRewritePattern<WaitOp> {
public:
using OpRewritePattern::OpRewritePattern;
LogicalResult matchAndRewrite(WaitOp op,
PatternRewriter &rewriter) const final {
auto predicate = [](Value value) {
auto waitOp = value.getDefiningOp<WaitOp>();
return waitOp && waitOp->getNumOperands() == 0;
};
if (llvm::none_of(op.getAsyncDependencies(), predicate))
return failure();
SmallVector<Value> validOperands;
for (Value operand : op->getOperands()) {
if (predicate(operand))
continue;
validOperands.push_back(operand);
}
rewriter.updateRootInPlace(op, [&]() { op->setOperands(validOperands); });
return success();
}
};
/// Simplify trivial gpu.wait ops for the following patterns.
/// 1. %t = gpu.wait async ... ops, where %t has no uses (regardless of async
/// dependencies).
/// 2. %t1 = gpu.wait async [%t0], in this case, we can replace uses of %t1 with
/// %t0.
/// 3. gpu.wait [] ops, i.e gpu.wait ops that neither have any async
/// dependencies nor return any token.
struct SimplifyGpuWaitOp : public OpRewritePattern<WaitOp> {
public:
using OpRewritePattern::OpRewritePattern;
LogicalResult matchAndRewrite(WaitOp op,
PatternRewriter &rewriter) const final {
// Erase gpu.wait ops that neither have any async dependencies nor return
// any async token.
if (op.getAsyncDependencies().empty() && !op.getAsyncToken()) {
rewriter.eraseOp(op);
return success();
}
// Replace uses of %t1 = gpu.wait async [%t0] ops with %t0 and erase the op.
if (llvm::hasSingleElement(op.getAsyncDependencies()) &&
op.getAsyncToken()) {
rewriter.replaceOp(op, op.getAsyncDependencies());
return success();
}
// Erase %t = gpu.wait async ... ops, where %t has no uses.
if (op.getAsyncToken() && op.getAsyncToken().use_empty()) {
rewriter.eraseOp(op);
return success();
}
return failure();
}
};
} // end anonymous namespace
void WaitOp::getCanonicalizationPatterns(RewritePatternSet &results,
MLIRContext *context) {
results.add<EraseRedundantGpuWaitOpPairs, SimplifyGpuWaitOp>(context);
}
//===----------------------------------------------------------------------===//
// GPU_AllocOp
//===----------------------------------------------------------------------===//
LogicalResult AllocOp::verify() {
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
auto memRefType = llvm::cast<MemRefType>(getMemref().getType());
if (static_cast<int64_t>(getDynamicSizes().size()) !=
memRefType.getNumDynamicDims())
return emitOpError("dimension operand count does not equal memref "
"dynamic dimension count");
unsigned numSymbols = 0;
if (!memRefType.getLayout().isIdentity())
numSymbols = memRefType.getLayout().getAffineMap().getNumSymbols();
if (getSymbolOperands().size() != numSymbols) {
return emitOpError(
"symbol operand count does not equal memref symbol count");
}
return success();
}
namespace {
/// Folding of memref.dim(gpu.alloc(%size), %idx) -> %size similar to
/// `memref::AllocOp`.
struct SimplifyDimOfAllocOp : public OpRewritePattern<memref::DimOp> {
using OpRewritePattern<memref::DimOp>::OpRewritePattern;
LogicalResult matchAndRewrite(memref::DimOp dimOp,
PatternRewriter &rewriter) const override {
std::optional<int64_t> index = dimOp.getConstantIndex();
if (!index)
return failure();
[mlir] Update method cast calls to function calls The MLIR classes Type/Attribute/Operation/Op/Value support cast/dyn_cast/isa/dyn_cast_or_null functionality through llvm's doCast functionality in addition to defining methods with the same name. This change begins the migration of uses of the method to the corresponding function call as has been decided as more consistent. Note that there still exist classes that only define methods directly, such as AffineExpr, and this does not include work currently to support a functional cast/isa call. Context: * https://mlir.llvm.org/deprecation/ at "Use the free function variants for dyn_cast/cast/isa/…" * Original discussion at https://discourse.llvm.org/t/preferred-casting-style-going-forward/68443 Implementation: This follows a previous patch that updated calls `op.cast<T>()-> cast<T>(op)`. However some cases could not handle an unprefixed `cast` call due to occurrences of variables named cast, or occurring inside of class definitions which would resolve to the method. All C++ files that did not work automatically with `cast<T>()` are updated here to `llvm::cast` and similar with the intention that they can be easily updated after the methods are removed through a find-replace. See https://github.com/llvm/llvm-project/compare/main...tpopp:llvm-project:tidy-cast-check for the clang-tidy check that is used and then update printed occurrences of the function to include `llvm::` before. One can then run the following: ``` ninja -C $BUILD_DIR clang-tidy run-clang-tidy -clang-tidy-binary=$BUILD_DIR/bin/clang-tidy -checks='-*,misc-cast-functions'\ -export-fixes /tmp/cast/casts.yaml mlir/*\ -header-filter=mlir/ -fix rm -rf $BUILD_DIR/tools/mlir/**/*.inc ``` Differential Revision: https://reviews.llvm.org/D150348
2023-05-11 11:10:46 +02:00
auto memrefType = llvm::dyn_cast<MemRefType>(dimOp.getSource().getType());
if (!memrefType || !memrefType.isDynamicDim(index.value()))
return failure();
auto alloc = dimOp.getSource().getDefiningOp<AllocOp>();
if (!alloc)
return failure();
Value substituteOp = *(alloc.getDynamicSizes().begin() +
memrefType.getDynamicDimIndex(index.value()));
rewriter.replaceOp(dimOp, substituteOp);
return success();
}
};
} // namespace
void AllocOp::getCanonicalizationPatterns(RewritePatternSet &results,
MLIRContext *context) {
results.add<SimplifyDimOfAllocOp>(context);
}
//===----------------------------------------------------------------------===//
// GPU object attribute
//===----------------------------------------------------------------------===//
LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
Attribute target, CompilationTarget format,
StringAttr object, DictionaryAttr properties) {
if (!target)
return emitError() << "the target attribute cannot be null";
if (target.hasPromiseOrImplementsInterface<TargetAttrInterface>())
return success();
return emitError() << "the target attribute must implement or promise the "
"`gpu::TargetAttrInterface`";
}
namespace {
LogicalResult parseObject(AsmParser &odsParser, CompilationTarget &format,
StringAttr &object) {
std::optional<CompilationTarget> formatResult;
StringRef enumKeyword;
auto loc = odsParser.getCurrentLocation();
if (failed(odsParser.parseOptionalKeyword(&enumKeyword)))
formatResult = CompilationTarget::Fatbin;
if (!formatResult &&
(formatResult =
gpu::symbolizeEnum<gpu::CompilationTarget>(enumKeyword)) &&
odsParser.parseEqual())
return odsParser.emitError(loc, "expected an equal sign");
if (!formatResult)
return odsParser.emitError(loc, "expected keyword for GPU object format");
FailureOr<StringAttr> objectResult =
FieldParser<StringAttr>::parse(odsParser);
if (failed(objectResult))
return odsParser.emitError(odsParser.getCurrentLocation(),
"failed to parse GPU_ObjectAttr parameter "
"'object' which is to be a `StringAttr`");
format = *formatResult;
object = *objectResult;
return success();
}
void printObject(AsmPrinter &odsParser, CompilationTarget format,
StringAttr object) {
if (format != CompilationTarget::Fatbin)
odsParser << stringifyEnum(format) << " = ";
odsParser << object;
}
} // namespace
//===----------------------------------------------------------------------===//
// GPU select object attribute
//===----------------------------------------------------------------------===//
LogicalResult
gpu::SelectObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
Attribute target) {
// Check `target`, it can be null, an integer attr or a GPU Target attribute.
if (target) {
if (auto intAttr = mlir::dyn_cast<IntegerAttr>(target)) {
if (intAttr.getInt() < 0) {
return emitError() << "the object index must be positive";
}
} else if (!target.hasPromiseOrImplementsInterface<TargetAttrInterface>()) {
return emitError()
<< "the target attribute must be a GPU Target attribute";
}
}
return success();
}
//===----------------------------------------------------------------------===//
[mlir][gpu] Introduce `gpu.dynamic_shared_memory` Op (#71546) While the `gpu.launch` Op allows setting the size via the `dynamic_shared_memory_size` argument, accessing the dynamic shared memory is very convoluted. This PR implements the proposed Op, `gpu.dynamic_shared_memory` that aims to simplify the utilization of dynamic shared memory. RFC: https://discourse.llvm.org/t/rfc-simplifying-dynamic-shared-memory-access-in-gpu/ **Proposal from RFC** This PR `gpu.dynamic.shared.memory` Op to use dynamic shared memory feature efficiently. It is is a powerful feature that enables the allocation of shared memory at runtime with the kernel launch on the host. Afterwards, the memory can be accessed directly from the device. I believe similar story exists for AMDGPU. **Current way Using Dynamic Shared Memory with MLIR** Let me illustrate the challenges of using dynamic shared memory in MLIR with an example below. The process involves several steps: - memref.global 0-sized array LLVM's NVPTX backend expects - dynamic_shared_memory_size Set the size of dynamic shared memory - memref.get_global Access the global symbol - reinterpret_cast and subview Many OPs for pointer arithmetic ``` // Step 1. Create 0-sized global symbol. Manually set the alignment memref.global "private" @dynamicShmem : memref<0xf16, 3> { alignment = 16 } func.func @main() { // Step 2. Allocate shared memory gpu.launch blocks(...) threads(...) dynamic_shared_memory_size %c10000 { // Step 3. Access the global object %shmem = memref.get_global @dynamicShmem : memref<0xf16, 3> // Step 4. A sequence of `memref.reinterpret_cast` and `memref.subview` operations. %4 = memref.reinterpret_cast %shmem to offset: [0], sizes: [14, 64, 128], strides: [8192,128,1] : memref<0xf16, 3> to memref<14x64x128xf16,3> %5 = memref.subview %4[7, 0, 0][7, 64, 128][1,1,1] : memref<14x64x128xf16,3> to memref<7x64x128xf16, strided<[8192, 128, 1], offset: 57344>, 3> %6 = memref.subview %5[2, 0, 0][1, 64, 128][1,1,1] : memref<7x64x128xf16, strided<[8192, 128, 1], offset: 57344>, 3> to memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> %7 = memref.subview %6[0, 0][64, 64][1,1] : memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> to memref<64x64xf16, strided<[128, 1], offset: 73728>, 3> %8 = memref.subview %6[32, 0][64, 64][1,1] : memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> to memref<64x64xf16, strided<[128, 1], offset: 77824>, 3> // Step.5 Use "test.use.shared.memory"(%7) : (memref<64x64xf16, strided<[128, 1], offset: 73728>, 3>) -> (index) "test.use.shared.memory"(%8) : (memref<64x64xf16, strided<[128, 1], offset: 77824>, 3>) -> (index) gpu.terminator } ``` Let’s write the program above with that: ``` func.func @main() { gpu.launch blocks(...) threads(...) dynamic_shared_memory_size %c10000 { %i = arith.constant 18 : index // Step 1: Obtain shared memory directly %shmem = gpu.dynamic_shared_memory : memref<?xi8, 3> %c147456 = arith.constant 147456 : index %c155648 = arith.constant 155648 : index %7 = memref.view %shmem[%c147456][] : memref<?xi8, 3> to memref<64x64xf16, 3> %8 = memref.view %shmem[%c155648][] : memref<?xi8, 3> to memref<64x64xf16, 3> // Step 2: Utilize the shared memory "test.use.shared.memory"(%7) : (memref<64x64xf16, 3>) -> (index) "test.use.shared.memory"(%8) : (memref<64x64xf16, 3>) -> (index) } } ``` This PR resolves #72513
2023-11-16 14:42:17 +01:00
// DynamicSharedMemoryOp
//===----------------------------------------------------------------------===//
LogicalResult gpu::DynamicSharedMemoryOp::verify() {
if (!getOperation()->getParentWithTrait<OpTrait::SymbolTable>())
return emitOpError() << "must be inside an op with symbol table";
MemRefType memrefType = getResultMemref().getType();
// Check address space
if (!GPUDialect::hasWorkgroupMemoryAddressSpace(memrefType)) {
return emitOpError() << "address space must be "
<< gpu::AddressSpaceAttr::getMnemonic() << "<"
<< stringifyEnum(gpu::AddressSpace::Workgroup) << ">";
}
if (memrefType.hasStaticShape()) {
return emitOpError() << "result memref type must be memref<?xi8, "
"#gpu.address_space<workgroup>>";
}
return success();
}
//===----------------------------------------------------------------------===//
// GPU target options
//===----------------------------------------------------------------------===//
TargetOptions::TargetOptions(
StringRef toolkitPath, ArrayRef<std::string> linkFiles,
StringRef cmdOptions, CompilationTarget compilationTarget,
function_ref<SymbolTable *()> getSymbolTableCallback)
: TargetOptions(TypeID::get<TargetOptions>(), toolkitPath, linkFiles,
cmdOptions, compilationTarget, getSymbolTableCallback) {}
TargetOptions::TargetOptions(
TypeID typeID, StringRef toolkitPath, ArrayRef<std::string> linkFiles,
StringRef cmdOptions, CompilationTarget compilationTarget,
function_ref<SymbolTable *()> getSymbolTableCallback)
: toolkitPath(toolkitPath.str()), linkFiles(linkFiles),
cmdOptions(cmdOptions.str()), compilationTarget(compilationTarget),
getSymbolTableCallback(getSymbolTableCallback), typeID(typeID) {}
TypeID TargetOptions::getTypeID() const { return typeID; }
StringRef TargetOptions::getToolkitPath() const { return toolkitPath; }
ArrayRef<std::string> TargetOptions::getLinkFiles() const { return linkFiles; }
StringRef TargetOptions::getCmdOptions() const { return cmdOptions; }
SymbolTable *TargetOptions::getSymbolTable() const {
return getSymbolTableCallback ? getSymbolTableCallback() : nullptr;
}
CompilationTarget TargetOptions::getCompilationTarget() const {
return compilationTarget;
}
CompilationTarget TargetOptions::getDefaultCompilationTarget() {
return CompilationTarget::Fatbin;
}
std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>>
TargetOptions::tokenizeCmdOptions() const {
std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> options;
llvm::StringSaver stringSaver(options.first);
StringRef opts = cmdOptions;
// For a correct tokenization of the command line options `opts` must be
// unquoted, otherwise the tokenization function returns a single string: the
// unquoted `cmdOptions` -which is not the desired behavior.
// Remove any quotes if they are at the beginning and end of the string:
if (!opts.empty() && opts.front() == '"' && opts.back() == '"')
opts.consume_front("\""), opts.consume_back("\"");
if (!opts.empty() && opts.front() == '\'' && opts.back() == '\'')
opts.consume_front("'"), opts.consume_back("'");
#ifdef _WIN32
llvm::cl::TokenizeWindowsCommandLine(opts, stringSaver, options.second,
/*MarkEOLs=*/false);
#else
llvm::cl::TokenizeGNUCommandLine(opts, stringSaver, options.second,
/*MarkEOLs=*/false);
#endif // _WIN32
return options;
}
MLIR_DEFINE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions)
#include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc"
#include "mlir/Dialect/GPU/IR/GPUOpsEnums.cpp.inc"
#define GET_ATTRDEF_CLASSES
#include "mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc"
#define GET_OP_CLASSES
#include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc"
#include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.cpp.inc"