mirror of
https://github.com/intel/llvm.git
synced 2026-02-05 13:21:04 +08:00
This prepares for the removal of llvm::Module and LLVMContext from the mlir::LLVMDialect. Reviewed By: rriddle Differential Revision: https://reviews.llvm.org/D85371
445 lines
19 KiB
C++
445 lines
19 KiB
C++
//===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements a pass to convert gpu.launch_func op into a sequence of
|
|
// GPU runtime calls. As most of GPU runtimes does not have a stable published
|
|
// ABI, this pass uses a slim runtime layer that builds on top of the public
|
|
// API from GPU runtime headers.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
|
|
|
|
#include "../PassDetail.h"
|
|
#include "mlir/Dialect/GPU/GPUDialect.h"
|
|
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
|
|
#include "mlir/IR/Attributes.h"
|
|
#include "mlir/IR/Builders.h"
|
|
#include "mlir/IR/Function.h"
|
|
#include "mlir/IR/Module.h"
|
|
#include "mlir/IR/StandardTypes.h"
|
|
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/IR/DataLayout.h"
|
|
#include "llvm/IR/DerivedTypes.h"
|
|
#include "llvm/IR/Module.h"
|
|
#include "llvm/IR/Type.h"
|
|
#include "llvm/Support/Error.h"
|
|
#include "llvm/Support/FormatVariadic.h"
|
|
|
|
using namespace mlir;
|
|
|
|
// To avoid name mangling, these are defined in the mini-runtime file.
|
|
static constexpr const char *kGpuModuleLoadName = "mgpuModuleLoad";
|
|
static constexpr const char *kGpuModuleGetFunctionName =
|
|
"mgpuModuleGetFunction";
|
|
static constexpr const char *kGpuLaunchKernelName = "mgpuLaunchKernel";
|
|
static constexpr const char *kGpuStreamCreateName = "mgpuStreamCreate";
|
|
static constexpr const char *kGpuStreamSynchronizeName =
|
|
"mgpuStreamSynchronize";
|
|
static constexpr const char *kGpuMemHostRegisterName = "mgpuMemHostRegister";
|
|
static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
|
|
|
|
namespace {
|
|
|
|
/// A pass to convert gpu.launch_func operations into a sequence of GPU
|
|
/// runtime calls. Currently it supports CUDA and ROCm (HIP).
|
|
///
|
|
/// In essence, a gpu.launch_func operations gets compiled into the following
|
|
/// sequence of runtime calls:
|
|
///
|
|
/// * moduleLoad -- loads the module given the cubin / hsaco data
|
|
/// * moduleGetFunction -- gets a handle to the actual kernel function
|
|
/// * getStreamHelper -- initializes a new compute stream on GPU
|
|
/// * launchKernel -- launches the kernel on a stream
|
|
/// * streamSynchronize -- waits for operations on the stream to finish
|
|
///
|
|
/// Intermediate data structures are allocated on the stack.
|
|
class GpuLaunchFuncToGpuRuntimeCallsPass
|
|
: public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<
|
|
GpuLaunchFuncToGpuRuntimeCallsPass> {
|
|
private:
|
|
LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }
|
|
|
|
void initializeCachedTypes() {
|
|
llvmVoidType = LLVM::LLVMType::getVoidTy(llvmDialect);
|
|
llvmPointerType = LLVM::LLVMType::getInt8PtrTy(llvmDialect);
|
|
llvmPointerPointerType = llvmPointerType.getPointerTo();
|
|
llvmInt8Type = LLVM::LLVMType::getInt8Ty(llvmDialect);
|
|
llvmInt32Type = LLVM::LLVMType::getInt32Ty(llvmDialect);
|
|
llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect);
|
|
llvmIntPtrType = LLVM::LLVMType::getIntNTy(
|
|
llvmDialect, llvmDialect->getDataLayout().getPointerSizeInBits());
|
|
}
|
|
|
|
LLVM::LLVMType getVoidType() { return llvmVoidType; }
|
|
|
|
LLVM::LLVMType getPointerType() { return llvmPointerType; }
|
|
|
|
LLVM::LLVMType getPointerPointerType() { return llvmPointerPointerType; }
|
|
|
|
LLVM::LLVMType getInt8Type() { return llvmInt8Type; }
|
|
|
|
LLVM::LLVMType getInt32Type() { return llvmInt32Type; }
|
|
|
|
LLVM::LLVMType getInt64Type() { return llvmInt64Type; }
|
|
|
|
LLVM::LLVMType getIntPtrType() {
|
|
return LLVM::LLVMType::getIntNTy(
|
|
getLLVMDialect(),
|
|
getLLVMDialect()->getDataLayout().getPointerSizeInBits());
|
|
}
|
|
|
|
// Allocate a void pointer on the stack.
|
|
Value allocatePointer(OpBuilder &builder, Location loc) {
|
|
auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
|
|
builder.getI32IntegerAttr(1));
|
|
return builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), one,
|
|
/*alignment=*/0);
|
|
}
|
|
|
|
void declareGpuRuntimeFunctions(Location loc);
|
|
void addParamToList(OpBuilder &builder, Location loc, Value param, Value list,
|
|
unsigned pos, Value one);
|
|
Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
|
|
Value generateKernelNameConstant(StringRef moduleName, StringRef name,
|
|
Location loc, OpBuilder &builder);
|
|
void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
|
|
|
|
public:
|
|
GpuLaunchFuncToGpuRuntimeCallsPass() = default;
|
|
GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) {
|
|
this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
|
|
}
|
|
|
|
// Run the dialect converter on the module.
|
|
void runOnOperation() override {
|
|
// Cache the LLVMDialect for the current module.
|
|
llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
|
|
// Cache the used LLVM types.
|
|
initializeCachedTypes();
|
|
|
|
getOperation().walk(
|
|
[this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
|
|
|
|
// GPU kernel modules are no longer necessary since we have a global
|
|
// constant with the CUBIN, or HSACO data.
|
|
for (auto m :
|
|
llvm::make_early_inc_range(getOperation().getOps<gpu::GPUModuleOp>()))
|
|
m.erase();
|
|
}
|
|
|
|
private:
|
|
LLVM::LLVMDialect *llvmDialect;
|
|
LLVM::LLVMType llvmVoidType;
|
|
LLVM::LLVMType llvmPointerType;
|
|
LLVM::LLVMType llvmPointerPointerType;
|
|
LLVM::LLVMType llvmInt8Type;
|
|
LLVM::LLVMType llvmInt32Type;
|
|
LLVM::LLVMType llvmInt64Type;
|
|
LLVM::LLVMType llvmIntPtrType;
|
|
};
|
|
|
|
} // anonymous namespace
|
|
|
|
// Adds declarations for the needed helper functions from the runtime wrappers.
|
|
// The types in comments give the actual types expected/returned but the API
|
|
// uses void pointers. This is fine as they have the same linkage in C.
|
|
void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
|
|
Location loc) {
|
|
ModuleOp module = getOperation();
|
|
OpBuilder builder(module.getBody()->getTerminator());
|
|
if (!module.lookupSymbol(kGpuModuleLoadName)) {
|
|
builder.create<LLVM::LLVMFuncOp>(
|
|
loc, kGpuModuleLoadName,
|
|
LLVM::LLVMType::getFunctionTy(getPointerType(),
|
|
{getPointerType()}, /* void *cubin */
|
|
/*isVarArg=*/false));
|
|
}
|
|
if (!module.lookupSymbol(kGpuModuleGetFunctionName)) {
|
|
// The helper uses void* instead of CUDA's opaque CUmodule and
|
|
// CUfunction, or ROCm (HIP)'s opaque hipModule_t and hipFunction_t.
|
|
builder.create<LLVM::LLVMFuncOp>(
|
|
loc, kGpuModuleGetFunctionName,
|
|
LLVM::LLVMType::getFunctionTy(getPointerType(),
|
|
{
|
|
getPointerType(), /* void *module */
|
|
getPointerType() /* char *name */
|
|
},
|
|
/*isVarArg=*/false));
|
|
}
|
|
if (!module.lookupSymbol(kGpuLaunchKernelName)) {
|
|
// Other than the CUDA or ROCm (HIP) api, the wrappers use uintptr_t to
|
|
// match the LLVM type if MLIR's index type, which the GPU dialect uses.
|
|
// Furthermore, they use void* instead of CUDA's opaque CUfunction and
|
|
// CUstream, or ROCm (HIP)'s opaque hipFunction_t and hipStream_t.
|
|
builder.create<LLVM::LLVMFuncOp>(
|
|
loc, kGpuLaunchKernelName,
|
|
LLVM::LLVMType::getFunctionTy(
|
|
getVoidType(),
|
|
{
|
|
getPointerType(), /* void* f */
|
|
getIntPtrType(), /* intptr_t gridXDim */
|
|
getIntPtrType(), /* intptr_t gridyDim */
|
|
getIntPtrType(), /* intptr_t gridZDim */
|
|
getIntPtrType(), /* intptr_t blockXDim */
|
|
getIntPtrType(), /* intptr_t blockYDim */
|
|
getIntPtrType(), /* intptr_t blockZDim */
|
|
getInt32Type(), /* unsigned int sharedMemBytes */
|
|
getPointerType(), /* void *hstream */
|
|
getPointerPointerType(), /* void **kernelParams */
|
|
getPointerPointerType() /* void **extra */
|
|
},
|
|
/*isVarArg=*/false));
|
|
}
|
|
if (!module.lookupSymbol(kGpuStreamCreateName)) {
|
|
// Helper function to get the current GPU compute stream. Uses void*
|
|
// instead of CUDA's opaque CUstream, or ROCm (HIP)'s opaque hipStream_t.
|
|
builder.create<LLVM::LLVMFuncOp>(
|
|
loc, kGpuStreamCreateName,
|
|
LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false));
|
|
}
|
|
if (!module.lookupSymbol(kGpuStreamSynchronizeName)) {
|
|
builder.create<LLVM::LLVMFuncOp>(
|
|
loc, kGpuStreamSynchronizeName,
|
|
LLVM::LLVMType::getFunctionTy(getVoidType(),
|
|
{getPointerType()}, /* void *stream */
|
|
/*isVarArg=*/false));
|
|
}
|
|
if (!module.lookupSymbol(kGpuMemHostRegisterName)) {
|
|
builder.create<LLVM::LLVMFuncOp>(
|
|
loc, kGpuMemHostRegisterName,
|
|
LLVM::LLVMType::getFunctionTy(getVoidType(),
|
|
{
|
|
getPointerType(), /* void *ptr */
|
|
getInt64Type() /* int64 sizeBytes*/
|
|
},
|
|
/*isVarArg=*/false));
|
|
}
|
|
}
|
|
|
|
/// Emits the IR with the following structure:
|
|
///
|
|
/// %data = llvm.alloca 1 x type-of(<param>)
|
|
/// llvm.store <param>, %data
|
|
/// %typeErased = llvm.bitcast %data to !llvm<"i8*">
|
|
/// %addr = llvm.getelementptr <list>[<pos>]
|
|
/// llvm.store %typeErased, %addr
|
|
///
|
|
/// This is necessary to construct the list of arguments passed to the kernel
|
|
/// function as accepted by cuLaunchKernel, i.e. as a void** that points to list
|
|
/// of stack-allocated type-erased pointers to the actual arguments.
|
|
void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder,
|
|
Location loc,
|
|
Value param, Value list,
|
|
unsigned pos,
|
|
Value one) {
|
|
auto memLocation = builder.create<LLVM::AllocaOp>(
|
|
loc, param.getType().cast<LLVM::LLVMType>().getPointerTo(), one,
|
|
/*alignment=*/1);
|
|
builder.create<LLVM::StoreOp>(loc, param, memLocation);
|
|
auto casted =
|
|
builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);
|
|
|
|
auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
|
|
builder.getI32IntegerAttr(pos));
|
|
auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), list,
|
|
ArrayRef<Value>{index});
|
|
builder.create<LLVM::StoreOp>(loc, casted, gep);
|
|
}
|
|
|
|
// Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel
|
|
// launch call. The arguments are extracted from the launchOp.
|
|
// The generated code is essentially as follows:
|
|
//
|
|
// %array = alloca(numparams * sizeof(void *))
|
|
// for (i : [0, NumKernelOperands))
|
|
// %array[i] = cast<void*>(KernelOperand[i])
|
|
// return %array
|
|
Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
|
|
gpu::LaunchFuncOp launchOp, OpBuilder &builder) {
|
|
|
|
// Get the launch target.
|
|
auto gpuFunc = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
|
|
launchOp, launchOp.kernel());
|
|
if (!gpuFunc)
|
|
return {};
|
|
|
|
unsigned numArgs = gpuFunc.getNumArguments();
|
|
|
|
auto numKernelOperands = launchOp.getNumKernelOperands();
|
|
Location loc = launchOp.getLoc();
|
|
auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
|
|
builder.getI32IntegerAttr(1));
|
|
auto arraySize = builder.create<LLVM::ConstantOp>(
|
|
loc, getInt32Type(), builder.getI32IntegerAttr(numArgs));
|
|
auto array = builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(),
|
|
arraySize, /*alignment=*/0);
|
|
|
|
unsigned pos = 0;
|
|
for (unsigned idx = 0; idx < numKernelOperands; ++idx) {
|
|
auto operand = launchOp.getKernelOperand(idx);
|
|
auto llvmType = operand.getType().cast<LLVM::LLVMType>();
|
|
|
|
// Assume all struct arguments come from MemRef. If this assumption does not
|
|
// hold anymore then we `launchOp` to lower from MemRefType and not after
|
|
// LLVMConversion has taken place and the MemRef information is lost.
|
|
if (!llvmType.isStructTy()) {
|
|
addParamToList(builder, loc, operand, array, pos++, one);
|
|
continue;
|
|
}
|
|
|
|
// Put individual components of a memref descriptor into the flat argument
|
|
// list. We cannot use unpackMemref from LLVM lowering here because we have
|
|
// no access to MemRefType that had been lowered away.
|
|
for (int32_t j = 0, ej = llvmType.getStructNumElements(); j < ej; ++j) {
|
|
auto elemType = llvmType.getStructElementType(j);
|
|
if (elemType.isArrayTy()) {
|
|
for (int32_t k = 0, ek = elemType.getArrayNumElements(); k < ek; ++k) {
|
|
Value elem = builder.create<LLVM::ExtractValueOp>(
|
|
loc, elemType.getArrayElementType(), operand,
|
|
builder.getI32ArrayAttr({j, k}));
|
|
addParamToList(builder, loc, elem, array, pos++, one);
|
|
}
|
|
} else {
|
|
assert((elemType.isIntegerTy() || elemType.isFloatTy() ||
|
|
elemType.isDoubleTy() || elemType.isPointerTy()) &&
|
|
"expected scalar type");
|
|
Value strct = builder.create<LLVM::ExtractValueOp>(
|
|
loc, elemType, operand, builder.getI32ArrayAttr(j));
|
|
addParamToList(builder, loc, strct, array, pos++, one);
|
|
}
|
|
}
|
|
}
|
|
|
|
return array;
|
|
}
|
|
|
|
// Generates an LLVM IR dialect global that contains the name of the given
|
|
// kernel function as a C string, and returns a pointer to its beginning.
|
|
// The code is essentially:
|
|
//
|
|
// llvm.global constant @kernel_name("function_name\00")
|
|
// func(...) {
|
|
// %0 = llvm.addressof @kernel_name
|
|
// %1 = llvm.constant (0 : index)
|
|
// %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
|
|
// }
|
|
Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant(
|
|
StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) {
|
|
// Make sure the trailing zero is included in the constant.
|
|
std::vector<char> kernelName(name.begin(), name.end());
|
|
kernelName.push_back('\0');
|
|
|
|
std::string globalName =
|
|
std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));
|
|
return LLVM::createGlobalString(
|
|
loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
|
|
LLVM::Linkage::Internal, llvmDialect);
|
|
}
|
|
|
|
// Emits LLVM IR to launch a kernel function. Expects the module that contains
|
|
// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
|
|
// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
|
|
//
|
|
// %0 = call %binarygetter
|
|
// %1 = call %moduleLoad(%0)
|
|
// %2 = <see generateKernelNameConstant>
|
|
// %3 = call %moduleGetFunction(%1, %2)
|
|
// %4 = call %streamCreate()
|
|
// %5 = <see setupParamsArray>
|
|
// call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
|
|
// call %streamSynchronize(%4)
|
|
void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
|
|
mlir::gpu::LaunchFuncOp launchOp) {
|
|
OpBuilder builder(launchOp);
|
|
Location loc = launchOp.getLoc();
|
|
declareGpuRuntimeFunctions(loc);
|
|
|
|
auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
|
|
builder.getI32IntegerAttr(0));
|
|
// Create an LLVM global with CUBIN extracted from the kernel annotation and
|
|
// obtain a pointer to the first byte in it.
|
|
auto kernelModule = getOperation().lookupSymbol<gpu::GPUModuleOp>(
|
|
launchOp.getKernelModuleName());
|
|
assert(kernelModule && "expected a kernel module");
|
|
|
|
auto binaryAttr = kernelModule.getAttrOfType<StringAttr>(gpuBinaryAnnotation);
|
|
if (!binaryAttr) {
|
|
kernelModule.emitOpError()
|
|
<< "missing " << gpuBinaryAnnotation << " attribute";
|
|
return signalPassFailure();
|
|
}
|
|
|
|
SmallString<128> nameBuffer(kernelModule.getName());
|
|
nameBuffer.append(kGpuBinaryStorageSuffix);
|
|
Value data = LLVM::createGlobalString(
|
|
loc, builder, nameBuffer.str(), binaryAttr.getValue(),
|
|
LLVM::Linkage::Internal, getLLVMDialect());
|
|
|
|
// Emit the load module call to load the module data. Error checking is done
|
|
// in the called helper function.
|
|
auto gpuModuleLoad =
|
|
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleLoadName);
|
|
auto module = builder.create<LLVM::CallOp>(
|
|
loc, ArrayRef<Type>{getPointerType()},
|
|
builder.getSymbolRefAttr(gpuModuleLoad), ArrayRef<Value>{data});
|
|
// Get the function from the module. The name corresponds to the name of
|
|
// the kernel function.
|
|
auto kernelName = generateKernelNameConstant(
|
|
launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder);
|
|
auto gpuModuleGetFunction =
|
|
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleGetFunctionName);
|
|
auto function = builder.create<LLVM::CallOp>(
|
|
loc, ArrayRef<Type>{getPointerType()},
|
|
builder.getSymbolRefAttr(gpuModuleGetFunction),
|
|
ArrayRef<Value>{module.getResult(0), kernelName});
|
|
// Grab the global stream needed for execution.
|
|
auto gpuStreamCreate =
|
|
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuStreamCreateName);
|
|
auto stream = builder.create<LLVM::CallOp>(
|
|
loc, ArrayRef<Type>{getPointerType()},
|
|
builder.getSymbolRefAttr(gpuStreamCreate), ArrayRef<Value>{});
|
|
// Invoke the function with required arguments.
|
|
auto gpuLaunchKernel =
|
|
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuLaunchKernelName);
|
|
auto paramsArray = setupParamsArray(launchOp, builder);
|
|
if (!paramsArray) {
|
|
launchOp.emitOpError() << "cannot pass given parameters to the kernel";
|
|
return signalPassFailure();
|
|
}
|
|
auto nullpointer =
|
|
builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);
|
|
builder.create<LLVM::CallOp>(
|
|
loc, ArrayRef<Type>{getVoidType()},
|
|
builder.getSymbolRefAttr(gpuLaunchKernel),
|
|
ArrayRef<Value>{function.getResult(0), launchOp.getOperand(0),
|
|
launchOp.getOperand(1), launchOp.getOperand(2),
|
|
launchOp.getOperand(3), launchOp.getOperand(4),
|
|
launchOp.getOperand(5), zero, /* sharedMemBytes */
|
|
stream.getResult(0), /* stream */
|
|
paramsArray, /* kernel params */
|
|
nullpointer /* extra */});
|
|
// Sync on the stream to make it synchronous.
|
|
auto gpuStreamSync =
|
|
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuStreamSynchronizeName);
|
|
builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getVoidType()},
|
|
builder.getSymbolRefAttr(gpuStreamSync),
|
|
ArrayRef<Value>(stream.getResult(0)));
|
|
launchOp.erase();
|
|
}
|
|
|
|
std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
|
|
mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
|
|
StringRef gpuBinaryAnnotation) {
|
|
if (gpuBinaryAnnotation.empty())
|
|
return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>();
|
|
return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>(
|
|
gpuBinaryAnnotation);
|
|
}
|