mirror of
https://github.com/intel/llvm.git
synced 2026-01-26 12:26:52 +08:00
[mlir][nvgpu] Add nvgpu.tma.async.load and nvgpu.tma.descriptor
This work adds `nvgpu.tma.async.load` Op that requests tma load asyncronusly using mbarrier object. It also creates nvgpu.tma.descriptor type. The type is supposed be created by `cuTensorMapEncodeTiled` cuda drivers api. Reviewed By: nicolasvasilache Differential Revision: https://reviews.llvm.org/D155453
This commit is contained in:
@@ -1,2 +1,17 @@
|
||||
add_mlir_dialect(NVGPU nvgpu)
|
||||
add_mlir_doc(NVGPU NVGPU Dialects/ -gen-dialect-doc)
|
||||
|
||||
set(LLVM_TARGET_DEFINITIONS NVGPU.td)
|
||||
mlir_tablegen(NVGPUEnums.h.inc -gen-enum-decls)
|
||||
mlir_tablegen(NVGPUEnums.cpp.inc -gen-enum-defs)
|
||||
add_public_tablegen_target(MLIRNVGPUEnumsIncGen)
|
||||
|
||||
set(LLVM_TARGET_DEFINITIONS NVGPU.td)
|
||||
mlir_tablegen(NVGPUAttrDefs.h.inc -gen-attrdef-decls)
|
||||
mlir_tablegen(NVGPUAttrDefs.cpp.inc -gen-attrdef-defs)
|
||||
add_public_tablegen_target(MLIRNVGPUAttributesIncGen)
|
||||
|
||||
set(LLVM_TARGET_DEFINITIONS NVGPU.td)
|
||||
mlir_tablegen(NVGPUAttrTypes.h.inc -gen-typedef-decls)
|
||||
mlir_tablegen(NVGPUAttrTypes.cpp.inc -gen-typedef-decls)
|
||||
add_public_tablegen_target(MLIRNVGPUTypesIncGen)
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
include "mlir/Interfaces/SideEffectInterfaces.td"
|
||||
include "mlir/IR/AttrTypeBase.td"
|
||||
include "mlir/IR/OpBase.td"
|
||||
include "mlir/IR/EnumAttr.td"
|
||||
|
||||
def NVGPU_Dialect : Dialect {
|
||||
let name = "nvgpu";
|
||||
@@ -61,6 +62,58 @@ def NVGPU_Dialect : Dialect {
|
||||
}];
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// NVGPU Attribute Definitions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def TensorMapSwizzleNone : I32EnumAttrCase<"SWIZZLE_NONE", 0, "none">;
|
||||
def TensorMapSwizzle32B : I32EnumAttrCase<"SWIZZLE_32B", 1, "swizzle_32b">;
|
||||
def TensorMapSwizzle64B : I32EnumAttrCase<"SWIZZLE_64B", 2, "swizzle_64b">;
|
||||
def TensorMapSwizzle128B : I32EnumAttrCase<"SWIZZLE_128B", 3, "swizzle_128b">;
|
||||
def TensorMapSwizzleKind : I32EnumAttr<"TensorMapSwizzleKind",
|
||||
"Tensor map swizzling mode of shared memory banks",
|
||||
[ TensorMapSwizzleNone, TensorMapSwizzle32B, TensorMapSwizzle64B,
|
||||
TensorMapSwizzle128B]> {
|
||||
let genSpecializedAttr = 0;
|
||||
let cppNamespace = "::mlir::nvgpu";
|
||||
}
|
||||
|
||||
def TensorMapL2PromoNone : I32EnumAttrCase<"L2PROMO_NONE", 0, "none">;
|
||||
def TensorMapL2Promo64B : I32EnumAttrCase<"L2PROMO_64B", 1, "l2promo_64b">;
|
||||
def TensorMapL2Promo128B : I32EnumAttrCase<"L2PROMO_128B", 2, "l2promo_128b">;
|
||||
def TensorMapL2Promo256B : I32EnumAttrCase<"L2PROMO_256B", 3, "l2promo_256b">;
|
||||
def TensorMapL2PromoKind : I32EnumAttr<"TensorMapL2PromoKind",
|
||||
"Tensor map L2 promotion type",
|
||||
[ TensorMapL2PromoNone, TensorMapL2Promo64B, TensorMapL2Promo128B,
|
||||
TensorMapL2Promo256B]> {
|
||||
let genSpecializedAttr = 0;
|
||||
let cppNamespace = "::mlir::nvgpu";
|
||||
}
|
||||
|
||||
def TensorMapOOBZero : I32EnumAttrCase<"OOB_ZERO", 0, "zero">;
|
||||
def TensorMapOOBNaN : I32EnumAttrCase<"OOB_NAN", 1, "nan">;
|
||||
def TensorMapOOBKind : I32EnumAttr<"TensorMapOOBKind",
|
||||
"Tensor map out-of-bounds fill type",
|
||||
[ TensorMapOOBZero, TensorMapOOBNaN]> {
|
||||
let genSpecializedAttr = 0;
|
||||
let cppNamespace = "::mlir::nvgpu";
|
||||
}
|
||||
|
||||
def TensorMapInterleaveNone : I32EnumAttrCase<"INTERLEAVE_NONE", 0, "none">;
|
||||
def TensorMapInterleave16B : I32EnumAttrCase<"INTERLEAVE_16B", 1, "interleave_16b">;
|
||||
def TensorMapInterleave32B : I32EnumAttrCase<"INTERLEAVE_32B", 2, "interleave_32b">;
|
||||
def TensorMapInterleaveKind : I32EnumAttr<"TensorMapInterleaveKind",
|
||||
"Tensor map interleave layout type",
|
||||
[ TensorMapInterleaveNone, TensorMapInterleave16B, TensorMapInterleave32B]> {
|
||||
let genSpecializedAttr = 0;
|
||||
let cppNamespace = "::mlir::nvgpu";
|
||||
}
|
||||
|
||||
def TensorMapSwizzleAttr : EnumAttr<NVGPU_Dialect, TensorMapSwizzleKind, "swizzle">;
|
||||
def TensorMapL2PromoAttr : EnumAttr<NVGPU_Dialect, TensorMapL2PromoKind, "l2promo">;
|
||||
def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">;
|
||||
def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// NVGPU Type Definitions
|
||||
//===----------------------------------------------------------------------===//
|
||||
@@ -100,6 +153,21 @@ def NVGPU_MBarrier : NVGPU_Type<"MBarrier", "mbarrier.barrier", []> {
|
||||
|
||||
def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { }
|
||||
|
||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map
|
||||
def NVGPU_TensorMapDescriptor : NVGPU_Type<"TensorMapDescriptor", "tensormap.descriptor", []> {
|
||||
let summary = "TensorMap descriptor";
|
||||
let parameters = (ins "MemRefType":$tensor,
|
||||
EnumParameter<TensorMapSwizzleKind>:$swizzle,
|
||||
EnumParameter<TensorMapL2PromoKind>:$l2promo,
|
||||
EnumParameter<TensorMapOOBKind>:$oob,
|
||||
EnumParameter<TensorMapInterleaveKind>:$interleave);
|
||||
let description = [{
|
||||
`nvgpu.tma.descriptor` is a type that represents a TMA descriptor. It is
|
||||
128-byte object either in constant space or kernel paramater.
|
||||
}];
|
||||
let assemblyFormat = "`<` struct(params) `>`";
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// NVGPU Op Definitions
|
||||
//===----------------------------------------------------------------------===//
|
||||
@@ -509,4 +577,27 @@ def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
|
||||
let assemblyFormat = "$barrier `,` $phase `,` $ticks attr-dict `:` type($barrier)";
|
||||
}
|
||||
|
||||
def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", []> {
|
||||
let summary = "TMA asynchronous load";
|
||||
let description = [{
|
||||
The Op loads a tile memory region from global memory to shared memory by
|
||||
Tensor Memory Access (TMA).
|
||||
|
||||
`$tensorMapDescriptor` is tensor map descriptor which has information about
|
||||
tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
|
||||
|
||||
The Op uses `$barrier` mbarrier based completion mechanism.
|
||||
}];
|
||||
let arguments = (ins Arg<AnyMemRef, "", [MemWrite]>:$dst,
|
||||
NVGPU_MBarrier:$barrier,
|
||||
NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
|
||||
Variadic<Index>:$coordinates);
|
||||
let assemblyFormat = [{
|
||||
$tensorMapDescriptor `[` $coordinates `]` `,` $barrier `to` $dst
|
||||
attr-dict `:` type($tensorMapDescriptor) `,` type($barrier) `->` type($dst)
|
||||
}];
|
||||
let hasVerifier = 1;
|
||||
|
||||
}
|
||||
|
||||
#endif // NVGPU
|
||||
|
||||
@@ -19,6 +19,11 @@
|
||||
#include "mlir/IR/OpDefinition.h"
|
||||
#include "mlir/Interfaces/SideEffectInterfaces.h"
|
||||
|
||||
#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc"
|
||||
|
||||
#define GET_ATTRDEF_CLASSES
|
||||
#include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.h.inc"
|
||||
|
||||
#define GET_TYPEDEF_CLASSES
|
||||
#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.h.inc"
|
||||
|
||||
|
||||
@@ -413,6 +413,9 @@ struct ConvertNVGPUToNVVMPass
|
||||
converter.addConversion([&](nvgpu::MBarrierType type) -> Type {
|
||||
return converter.convertType(createMBarrierMemrefType(rewriter, type));
|
||||
});
|
||||
converter.addConversion([&](nvgpu::TensorMapDescriptorType type) -> Type {
|
||||
return converter.getPointerType(type.getTensor().getElementType());
|
||||
});
|
||||
populateNVGPUToNVVMConversionPatterns(converter, patterns);
|
||||
LLVMConversionTarget target(getContext());
|
||||
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
|
||||
@@ -770,11 +773,7 @@ struct NVGPUMBarrierInitLowering
|
||||
Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
|
||||
op.getBarrier(), adaptor.getBarrier());
|
||||
|
||||
Value count = adaptor.getCount();
|
||||
if (!adaptor.getCount().getType().isInteger(32)) {
|
||||
count = rewriter.create<LLVM::TruncOp>(op->getLoc(),
|
||||
rewriter.getI32Type(), count);
|
||||
}
|
||||
Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount());
|
||||
|
||||
if (isMbarrierShared(op.getBarrier().getType())) {
|
||||
rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(op, barrier,
|
||||
@@ -822,11 +821,7 @@ struct NVGPUMBarrierArriveNoCompleteLowering
|
||||
op.getBarrier(), adaptor.getBarrier());
|
||||
Type tokenType = getTypeConverter()->convertType(
|
||||
nvgpu::MBarrierTokenType::get(op->getContext()));
|
||||
Value count = adaptor.getCount();
|
||||
if (!adaptor.getCount().getType().isInteger(32)) {
|
||||
count = rewriter.create<LLVM::TruncOp>(op->getLoc(),
|
||||
rewriter.getI32Type(), count);
|
||||
}
|
||||
Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount());
|
||||
if (isMbarrierShared(op.getBarrier().getType())) {
|
||||
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>(
|
||||
op, tokenType, barrier, count);
|
||||
@@ -910,6 +905,27 @@ struct NVGPUMBarrierTryWaitParityLowering
|
||||
}
|
||||
};
|
||||
|
||||
struct NVGPUTmaAsyncLoadOpLowering
|
||||
: public ConvertOpToLLVMPattern<nvgpu::TmaAsyncLoadOp> {
|
||||
using ConvertOpToLLVMPattern<nvgpu::TmaAsyncLoadOp>::ConvertOpToLLVMPattern;
|
||||
LogicalResult
|
||||
matchAndRewrite(nvgpu::TmaAsyncLoadOp op, OpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const override {
|
||||
auto dest = rewriter.create<LLVM::ExtractValueOp>(op->getLoc(),
|
||||
adaptor.getDst(), 1);
|
||||
Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
|
||||
op.getBarrier(), adaptor.getBarrier());
|
||||
|
||||
SmallVector<Value> coords = adaptor.getCoordinates();
|
||||
for (auto [index, value] : llvm::enumerate(coords)) {
|
||||
coords[index] = truncToI32(rewriter, op->getLoc(), value);
|
||||
}
|
||||
|
||||
rewriter.replaceOpWithNewOp<NVVM::CpAsyncBulkTensorGlobalToSharedClusterOp>(
|
||||
op, dest, adaptor.getTensorMapDescriptor(), barrier, coords);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
|
||||
@@ -922,6 +938,7 @@ void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
|
||||
NVGPUMBarrierTestWaitLowering, // nvgpu.mbarrier.test_wait_parity
|
||||
NVGPUMBarrierTryWaitParityLowering, // nvgpu.mbarrier.try_wait_parity
|
||||
NVGPUMBarrierArriveExpectTxLowering, // nvgpu.mbarrier.arrive.expect_tx
|
||||
NVGPUTmaAsyncLoadOpLowering, // nvgpu.tma.async.load
|
||||
MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
|
||||
NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
|
||||
NVGPUMmaSparseSyncLowering>(converter);
|
||||
|
||||
@@ -6,6 +6,9 @@ add_mlir_dialect_library(MLIRNVGPUDialect
|
||||
|
||||
DEPENDS
|
||||
MLIRNVGPUIncGen
|
||||
MLIRNVGPUEnumsIncGen
|
||||
MLIRNVGPUAttributesIncGen
|
||||
MLIRNVGPUTypesIncGen
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRGPUDialect
|
||||
|
||||
@@ -14,21 +14,31 @@
|
||||
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/IR/BuiltinAttributes.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/Diagnostics.h"
|
||||
#include "mlir/IR/DialectImplementation.h"
|
||||
#include "mlir/IR/Matchers.h"
|
||||
#include "mlir/IR/OpImplementation.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/IR/TypeUtilities.h"
|
||||
#include "mlir/IR/Verifier.h"
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/ADT/TypeSwitch.h"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace mlir::nvgpu;
|
||||
|
||||
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc"
|
||||
|
||||
void nvgpu::NVGPUDialect::initialize() {
|
||||
addTypes<
|
||||
#define GET_TYPEDEF_LIST
|
||||
#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc"
|
||||
>();
|
||||
addAttributes<
|
||||
#define GET_ATTRDEF_LIST
|
||||
#include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.cpp.inc"
|
||||
>();
|
||||
addOperations<
|
||||
#define GET_OP_LIST
|
||||
#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
|
||||
@@ -320,11 +330,39 @@ LogicalResult LdMatrixOp::verify() {
|
||||
return success();
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// NVGPU_TmaAsyncLoadOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
LogicalResult TmaAsyncLoadOp::verify() {
|
||||
// Destination memref
|
||||
auto dstMemref = llvm::cast<MemRefType>(getDst().getType());
|
||||
if (!NVGPUDialect::hasSharedMemoryAddressSpace(dstMemref)) {
|
||||
return emitError()
|
||||
<< "The operation stores data to shared memory, but "
|
||||
"the destination memref does not have a memory space of "
|
||||
<< NVGPUDialect::kSharedMemoryAddressSpace;
|
||||
}
|
||||
if (getCoordinates().size() > 5) {
|
||||
return emitError() << "Maximum 5 coordinates are supported.";
|
||||
}
|
||||
if (getCoordinates().size() != size_t(dstMemref.getRank())) {
|
||||
return emitError() << "Destination memref rank is "
|
||||
<< size_t(dstMemref.getRank()) << " but there are "
|
||||
<< getCoordinates().size()
|
||||
<< " coordinates. They must match.";
|
||||
}
|
||||
return success();
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// TableGen'd dialect, type, and op definitions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc"
|
||||
#define GET_ATTRDEF_CLASSES
|
||||
#include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.cpp.inc"
|
||||
|
||||
#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.cpp.inc"
|
||||
|
||||
#define GET_OP_CLASSES
|
||||
#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
|
||||
|
||||
@@ -559,7 +559,6 @@ func.func @mbarrier_nocomplete() {
|
||||
func.return
|
||||
}
|
||||
|
||||
|
||||
// -----
|
||||
!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
|
||||
!tokenType = !nvgpu.mbarrier.token
|
||||
@@ -603,4 +602,36 @@ func.func @mbarrier_txcount() {
|
||||
nvgpu.mbarrier.try_wait.parity %barrier, %phase, %ticks : !barrierType
|
||||
|
||||
func.return
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @async_tma_load
|
||||
!tensorMap1d = !nvgpu.tensormap.descriptor<tensor = memref<128xf32,3>, swizzle=none, l2promo = none, oob = nan, interleave = interleave_16b>
|
||||
!tensorMap2d = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
|
||||
!tensorMap3d = !nvgpu.tensormap.descriptor<tensor = memref<2x32x32xf32,3>, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none>
|
||||
!tensorMap4d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x32x32xf32,3>, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = none>
|
||||
!tensorMap5d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x2x32x32xf32,3>, swizzle=none, l2promo = none, oob = zero, interleave = none>
|
||||
!mbarrier = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
|
||||
func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d,
|
||||
%buffer1d: memref<128xf32,3>,
|
||||
%buffer2d: memref<32x32xf32,3>,
|
||||
%buffer3d: memref<2x32x32xf32,3>,
|
||||
%buffer4d: memref<2x2x32x32xf32,3>,
|
||||
%buffer5d: memref<2x2x2x32x32xf32,3>,
|
||||
%mbarrier: !mbarrier) {
|
||||
%crd0 = arith.constant 0 : index
|
||||
%crd1 = arith.constant 0 : index
|
||||
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}]
|
||||
nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier to %buffer1d : !tensorMap1d, !mbarrier -> memref<128xf32,3>
|
||||
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}]
|
||||
nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x32xf32,3>
|
||||
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}]
|
||||
nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3>
|
||||
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
|
||||
nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier to %buffer4d : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3>
|
||||
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
|
||||
nvgpu.tma.async.load %tensorMap5d[%crd0, %crd1, %crd1, %crd0, %crd0], %mbarrier to %buffer5d : !tensorMap5d, !mbarrier -> memref<2x2x2x32x32xf32,3>
|
||||
func.return
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user