[mlir][AMDGPU] Define atomic compare-and-swap for raw buffers

This commit adds the buffer cmpswap intrinsic to the ROCDL dialect and
its corresponding AMDGPU dialect wrappers.

Reviewed By: nirvedhmeshram

Differential Revision: https://reviews.llvm.org/D148722
This commit is contained in:
Krzysztof Drewniak
2023-04-17 16:55:56 +00:00
parent 6fbf022908
commit 98c1104d41
8 changed files with 155 additions and 7 deletions

View File

@@ -136,6 +136,48 @@ def AMDGPU_RawBufferStoreOp :
let hasVerifier = 1;
}
// Raw buffer atomic compare-and-swap
def AMDGPU_RawBufferAtomicCmpswapOp :
AMDGPU_Op<"raw_buffer_atomic_cmpswap", [
AttrSizedOperandSegments,
AllTypesMatch<["src", "cmp", "value"]>,
AllElementTypesMatch<["value", "memref"]>]>,
Arguments<(ins AnyTypeOf<[I32, I64, F32, F64]>:$src,
AnyType:$cmp,
Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref,
Variadic<I32>:$indices,
DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
OptionalAttr<I32Attr>:$indexOffset,
Optional<I32>:$sgprOffset)>,
Results<(outs AnyType:$value)> {
let summary = "Raw Buffer Atomic compare-and-swap";
let description = [{
The `amdgpu.raw_buffer_atomic_cmpswap` op is a wrapper around the
buffer-based atomic compare-and-swap min available on AMD GPUs.
The index into the buffer is computed as for `memref.store` with the addition
of `indexOffset` (which is used to aid in emitting vectorized code) and,
if present `sgprOffset` (which is added after bounds checks and includes
any non-zero offset on the memref type).
All indexing components are given in terms of the memref's element size, not
the byte lengths required by the intrinsic.
Out of bounds atomic operations are ignored in hardware.
See `amdgpu.raw_buffer_load` for a description of how the underlying
instruction is constructed.
}];
let assemblyFormat = [{
attr-dict $src `,` $cmp `->` $memref `[` $indices `]`
(`sgprOffset` $sgprOffset^)? `:`
type($value) `->` type($memref) `,` type($indices)
}];
let hasCanonicalizer = 1;
let hasVerifier = 1;
}
// Raw buffer atomic floating point add
def AMDGPU_RawBufferAtomicFaddOp :
AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>,

View File

@@ -252,6 +252,25 @@ def ROCDL_RawBufferStoreOp :
let hasCustomAssemblyFormat = 1;
}
def ROCDL_RawBufferAtomicCmpSwap :
ROCDL_Op<"raw.buffer.atomic.cmpswap", [AllTypesMatch<["res", "src", "cmp"]>]>,
Results<(outs LLVM_Type:$res)>,
Arguments<(ins LLVM_Type:$src,
LLVM_Type:$cmp,
LLVM_Type:$rsrc,
I32:$offset,
I32:$soffset,
I32:$aux)>{
string llvmBuilder = [{
$res = createIntrinsicCall(builder,
llvm::Intrinsic::amdgcn_raw_buffer_atomic_cmpswap, {$src, $cmp, $rsrc,
$offset, $soffset, $aux}, {$_resultType});
}];
let assemblyFormat = [{
attr-dict `(` operands `)` `:` type($res) `,` type($rsrc)
}];
}
//===---------------------------------------------------------------------===//
// MI-100 and MI-200 buffer atomic floating point add intrinsic

View File

@@ -62,6 +62,14 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
else
wantedDataType = gpuOp.getODSResults(0)[0].getType();
Value atomicCmpData = Value();
// Operand index 1 of a load is the indices, trying to read them can crash.
if (storeData) {
Value maybeCmpData = adaptor.getODSOperands(1)[0];
if (maybeCmpData != memref)
atomicCmpData = maybeCmpData;
}
Type llvmWantedDataType = this->typeConverter->convertType(wantedDataType);
Type i32 = rewriter.getI32Type();
@@ -73,8 +81,16 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
// If we want to load a vector<NxT> with total size <= 32
// bits, use a scalar load and bitcast it. Similarly, if bitsize(T) < 32
// and the total load size is >= 32, use a vector load of N / (bitsize(T) /
// 32) x i32 and bitcast.
// 32) x i32 and bitcast. Also, the CAS intrinsic requires integer operands,
// so bitcast any floats to integers.
Type llvmBufferValType = llvmWantedDataType;
if (atomicCmpData) {
if (wantedDataType.isa<VectorType>())
return gpuOp.emitOpError("vector compare-and-swap does not exist");
if (auto floatType = wantedDataType.dyn_cast<FloatType>())
llvmBufferValType = this->getTypeConverter()->convertType(
rewriter.getIntegerType(floatType.getWidth()));
}
if (auto dataVector = wantedDataType.dyn_cast<VectorType>()) {
uint32_t elemBits = dataVector.getElementTypeBitWidth();
uint32_t totalBits = elemBits * dataVector.getNumElements();
@@ -109,6 +125,16 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
}
}
if (atomicCmpData) {
if (llvmBufferValType != llvmWantedDataType) {
Value castForCmp = rewriter.create<LLVM::BitcastOp>(
loc, llvmBufferValType, atomicCmpData);
args.push_back(castForCmp);
} else {
args.push_back(atomicCmpData);
}
}
// Construct buffer descriptor from memref, attributes
int64_t offset = 0;
SmallVector<int64_t, 5> strides;
@@ -529,6 +555,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
RawBufferOpLowering<RawBufferAtomicFmaxOp, ROCDL::RawBufferAtomicFMaxOp>,
RawBufferOpLowering<RawBufferAtomicSmaxOp, ROCDL::RawBufferAtomicSMaxOp>,
RawBufferOpLowering<RawBufferAtomicUminOp, ROCDL::RawBufferAtomicUMinOp>,
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
ROCDL::RawBufferAtomicCmpSwap>,
MFMAOpLowering>(converter, chipset);
}

View File

@@ -90,6 +90,10 @@ LogicalResult RawBufferAtomicUminOp::verify() {
return verifyRawBufferOp(*this);
}
LogicalResult RawBufferAtomicCmpswapOp::verify() {
return verifyRawBufferOp(*this);
}
static std::optional<uint32_t> getConstantUint32(Value v) {
APInt cst;
if (!v.getType().isInteger(32))
@@ -136,12 +140,11 @@ static bool staticallyOutOfBounds(OpType op) {
}
namespace {
struct RemoveStaticallyOobBufferLoads final
: public OpRewritePattern<RawBufferLoadOp> {
using OpRewritePattern<RawBufferLoadOp>::OpRewritePattern;
template <typename OpType>
struct RemoveStaticallyOobBufferLoads final : public OpRewritePattern<OpType> {
using OpRewritePattern<OpType>::OpRewritePattern;
LogicalResult matchAndRewrite(RawBufferLoadOp op,
PatternRewriter &rw) const override {
LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override {
if (!staticallyOutOfBounds(op))
return failure();
Type loadType = op.getResult().getType();
@@ -167,7 +170,7 @@ struct RemoveStaticallyOobBufferWrites final : public OpRewritePattern<OpType> {
void RawBufferLoadOp::getCanonicalizationPatterns(RewritePatternSet &results,
MLIRContext *context) {
results.add<RemoveStaticallyOobBufferLoads>(context);
results.add<RemoveStaticallyOobBufferLoads<RawBufferLoadOp>>(context);
}
void RawBufferStoreOp::getCanonicalizationPatterns(RewritePatternSet &results,
@@ -195,6 +198,12 @@ void RawBufferAtomicUminOp::getCanonicalizationPatterns(
results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicUminOp>>(context);
}
void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
RewritePatternSet &results, MLIRContext *context) {
results.add<RemoveStaticallyOobBufferLoads<RawBufferAtomicCmpswapOp>>(
context);
}
//===----------------------------------------------------------------------===//
// MFMAOp
//===----------------------------------------------------------------------===//

View File

@@ -197,6 +197,35 @@ func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>,
func.return
}
// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_f32
// CHECK-SAME: (%[[src:.*]]: f32, %[[cmp:.*]]: f32, {{.*}})
func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : memref<64xf32>, %idx: i32) -> f32 {
// CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32
// CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
// CHECK: llvm.insertelement{{.*}}%[[numRecords]]
// CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
// CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
// CHECK: %[[dst:.*]] = rocdl.raw.buffer.atomic.cmpswap(%[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}}) : i32, vector<4xi32>
// CHECK: %[[dstCast:.*]] = llvm.bitcast %[[dst]] : i32 to f32
// CHECK: return %[[dstCast]]
%dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : f32 -> memref<64xf32>, i32
func.return %dst : f32
}
// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_i64
// CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}})
func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32)
// CHECK: llvm.insertelement{{.*}}%[[numRecords]]
// CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
// CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
// CHECK: %[[dst:.*]] = rocdl.raw.buffer.atomic.cmpswap(%[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}}) : i64, vector<4xi32>
// CHECK: return %[[dst]]
%dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : i64 -> memref<64xi64>, i32
func.return %dst : i64
}
// CHECK-LABEL: func @lds_barrier
func.func @lds_barrier() {
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier"

View File

@@ -74,6 +74,13 @@ func.func @raw_buffer_atomic_fadd_f32_to_rank_4(%value : f32, %dst : memref<128x
func.return
}
// CHECK-LABEL: func @raw_buffer_atomic_cmpswap_f32
func.func @raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) {
// CHECK: amdgpu.raw_buffer_atomic_cmpswap {indexOffset = 1 : i32} %{{.*}}, %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32
amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true, indexOffset = 1 : i32} %src, %cmp -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32
func.return
}
// CHECK-LABEL: func @lds_barrier
func.func @lds_barrier() {
// CHECK: amdgpu.lds_barrier

View File

@@ -262,9 +262,11 @@ llvm.func @rocdl.raw.buffer.i32(%rsrc : vector<4xi32>,
// CHECK-LABEL: rocdl.raw.buffer.i32
// CHECK: rocdl.raw.buffer.atomic.smax %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : i32
// CHECK: rocdl.raw.buffer.atomic.umin %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : i32
// CHECK: %{{.*}} = rocdl.raw.buffer.atomic.cmpswap(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : i32, vector<4xi32>
rocdl.raw.buffer.atomic.smax %vdata1, %rsrc, %offset, %soffset, %aux : i32
rocdl.raw.buffer.atomic.umin %vdata1, %rsrc, %offset, %soffset, %aux : i32
%val = rocdl.raw.buffer.atomic.cmpswap(%vdata1, %vdata1, %rsrc, %offset, %soffset, %aux) : i32, vector<4xi32>
llvm.return
}

View File

@@ -294,6 +294,18 @@ llvm.func @rocdl.raw.buffer.atomic.i32(%rsrc : vector<4xi32>,
llvm.return
}
llvm.func @rocdl.raw.buffer.atomic.cmpswap(%rsrc : vector<4xi32>,
%offset : i32, %soffset : i32,
%src : i32, %cmp : i32) -> i32 {
%aux = llvm.mlir.constant(0 : i32) : i32
// CHECK-LABEL: rocdl.raw.buffer.atomic.cmpswap
// CHECK: [[val:%.+]] = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %{{.*}}, i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
// CHECK: ret i32 [[val]]
%val = rocdl.raw.buffer.atomic.cmpswap(%src, %cmp, %rsrc, %offset, %soffset, %aux) : i32, vector<4xi32>
llvm.return %val : i32
}
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" }
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"