mirror of
https://github.com/intel/llvm.git
synced 2026-01-16 05:32:28 +08:00
[mlir][AMDGPU] Define atomic compare-and-swap for raw buffers
This commit adds the buffer cmpswap intrinsic to the ROCDL dialect and its corresponding AMDGPU dialect wrappers. Reviewed By: nirvedhmeshram Differential Revision: https://reviews.llvm.org/D148722
This commit is contained in:
@@ -136,6 +136,48 @@ def AMDGPU_RawBufferStoreOp :
|
||||
let hasVerifier = 1;
|
||||
}
|
||||
|
||||
// Raw buffer atomic compare-and-swap
|
||||
def AMDGPU_RawBufferAtomicCmpswapOp :
|
||||
AMDGPU_Op<"raw_buffer_atomic_cmpswap", [
|
||||
AttrSizedOperandSegments,
|
||||
AllTypesMatch<["src", "cmp", "value"]>,
|
||||
AllElementTypesMatch<["value", "memref"]>]>,
|
||||
Arguments<(ins AnyTypeOf<[I32, I64, F32, F64]>:$src,
|
||||
AnyType:$cmp,
|
||||
Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref,
|
||||
Variadic<I32>:$indices,
|
||||
DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
|
||||
OptionalAttr<I32Attr>:$indexOffset,
|
||||
Optional<I32>:$sgprOffset)>,
|
||||
Results<(outs AnyType:$value)> {
|
||||
|
||||
let summary = "Raw Buffer Atomic compare-and-swap";
|
||||
let description = [{
|
||||
The `amdgpu.raw_buffer_atomic_cmpswap` op is a wrapper around the
|
||||
buffer-based atomic compare-and-swap min available on AMD GPUs.
|
||||
|
||||
The index into the buffer is computed as for `memref.store` with the addition
|
||||
of `indexOffset` (which is used to aid in emitting vectorized code) and,
|
||||
if present `sgprOffset` (which is added after bounds checks and includes
|
||||
any non-zero offset on the memref type).
|
||||
|
||||
All indexing components are given in terms of the memref's element size, not
|
||||
the byte lengths required by the intrinsic.
|
||||
|
||||
Out of bounds atomic operations are ignored in hardware.
|
||||
|
||||
See `amdgpu.raw_buffer_load` for a description of how the underlying
|
||||
instruction is constructed.
|
||||
}];
|
||||
let assemblyFormat = [{
|
||||
attr-dict $src `,` $cmp `->` $memref `[` $indices `]`
|
||||
(`sgprOffset` $sgprOffset^)? `:`
|
||||
type($value) `->` type($memref) `,` type($indices)
|
||||
}];
|
||||
let hasCanonicalizer = 1;
|
||||
let hasVerifier = 1;
|
||||
}
|
||||
|
||||
// Raw buffer atomic floating point add
|
||||
def AMDGPU_RawBufferAtomicFaddOp :
|
||||
AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>,
|
||||
|
||||
@@ -252,6 +252,25 @@ def ROCDL_RawBufferStoreOp :
|
||||
let hasCustomAssemblyFormat = 1;
|
||||
}
|
||||
|
||||
def ROCDL_RawBufferAtomicCmpSwap :
|
||||
ROCDL_Op<"raw.buffer.atomic.cmpswap", [AllTypesMatch<["res", "src", "cmp"]>]>,
|
||||
Results<(outs LLVM_Type:$res)>,
|
||||
Arguments<(ins LLVM_Type:$src,
|
||||
LLVM_Type:$cmp,
|
||||
LLVM_Type:$rsrc,
|
||||
I32:$offset,
|
||||
I32:$soffset,
|
||||
I32:$aux)>{
|
||||
string llvmBuilder = [{
|
||||
$res = createIntrinsicCall(builder,
|
||||
llvm::Intrinsic::amdgcn_raw_buffer_atomic_cmpswap, {$src, $cmp, $rsrc,
|
||||
$offset, $soffset, $aux}, {$_resultType});
|
||||
}];
|
||||
let assemblyFormat = [{
|
||||
attr-dict `(` operands `)` `:` type($res) `,` type($rsrc)
|
||||
}];
|
||||
}
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
// MI-100 and MI-200 buffer atomic floating point add intrinsic
|
||||
|
||||
|
||||
@@ -62,6 +62,14 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
|
||||
else
|
||||
wantedDataType = gpuOp.getODSResults(0)[0].getType();
|
||||
|
||||
Value atomicCmpData = Value();
|
||||
// Operand index 1 of a load is the indices, trying to read them can crash.
|
||||
if (storeData) {
|
||||
Value maybeCmpData = adaptor.getODSOperands(1)[0];
|
||||
if (maybeCmpData != memref)
|
||||
atomicCmpData = maybeCmpData;
|
||||
}
|
||||
|
||||
Type llvmWantedDataType = this->typeConverter->convertType(wantedDataType);
|
||||
|
||||
Type i32 = rewriter.getI32Type();
|
||||
@@ -73,8 +81,16 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
|
||||
// If we want to load a vector<NxT> with total size <= 32
|
||||
// bits, use a scalar load and bitcast it. Similarly, if bitsize(T) < 32
|
||||
// and the total load size is >= 32, use a vector load of N / (bitsize(T) /
|
||||
// 32) x i32 and bitcast.
|
||||
// 32) x i32 and bitcast. Also, the CAS intrinsic requires integer operands,
|
||||
// so bitcast any floats to integers.
|
||||
Type llvmBufferValType = llvmWantedDataType;
|
||||
if (atomicCmpData) {
|
||||
if (wantedDataType.isa<VectorType>())
|
||||
return gpuOp.emitOpError("vector compare-and-swap does not exist");
|
||||
if (auto floatType = wantedDataType.dyn_cast<FloatType>())
|
||||
llvmBufferValType = this->getTypeConverter()->convertType(
|
||||
rewriter.getIntegerType(floatType.getWidth()));
|
||||
}
|
||||
if (auto dataVector = wantedDataType.dyn_cast<VectorType>()) {
|
||||
uint32_t elemBits = dataVector.getElementTypeBitWidth();
|
||||
uint32_t totalBits = elemBits * dataVector.getNumElements();
|
||||
@@ -109,6 +125,16 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
|
||||
}
|
||||
}
|
||||
|
||||
if (atomicCmpData) {
|
||||
if (llvmBufferValType != llvmWantedDataType) {
|
||||
Value castForCmp = rewriter.create<LLVM::BitcastOp>(
|
||||
loc, llvmBufferValType, atomicCmpData);
|
||||
args.push_back(castForCmp);
|
||||
} else {
|
||||
args.push_back(atomicCmpData);
|
||||
}
|
||||
}
|
||||
|
||||
// Construct buffer descriptor from memref, attributes
|
||||
int64_t offset = 0;
|
||||
SmallVector<int64_t, 5> strides;
|
||||
@@ -529,6 +555,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
|
||||
RawBufferOpLowering<RawBufferAtomicFmaxOp, ROCDL::RawBufferAtomicFMaxOp>,
|
||||
RawBufferOpLowering<RawBufferAtomicSmaxOp, ROCDL::RawBufferAtomicSMaxOp>,
|
||||
RawBufferOpLowering<RawBufferAtomicUminOp, ROCDL::RawBufferAtomicUMinOp>,
|
||||
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
|
||||
ROCDL::RawBufferAtomicCmpSwap>,
|
||||
MFMAOpLowering>(converter, chipset);
|
||||
}
|
||||
|
||||
|
||||
@@ -90,6 +90,10 @@ LogicalResult RawBufferAtomicUminOp::verify() {
|
||||
return verifyRawBufferOp(*this);
|
||||
}
|
||||
|
||||
LogicalResult RawBufferAtomicCmpswapOp::verify() {
|
||||
return verifyRawBufferOp(*this);
|
||||
}
|
||||
|
||||
static std::optional<uint32_t> getConstantUint32(Value v) {
|
||||
APInt cst;
|
||||
if (!v.getType().isInteger(32))
|
||||
@@ -136,12 +140,11 @@ static bool staticallyOutOfBounds(OpType op) {
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct RemoveStaticallyOobBufferLoads final
|
||||
: public OpRewritePattern<RawBufferLoadOp> {
|
||||
using OpRewritePattern<RawBufferLoadOp>::OpRewritePattern;
|
||||
template <typename OpType>
|
||||
struct RemoveStaticallyOobBufferLoads final : public OpRewritePattern<OpType> {
|
||||
using OpRewritePattern<OpType>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(RawBufferLoadOp op,
|
||||
PatternRewriter &rw) const override {
|
||||
LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override {
|
||||
if (!staticallyOutOfBounds(op))
|
||||
return failure();
|
||||
Type loadType = op.getResult().getType();
|
||||
@@ -167,7 +170,7 @@ struct RemoveStaticallyOobBufferWrites final : public OpRewritePattern<OpType> {
|
||||
|
||||
void RawBufferLoadOp::getCanonicalizationPatterns(RewritePatternSet &results,
|
||||
MLIRContext *context) {
|
||||
results.add<RemoveStaticallyOobBufferLoads>(context);
|
||||
results.add<RemoveStaticallyOobBufferLoads<RawBufferLoadOp>>(context);
|
||||
}
|
||||
|
||||
void RawBufferStoreOp::getCanonicalizationPatterns(RewritePatternSet &results,
|
||||
@@ -195,6 +198,12 @@ void RawBufferAtomicUminOp::getCanonicalizationPatterns(
|
||||
results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicUminOp>>(context);
|
||||
}
|
||||
|
||||
void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
|
||||
RewritePatternSet &results, MLIRContext *context) {
|
||||
results.add<RemoveStaticallyOobBufferLoads<RawBufferAtomicCmpswapOp>>(
|
||||
context);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// MFMAOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
@@ -197,6 +197,35 @@ func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>,
|
||||
func.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_f32
|
||||
// CHECK-SAME: (%[[src:.*]]: f32, %[[cmp:.*]]: f32, {{.*}})
|
||||
func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : memref<64xf32>, %idx: i32) -> f32 {
|
||||
// CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32
|
||||
// CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32
|
||||
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
|
||||
// CHECK: llvm.insertelement{{.*}}%[[numRecords]]
|
||||
// CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
|
||||
// CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
|
||||
// CHECK: %[[dst:.*]] = rocdl.raw.buffer.atomic.cmpswap(%[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}}) : i32, vector<4xi32>
|
||||
// CHECK: %[[dstCast:.*]] = llvm.bitcast %[[dst]] : i32 to f32
|
||||
// CHECK: return %[[dstCast]]
|
||||
%dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : f32 -> memref<64xf32>, i32
|
||||
func.return %dst : f32
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_i64
|
||||
// CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}})
|
||||
func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 {
|
||||
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32)
|
||||
// CHECK: llvm.insertelement{{.*}}%[[numRecords]]
|
||||
// CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
|
||||
// CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
|
||||
// CHECK: %[[dst:.*]] = rocdl.raw.buffer.atomic.cmpswap(%[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}}) : i64, vector<4xi32>
|
||||
// CHECK: return %[[dst]]
|
||||
%dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : i64 -> memref<64xi64>, i32
|
||||
func.return %dst : i64
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @lds_barrier
|
||||
func.func @lds_barrier() {
|
||||
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier"
|
||||
|
||||
@@ -74,6 +74,13 @@ func.func @raw_buffer_atomic_fadd_f32_to_rank_4(%value : f32, %dst : memref<128x
|
||||
func.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @raw_buffer_atomic_cmpswap_f32
|
||||
func.func @raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) {
|
||||
// CHECK: amdgpu.raw_buffer_atomic_cmpswap {indexOffset = 1 : i32} %{{.*}}, %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32
|
||||
amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true, indexOffset = 1 : i32} %src, %cmp -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32
|
||||
func.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @lds_barrier
|
||||
func.func @lds_barrier() {
|
||||
// CHECK: amdgpu.lds_barrier
|
||||
|
||||
@@ -262,9 +262,11 @@ llvm.func @rocdl.raw.buffer.i32(%rsrc : vector<4xi32>,
|
||||
// CHECK-LABEL: rocdl.raw.buffer.i32
|
||||
// CHECK: rocdl.raw.buffer.atomic.smax %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : i32
|
||||
// CHECK: rocdl.raw.buffer.atomic.umin %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : i32
|
||||
// CHECK: %{{.*}} = rocdl.raw.buffer.atomic.cmpswap(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : i32, vector<4xi32>
|
||||
|
||||
rocdl.raw.buffer.atomic.smax %vdata1, %rsrc, %offset, %soffset, %aux : i32
|
||||
rocdl.raw.buffer.atomic.umin %vdata1, %rsrc, %offset, %soffset, %aux : i32
|
||||
%val = rocdl.raw.buffer.atomic.cmpswap(%vdata1, %vdata1, %rsrc, %offset, %soffset, %aux) : i32, vector<4xi32>
|
||||
llvm.return
|
||||
}
|
||||
|
||||
|
||||
@@ -294,6 +294,18 @@ llvm.func @rocdl.raw.buffer.atomic.i32(%rsrc : vector<4xi32>,
|
||||
llvm.return
|
||||
}
|
||||
|
||||
llvm.func @rocdl.raw.buffer.atomic.cmpswap(%rsrc : vector<4xi32>,
|
||||
%offset : i32, %soffset : i32,
|
||||
%src : i32, %cmp : i32) -> i32 {
|
||||
%aux = llvm.mlir.constant(0 : i32) : i32
|
||||
// CHECK-LABEL: rocdl.raw.buffer.atomic.cmpswap
|
||||
// CHECK: [[val:%.+]] = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %{{.*}}, i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
|
||||
// CHECK: ret i32 [[val]]
|
||||
|
||||
%val = rocdl.raw.buffer.atomic.cmpswap(%src, %cmp, %rsrc, %offset, %soffset, %aux) : i32, vector<4xi32>
|
||||
llvm.return %val : i32
|
||||
}
|
||||
|
||||
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" }
|
||||
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
|
||||
// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
|
||||
|
||||
Reference in New Issue
Block a user