mirror of
https://github.com/intel/llvm.git
synced 2026-02-02 18:18:09 +08:00
[mlir][sparse][gpu] unify dnmat and dnvec handle and ops
Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D152465
This commit is contained in:
@@ -116,17 +116,11 @@ def GPU_SparseEnvHandle :
|
||||
"sparse environment handle type">,
|
||||
BuildableType<"mlir::gpu::SparseEnvHandleType::get($_builder.getContext())">;
|
||||
|
||||
def GPU_SparseDnVecHandle :
|
||||
def GPU_SparseDnTensorHandle :
|
||||
DialectType<GPU_Dialect,
|
||||
CPred<"llvm::isa<::mlir::gpu::SparseDnVecHandleType>($_self)">,
|
||||
"dense vector handle type">,
|
||||
BuildableType<"mlir::gpu::SparseDnVecHandleType::get($_builder.getContext())">;
|
||||
|
||||
def GPU_SparseDnMatHandle :
|
||||
DialectType<GPU_Dialect,
|
||||
CPred<"llvm::isa<::mlir::gpu::SparseDnMatHandleType>($_self)">,
|
||||
"dense matrix handle type">,
|
||||
BuildableType<"mlir::gpu::SparseDnMatHandleType::get($_builder.getContext())">;
|
||||
CPred<"llvm::isa<::mlir::gpu::SparseDnTensorHandleType>($_self)">,
|
||||
"dense tensor handle type">,
|
||||
BuildableType<"mlir::gpu::SparseDnTensorHandleType::get($_builder.getContext())">;
|
||||
|
||||
def GPU_SparseSpMatHandle :
|
||||
DialectType<GPU_Dialect,
|
||||
|
||||
@@ -165,7 +165,7 @@ public:
|
||||
void addAsyncDependency(Operation *op, Value token);
|
||||
|
||||
// Handle types for sparse.
|
||||
enum class SparseHandleKind { Env, DnVec, DnMat, SpMat };
|
||||
enum class SparseHandleKind { Env, SpMat, DnTensor };
|
||||
|
||||
template <SparseHandleKind K>
|
||||
class SparseHandleType
|
||||
@@ -177,8 +177,7 @@ public:
|
||||
};
|
||||
|
||||
using SparseEnvHandleType = SparseHandleType<SparseHandleKind::Env>;
|
||||
using SparseDnVecHandleType = SparseHandleType<SparseHandleKind::DnVec>;
|
||||
using SparseDnMatHandleType = SparseHandleType<SparseHandleKind::DnMat>;
|
||||
using SparseDnTensorHandleType = SparseHandleType<SparseHandleKind::DnTensor>;
|
||||
using SparseSpMatHandleType = SparseHandleType<SparseHandleKind::SpMat>;
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
@@ -1597,73 +1597,13 @@ def GPU_DestroySparseEnvOp : GPU_Op<
|
||||
}];
|
||||
}
|
||||
|
||||
def GPU_CreateDnVecOp : GPU_Op<"create_dn_vec", [GPU_AsyncOpInterface]> {
|
||||
let summary = "Create dense vector operation";
|
||||
def GPU_CreateDnTensorOp : GPU_Op<"create_dn_tensor", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> {
|
||||
let summary = "Create dense tensor operation";
|
||||
let description = [{
|
||||
The `gpu.create_dn_vec` operation initializes a dense vector from
|
||||
the given values buffer and size. The buffer must already be copied
|
||||
from the host to the device prior to using this operation. The
|
||||
operation returns a handle to the dense vector descriptor.
|
||||
|
||||
If the `async` keyword is present, the op is executed asynchronously (i.e.
|
||||
it does not block until the execution has finished on the device). In
|
||||
that case, it returns a !gpu.async.token in addition to the environment.
|
||||
|
||||
Example:
|
||||
|
||||
```mlir
|
||||
%dvec, %token = gpu.create_dn_vec async [%dep] %env, %mem, %size : memref<?xf64>
|
||||
```
|
||||
}];
|
||||
|
||||
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
|
||||
GPU_SparseEnvHandle:$env,
|
||||
AnyMemRef:$memref,
|
||||
Index:$size);
|
||||
let results = (outs Res<GPU_SparseDnVecHandle>:$dvec,
|
||||
Optional<GPU_AsyncToken>:$asyncToken);
|
||||
|
||||
let assemblyFormat = [{
|
||||
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
|
||||
$env `,` $memref `,` $size attr-dict `:` type($memref)
|
||||
}];
|
||||
}
|
||||
|
||||
def GPU_DestroyDnVecOp : GPU_Op<"destroy_dn_vec", [GPU_AsyncOpInterface]> {
|
||||
let summary = "Destroy dense vector operation";
|
||||
let description = [{
|
||||
The `gpu.destroy_dn_vec` operation releases all resources of a dense
|
||||
vector represented by a handle that was previously created by a
|
||||
`gpu.create_dn_vec` operation.
|
||||
|
||||
If the `async` keyword is present, the op is executed asynchronously (i.e.
|
||||
it does not block until the execution has finished on the device). In
|
||||
that case, it returns a !gpu.async.token in addition to the environment.
|
||||
|
||||
Example:
|
||||
|
||||
```mlir
|
||||
%token = gpu.destroy_dn_vec async [%dep] %dvec
|
||||
```
|
||||
}];
|
||||
|
||||
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
|
||||
Arg<GPU_SparseDnVecHandle>:$dvec);
|
||||
let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
|
||||
|
||||
let assemblyFormat = [{
|
||||
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
|
||||
$dvec attr-dict
|
||||
}];
|
||||
}
|
||||
|
||||
def GPU_CreateDnMatOp : GPU_Op<"create_dn_mat", [GPU_AsyncOpInterface]> {
|
||||
let summary = "Create dense matrix operation";
|
||||
let description = [{
|
||||
The `gpu.create_dn_mat` operation initializes a dense matrix from
|
||||
The `gpu.create_dn_tensor` operation initializes a dense tensor from
|
||||
the given values buffer and sizes. The buffer must already be copied
|
||||
from the host to the device prior to using this operation. The
|
||||
operation returns a handle to the dense matrix descriptor.
|
||||
operation returns a handle to the dense tensor descriptor.
|
||||
|
||||
If the `async` keyword is present, the op is executed asynchronously (i.e.
|
||||
it does not block until the execution has finished on the device). In
|
||||
@@ -1672,29 +1612,28 @@ def GPU_CreateDnMatOp : GPU_Op<"create_dn_mat", [GPU_AsyncOpInterface]> {
|
||||
Example:
|
||||
|
||||
```mlir
|
||||
%dmat, %token = gpu.create_dn_mat async [%dep] %env, %rows, %cols, %mem : memref<?xf64>
|
||||
%dmat, %token = gpu.create_dn_tensor async [%dep] %env, %mem, %dims : index, index into memref<?xf64>
|
||||
```
|
||||
}];
|
||||
|
||||
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
|
||||
GPU_SparseEnvHandle:$env,
|
||||
Index:$rows,
|
||||
Index:$cols,
|
||||
AnyMemRef:$memref);
|
||||
let results = (outs Res<GPU_SparseDnMatHandle>:$dmat, Optional<GPU_AsyncToken>:$asyncToken);
|
||||
AnyMemRef:$memref,
|
||||
Variadic<Index>:$dims);
|
||||
let results = (outs Res<GPU_SparseDnTensorHandle>:$dnTensor, Optional<GPU_AsyncToken>:$asyncToken);
|
||||
|
||||
let assemblyFormat = [{
|
||||
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
|
||||
$env `,` $rows `,` $cols `,` $memref attr-dict `:` type($memref)
|
||||
$env `,` $memref `,` $dims attr-dict `:` type($dims) `into` type($memref)
|
||||
}];
|
||||
}
|
||||
|
||||
def GPU_DestroyDnMatOp : GPU_Op<"destroy_dn_mat", [GPU_AsyncOpInterface]> {
|
||||
let summary = "Destroy dense matrix operation";
|
||||
def GPU_DestroyDnTensorOp : GPU_Op<"destroy_dn_tensor", [GPU_AsyncOpInterface]> {
|
||||
let summary = "Destroy dense tensor operation";
|
||||
let description = [{
|
||||
The `gpu.destroy_dn_mat` operation releases all resources of a dense
|
||||
matrix represented by a handle that was previously created by a
|
||||
`gpu.create_dn_mat` operation.
|
||||
The `gpu.destroy_dn_tensor` operation releases all resources of a dense
|
||||
tensor represented by a handle that was previously created by a
|
||||
`gpu.create_dn_tensor` operation.
|
||||
|
||||
If the `async` keyword is present, the op is executed asynchronously (i.e.
|
||||
it does not block until the execution has finished on the device). In
|
||||
@@ -1703,17 +1642,17 @@ def GPU_DestroyDnMatOp : GPU_Op<"destroy_dn_mat", [GPU_AsyncOpInterface]> {
|
||||
Example:
|
||||
|
||||
```mlir
|
||||
%token = gpu.destroy_dn_vec async [%dep] %dmat
|
||||
%token = gpu.destroy_dn_tensor async [%dep] %dnTensor
|
||||
```
|
||||
}];
|
||||
|
||||
let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
|
||||
Arg<GPU_SparseDnMatHandle>:$dmat);
|
||||
Arg<GPU_SparseDnTensorHandle>:$dnTensor);
|
||||
let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
|
||||
|
||||
let assemblyFormat = [{
|
||||
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
|
||||
$dmat attr-dict
|
||||
$dnTensor attr-dict
|
||||
}];
|
||||
}
|
||||
|
||||
@@ -1945,8 +1884,8 @@ def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> {
|
||||
GPU_SparseEnvHandle:$env,
|
||||
GPU_TransposeModeAttr:$modeA,
|
||||
GPU_SparseSpMatHandle:$spmatA,
|
||||
GPU_SparseDnVecHandle:$dnX,
|
||||
GPU_SparseDnVecHandle:$dnY,
|
||||
GPU_SparseDnTensorHandle:$dnX,
|
||||
GPU_SparseDnTensorHandle:$dnY,
|
||||
TypeAttr:$computeType);
|
||||
let results = (outs Res<Index>:$bufferSz,
|
||||
Optional<GPU_AsyncToken>:$asyncToken);
|
||||
@@ -1998,8 +1937,8 @@ def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> {
|
||||
GPU_SparseEnvHandle:$env,
|
||||
GPU_TransposeModeAttr:$modeA,
|
||||
GPU_SparseSpMatHandle:$spmatA,
|
||||
GPU_SparseDnVecHandle:$dnX,
|
||||
GPU_SparseDnVecHandle:$dnY,
|
||||
GPU_SparseDnTensorHandle:$dnX,
|
||||
GPU_SparseDnTensorHandle:$dnY,
|
||||
TypeAttr:$computeType,
|
||||
AnyMemRef:$buffer);
|
||||
let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
|
||||
@@ -2052,8 +1991,8 @@ def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface]> {
|
||||
GPU_TransposeModeAttr:$modeA,
|
||||
GPU_TransposeModeAttr:$modeB,
|
||||
GPU_SparseSpMatHandle:$spmatA,
|
||||
GPU_SparseDnMatHandle:$dnmatB,
|
||||
GPU_SparseDnMatHandle:$dnmatC,
|
||||
GPU_SparseDnTensorHandle:$dnmatB,
|
||||
GPU_SparseDnTensorHandle:$dnmatC,
|
||||
TypeAttr:$computeType);
|
||||
let results = (outs Res<AnyTypeOf<[Index, TupleOf<[Index, Index,
|
||||
Index]>]>>:$bufferSzs,
|
||||
@@ -2108,8 +2047,8 @@ def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface, AttrSizedOperandSegments]
|
||||
GPU_TransposeModeAttr:$modeA,
|
||||
GPU_TransposeModeAttr:$modeB,
|
||||
GPU_SparseSpMatHandle:$spmatA,
|
||||
GPU_SparseDnMatHandle:$dnmatB,
|
||||
GPU_SparseDnMatHandle:$dnmatC,
|
||||
GPU_SparseDnTensorHandle:$dnmatB,
|
||||
GPU_SparseDnTensorHandle:$dnmatC,
|
||||
TypeAttr:$computeType,
|
||||
Variadic<AnyMemRef>:$buffers);
|
||||
let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
|
||||
@@ -2162,8 +2101,8 @@ def GPU_SDDMMBufferSizeOp : GPU_Op<"sddmm_buffer_size", [GPU_AsyncOpInterface]>
|
||||
GPU_SparseEnvHandle:$env,
|
||||
GPU_TransposeModeAttr:$modeA,
|
||||
GPU_TransposeModeAttr:$modeB,
|
||||
GPU_SparseDnMatHandle:$dnmatA,
|
||||
GPU_SparseDnMatHandle:$dnmatB,
|
||||
GPU_SparseDnTensorHandle:$dnmatA,
|
||||
GPU_SparseDnTensorHandle:$dnmatB,
|
||||
GPU_SparseSpMatHandle:$spmatC,
|
||||
TypeAttr:$computeType);
|
||||
let results = (outs Res<Index>:$bufferSz, Optional<GPU_AsyncToken>:$asyncToken);
|
||||
@@ -2216,8 +2155,8 @@ def GPU_SDDMMOp : GPU_Op<"sddmm", [GPU_AsyncOpInterface]> {
|
||||
GPU_SparseEnvHandle:$env,
|
||||
GPU_TransposeModeAttr:$modeA,
|
||||
GPU_TransposeModeAttr:$modeB,
|
||||
GPU_SparseDnMatHandle:$dnmatA,
|
||||
GPU_SparseDnMatHandle:$dnmatB,
|
||||
GPU_SparseDnTensorHandle:$dnmatA,
|
||||
GPU_SparseDnTensorHandle:$dnmatB,
|
||||
GPU_SparseSpMatHandle:$spmatC,
|
||||
TypeAttr:$computeType,
|
||||
AnyMemRef:$buffer);
|
||||
|
||||
@@ -548,51 +548,31 @@ private:
|
||||
ConversionPatternRewriter &rewriter) const override;
|
||||
};
|
||||
|
||||
class ConvertCreateDnVecOpToGpuRuntimeCallPattern
|
||||
: public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnVecOp> {
|
||||
class ConvertCreateDnTensorOpToGpuRuntimeCallPattern
|
||||
: public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnTensorOp> {
|
||||
public:
|
||||
ConvertCreateDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
|
||||
: ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnVecOp>(typeConverter) {}
|
||||
ConvertCreateDnTensorOpToGpuRuntimeCallPattern(
|
||||
LLVMTypeConverter &typeConverter)
|
||||
: ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnTensorOp>(typeConverter) {
|
||||
}
|
||||
|
||||
private:
|
||||
LogicalResult
|
||||
matchAndRewrite(gpu::CreateDnVecOp op, OpAdaptor adaptor,
|
||||
matchAndRewrite(gpu::CreateDnTensorOp op, OpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const override;
|
||||
};
|
||||
|
||||
class ConvertDestroyDnVecOpToGpuRuntimeCallPattern
|
||||
: public ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnVecOp> {
|
||||
class ConvertDestroyDnTensorOpToGpuRuntimeCallPattern
|
||||
: public ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnTensorOp> {
|
||||
public:
|
||||
ConvertDestroyDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
|
||||
: ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnVecOp>(typeConverter) {}
|
||||
ConvertDestroyDnTensorOpToGpuRuntimeCallPattern(
|
||||
LLVMTypeConverter &typeConverter)
|
||||
: ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnTensorOp>(
|
||||
typeConverter) {}
|
||||
|
||||
private:
|
||||
LogicalResult
|
||||
matchAndRewrite(gpu::DestroyDnVecOp op, OpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const override;
|
||||
};
|
||||
|
||||
class ConvertCreateDnMatOpToGpuRuntimeCallPattern
|
||||
: public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnMatOp> {
|
||||
public:
|
||||
ConvertCreateDnMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
|
||||
: ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnMatOp>(typeConverter) {}
|
||||
|
||||
private:
|
||||
LogicalResult
|
||||
matchAndRewrite(gpu::CreateDnMatOp op, OpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const override;
|
||||
};
|
||||
|
||||
class ConvertDestroyDnMatOpToGpuRuntimeCallPattern
|
||||
: public ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnMatOp> {
|
||||
public:
|
||||
ConvertDestroyDnMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
|
||||
: ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnMatOp>(typeConverter) {}
|
||||
|
||||
private:
|
||||
LogicalResult
|
||||
matchAndRewrite(gpu::DestroyDnMatOp op, OpAdaptor adaptor,
|
||||
matchAndRewrite(gpu::DestroyDnTensorOp op, OpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const override;
|
||||
};
|
||||
|
||||
@@ -1474,102 +1454,90 @@ LogicalResult ConvertDestroySparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite(
|
||||
return success();
|
||||
}
|
||||
|
||||
LogicalResult ConvertCreateDnVecOpToGpuRuntimeCallPattern::matchAndRewrite(
|
||||
gpu::CreateDnVecOp op, OpAdaptor adaptor,
|
||||
LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
|
||||
gpu::CreateDnTensorOp op, OpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const {
|
||||
if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
|
||||
failed(isAsyncWithOneDependency(rewriter, op)))
|
||||
return failure();
|
||||
Location loc = op.getLoc();
|
||||
auto stream = adaptor.getAsyncDependencies().front();
|
||||
Value pVec =
|
||||
Value pTensor =
|
||||
MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
|
||||
if (!getTypeConverter()->useOpaquePointers())
|
||||
pVec = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pVec);
|
||||
pTensor = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pTensor);
|
||||
Type dType = op.getMemref().getType().getElementType();
|
||||
auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
|
||||
auto handle =
|
||||
createDnVecCallBuilder
|
||||
.create(loc, rewriter, {adaptor.getSize(), pVec, dtp, stream})
|
||||
.getResult();
|
||||
rewriter.replaceOp(op, {handle, stream});
|
||||
return success();
|
||||
}
|
||||
|
||||
LogicalResult ConvertDestroyDnVecOpToGpuRuntimeCallPattern::matchAndRewrite(
|
||||
gpu::DestroyDnVecOp op, OpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const {
|
||||
if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
|
||||
failed(isAsyncWithOneDependency(rewriter, op)))
|
||||
return failure();
|
||||
Location loc = op.getLoc();
|
||||
auto stream = adaptor.getAsyncDependencies().front();
|
||||
destroyDnVecCallBuilder.create(loc, rewriter, {adaptor.getDvec(), stream});
|
||||
rewriter.replaceOp(op, {stream});
|
||||
return success();
|
||||
}
|
||||
SmallVector<Value, 4> dims;
|
||||
for (Value dim : adaptor.getDims()) {
|
||||
dims.push_back(dim);
|
||||
}
|
||||
|
||||
LogicalResult ConvertCreateDnMatOpToGpuRuntimeCallPattern::matchAndRewrite(
|
||||
gpu::CreateDnMatOp op, OpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const {
|
||||
if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
|
||||
failed(isAsyncWithOneDependency(rewriter, op)))
|
||||
return failure();
|
||||
Location loc = op.getLoc();
|
||||
auto stream = adaptor.getAsyncDependencies().front();
|
||||
Value pMat =
|
||||
MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
|
||||
if (!getTypeConverter()->useOpaquePointers())
|
||||
pMat = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pMat);
|
||||
Type dType = op.getMemref().getType().getElementType();
|
||||
auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
|
||||
Value handle;
|
||||
// TODO: For now, we track the use of the handle and lower it to cusparse /
|
||||
// cusparseLt accordingly. If in a block, both cusparse and cusparseLt are
|
||||
// used, we require two separate Creation ops to be the correct logic. In
|
||||
// future, we may add support to using one handle in sparse tensor / GPU
|
||||
// dialect in both cusparse and cusparseLt. use the cusparseLt create call if
|
||||
// the dnmat is used with spmat with 2:4 sparsity
|
||||
Value handle;
|
||||
if (isSpMMCusparseLtOp(op.getDmat())) {
|
||||
auto envHandle = adaptor.getEnv();
|
||||
AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {});
|
||||
auto handleSz = rewriter.create<LLVM::ConstantOp>(
|
||||
loc, getIndexType(), rewriter.getIndexAttr(11032));
|
||||
handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
|
||||
llvmInt8Type, handleSz);
|
||||
handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
|
||||
if (dims.size() == 2) {
|
||||
if (isSpMMCusparseLtOp(op.getDnTensor())) {
|
||||
auto envHandle = adaptor.getEnv();
|
||||
AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {});
|
||||
auto handleSz = rewriter.create<LLVM::ConstantOp>(
|
||||
loc, getIndexType(), rewriter.getIndexAttr(11032));
|
||||
handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
|
||||
llvmInt8Type, handleSz);
|
||||
handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
|
||||
|
||||
createLtDnMatCallBuilder
|
||||
.create(loc, rewriter,
|
||||
{handle, envHandle, adaptor.getRows(), adaptor.getCols(), pMat,
|
||||
dtp, stream})
|
||||
.getResult();
|
||||
createLtDnMatCallBuilder
|
||||
.create(loc, rewriter,
|
||||
{handle, envHandle, dims[0], dims[1], pTensor, dtp, stream})
|
||||
.getResult();
|
||||
} else {
|
||||
handle =
|
||||
createDnMatCallBuilder
|
||||
.create(loc, rewriter, {dims[0], dims[1], pTensor, dtp, stream})
|
||||
.getResult();
|
||||
}
|
||||
} else {
|
||||
handle =
|
||||
createDnMatCallBuilder
|
||||
.create(loc, rewriter,
|
||||
{adaptor.getRows(), adaptor.getCols(), pMat, dtp, stream})
|
||||
.getResult();
|
||||
assert(dims.size() == 1 && "Only 1D and 2D tensors are supported");
|
||||
handle = createDnVecCallBuilder
|
||||
.create(loc, rewriter, {dims[0], pTensor, dtp, stream})
|
||||
.getResult();
|
||||
}
|
||||
rewriter.replaceOp(op, {handle, stream});
|
||||
return success();
|
||||
}
|
||||
|
||||
LogicalResult ConvertDestroyDnMatOpToGpuRuntimeCallPattern::matchAndRewrite(
|
||||
gpu::DestroyDnMatOp op, OpAdaptor adaptor,
|
||||
LogicalResult ConvertDestroyDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
|
||||
gpu::DestroyDnTensorOp op, OpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const {
|
||||
if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
|
||||
failed(isAsyncWithOneDependency(rewriter, op)))
|
||||
return failure();
|
||||
Location loc = op.getLoc();
|
||||
auto stream = adaptor.getAsyncDependencies().front();
|
||||
// Use the cusparseLt destroy call if the dnmat is used with spmat with
|
||||
// 2:4 sparsity
|
||||
if (isSpMMCusparseLtOp(op.getDmat())) {
|
||||
destroyCuSparseLtDnMatBuilder.create(loc, rewriter,
|
||||
{adaptor.getDmat(), stream});
|
||||
auto definingOp = op.getDnTensor().getDefiningOp<gpu::CreateDnTensorOp>();
|
||||
SmallVector<Value, 4> dims;
|
||||
for (Value dim : definingOp.getDims()) {
|
||||
dims.push_back(dim);
|
||||
}
|
||||
if (dims.size() == 2) {
|
||||
// Use the cusparseLt destroy call if the dnmat is used with spmat with
|
||||
// 2:4 sparsity
|
||||
if (isSpMMCusparseLtOp(op.getDnTensor())) {
|
||||
destroyCuSparseLtDnMatBuilder.create(loc, rewriter,
|
||||
{adaptor.getDnTensor(), stream});
|
||||
} else {
|
||||
destroyDnMatCallBuilder.create(loc, rewriter,
|
||||
{adaptor.getDnTensor(), stream});
|
||||
}
|
||||
} else {
|
||||
destroyDnMatCallBuilder.create(loc, rewriter, {adaptor.getDmat(), stream});
|
||||
assert(dims.size() == 1 && "Only 1D and 2D tensors are supported");
|
||||
destroyDnVecCallBuilder.create(loc, rewriter,
|
||||
{adaptor.getDnTensor(), stream});
|
||||
}
|
||||
rewriter.replaceOp(op, {stream});
|
||||
return success();
|
||||
@@ -1914,8 +1882,7 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
|
||||
StringRef gpuBinaryAnnotation,
|
||||
bool kernelBarePtrCallConv) {
|
||||
addOpaquePointerConversion<gpu::AsyncTokenType>(converter);
|
||||
addOpaquePointerConversion<gpu::SparseDnVecHandleType>(converter);
|
||||
addOpaquePointerConversion<gpu::SparseDnMatHandleType>(converter);
|
||||
addOpaquePointerConversion<gpu::SparseDnTensorHandleType>(converter);
|
||||
addOpaquePointerConversion<gpu::SparseSpMatHandleType>(converter);
|
||||
addOpaquePointerConversion<gpu::SparseEnvHandleType>(converter);
|
||||
|
||||
@@ -1931,10 +1898,8 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
|
||||
ConvertAsyncYieldToGpuRuntimeCallPattern,
|
||||
ConvertCreateSparseEnvOpToGpuRuntimeCallPattern,
|
||||
ConvertDestroySparseEnvOpToGpuRuntimeCallPattern,
|
||||
ConvertCreateDnVecOpToGpuRuntimeCallPattern,
|
||||
ConvertDestroyDnVecOpToGpuRuntimeCallPattern,
|
||||
ConvertCreateDnMatOpToGpuRuntimeCallPattern,
|
||||
ConvertDestroyDnMatOpToGpuRuntimeCallPattern,
|
||||
ConvertCreateDnTensorOpToGpuRuntimeCallPattern,
|
||||
ConvertDestroyDnTensorOpToGpuRuntimeCallPattern,
|
||||
ConvertCreateCooOpToGpuRuntimeCallPattern,
|
||||
ConvertCreateCooAoSOpToGpuRuntimeCallPattern,
|
||||
ConvertCreateCsrOpToGpuRuntimeCallPattern,
|
||||
|
||||
@@ -147,8 +147,7 @@ void GPUDialect::initialize() {
|
||||
addTypes<AsyncTokenType>();
|
||||
addTypes<MMAMatrixType>();
|
||||
addTypes<SparseEnvHandleType>();
|
||||
addTypes<SparseDnVecHandleType>();
|
||||
addTypes<SparseDnMatHandleType>();
|
||||
addTypes<SparseDnTensorHandleType>();
|
||||
addTypes<SparseSpMatHandleType>();
|
||||
addOperations<
|
||||
#define GET_OP_LIST
|
||||
@@ -165,10 +164,8 @@ static std::string getSparseHandleKeyword(SparseHandleKind kind) {
|
||||
switch (kind) {
|
||||
case SparseHandleKind::Env:
|
||||
return "sparse.env_handle";
|
||||
case SparseHandleKind::DnVec:
|
||||
return "sparse.dnvec_handle";
|
||||
case SparseHandleKind::DnMat:
|
||||
return "sparse.dnmat_handle";
|
||||
case SparseHandleKind::DnTensor:
|
||||
return "sparse.dntensor_handle";
|
||||
case SparseHandleKind::SpMat:
|
||||
return "sparse.spmat_handle";
|
||||
}
|
||||
@@ -221,10 +218,8 @@ Type GPUDialect::parseType(DialectAsmParser &parser) const {
|
||||
|
||||
if (keyword == getSparseHandleKeyword(SparseHandleKind::Env))
|
||||
return SparseEnvHandleType::get(context);
|
||||
if (keyword == getSparseHandleKeyword(SparseHandleKind::DnVec))
|
||||
return SparseDnVecHandleType::get(context);
|
||||
if (keyword == getSparseHandleKeyword(SparseHandleKind::DnMat))
|
||||
return SparseDnMatHandleType::get(context);
|
||||
if (keyword == getSparseHandleKeyword(SparseHandleKind::DnTensor))
|
||||
return SparseDnTensorHandleType::get(context);
|
||||
if (keyword == getSparseHandleKeyword(SparseHandleKind::SpMat))
|
||||
return SparseSpMatHandleType::get(context);
|
||||
|
||||
@@ -238,10 +233,9 @@ void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
|
||||
.Case<AsyncTokenType>([&](Type) { os << "async.token"; })
|
||||
.Case<SparseEnvHandleType>(
|
||||
[&](Type) { os << getSparseHandleKeyword(SparseHandleKind::Env); })
|
||||
.Case<SparseDnVecHandleType>(
|
||||
[&](Type) { os << getSparseHandleKeyword(SparseHandleKind::DnVec); })
|
||||
.Case<SparseDnMatHandleType>(
|
||||
[&](Type) { os << getSparseHandleKeyword(SparseHandleKind::DnMat); })
|
||||
.Case<SparseDnTensorHandleType>([&](Type) {
|
||||
os << getSparseHandleKeyword(SparseHandleKind::DnTensor);
|
||||
})
|
||||
.Case<SparseSpMatHandleType>(
|
||||
[&](Type) { os << getSparseHandleKeyword(SparseHandleKind::SpMat); })
|
||||
.Case<MMAMatrixType>([&](MMAMatrixType fragTy) {
|
||||
|
||||
@@ -450,7 +450,7 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
|
||||
// Create sparse environment and sparse matrix/dense vector handles.
|
||||
Type indexTp = rewriter.getIndexType();
|
||||
Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
|
||||
Type dnVecHandleTp = rewriter.getType<gpu::SparseDnVecHandleType>();
|
||||
Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();
|
||||
Type spmatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
|
||||
Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
|
||||
Value token = genFirstWait(rewriter, loc);
|
||||
@@ -463,12 +463,12 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
|
||||
rowA, colA, valA, isCOO, enableRT);
|
||||
Value spMatA = spGenA->getResult(0);
|
||||
token = spGenA->getResult(1);
|
||||
auto dvecX = rewriter.create<gpu::CreateDnVecOp>(loc, dnVecHandleTp, tokenTp,
|
||||
token, handle, vecX, szX);
|
||||
auto dvecX = rewriter.create<gpu::CreateDnTensorOp>(
|
||||
loc, dnTensorHandleTp, tokenTp, token, handle, vecX, szX);
|
||||
Value dnX = dvecX.getResult(0);
|
||||
token = dvecX.getAsyncToken();
|
||||
auto dvecY = rewriter.create<gpu::CreateDnVecOp>(loc, dnVecHandleTp, tokenTp,
|
||||
token, handle, vecY, szY);
|
||||
auto dvecY = rewriter.create<gpu::CreateDnTensorOp>(
|
||||
loc, dnTensorHandleTp, tokenTp, token, handle, vecY, szY);
|
||||
Value dnY = dvecY.getResult(0);
|
||||
token = dvecY.getAsyncToken();
|
||||
|
||||
@@ -493,9 +493,9 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
|
||||
// Copy data back to host and free all the resoures.
|
||||
token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
|
||||
.getAsyncToken();
|
||||
token = rewriter.create<gpu::DestroyDnVecOp>(loc, tokenTp, token, dnX)
|
||||
token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnX)
|
||||
.getAsyncToken();
|
||||
token = rewriter.create<gpu::DestroyDnVecOp>(loc, tokenTp, token, dnY)
|
||||
token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnY)
|
||||
.getAsyncToken();
|
||||
token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
|
||||
.getAsyncToken();
|
||||
@@ -557,7 +557,7 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
|
||||
// Create sparse environment and sparse matrix/dense matrix handles.
|
||||
Type indexTp = rewriter.getIndexType();
|
||||
Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
|
||||
Type dnMatHandleTp = rewriter.getType<gpu::SparseDnMatHandleType>();
|
||||
Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();
|
||||
Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
|
||||
Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
|
||||
Value token = genFirstWait(rewriter, loc);
|
||||
@@ -570,12 +570,14 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
|
||||
rowA, colA, valA, isCOO, enableRT);
|
||||
Value spMatA = spGenA->getResult(0);
|
||||
token = spGenA->getResult(1);
|
||||
auto dmatB = rewriter.create<gpu::CreateDnMatOp>(
|
||||
loc, dnMatHandleTp, tokenTp, token, handle, szk, szn, matB);
|
||||
auto dmatB = rewriter.create<gpu::CreateDnTensorOp>(
|
||||
loc, dnTensorHandleTp, tokenTp, token, handle, matB,
|
||||
SmallVector<Value>{szk, szn});
|
||||
Value dnB = dmatB.getResult(0);
|
||||
token = dmatB.getAsyncToken();
|
||||
auto dmatC = rewriter.create<gpu::CreateDnMatOp>(
|
||||
loc, dnMatHandleTp, tokenTp, token, handle, szm, szn, matC);
|
||||
auto dmatC = rewriter.create<gpu::CreateDnTensorOp>(
|
||||
loc, dnTensorHandleTp, tokenTp, token, handle, matC,
|
||||
SmallVector<Value>{szm, szn});
|
||||
Value dnC = dmatC.getResult(0);
|
||||
token = dmatC.getAsyncToken();
|
||||
|
||||
@@ -602,9 +604,9 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
|
||||
// Copy data back to host and free all the resoures.
|
||||
token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
|
||||
.getAsyncToken();
|
||||
token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnB)
|
||||
token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
|
||||
.getAsyncToken();
|
||||
token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnC)
|
||||
token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
|
||||
.getAsyncToken();
|
||||
token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
|
||||
.getAsyncToken();
|
||||
|
||||
@@ -22,11 +22,11 @@ module attributes {gpu.container_module} {
|
||||
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf16>
|
||||
%env, %token3 = gpu.create_sparse_env async [%token2]
|
||||
%spmat, %token4 = gpu.create_2to4_spmat async [%token3] %env, %arg0, %arg0, %mem1: memref<?xf16>
|
||||
%dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf16>
|
||||
%dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf16>
|
||||
%bufferSzs, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : tuple<index,index,index> into f16
|
||||
%token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref<?xf16>,memref<?xf16>,memref<?xf16> into f16
|
||||
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
|
||||
%token9 = gpu.destroy_dn_mat async [%token8] %dnmat
|
||||
%token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
|
||||
%token10 = gpu.destroy_sparse_env async [%token9] %env
|
||||
gpu.wait [%token10]
|
||||
return
|
||||
|
||||
@@ -22,11 +22,11 @@ module attributes {gpu.container_module} {
|
||||
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
|
||||
%env, %token3 = gpu.create_sparse_env async [%token2]
|
||||
%spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
%dnvec, %token5 = gpu.create_dn_vec async [%token4] %env, %mem2, %arg0 : memref<?xf64>
|
||||
%dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref<?xf64>
|
||||
%bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64
|
||||
%token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
|
||||
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
|
||||
%token9 = gpu.destroy_dn_vec async [%token8] %dnvec
|
||||
%token9 = gpu.destroy_dn_tensor async [%token8] %dnvec
|
||||
%token10 = gpu.destroy_sparse_env async [%token9] %env
|
||||
gpu.wait [%token10]
|
||||
return
|
||||
@@ -52,11 +52,11 @@ module attributes {gpu.container_module} {
|
||||
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
|
||||
%env, %token3 = gpu.create_sparse_env async [%token2]
|
||||
%spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
%dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf64>
|
||||
%dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
|
||||
%bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64
|
||||
%token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
|
||||
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
|
||||
%token9 = gpu.destroy_dn_mat async [%token8] %dnmat
|
||||
%token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
|
||||
%token10 = gpu.destroy_sparse_env async [%token9] %env
|
||||
gpu.wait [%token10]
|
||||
return
|
||||
@@ -82,11 +82,11 @@ module attributes {gpu.container_module} {
|
||||
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
|
||||
%env, %token3 = gpu.create_sparse_env async [%token2]
|
||||
%spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
%dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf64>
|
||||
%dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
|
||||
%bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64
|
||||
%token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
|
||||
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
|
||||
%token9 = gpu.destroy_dn_mat async [%token8] %dnmat
|
||||
%token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
|
||||
%token10 = gpu.destroy_sparse_env async [%token9] %env
|
||||
gpu.wait [%token10]
|
||||
return
|
||||
|
||||
@@ -332,14 +332,14 @@ module attributes {gpu.container_module} {
|
||||
%spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
// CHECK: gpu.create_csr async
|
||||
%spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
// CHECK: gpu.create_dn_vec async
|
||||
%dnvec, %token6 = gpu.create_dn_vec async [%token5] %env, %mem2, %arg0 : memref<?xf64>
|
||||
// CHECK: gpu.create_dn_tensor async
|
||||
%dnvec, %token6 = gpu.create_dn_tensor async [%token5] %env, %mem2, %arg0 : index into memref<?xf64>
|
||||
// CHECK: gpu.spmv_buffer_size async
|
||||
%bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec into f64
|
||||
// CHECK: gpu.spmv async
|
||||
%token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
|
||||
// CHECK: gpu.create_dn_mat async
|
||||
%dnmat, %token9 = gpu.create_dn_mat async [%token8] %env, %arg0, %arg0, %mem2 : memref<?xf64>
|
||||
// CHECK: gpu.create_dn_tensor async
|
||||
%dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
|
||||
// CHECK: gpu.spmm_buffer_size async
|
||||
%bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat : index into f64
|
||||
// CHECK: gpu.spmm async
|
||||
@@ -348,12 +348,12 @@ module attributes {gpu.container_module} {
|
||||
%bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %env, %dnmat, %dnmat, %spmat into f64
|
||||
// CHECK: gpu.sddmm async
|
||||
%token13 = gpu.sddmm async [%token12] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
|
||||
// CHECK: gpu.destroy_dn_mat async
|
||||
%token14 = gpu.destroy_dn_mat async [%token13] %dnmat
|
||||
// CHECK: gpu.destroy_dn_tensor async
|
||||
%token14 = gpu.destroy_dn_tensor async [%token13] %dnmat
|
||||
// CHECK: gpu.destroy_sp_mat async
|
||||
%token15 = gpu.destroy_sp_mat async [%token14] %spmat
|
||||
// CHECK: gpu.destroy_dn_vec async
|
||||
%token16 = gpu.destroy_dn_vec async [%token15] %dnvec
|
||||
// CHECK: gpu.destroy_dn_tensor async
|
||||
%token16 = gpu.destroy_dn_tensor async [%token15] %dnvec
|
||||
// CHECK: gpu.destroy_sparse_env async
|
||||
%token17 = gpu.destroy_sparse_env async [%token16] %env
|
||||
// CHECK: gpu.wait
|
||||
|
||||
@@ -8,11 +8,11 @@ module attributes {gpu.container_module} {
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.create_coo async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_vec async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index into memref<?xf64>
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64
|
||||
// CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
|
||||
// CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
|
||||
// CHECK: %{{.*}} = gpu.destroy_dn_vec async [%{{.*}}] %{{.*}}
|
||||
// CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}}
|
||||
// CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}}
|
||||
// CHECK: gpu.wait [%{{.*}}]
|
||||
// CHECK: return
|
||||
@@ -22,11 +22,11 @@ module attributes {gpu.container_module} {
|
||||
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
|
||||
%env, %token3 = gpu.create_sparse_env async [%token2]
|
||||
%spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
%dnvec, %token5 = gpu.create_dn_vec async [%token4] %env, %mem2, %arg0 : memref<?xf64>
|
||||
%dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref<?xf64>
|
||||
%bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64
|
||||
%token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
|
||||
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
|
||||
%token9 = gpu.destroy_dn_vec async [%token8] %dnvec
|
||||
%token9 = gpu.destroy_dn_tensor async [%token8] %dnvec
|
||||
%token10 = gpu.destroy_sparse_env async [%token9] %env
|
||||
gpu.wait [%token10]
|
||||
return
|
||||
@@ -38,11 +38,11 @@ module attributes {gpu.container_module} {
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref<?xf64>
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64
|
||||
// CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
|
||||
// CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
|
||||
// CHECK: %{{.*}} = gpu.destroy_dn_mat async [%{{.*}}] %{{.*}}
|
||||
// CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}}
|
||||
// CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}}
|
||||
// CHECK: gpu.wait [%{{.*}}]
|
||||
// CHECK: return
|
||||
@@ -52,11 +52,11 @@ module attributes {gpu.container_module} {
|
||||
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
|
||||
%env, %token3 = gpu.create_sparse_env async [%token2]
|
||||
%spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
%dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf64>
|
||||
%dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
|
||||
%bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64
|
||||
%token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
|
||||
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
|
||||
%token9 = gpu.destroy_dn_mat async [%token8] %dnmat
|
||||
%token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
|
||||
%token10 = gpu.destroy_sparse_env async [%token9] %env
|
||||
gpu.wait [%token10]
|
||||
return
|
||||
@@ -68,11 +68,11 @@ module attributes {gpu.container_module} {
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref<?xf64>
|
||||
// CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64
|
||||
// CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
|
||||
// CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
|
||||
// CHECK: %{{.*}} = gpu.destroy_dn_mat async [%{{.*}}] %{{.*}}
|
||||
// CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}}
|
||||
// CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}}
|
||||
// CHECK: gpu.wait [%{{.*}}]
|
||||
// CHECK: return
|
||||
@@ -82,11 +82,11 @@ module attributes {gpu.container_module} {
|
||||
%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
|
||||
%env, %token3 = gpu.create_sparse_env async [%token2]
|
||||
%spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
%dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf64>
|
||||
%dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
|
||||
%bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64
|
||||
%token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
|
||||
%token8 = gpu.destroy_sp_mat async [%token7] %spmat
|
||||
%token9 = gpu.destroy_dn_mat async [%token8] %dnmat
|
||||
%token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
|
||||
%token10 = gpu.destroy_sparse_env async [%token9] %env
|
||||
gpu.wait [%token10]
|
||||
return
|
||||
|
||||
@@ -47,14 +47,14 @@
|
||||
// CHECK: %[[VAL_41:.*]] = gpu.wait async
|
||||
// CHECK: %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]]
|
||||
// CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref<?x?xf64>
|
||||
// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref<?x?xf64>
|
||||
// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref<?x?xf64>
|
||||
// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref<?x?xf64>
|
||||
// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index
|
||||
// CHECK: %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref<?xi8>
|
||||
// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
|
||||
// CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]
|
||||
// CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_55]]] %[[VAL_46]]
|
||||
// CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_56]]] %[[VAL_48]]
|
||||
// CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_55]]] %[[VAL_46]]
|
||||
// CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_56]]] %[[VAL_48]]
|
||||
// CHECK: %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]]
|
||||
// CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_14]] : memref<?xindex>
|
||||
// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref<?xindex>
|
||||
|
||||
@@ -45,14 +45,14 @@ module {
|
||||
// CHECK: %[[VAL_38:.*]] = gpu.wait async
|
||||
// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_38]]]
|
||||
// CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_40]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
|
||||
// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : memref<?xf64>
|
||||
// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : memref<?xf64>
|
||||
// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : index into memref<?xf64>
|
||||
// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : index into memref<?xf64>
|
||||
// CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]]
|
||||
// CHECK: %[[VAL_49:.*]], %[[VAL_50:.*]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref<?xi8>
|
||||
// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref<?xi8>
|
||||
// CHECK: %[[VAL_52:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_51]]] %[[VAL_41]]
|
||||
// CHECK: %[[VAL_53:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_52]]] %[[VAL_43]]
|
||||
// CHECK: %[[VAL_54:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_53]]] %[[VAL_45]]
|
||||
// CHECK: %[[VAL_53:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_52]]] %[[VAL_43]]
|
||||
// CHECK: %[[VAL_54:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_53]]] %[[VAL_45]]
|
||||
// CHECK: %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] %[[VAL_39]]
|
||||
// CHECK: %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_55]]] %[[VAL_13]] : memref<?xindex>
|
||||
// CHECK: %[[VAL_57:.*]] = gpu.dealloc async {{\[}}%[[VAL_56]]] %[[VAL_18]] : memref<?xindex>
|
||||
|
||||
Reference in New Issue
Block a user