[mlir][sparse][gpu] unify dnmat and dnvec handle and ops

Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D152465
2026-02-02 18:18:09 +08:00 · 2023-06-09 16:58:44 +00:00
parent 853d704fd0
commit 97f4c22b3a
12 changed files with 164 additions and 271 deletions
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -116,17 +116,11 @@ def GPU_SparseEnvHandle :
    "sparse environment handle type">,
  BuildableType<"mlir::gpu::SparseEnvHandleType::get($_builder.getContext())">;

-def GPU_SparseDnVecHandle :
+def GPU_SparseDnTensorHandle :
  DialectType<GPU_Dialect,
-    CPred<"llvm::isa<::mlir::gpu::SparseDnVecHandleType>($_self)">,
-    "dense vector handle type">,
-  BuildableType<"mlir::gpu::SparseDnVecHandleType::get($_builder.getContext())">;
-
-def GPU_SparseDnMatHandle :
-  DialectType<GPU_Dialect,
-    CPred<"llvm::isa<::mlir::gpu::SparseDnMatHandleType>($_self)">,
-    "dense matrix handle type">,
-  BuildableType<"mlir::gpu::SparseDnMatHandleType::get($_builder.getContext())">;
+    CPred<"llvm::isa<::mlir::gpu::SparseDnTensorHandleType>($_self)">,
+    "dense tensor handle type">,
+  BuildableType<"mlir::gpu::SparseDnTensorHandleType::get($_builder.getContext())">;

 def GPU_SparseSpMatHandle :
  DialectType<GPU_Dialect,
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -165,7 +165,7 @@ public:
 void addAsyncDependency(Operation *op, Value token);

 // Handle types for sparse.
-enum class SparseHandleKind { Env, DnVec, DnMat, SpMat };
+enum class SparseHandleKind { Env, SpMat, DnTensor };

 template <SparseHandleKind K>
 class SparseHandleType
@@ -177,8 +177,7 @@ public:
 };

 using SparseEnvHandleType = SparseHandleType<SparseHandleKind::Env>;
-using SparseDnVecHandleType = SparseHandleType<SparseHandleKind::DnVec>;
-using SparseDnMatHandleType = SparseHandleType<SparseHandleKind::DnMat>;
+using SparseDnTensorHandleType = SparseHandleType<SparseHandleKind::DnTensor>;
 using SparseSpMatHandleType = SparseHandleType<SparseHandleKind::SpMat>;

 } // namespace gpu
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1597,73 +1597,13 @@ def GPU_DestroySparseEnvOp : GPU_Op<
  }];
 }

-def GPU_CreateDnVecOp : GPU_Op<"create_dn_vec", [GPU_AsyncOpInterface]> {
-  let summary = "Create dense vector operation";
+def GPU_CreateDnTensorOp : GPU_Op<"create_dn_tensor", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> {
+  let summary = "Create dense tensor operation";
  let description = [{
-    The `gpu.create_dn_vec` operation initializes a dense vector from
-    the given values buffer and size. The buffer must already be copied
-    from the host to the device prior to using this operation. The
-    operation returns a handle to the dense vector descriptor.
-
-    If the `async` keyword is present, the op is executed asynchronously (i.e.
-    it does not block until the execution has finished on the device). In
-    that case, it returns a !gpu.async.token in addition to the environment.
-
-    Example:
-
-    ```mlir
-    %dvec, %token = gpu.create_dn_vec async [%dep] %env, %mem, %size : memref<?xf64>
-    ```
-  }];
-
-  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                       GPU_SparseEnvHandle:$env,
-                       AnyMemRef:$memref, 
-                       Index:$size);
-  let results = (outs Res<GPU_SparseDnVecHandle>:$dvec, 
-                      Optional<GPU_AsyncToken>:$asyncToken);
-
-  let assemblyFormat = [{
-    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $memref `,` $size attr-dict `:` type($memref)
-  }];
-}
-
-def GPU_DestroyDnVecOp : GPU_Op<"destroy_dn_vec", [GPU_AsyncOpInterface]> {
-  let summary = "Destroy dense vector operation";
-  let description = [{
-    The `gpu.destroy_dn_vec` operation releases all resources of a dense
-    vector represented by a handle that was previously created by a
-    `gpu.create_dn_vec` operation.
-
-    If the `async` keyword is present, the op is executed asynchronously (i.e.
-    it does not block until the execution has finished on the device). In
-    that case, it returns a !gpu.async.token in addition to the environment.
-
-    Example:
-
-    ```mlir
-    %token = gpu.destroy_dn_vec async [%dep] %dvec
-    ```
-  }];
-
-  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                       Arg<GPU_SparseDnVecHandle>:$dvec);
-  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
-
-  let assemblyFormat = [{
-    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $dvec attr-dict
-  }];
-}
-
-def GPU_CreateDnMatOp : GPU_Op<"create_dn_mat", [GPU_AsyncOpInterface]> {
-  let summary = "Create dense matrix operation";
-  let description = [{
-    The `gpu.create_dn_mat` operation initializes a dense matrix from
+    The `gpu.create_dn_tensor` operation initializes a dense tensor from
    the given values buffer and sizes. The buffer must already be copied
    from the host to the device prior to using this operation. The
-    operation returns a handle to the dense matrix descriptor.
+    operation returns a handle to the dense tensor descriptor.

    If the `async` keyword is present, the op is executed asynchronously (i.e.
    it does not block until the execution has finished on the device). In
@@ -1672,29 +1612,28 @@ def GPU_CreateDnMatOp : GPU_Op<"create_dn_mat", [GPU_AsyncOpInterface]> {
    Example:

    ```mlir
-    %dmat, %token = gpu.create_dn_mat async [%dep] %env, %rows, %cols, %mem : memref<?xf64>
+    %dmat, %token = gpu.create_dn_tensor async [%dep] %env, %mem, %dims : index, index into memref<?xf64>
    ```
  }];

  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
                       GPU_SparseEnvHandle:$env,
-                       Index:$rows,
-                       Index:$cols,
-                       AnyMemRef:$memref);
-  let results = (outs Res<GPU_SparseDnMatHandle>:$dmat, Optional<GPU_AsyncToken>:$asyncToken);
+                       AnyMemRef:$memref,
+                       Variadic<Index>:$dims);
+  let results = (outs Res<GPU_SparseDnTensorHandle>:$dnTensor, Optional<GPU_AsyncToken>:$asyncToken);

  let assemblyFormat = [{
    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $rows `,` $cols `,` $memref attr-dict `:` type($memref)
+    $env  `,` $memref `,` $dims attr-dict `:` type($dims) `into` type($memref)
  }];
 }

-def GPU_DestroyDnMatOp : GPU_Op<"destroy_dn_mat", [GPU_AsyncOpInterface]> {
-  let summary = "Destroy dense matrix operation";
+def GPU_DestroyDnTensorOp : GPU_Op<"destroy_dn_tensor", [GPU_AsyncOpInterface]> {
+  let summary = "Destroy dense tensor operation";
  let description = [{
-    The `gpu.destroy_dn_mat` operation releases all resources of a dense
-    matrix represented by a handle that was previously created by a
-    `gpu.create_dn_mat` operation.
+    The `gpu.destroy_dn_tensor` operation releases all resources of a dense
+    tensor represented by a handle that was previously created by a
+    `gpu.create_dn_tensor` operation.

    If the `async` keyword is present, the op is executed asynchronously (i.e.
    it does not block until the execution has finished on the device). In
@@ -1703,17 +1642,17 @@ def GPU_DestroyDnMatOp : GPU_Op<"destroy_dn_mat", [GPU_AsyncOpInterface]> {
    Example:

    ```mlir
-    %token = gpu.destroy_dn_vec async [%dep] %dmat
+    %token = gpu.destroy_dn_tensor async [%dep] %dnTensor
    ```
  }];

  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
-                       Arg<GPU_SparseDnMatHandle>:$dmat);
+                       Arg<GPU_SparseDnTensorHandle>:$dnTensor);
  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);

  let assemblyFormat = [{
    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $dmat attr-dict
+    $dnTensor attr-dict
  }];
 }

@@ -1945,8 +1884,8 @@ def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> {
                       GPU_SparseEnvHandle:$env,
                       GPU_TransposeModeAttr:$modeA,
                       GPU_SparseSpMatHandle:$spmatA,
-                       GPU_SparseDnVecHandle:$dnX,
-                       GPU_SparseDnVecHandle:$dnY,
+                       GPU_SparseDnTensorHandle:$dnX,
+                       GPU_SparseDnTensorHandle:$dnY,
                       TypeAttr:$computeType);
  let results = (outs Res<Index>:$bufferSz,
                      Optional<GPU_AsyncToken>:$asyncToken);
@@ -1998,8 +1937,8 @@ def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> {
                       GPU_SparseEnvHandle:$env,
                       GPU_TransposeModeAttr:$modeA,
                       GPU_SparseSpMatHandle:$spmatA,
-                       GPU_SparseDnVecHandle:$dnX,
-                       GPU_SparseDnVecHandle:$dnY,
+                       GPU_SparseDnTensorHandle:$dnX,
+                       GPU_SparseDnTensorHandle:$dnY,
                       TypeAttr:$computeType,
                       AnyMemRef:$buffer);
  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
@@ -2052,8 +1991,8 @@ def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface]> {
                       GPU_TransposeModeAttr:$modeA,
                       GPU_TransposeModeAttr:$modeB,
                       GPU_SparseSpMatHandle:$spmatA,
-                       GPU_SparseDnMatHandle:$dnmatB,
-                       GPU_SparseDnMatHandle:$dnmatC,
+                       GPU_SparseDnTensorHandle:$dnmatB,
+                       GPU_SparseDnTensorHandle:$dnmatC,
                       TypeAttr:$computeType);
  let results = (outs Res<AnyTypeOf<[Index, TupleOf<[Index, Index, 
                                                     Index]>]>>:$bufferSzs, 
@@ -2108,8 +2047,8 @@ def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface, AttrSizedOperandSegments]
                       GPU_TransposeModeAttr:$modeA,
                       GPU_TransposeModeAttr:$modeB,
                       GPU_SparseSpMatHandle:$spmatA,
-                       GPU_SparseDnMatHandle:$dnmatB,
-                       GPU_SparseDnMatHandle:$dnmatC,
+                       GPU_SparseDnTensorHandle:$dnmatB,
+                       GPU_SparseDnTensorHandle:$dnmatC,
                       TypeAttr:$computeType,
                       Variadic<AnyMemRef>:$buffers);
  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
@@ -2162,8 +2101,8 @@ def GPU_SDDMMBufferSizeOp : GPU_Op<"sddmm_buffer_size", [GPU_AsyncOpInterface]>
                   GPU_SparseEnvHandle:$env,
                   GPU_TransposeModeAttr:$modeA,
                   GPU_TransposeModeAttr:$modeB,
-                   GPU_SparseDnMatHandle:$dnmatA,
-                   GPU_SparseDnMatHandle:$dnmatB,
+                   GPU_SparseDnTensorHandle:$dnmatA,
+                   GPU_SparseDnTensorHandle:$dnmatB,
                   GPU_SparseSpMatHandle:$spmatC,
                   TypeAttr:$computeType);
  let results = (outs Res<Index>:$bufferSz, Optional<GPU_AsyncToken>:$asyncToken);
@@ -2216,8 +2155,8 @@ def GPU_SDDMMOp : GPU_Op<"sddmm", [GPU_AsyncOpInterface]> {
                   GPU_SparseEnvHandle:$env,
                   GPU_TransposeModeAttr:$modeA,
                   GPU_TransposeModeAttr:$modeB,
-                   GPU_SparseDnMatHandle:$dnmatA,
-                   GPU_SparseDnMatHandle:$dnmatB,
+                   GPU_SparseDnTensorHandle:$dnmatA,
+                   GPU_SparseDnTensorHandle:$dnmatB,
                   GPU_SparseSpMatHandle:$spmatC,
                   TypeAttr:$computeType,
                   AnyMemRef:$buffer);
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -548,51 +548,31 @@ private:
                  ConversionPatternRewriter &rewriter) const override;
 };

-class ConvertCreateDnVecOpToGpuRuntimeCallPattern
-    : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnVecOp> {
+class ConvertCreateDnTensorOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnTensorOp> {
 public:
-  ConvertCreateDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
-      : ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnVecOp>(typeConverter) {}
+  ConvertCreateDnTensorOpToGpuRuntimeCallPattern(
+      LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnTensorOp>(typeConverter) {
+  }

 private:
  LogicalResult
-  matchAndRewrite(gpu::CreateDnVecOp op, OpAdaptor adaptor,
+  matchAndRewrite(gpu::CreateDnTensorOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override;
 };

-class ConvertDestroyDnVecOpToGpuRuntimeCallPattern
-    : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnVecOp> {
+class ConvertDestroyDnTensorOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnTensorOp> {
 public:
-  ConvertDestroyDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
-      : ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnVecOp>(typeConverter) {}
+  ConvertDestroyDnTensorOpToGpuRuntimeCallPattern(
+      LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnTensorOp>(
+            typeConverter) {}

 private:
  LogicalResult
-  matchAndRewrite(gpu::DestroyDnVecOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override;
-};
-
-class ConvertCreateDnMatOpToGpuRuntimeCallPattern
-    : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnMatOp> {
-public:
-  ConvertCreateDnMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
-      : ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnMatOp>(typeConverter) {}
-
-private:
-  LogicalResult
-  matchAndRewrite(gpu::CreateDnMatOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override;
-};
-
-class ConvertDestroyDnMatOpToGpuRuntimeCallPattern
-    : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnMatOp> {
-public:
-  ConvertDestroyDnMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
-      : ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnMatOp>(typeConverter) {}
-
-private:
-  LogicalResult
-  matchAndRewrite(gpu::DestroyDnMatOp op, OpAdaptor adaptor,
+  matchAndRewrite(gpu::DestroyDnTensorOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override;
 };

@@ -1474,102 +1454,90 @@ LogicalResult ConvertDestroySparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite(
  return success();
 }

-LogicalResult ConvertCreateDnVecOpToGpuRuntimeCallPattern::matchAndRewrite(
-    gpu::CreateDnVecOp op, OpAdaptor adaptor,
+LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::CreateDnTensorOp op, OpAdaptor adaptor,
    ConversionPatternRewriter &rewriter) const {
  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
      failed(isAsyncWithOneDependency(rewriter, op)))
    return failure();
  Location loc = op.getLoc();
  auto stream = adaptor.getAsyncDependencies().front();
-  Value pVec =
+  Value pTensor =
      MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
  if (!getTypeConverter()->useOpaquePointers())
-    pVec = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pVec);
+    pTensor = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pTensor);
  Type dType = op.getMemref().getType().getElementType();
  auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
-  auto handle =
-      createDnVecCallBuilder
-          .create(loc, rewriter, {adaptor.getSize(), pVec, dtp, stream})
-          .getResult();
-  rewriter.replaceOp(op, {handle, stream});
-  return success();
-}

-LogicalResult ConvertDestroyDnVecOpToGpuRuntimeCallPattern::matchAndRewrite(
-    gpu::DestroyDnVecOp op, OpAdaptor adaptor,
-    ConversionPatternRewriter &rewriter) const {
-  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
-      failed(isAsyncWithOneDependency(rewriter, op)))
-    return failure();
-  Location loc = op.getLoc();
-  auto stream = adaptor.getAsyncDependencies().front();
-  destroyDnVecCallBuilder.create(loc, rewriter, {adaptor.getDvec(), stream});
-  rewriter.replaceOp(op, {stream});
-  return success();
-}
+  SmallVector<Value, 4> dims;
+  for (Value dim : adaptor.getDims()) {
+    dims.push_back(dim);
+  }

-LogicalResult ConvertCreateDnMatOpToGpuRuntimeCallPattern::matchAndRewrite(
-    gpu::CreateDnMatOp op, OpAdaptor adaptor,
-    ConversionPatternRewriter &rewriter) const {
-  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
-      failed(isAsyncWithOneDependency(rewriter, op)))
-    return failure();
-  Location loc = op.getLoc();
-  auto stream = adaptor.getAsyncDependencies().front();
-  Value pMat =
-      MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers())
-    pMat = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pMat);
-  Type dType = op.getMemref().getType().getElementType();
-  auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
+  Value handle;
  // TODO: For now, we track the use of the handle and lower it to cusparse /
  // cusparseLt accordingly. If in a block, both cusparse and cusparseLt are
  // used, we require two separate Creation ops to be the correct logic. In
  // future, we may add support to using one handle in sparse tensor / GPU
  // dialect in both cusparse and cusparseLt. use the cusparseLt create call if
  // the dnmat is used with spmat with 2:4 sparsity
-  Value handle;
-  if (isSpMMCusparseLtOp(op.getDmat())) {
-    auto envHandle = adaptor.getEnv();
-    AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {});
-    auto handleSz = rewriter.create<LLVM::ConstantOp>(
-        loc, getIndexType(), rewriter.getIndexAttr(11032));
-    handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
-                                             llvmInt8Type, handleSz);
-    handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
+  if (dims.size() == 2) {
+    if (isSpMMCusparseLtOp(op.getDnTensor())) {
+      auto envHandle = adaptor.getEnv();
+      AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {});
+      auto handleSz = rewriter.create<LLVM::ConstantOp>(
+          loc, getIndexType(), rewriter.getIndexAttr(11032));
+      handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
+                                               llvmInt8Type, handleSz);
+      handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);

-    createLtDnMatCallBuilder
-        .create(loc, rewriter,
-                {handle, envHandle, adaptor.getRows(), adaptor.getCols(), pMat,
-                 dtp, stream})
-        .getResult();
+      createLtDnMatCallBuilder
+          .create(loc, rewriter,
+                  {handle, envHandle, dims[0], dims[1], pTensor, dtp, stream})
+          .getResult();
+    } else {
+      handle =
+          createDnMatCallBuilder
+              .create(loc, rewriter, {dims[0], dims[1], pTensor, dtp, stream})
+              .getResult();
+    }
  } else {
-    handle =
-        createDnMatCallBuilder
-            .create(loc, rewriter,
-                    {adaptor.getRows(), adaptor.getCols(), pMat, dtp, stream})
-            .getResult();
+    assert(dims.size() == 1 && "Only 1D and 2D tensors are supported");
+    handle = createDnVecCallBuilder
+                 .create(loc, rewriter, {dims[0], pTensor, dtp, stream})
+                 .getResult();
  }
  rewriter.replaceOp(op, {handle, stream});
  return success();
 }

-LogicalResult ConvertDestroyDnMatOpToGpuRuntimeCallPattern::matchAndRewrite(
-    gpu::DestroyDnMatOp op, OpAdaptor adaptor,
+LogicalResult ConvertDestroyDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::DestroyDnTensorOp op, OpAdaptor adaptor,
    ConversionPatternRewriter &rewriter) const {
  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
      failed(isAsyncWithOneDependency(rewriter, op)))
    return failure();
  Location loc = op.getLoc();
  auto stream = adaptor.getAsyncDependencies().front();
-  // Use the cusparseLt destroy call if the dnmat is used with spmat with
-  // 2:4 sparsity
-  if (isSpMMCusparseLtOp(op.getDmat())) {
-    destroyCuSparseLtDnMatBuilder.create(loc, rewriter,
-                                         {adaptor.getDmat(), stream});
+  auto definingOp = op.getDnTensor().getDefiningOp<gpu::CreateDnTensorOp>();
+  SmallVector<Value, 4> dims;
+  for (Value dim : definingOp.getDims()) {
+    dims.push_back(dim);
+  }
+  if (dims.size() == 2) {
+    // Use the cusparseLt destroy call if the dnmat is used with spmat with
+    // 2:4 sparsity
+    if (isSpMMCusparseLtOp(op.getDnTensor())) {
+      destroyCuSparseLtDnMatBuilder.create(loc, rewriter,
+                                           {adaptor.getDnTensor(), stream});
+    } else {
+      destroyDnMatCallBuilder.create(loc, rewriter,
+                                     {adaptor.getDnTensor(), stream});
+    }
  } else {
-    destroyDnMatCallBuilder.create(loc, rewriter, {adaptor.getDmat(), stream});
+    assert(dims.size() == 1 && "Only 1D and 2D tensors are supported");
+    destroyDnVecCallBuilder.create(loc, rewriter,
+                                   {adaptor.getDnTensor(), stream});
  }
  rewriter.replaceOp(op, {stream});
  return success();
@@ -1914,8 +1882,7 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                                               StringRef gpuBinaryAnnotation,
                                               bool kernelBarePtrCallConv) {
  addOpaquePointerConversion<gpu::AsyncTokenType>(converter);
-  addOpaquePointerConversion<gpu::SparseDnVecHandleType>(converter);
-  addOpaquePointerConversion<gpu::SparseDnMatHandleType>(converter);
+  addOpaquePointerConversion<gpu::SparseDnTensorHandleType>(converter);
  addOpaquePointerConversion<gpu::SparseSpMatHandleType>(converter);
  addOpaquePointerConversion<gpu::SparseEnvHandleType>(converter);

@@ -1931,10 +1898,8 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
               ConvertAsyncYieldToGpuRuntimeCallPattern,
               ConvertCreateSparseEnvOpToGpuRuntimeCallPattern,
               ConvertDestroySparseEnvOpToGpuRuntimeCallPattern,
-               ConvertCreateDnVecOpToGpuRuntimeCallPattern,
-               ConvertDestroyDnVecOpToGpuRuntimeCallPattern,
-               ConvertCreateDnMatOpToGpuRuntimeCallPattern,
-               ConvertDestroyDnMatOpToGpuRuntimeCallPattern,
+               ConvertCreateDnTensorOpToGpuRuntimeCallPattern,
+               ConvertDestroyDnTensorOpToGpuRuntimeCallPattern,
               ConvertCreateCooOpToGpuRuntimeCallPattern,
               ConvertCreateCooAoSOpToGpuRuntimeCallPattern,
               ConvertCreateCsrOpToGpuRuntimeCallPattern,
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -147,8 +147,7 @@ void GPUDialect::initialize() {
  addTypes<AsyncTokenType>();
  addTypes<MMAMatrixType>();
  addTypes<SparseEnvHandleType>();
-  addTypes<SparseDnVecHandleType>();
-  addTypes<SparseDnMatHandleType>();
+  addTypes<SparseDnTensorHandleType>();
  addTypes<SparseSpMatHandleType>();
  addOperations<
 #define GET_OP_LIST
@@ -165,10 +164,8 @@ static std::string getSparseHandleKeyword(SparseHandleKind kind) {
  switch (kind) {
  case SparseHandleKind::Env:
    return "sparse.env_handle";
-  case SparseHandleKind::DnVec:
-    return "sparse.dnvec_handle";
-  case SparseHandleKind::DnMat:
-    return "sparse.dnmat_handle";
+  case SparseHandleKind::DnTensor:
+    return "sparse.dntensor_handle";
  case SparseHandleKind::SpMat:
    return "sparse.spmat_handle";
  }
@@ -221,10 +218,8 @@ Type GPUDialect::parseType(DialectAsmParser &parser) const {

  if (keyword == getSparseHandleKeyword(SparseHandleKind::Env))
    return SparseEnvHandleType::get(context);
-  if (keyword == getSparseHandleKeyword(SparseHandleKind::DnVec))
-    return SparseDnVecHandleType::get(context);
-  if (keyword == getSparseHandleKeyword(SparseHandleKind::DnMat))
-    return SparseDnMatHandleType::get(context);
+  if (keyword == getSparseHandleKeyword(SparseHandleKind::DnTensor))
+    return SparseDnTensorHandleType::get(context);
  if (keyword == getSparseHandleKeyword(SparseHandleKind::SpMat))
    return SparseSpMatHandleType::get(context);

@@ -238,10 +233,9 @@ void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
      .Case<AsyncTokenType>([&](Type) { os << "async.token"; })
      .Case<SparseEnvHandleType>(
          [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::Env); })
-      .Case<SparseDnVecHandleType>(
-          [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::DnVec); })
-      .Case<SparseDnMatHandleType>(
-          [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::DnMat); })
+      .Case<SparseDnTensorHandleType>([&](Type) {
+        os << getSparseHandleKeyword(SparseHandleKind::DnTensor);
+      })
      .Case<SparseSpMatHandleType>(
          [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::SpMat); })
      .Case<MMAMatrixType>([&](MMAMatrixType fragTy) {
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -450,7 +450,7 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
  // Create sparse environment and sparse matrix/dense vector handles.
  Type indexTp = rewriter.getIndexType();
  Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
-  Type dnVecHandleTp = rewriter.getType<gpu::SparseDnVecHandleType>();
+  Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();
  Type spmatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
  Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
  Value token = genFirstWait(rewriter, loc);
@@ -463,12 +463,12 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
               rowA, colA, valA, isCOO, enableRT);
  Value spMatA = spGenA->getResult(0);
  token = spGenA->getResult(1);
-  auto dvecX = rewriter.create<gpu::CreateDnVecOp>(loc, dnVecHandleTp, tokenTp,
-                                                   token, handle, vecX, szX);
+  auto dvecX = rewriter.create<gpu::CreateDnTensorOp>(
+      loc, dnTensorHandleTp, tokenTp, token, handle, vecX, szX);
  Value dnX = dvecX.getResult(0);
  token = dvecX.getAsyncToken();
-  auto dvecY = rewriter.create<gpu::CreateDnVecOp>(loc, dnVecHandleTp, tokenTp,
-                                                   token, handle, vecY, szY);
+  auto dvecY = rewriter.create<gpu::CreateDnTensorOp>(
+      loc, dnTensorHandleTp, tokenTp, token, handle, vecY, szY);
  Value dnY = dvecY.getResult(0);
  token = dvecY.getAsyncToken();

@@ -493,9 +493,9 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
  // Copy data back to host and free all the resoures.
  token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
              .getAsyncToken();
-  token = rewriter.create<gpu::DestroyDnVecOp>(loc, tokenTp, token, dnX)
+  token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnX)
              .getAsyncToken();
-  token = rewriter.create<gpu::DestroyDnVecOp>(loc, tokenTp, token, dnY)
+  token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnY)
              .getAsyncToken();
  token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
              .getAsyncToken();
@@ -557,7 +557,7 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
  // Create sparse environment and sparse matrix/dense matrix handles.
  Type indexTp = rewriter.getIndexType();
  Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
-  Type dnMatHandleTp = rewriter.getType<gpu::SparseDnMatHandleType>();
+  Type dnTensorHandleTp = rewriter.getType<gpu::SparseDnTensorHandleType>();
  Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
  Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
  Value token = genFirstWait(rewriter, loc);
@@ -570,12 +570,14 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
               rowA, colA, valA, isCOO, enableRT);
  Value spMatA = spGenA->getResult(0);
  token = spGenA->getResult(1);
-  auto dmatB = rewriter.create<gpu::CreateDnMatOp>(
-      loc, dnMatHandleTp, tokenTp, token, handle, szk, szn, matB);
+  auto dmatB = rewriter.create<gpu::CreateDnTensorOp>(
+      loc, dnTensorHandleTp, tokenTp, token, handle, matB,
+      SmallVector<Value>{szk, szn});
  Value dnB = dmatB.getResult(0);
  token = dmatB.getAsyncToken();
-  auto dmatC = rewriter.create<gpu::CreateDnMatOp>(
-      loc, dnMatHandleTp, tokenTp, token, handle, szm, szn, matC);
+  auto dmatC = rewriter.create<gpu::CreateDnTensorOp>(
+      loc, dnTensorHandleTp, tokenTp, token, handle, matC,
+      SmallVector<Value>{szm, szn});
  Value dnC = dmatC.getResult(0);
  token = dmatC.getAsyncToken();

@@ -602,9 +604,9 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
  // Copy data back to host and free all the resoures.
  token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
              .getAsyncToken();
-  token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnB)
+  token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
              .getAsyncToken();
-  token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnC)
+  token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
              .getAsyncToken();
  token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
              .getAsyncToken();
--- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
@@ -22,11 +22,11 @@ module attributes {gpu.container_module} {
    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf16>
    %env, %token3 = gpu.create_sparse_env async [%token2]
    %spmat, %token4 = gpu.create_2to4_spmat async [%token3] %env, %arg0, %arg0, %mem1:  memref<?xf16>
-    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf16>
+    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf16>
    %bufferSzs, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : tuple<index,index,index> into f16
    %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref<?xf16>,memref<?xf16>,memref<?xf16> into f16
    %token8 = gpu.destroy_sp_mat async [%token7] %spmat
-    %token9 = gpu.destroy_dn_mat async [%token8] %dnmat
+    %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
    %token10 = gpu.destroy_sparse_env async [%token9] %env
    gpu.wait [%token10]
    return
--- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -22,11 +22,11 @@ module attributes {gpu.container_module} {
    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
    %env, %token3 = gpu.create_sparse_env async [%token2]
    %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnvec, %token5 = gpu.create_dn_vec async [%token4] %env, %mem2, %arg0 : memref<?xf64>
+    %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref<?xf64>
    %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec  into f64
    %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
    %token8 = gpu.destroy_sp_mat async [%token7] %spmat
-    %token9 = gpu.destroy_dn_vec async [%token8] %dnvec
+    %token9 = gpu.destroy_dn_tensor async [%token8] %dnvec
    %token10 = gpu.destroy_sparse_env async [%token9] %env
    gpu.wait [%token10]
    return
@@ -52,11 +52,11 @@ module attributes {gpu.container_module} {
    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
    %env, %token3 = gpu.create_sparse_env async [%token2]
    %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf64>
+    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
    %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64
    %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
    %token8 = gpu.destroy_sp_mat async [%token7] %spmat
-    %token9 = gpu.destroy_dn_mat async [%token8] %dnmat
+    %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
    %token10 = gpu.destroy_sparse_env async [%token9] %env
    gpu.wait [%token10]
    return
@@ -82,11 +82,11 @@ module attributes {gpu.container_module} {
    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
    %env, %token3 = gpu.create_sparse_env async [%token2]
    %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf64>
+    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
    %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64
    %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
    %token8 = gpu.destroy_sp_mat async [%token7] %spmat
-    %token9 = gpu.destroy_dn_mat async [%token8] %dnmat
+    %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
    %token10 = gpu.destroy_sparse_env async [%token9] %env
    gpu.wait [%token10]
    return
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -332,14 +332,14 @@ module attributes {gpu.container_module} {
    %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
    // CHECK: gpu.create_csr async
    %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    // CHECK: gpu.create_dn_vec async
-    %dnvec, %token6 = gpu.create_dn_vec async [%token5] %env, %mem2, %arg0 : memref<?xf64>
+    // CHECK: gpu.create_dn_tensor async
+    %dnvec, %token6 = gpu.create_dn_tensor async [%token5] %env, %mem2, %arg0 : index into memref<?xf64>
    // CHECK: gpu.spmv_buffer_size async
    %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec  into f64
    // CHECK: gpu.spmv async
    %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>  into f64
-    // CHECK: gpu.create_dn_mat async
-    %dnmat, %token9 = gpu.create_dn_mat async [%token8] %env, %arg0, %arg0, %mem2 : memref<?xf64>
+    // CHECK: gpu.create_dn_tensor async
+    %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
    // CHECK: gpu.spmm_buffer_size async
    %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat : index into f64
    // CHECK: gpu.spmm async
@@ -348,12 +348,12 @@ module attributes {gpu.container_module} {
    %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %env, %dnmat, %dnmat, %spmat  into f64
    // CHECK: gpu.sddmm async
    %token13 = gpu.sddmm async [%token12] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64>  into f64
-    // CHECK: gpu.destroy_dn_mat async
-    %token14 = gpu.destroy_dn_mat async [%token13] %dnmat
+    // CHECK: gpu.destroy_dn_tensor async
+    %token14 = gpu.destroy_dn_tensor async [%token13] %dnmat
    // CHECK: gpu.destroy_sp_mat async
    %token15 = gpu.destroy_sp_mat async [%token14] %spmat
-    // CHECK: gpu.destroy_dn_vec async
-    %token16 = gpu.destroy_dn_vec async [%token15] %dnvec
+    // CHECK: gpu.destroy_dn_tensor async
+    %token16 = gpu.destroy_dn_tensor async [%token15] %dnvec
    // CHECK: gpu.destroy_sparse_env async
    %token17 = gpu.destroy_sparse_env async [%token16] %env
    // CHECK: gpu.wait
--- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
+++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir
@@ -8,11 +8,11 @@ module attributes {gpu.container_module} {
  // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
  // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
  // CHECK: %{{.*}}, %{{.*}} = gpu.create_coo async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_vec async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index into memref<?xf64>
  // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64
  // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
  // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
-  // CHECK: %{{.*}} = gpu.destroy_dn_vec async [%{{.*}}] %{{.*}}
+  // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}}
  // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}}
  // CHECK: gpu.wait [%{{.*}}]
  // CHECK: return
@@ -22,11 +22,11 @@ module attributes {gpu.container_module} {
    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
    %env, %token3 = gpu.create_sparse_env async [%token2]
    %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnvec, %token5 = gpu.create_dn_vec async [%token4] %env, %mem2, %arg0 : memref<?xf64>
+    %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref<?xf64>
    %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64
    %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64
    %token8 = gpu.destroy_sp_mat async [%token7] %spmat
-    %token9 = gpu.destroy_dn_vec async [%token8] %dnvec
+    %token9 = gpu.destroy_dn_tensor async [%token8] %dnvec
    %token10 = gpu.destroy_sparse_env async [%token9] %env
    gpu.wait [%token10]
    return
@@ -38,11 +38,11 @@ module attributes {gpu.container_module} {
  // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
  // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
  // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref<?xf64>
  // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64
  // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64> into f64
  // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
-  // CHECK: %{{.*}} = gpu.destroy_dn_mat async [%{{.*}}] %{{.*}}
+  // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}}
  // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}}
  // CHECK: gpu.wait [%{{.*}}]
  // CHECK: return
@@ -52,11 +52,11 @@ module attributes {gpu.container_module} {
    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
    %env, %token3 = gpu.create_sparse_env async [%token2]
    %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf64>
+    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
    %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64
    %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64
    %token8 = gpu.destroy_sp_mat async [%token7] %spmat
-    %token9 = gpu.destroy_dn_mat async [%token8] %dnmat
+    %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
    %token10 = gpu.destroy_sparse_env async [%token9] %env
    gpu.wait [%token10]
    return
@@ -68,11 +68,11 @@ module attributes {gpu.container_module} {
  // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref<?xf64>
  // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}]
  // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xindex>, memref<?xindex>, memref<?xf64>
-  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_mat async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>
+  // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref<?xf64>
  // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}  into f64
  // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<?xf64>  into f64
  // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}}
-  // CHECK: %{{.*}} = gpu.destroy_dn_mat async [%{{.*}}] %{{.*}}
+  // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}}
  // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}}
  // CHECK: gpu.wait [%{{.*}}]
  // CHECK: return
@@ -82,11 +82,11 @@ module attributes {gpu.container_module} {
    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
    %env, %token3 = gpu.create_sparse_env async [%token2]
    %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
-    %dnmat, %token5 = gpu.create_dn_mat async [%token4] %env, %arg0, %arg0, %mem2 : memref<?xf64>
+    %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref<?xf64>
    %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64
    %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64
    %token8 = gpu.destroy_sp_mat async [%token7] %spmat
-    %token9 = gpu.destroy_dn_mat async [%token8] %dnmat
+    %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat
    %token10 = gpu.destroy_sparse_env async [%token9] %env
    gpu.wait [%token10]
    return
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
@@ -47,14 +47,14 @@
 // CHECK:           %[[VAL_41:.*]] = gpu.wait async
 // CHECK:           %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]]
 // CHECK:           %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
-// CHECK:           %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref<?x?xf64>
-// CHECK:           %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref<?x?xf64>
+// CHECK:           %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref<?x?xf64>
+// CHECK:           %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref<?x?xf64>
 // CHECK:           %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index
 // CHECK:           %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref<?xi8>
 // CHECK:           %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
 // CHECK:           %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]
-// CHECK:           %[[VAL_56:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_55]]] %[[VAL_46]]
-// CHECK:           %[[VAL_57:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_56]]] %[[VAL_48]]
+// CHECK:           %[[VAL_56:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_55]]] %[[VAL_46]]
+// CHECK:           %[[VAL_57:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_56]]] %[[VAL_48]]
 // CHECK:           %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]]
 // CHECK:           %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_14]] : memref<?xindex>
 // CHECK:           %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref<?xindex>
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
@@ -45,14 +45,14 @@ module {
 // CHECK:           %[[VAL_38:.*]] = gpu.wait async
 // CHECK:           %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_38]]]
 // CHECK:           %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_40]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
-// CHECK:           %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : memref<?xf64>
-// CHECK:           %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : memref<?xf64>
+// CHECK:           %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : index into memref<?xf64>
+// CHECK:           %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : index into memref<?xf64>
 // CHECK:           %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]]
 // CHECK:           %[[VAL_49:.*]], %[[VAL_50:.*]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref<?xi8>
 // CHECK:           %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref<?xi8>
 // CHECK:           %[[VAL_52:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_51]]] %[[VAL_41]]
-// CHECK:           %[[VAL_53:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_52]]] %[[VAL_43]]
-// CHECK:           %[[VAL_54:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_53]]] %[[VAL_45]]
+// CHECK:           %[[VAL_53:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_52]]] %[[VAL_43]]
+// CHECK:           %[[VAL_54:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_53]]] %[[VAL_45]]
 // CHECK:           %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] %[[VAL_39]]
 // CHECK:           %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_55]]] %[[VAL_13]] : memref<?xindex>
 // CHECK:           %[[VAL_57:.*]] = gpu.dealloc async {{\[}}%[[VAL_56]]] %[[VAL_18]] : memref<?xindex>