mirror of
https://github.com/intel/llvm.git
synced 2026-01-27 06:06:34 +08:00
[mlir] Update VectorToGPU to new memory space
GPU memory space have changed to new attributes. Update VectorToGPU pass to use those. Differential Revision: https://reviews.llvm.org/D142105
This commit is contained in:
@@ -669,6 +669,16 @@ createNonLdMatrixLoads(vector::TransferReadOp op, OpBuilder &builder,
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Return true if this is a shared memory memref type.
|
||||
static bool isSharedMemory(MemRefType type) {
|
||||
auto addressSpace =
|
||||
type.getMemorySpace().dyn_cast_or_null<gpu::AddressSpaceAttr>();
|
||||
if (addressSpace &&
|
||||
addressSpace.getValue() == gpu::GPUDialect::getWorkgroupAddressSpace())
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Converts a `vector.transfer_read` operation directly to either a
|
||||
/// `vector.load` or a `nvgpu.ldmatrix` operation. This function should only be
|
||||
/// used when converting to `nvgpu.mma.sync` operations.
|
||||
@@ -683,7 +693,7 @@ convertTransferReadToLoads(vector::TransferReadOp op,
|
||||
return failure();
|
||||
|
||||
bool isLdMatrixCompatible =
|
||||
op.getSource().getType().cast<MemRefType>().getMemorySpaceAsInt() == 3 &&
|
||||
isSharedMemory(op.getSource().getType().cast<MemRefType>()) &&
|
||||
nvgpu::inferTileWidthInBits(*warpMatrixInfo) == 128;
|
||||
|
||||
VectorType vecTy = op.getVectorType();
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
|
||||
|
||||
// CHECK-LABEL: func @m16n8k32_int8_row_row_row
|
||||
func.func @m16n8k32_int8_row_row_row(%arg0: memref<128x128xi8, 3>, %arg1: memref<128x128xi8, 3>, %arg2: memref<128x128xi32>) {
|
||||
func.func @m16n8k32_int8_row_row_row(%arg0: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg1: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg2: memref<128x128xi32>) {
|
||||
%cst_0 = arith.constant dense<0> : vector<32x8xi8>
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
@@ -44,7 +44,7 @@ func.func @m16n8k32_int8_row_row_row(%arg0: memref<128x128xi8, 3>, %arg1: memref
|
||||
|
||||
// CHECK: [[m_coord:%.+]] = affine.apply [[$strided_map]]()[{{%.+}}]
|
||||
// CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]()[{{%.+}}]
|
||||
// CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8, 3> -> vector<4x4xi8>
|
||||
// CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8, #gpu.address_space<workgroup>> -> vector<4x4xi8>
|
||||
|
||||
// Verify that the operandB load is lowered to scalar load to be able
|
||||
// to transpose at 8-bit granularity. ldmatrix can only transpose at
|
||||
@@ -52,28 +52,28 @@ func.func @m16n8k32_int8_row_row_row(%arg0: memref<128x128xi8, 3>, %arg1: memref
|
||||
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB0_map]]()[{{%.+}}]
|
||||
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB1_map]]()[{{%.+}}]
|
||||
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB2_map]]()[{{%.+}}]
|
||||
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB3_map]]()[{{%.+}}]
|
||||
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
|
||||
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB4_map]]()[{{%.+}}]
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB5_map]]()[{{%.+}}]
|
||||
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB6_map]]()[{{%.+}}]
|
||||
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB7_map]]()[{{%.+}}]
|
||||
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, 3>
|
||||
// CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
|
||||
// CHECK-NOT: memref.load %arg1
|
||||
|
||||
// Verify that the operand C is distributed to loads correctly.
|
||||
@@ -85,8 +85,8 @@ func.func @m16n8k32_int8_row_row_row(%arg0: memref<128x128xi8, 3>, %arg1: memref
|
||||
// CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
|
||||
// CHECK-NOT: vector.load %arg2{{.*}}
|
||||
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi8, 3>, vector<16x32xi8>
|
||||
%B = vector.transfer_read %arg1[%c39, %c40], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<128x128xi8, 3>, vector<8x32xi8>
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi8, #gpu.address_space<workgroup>>, vector<16x32xi8>
|
||||
%B = vector.transfer_read %arg1[%c39, %c40], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<128x128xi8, #gpu.address_space<workgroup>>, vector<8x32xi8>
|
||||
%C = vector.transfer_read %arg2[%c49, %c40], %cst0 {in_bounds = [true, true]} : memref<128x128xi32>, vector<16x8xi32>
|
||||
// CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
|
||||
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x32xi8>, vector<8x32xi8> into vector<16x8xi32>
|
||||
@@ -178,7 +178,7 @@ func.func @m8n8k4_f64_row_row_row(%arg0: memref<128x128xf64>, %arg1: memref<128x
|
||||
// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)>
|
||||
|
||||
// CHECK-LABEL: func @m16n8k16_fp16_row_row_row
|
||||
func.func @m16n8k16_fp16_row_row_row(%arg0: memref<20x20xf16, 3>, %arg1: memref<20x20xf16, 3>, %arg2: memref<20x20xf16, 3>) {
|
||||
func.func @m16n8k16_fp16_row_row_row(%arg0: memref<20x20xf16, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf16, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf16, #gpu.address_space<workgroup>>) {
|
||||
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
|
||||
%c0 = arith.constant 0 : index
|
||||
%cst = arith.constant 0.000000e+00 : f16
|
||||
@@ -189,11 +189,11 @@ func.func @m16n8k16_fp16_row_row_row(%arg0: memref<20x20xf16, 3>, %arg1: memref<
|
||||
// CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
|
||||
// CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
|
||||
// CHECK: nvgpu.ldmatrix %arg1[[[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true}
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<16x16xf16>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<8x16xf16>
|
||||
%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<16x8xf16>
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<8x16xf16>
|
||||
%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<16x8xf16>
|
||||
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
|
||||
vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16, 3>
|
||||
vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16, #gpu.address_space<workgroup>>
|
||||
return
|
||||
}
|
||||
|
||||
@@ -212,7 +212,7 @@ func.func @m16n8k16_fp16_row_row_row(%arg0: memref<20x20xf16, 3>, %arg1: memref<
|
||||
#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
|
||||
|
||||
// CHECK-LABEL: func @m16n16k16_mmasync16816_fp16_f16_row_row_row
|
||||
func.func @m16n16k16_mmasync16816_fp16_f16_row_row_row(%arg0: memref<42x32xf16, 3>, %arg1: memref<32x64xf16, 3>, %arg2: memref<42x64xf16, 3>) {
|
||||
func.func @m16n16k16_mmasync16816_fp16_f16_row_row_row(%arg0: memref<42x32xf16, #gpu.address_space<workgroup>>, %arg1: memref<32x64xf16, #gpu.address_space<workgroup>>, %arg2: memref<42x64xf16, #gpu.address_space<workgroup>>) {
|
||||
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
|
||||
%c0 = arith.constant 0 : index
|
||||
%c8 = arith.constant 8 : index
|
||||
@@ -221,17 +221,17 @@ func.func @m16n16k16_mmasync16816_fp16_f16_row_row_row(%arg0: memref<42x32xf16,
|
||||
// CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
|
||||
// CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
|
||||
// CHECK: [[fragmentA:%.+]] = nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false}
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<42x32xf16, 3>, vector<16x16xf16>
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<42x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
|
||||
|
||||
// CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
|
||||
// CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
|
||||
// CHECK-DAG: [[fragmentB:%.+]] = nvgpu.ldmatrix %arg1[[[k_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = true}
|
||||
%B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<32x64xf16, 3>, vector<16x16xf16>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<32x64xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
|
||||
|
||||
// CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
|
||||
// CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
|
||||
// CHECK-DAG: [[fragmentC:%.*]] = nvgpu.ldmatrix %arg2[[[m_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = false}
|
||||
%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<42x64xf16, 3>, vector<16x16xf16>
|
||||
%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<42x64xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
|
||||
|
||||
// CHECK-DAG: [[fragmentB0:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
|
||||
// CHECK-DAG: [[fragmentC0:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
|
||||
@@ -239,7 +239,7 @@ func.func @m16n16k16_mmasync16816_fp16_f16_row_row_row(%arg0: memref<42x32xf16,
|
||||
%B0 = vector.extract_strided_slice %B {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
|
||||
%C0 = vector.extract_strided_slice %C {offsets = [0, 0], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16>
|
||||
%D0 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B0, %C0 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
|
||||
vector.transfer_write %D0, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<42x64xf16, 3>
|
||||
vector.transfer_write %D0, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<42x64xf16, #gpu.address_space<workgroup>>
|
||||
|
||||
// CHECK-DAG: [[fragmentB1:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
|
||||
// CHECK-DAG: [[fragmentC1:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
|
||||
@@ -247,7 +247,7 @@ func.func @m16n16k16_mmasync16816_fp16_f16_row_row_row(%arg0: memref<42x32xf16,
|
||||
%B1 = vector.extract_strided_slice %B {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
|
||||
%C1 = vector.extract_strided_slice %C {offsets = [0, 8], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16>
|
||||
%D1 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B1, %C1 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
|
||||
vector.transfer_write %D1, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<42x64xf16, 3>
|
||||
vector.transfer_write %D1, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<42x64xf16, #gpu.address_space<workgroup>>
|
||||
|
||||
return
|
||||
}
|
||||
@@ -262,7 +262,7 @@ func.func @m16n16k16_mmasync16816_fp16_f16_row_row_row(%arg0: memref<42x32xf16,
|
||||
#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
|
||||
|
||||
// CHECK-LABEL: func @batch_m16n8k16_fp16_row_row_row
|
||||
func.func @batch_m16n8k16_fp16_row_row_row(%arg0: memref<2x20x20xf16, 3>, %arg1: memref<2x20x20xf16, 3>, %arg2: memref<2x20x20xf16, 3>) {
|
||||
func.func @batch_m16n8k16_fp16_row_row_row(%arg0: memref<2x20x20xf16, #gpu.address_space<workgroup>>, %arg1: memref<2x20x20xf16, #gpu.address_space<workgroup>>, %arg2: memref<2x20x20xf16, #gpu.address_space<workgroup>>) {
|
||||
%cst_0 = arith.constant dense<0.000000e+00> : vector<20x20xf16>
|
||||
// CHECK: [[C0:%.+]] = arith.constant 0 : index
|
||||
%c0 = arith.constant 0 : index
|
||||
@@ -270,20 +270,20 @@ func.func @batch_m16n8k16_fp16_row_row_row(%arg0: memref<2x20x20xf16, 3>, %arg1:
|
||||
|
||||
// CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
|
||||
// CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
|
||||
// CHECK: nvgpu.ldmatrix %arg0[[[C0]], [[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<2x20x20xf16, 3> -> vector<4x2xf16>
|
||||
%A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, 3>, vector<16x16xf16>
|
||||
// CHECK: nvgpu.ldmatrix %arg0[[[C0]], [[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<2x20x20xf16, #gpu.address_space<workgroup>> -> vector<4x2xf16>
|
||||
%A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
|
||||
|
||||
// CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
|
||||
// CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
|
||||
// CHECK: nvgpu.ldmatrix %arg1[[[C0]], [[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true} : memref<2x20x20xf16, 3> -> vector<2x2xf16>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<2x20x20xf16, 3>, vector<8x16xf16>
|
||||
// CHECK: nvgpu.ldmatrix %arg1[[[C0]], [[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true} : memref<2x20x20xf16, #gpu.address_space<workgroup>> -> vector<2x2xf16>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<8x16xf16>
|
||||
|
||||
// CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
|
||||
// CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
|
||||
// CHECK: nvgpu.ldmatrix %arg2[[[C0]], [[m_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = false} : memref<2x20x20xf16, 3> -> vector<2x2xf16>
|
||||
%C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, 3>, vector<16x8xf16>
|
||||
// CHECK: nvgpu.ldmatrix %arg2[[[C0]], [[m_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = false} : memref<2x20x20xf16, #gpu.address_space<workgroup>> -> vector<2x2xf16>
|
||||
%C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<16x8xf16>
|
||||
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
|
||||
vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<2x20x20xf16, 3>
|
||||
vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<2x20x20xf16, #gpu.address_space<workgroup>>
|
||||
return
|
||||
}
|
||||
|
||||
@@ -305,7 +305,7 @@ func.func @batch_m16n8k16_fp16_row_row_row(%arg0: memref<2x20x20xf16, 3>, %arg1:
|
||||
// CHECK: [[$contiguous_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 8)>
|
||||
|
||||
// CHECK-LABEL: func @m16n8k16_fp16_row_col_row
|
||||
func.func @m16n8k16_fp16_row_col_row(%arg0: memref<20x20xf16, 3>, %arg1: memref<20x20xf16, 3>, %arg2: memref<20x20xf16, 3>) {
|
||||
func.func @m16n8k16_fp16_row_col_row(%arg0: memref<20x20xf16, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf16, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf16, #gpu.address_space<workgroup>>) {
|
||||
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
|
||||
%c0 = arith.constant 0 : index
|
||||
|
||||
@@ -324,11 +324,11 @@ func.func @m16n8k16_fp16_row_col_row(%arg0: memref<20x20xf16, 3>, %arg1: memref<
|
||||
// CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x4_map]]
|
||||
// CHECK: nvgpu.ldmatrix %arg2[[[m_coord]], [[n_coord]]] {numTiles = 2 : i32
|
||||
// CHECK-SAME: transpose = false
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<16x16xf16>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<8x16xf16>
|
||||
%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, 3>, vector<16x8xf16>
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<8x16xf16>
|
||||
%C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<16x8xf16>
|
||||
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
|
||||
vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16, 3>
|
||||
vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16, #gpu.address_space<workgroup>>
|
||||
return
|
||||
}
|
||||
|
||||
@@ -354,7 +354,7 @@ func.func @m16n8k16_fp16_row_col_row(%arg0: memref<20x20xf16, 3>, %arg1: memref<
|
||||
// CHECK-DAG: [[$colC_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>
|
||||
|
||||
// CHECK-LABEL: func @m16n8k4_tf32_f32_row_row_row
|
||||
func.func @m16n8k4_tf32_f32_row_row_row(%arg0: memref<20x20xf32, 3>, %arg1: memref<20x20xf32, 3>, %arg2: memref<20x20xf32>) {
|
||||
func.func @m16n8k4_tf32_f32_row_row_row(%arg0: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf32>) {
|
||||
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf32>
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
@@ -372,14 +372,14 @@ func.func @m16n8k4_tf32_f32_row_row_row(%arg0: memref<20x20xf32, 3>, %arg1: memr
|
||||
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB_map]]
|
||||
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB_map]]
|
||||
// CHECK: [[b_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, 3>
|
||||
// CHECK: [[b_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
|
||||
// CHECK: [[b_frag:%.+]] = vector.insert [[b_el]], {{.*}} : f32 into vector<1x1xf32>
|
||||
|
||||
// CHECK: [[d_frag:%.+]] = nvgpu.mma.sync([[a_frag]], [[b_frag]], [[c_frag]])
|
||||
// CHECK-SAME: mmaShape = [16, 8, 4]
|
||||
// CHECK-SAME: -> vector<2x2xf32>
|
||||
%A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf32, 3>, vector<16x4xf32>
|
||||
%B = vector.transfer_read %arg1[%c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf32, 3>, vector<8x4xf32>
|
||||
%A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
|
||||
%B = vector.transfer_read %arg1[%c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<8x4xf32>
|
||||
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %cst_0 : vector<16x4xf32>, vector<8x4xf32> into vector<16x8xf32>
|
||||
|
||||
// CHECK: vector.extract [[d_frag]][0] : vector<2x2xf32>
|
||||
@@ -415,7 +415,7 @@ func.func @m16n8k4_tf32_f32_row_row_row(%arg0: memref<20x20xf32, 3>, %arg1: memr
|
||||
// CHECK-DAG: [[$colC_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>
|
||||
|
||||
// CHECK-LABEL: func @m16n8k8_tf32_f32_row_row_row
|
||||
func.func @m16n8k8_tf32_f32_row_row_row(%arg0: memref<20x20xf32, 3>, %arg1: memref<20x20xf32, 3>, %arg2: memref<20x20xf32>) {
|
||||
func.func @m16n8k8_tf32_f32_row_row_row(%arg0: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf32>) {
|
||||
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf32>
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
@@ -433,16 +433,16 @@ func.func @m16n8k8_tf32_f32_row_row_row(%arg0: memref<20x20xf32, 3>, %arg1: memr
|
||||
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB_map]]
|
||||
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colB_map]]
|
||||
// CHECK: [[b_el0:%.+]] = memref.load {{%.+}} : memref<20x20xf32, 3>
|
||||
// CHECK: [[b_el0:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
|
||||
// CHECK: [[b_frag0:%.+]] = vector.insert [[b_el0]], {{.*}} : f32 into vector<2x1xf32>
|
||||
// CHECK: [[b_el1:%.+]] = memref.load {{%.+}} : memref<20x20xf32, 3>
|
||||
// CHECK: [[b_el1:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
|
||||
// CHECK: [[b_frag1:%.+]] = vector.insert [[b_el1]], {{.*}} : f32 into vector<2x1xf32>
|
||||
|
||||
// CHECK: [[d_frag:%.+]] = nvgpu.mma.sync([[a_frag]], [[b_frag1]], [[c_frag]])
|
||||
// CHECK-SAME: mmaShape = [16, 8, 8]
|
||||
// CHECK-SAME: -> vector<2x2xf32>
|
||||
%A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf32, 3>, vector<16x8xf32>
|
||||
%B = vector.transfer_read %arg1[%c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf32, 3>, vector<8x8xf32>
|
||||
%A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
|
||||
%B = vector.transfer_read %arg1[%c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<8x8xf32>
|
||||
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %cst_0 : vector<16x8xf32>, vector<8x8xf32> into vector<16x8xf32>
|
||||
|
||||
// CHECK: vector.extract [[d_frag]][0] : vector<2x2xf32>
|
||||
@@ -480,7 +480,7 @@ func.func @m16n8k8_tf32_f32_row_row_row(%arg0: memref<20x20xf32, 3>, %arg1: memr
|
||||
// CHECK-DAG: [[$colC_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 + 8)>
|
||||
|
||||
// CHECK-LABEL: func @m16n8k8_tf32_f32_col_col_row
|
||||
func.func @m16n8k8_tf32_f32_col_col_row(%arg0: memref<20x20xf32, 3>, %arg1: memref<20x20xf32, 3>, %arg2: memref<20x20xf32>) {
|
||||
func.func @m16n8k8_tf32_f32_col_col_row(%arg0: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf32>) {
|
||||
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf32>
|
||||
%c0 = arith.constant 0 : index
|
||||
%c16 = arith.constant 16 : index
|
||||
@@ -491,17 +491,17 @@ func.func @m16n8k8_tf32_f32_col_col_row(%arg0: memref<20x20xf32, 3>, %arg1: memr
|
||||
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA0_map]]
|
||||
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colA0_map]]
|
||||
// CHECK: [[a_el0:%.+]] = memref.load {{%.+}} : memref<20x20xf32, 3>
|
||||
// CHECK: [[a_el0:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
|
||||
// CHECK: [[a_frag0:%.+]] = vector.insert [[a_el0]], {{.*}} [0, 0] : f32 into vector<4x1xf32>
|
||||
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA8_map]]
|
||||
// CHECK-DAG: [[col:%.+]] = affine.apply [[$colA0_map]]
|
||||
// CHECK: [[a_el0:%.+]] = memref.load {{%.+}} : memref<20x20xf32, 3>
|
||||
// CHECK: [[a_el0:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
|
||||
// CHECK: [[a_frag0:%.+]] = vector.insert [[a_el0]], {{.*}} [1, 0] : f32 into vector<4x1xf32>
|
||||
|
||||
// CHECK: [[a_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, 3>
|
||||
// CHECK: [[a_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
|
||||
// CHECK: [[a_frag:%.+]] = vector.insert [[a_el]], {{.*}} [2, 0] : f32 into vector<4x1xf32>
|
||||
// CHECK: [[a_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, 3>
|
||||
// CHECK: [[a_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
|
||||
// CHECK: [[a_frag:%.+]] = vector.insert [[a_el]], {{.*}} [3, 0] : f32 into vector<4x1xf32>
|
||||
|
||||
// CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB0_map]]
|
||||
@@ -511,8 +511,8 @@ func.func @m16n8k8_tf32_f32_col_col_row(%arg0: memref<20x20xf32, 3>, %arg1: memr
|
||||
// CHECK: [[d_frag:%.+]] = nvgpu.mma.sync([[a_frag]], [[b_frag]], [[c_frag]])
|
||||
// CHECK-SAME: mmaShape = [16, 8, 8]
|
||||
// CHECK-SAME: -> vector<2x2xf32>
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<20x20xf32, 3>, vector<16x8xf32>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf32, 3>, vector<8x8xf32>
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<8x8xf32>
|
||||
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"],
|
||||
kind = #vector.kind<add>} %A, %B, %cst_0 : vector<16x8xf32>, vector<8x8xf32> into vector<16x8xf32>
|
||||
|
||||
@@ -551,7 +551,7 @@ func.func @m16n8k8_tf32_f32_col_col_row(%arg0: memref<20x20xf32, 3>, %arg1: memr
|
||||
#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
|
||||
|
||||
// CHECK-LABEL: func @m16n8k64_int4_row_col_row
|
||||
func.func @m16n8k64_int4_row_col_row(%arg0: memref<128x128xi4, 3>, %arg1: memref<128x128xi4, 3>, %arg2: memref<128x128xi32>) {
|
||||
func.func @m16n8k64_int4_row_col_row(%arg0: memref<128x128xi4, #gpu.address_space<workgroup>>, %arg1: memref<128x128xi4, #gpu.address_space<workgroup>>, %arg2: memref<128x128xi32>) {
|
||||
%cst = arith.constant 0 : i4
|
||||
%cst0 = arith.constant 0 : i32
|
||||
%cst_0 = arith.constant dense<0> : vector<32x8xi4>
|
||||
@@ -560,12 +560,12 @@ func.func @m16n8k64_int4_row_col_row(%arg0: memref<128x128xi4, 3>, %arg1: memref
|
||||
// CHECK: [[lane:%.+]] = gpu.lane_id
|
||||
// CHECK: [[m_coord:%.+]] = affine.apply [[$strided_ldmatrix_x4_map]]()[[[lane]]]
|
||||
// CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x4_map]]()[[[lane]]]
|
||||
// CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi4, 3> -> vector<4x8xi4>
|
||||
// CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi4, #gpu.address_space<workgroup>> -> vector<4x8xi4>
|
||||
|
||||
// CHECK: [[lane:%.+]] = gpu.lane_id
|
||||
// CHECK: [[n_coord:%.+]] = affine.apply [[$strided_ldmatrix_x2_map]]()[[[lane]]]
|
||||
// CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x2_map]]()[[[lane]]]
|
||||
// CHECK: nvgpu.ldmatrix %arg1[[[n_coord]], [[k_coord]]] {numTiles = 2 : i32, transpose = false} : memref<128x128xi4, 3> -> vector<2x8xi4>
|
||||
// CHECK: nvgpu.ldmatrix %arg1[[[n_coord]], [[k_coord]]] {numTiles = 2 : i32, transpose = false} : memref<128x128xi4, #gpu.address_space<workgroup>> -> vector<2x8xi4>
|
||||
|
||||
// CHECK: [[lane:%.+]] = gpu.lane_id
|
||||
// CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[{{%.+}}]
|
||||
@@ -577,8 +577,8 @@ func.func @m16n8k64_int4_row_col_row(%arg0: memref<128x128xi4, 3>, %arg1: memref
|
||||
// CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
|
||||
// CHECK-NOT: vector.load
|
||||
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi4, 3>, vector<16x64xi4>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi4, 3>, vector<8x64xi4>
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi4, #gpu.address_space<workgroup>>, vector<16x64xi4>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi4, #gpu.address_space<workgroup>>, vector<8x64xi4>
|
||||
%C = vector.transfer_read %arg2[%c0, %c0], %cst0 {in_bounds = [true, true]} : memref<128x128xi32>, vector<16x8xi32>
|
||||
// CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [16, 8, 64]} : (vector<4x8xi4>, vector<2x8xi4>, vector<2x2xi32>) -> vector<2x2xi32>
|
||||
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x64xi4>, vector<8x64xi4> into vector<16x8xi32>
|
||||
@@ -622,7 +622,7 @@ func.func @m16n8k64_int4_row_col_row(%arg0: memref<128x128xi4, 3>, %arg1: memref
|
||||
#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
|
||||
|
||||
// CHECK-LABEL: func @m16n8k32_int8_row_col_row
|
||||
func.func @m16n8k32_int8_row_col_row(%arg0: memref<128x128xi8, 3>, %arg1: memref<128x128xi8, 3>, %arg2: memref<128x128xi32>) {
|
||||
func.func @m16n8k32_int8_row_col_row(%arg0: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg1: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg2: memref<128x128xi32>) {
|
||||
%cst_0 = arith.constant dense<0> : vector<32x8xi8>
|
||||
%c0 = arith.constant 0 : index
|
||||
%cst = arith.constant 0 : i8
|
||||
@@ -631,12 +631,12 @@ func.func @m16n8k32_int8_row_col_row(%arg0: memref<128x128xi8, 3>, %arg1: memref
|
||||
// CHECK: [[lane:%.+]] = gpu.lane_id
|
||||
// CHECK: [[m_coord:%.+]] = affine.apply [[$strided_ldmatrix_x4_map]]()[[[lane]]]
|
||||
// CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x4_map]]()[[[lane]]]
|
||||
// CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8, 3> -> vector<4x4xi8>
|
||||
// CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8, #gpu.address_space<workgroup>> -> vector<4x4xi8>
|
||||
|
||||
// CHECK: [[lane:%.+]] = gpu.lane_id
|
||||
// CHECK: [[n_coord:%.+]] = affine.apply [[$strided_ldmatrix_x2_map]]()[[[lane]]]
|
||||
// CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x2_map]]()[[[lane]]]
|
||||
// CHECK: nvgpu.ldmatrix %arg1[[[n_coord]], [[k_coord]]] {numTiles = 2 : i32, transpose = false} : memref<128x128xi8, 3> -> vector<2x4xi8>
|
||||
// CHECK: nvgpu.ldmatrix %arg1[[[n_coord]], [[k_coord]]] {numTiles = 2 : i32, transpose = false} : memref<128x128xi8, #gpu.address_space<workgroup>> -> vector<2x4xi8>
|
||||
|
||||
// CHECK: [[lane:%.+]] = gpu.lane_id
|
||||
// CHECK: [[m_coord:%.+]] = affine.apply [[$rowC0_map]]()[[[lane]]]
|
||||
@@ -647,8 +647,8 @@ func.func @m16n8k32_int8_row_col_row(%arg0: memref<128x128xi8, 3>, %arg1: memref
|
||||
// CHECK: vector.load %arg2[[[m_coord]], [[n_coord]]] : memref<128x128xi32>, vector<2xi32>
|
||||
// CHECK-NOT: vector.load %arg2{{.*}}
|
||||
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi8, 3>, vector<16x32xi8>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi8, 3>, vector<8x32xi8>
|
||||
%A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi8, #gpu.address_space<workgroup>>, vector<16x32xi8>
|
||||
%B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi8, #gpu.address_space<workgroup>>, vector<8x32xi8>
|
||||
%C = vector.transfer_read %arg2[%c0, %c0], %cst0 {in_bounds = [true, true]} : memref<128x128xi32>, vector<16x8xi32>
|
||||
// CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
|
||||
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x32xi8>, vector<8x32xi8> into vector<16x8xi32>
|
||||
|
||||
Reference in New Issue
Block a user