mirror of
https://github.com/intel/llvm.git
synced 2026-01-24 08:30:34 +08:00
[MLIR][XeGPU] Refactor xegpu-wg-to-sg tests (#149204)
This PR refactors the xegpu-wg-to-sg.mlir tests to use larger shapes which resemble closer to workgroup level programming.
This commit is contained in:
@@ -2,122 +2,117 @@
|
||||
|
||||
gpu.module @test_round_robin_assignment {
|
||||
// CHECK-LABEL: create_nd_tdesc
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
||||
gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) {
|
||||
// CHECK-COUNT-12: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<24x32xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
|
||||
gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
|
||||
// CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK-NOT: xegpu.create_nd_tdesc
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
|
||||
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: load_nd_tdesc
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
||||
gpu.func @load_nd_tdesc(%src: memref<24x32xf32>) {
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
// CHECK-COUNT-12: xegpu.load_nd %{{.*}}
|
||||
// CHECK-SAME-COUNT-12: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
// CHECK-SAME-COUNT-12: -> vector<2x2xf32>
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
|
||||
gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
|
||||
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK-COUNT-4: xegpu.load_nd %{{.*}}
|
||||
// CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK-SAME-COUNT-4: -> vector<16x16xf32>
|
||||
// CHECK-NOT: xegpu.load_nd
|
||||
%load = xegpu.load_nd %tdesc
|
||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
-> vector<24x32xf32>
|
||||
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
-> vector<256x128xf32>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: store_nd
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
||||
gpu.func @store_nd(%src: memref<24x32xf32>) {
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
// CHECK-COUNT-12: xegpu.store_nd %{{.*}}, %{{.*}}
|
||||
// CHECK-SAME-COUNT-12: : vector<2x2xf32>, !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
|
||||
gpu.func @store_nd(%src: memref<256x128xf32>) {
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
|
||||
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}}
|
||||
// CHECK-SAME-COUNT-4: : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK-NOT : xegpu.store_nd
|
||||
%load = xegpu.load_nd %tdesc
|
||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
-> vector<24x32xf32>
|
||||
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
-> vector<256x128xf32>
|
||||
xegpu.store_nd %load, %tdesc
|
||||
: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
: vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: update_nd
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
||||
gpu.func @update_nd(%src: memref<24x32xf32>){
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
// CHECK-COUNT-12: xegpu.update_nd_offset %{{.*}}, [0, 16]
|
||||
// CHECK-SAME-COUNT-12: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
|
||||
gpu.func @update_nd(%src: memref<256x128xf32>){
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
|
||||
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16]
|
||||
// CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>>
|
||||
// CHECK-NOT: xegpu.update_nd_offset
|
||||
%update = xegpu.update_nd_offset %tdesc, [0, 16]
|
||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: dpas
|
||||
// CHECK-SAME: (%[[ARG_0:.*]]: memref<8x8xf32>, %[[ARG_1:.*]]: memref<8x8xf32>, %[[ARG_2:.*]]: memref<8x8xf32>)
|
||||
gpu.func @dpas(%a: memref<8x8xf32>, %b: memref<8x8xf32>, %c: memref<8x8xf32>) {
|
||||
// CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<8x8xf32>
|
||||
// CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>)
|
||||
gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
|
||||
// CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16>
|
||||
// CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK-NOT: xegpu.create_nd_tdesc
|
||||
// CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<8x8xf32>
|
||||
// CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
// CHECK-NOT: xegpu.create_nd_tdesc
|
||||
// CHECK-COUNT-4: xegpu.create_nd_tdesc %{{.*}}[%{{.*}}, %{{.*}}] : memref<8x8xf32>
|
||||
// CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
// CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16>
|
||||
// CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [4, 8], lane_data = [1, 1]>>
|
||||
// CHECK-NOT: xegpu.create_nd_tdesc
|
||||
// CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}}
|
||||
// CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
|
||||
// CHECK-SAME-COUNT-16: : vector<2x2xf32>, vector<2x2xf32> -> vector<2x2xf32>
|
||||
// CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
|
||||
// CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
|
||||
// CHECK-NOT: xegpu.dpas
|
||||
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<8x8xf32>
|
||||
-> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16>
|
||||
-> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
%load_a = xegpu.load_nd %tdesc_a
|
||||
: !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
-> vector<8x8xf32>
|
||||
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<8x8xf32>
|
||||
-> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
: !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
-> vector<256x128xf16>
|
||||
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x256xf16>
|
||||
-> !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
|
||||
%load_b = xegpu.load_nd %tdesc_b
|
||||
: !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
-> vector<8x8xf32>
|
||||
%tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<8x8xf32>
|
||||
-> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
: !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
|
||||
-> vector<128x256xf16>
|
||||
%dpas = xegpu.dpas %load_a, %load_b
|
||||
{layout_result_0 = #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
|
||||
: vector<8x8xf32>, vector<8x8xf32> -> vector<8x8xf32>
|
||||
{layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
|
||||
: vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: prefetch_nd_tdesc
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
||||
gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) {
|
||||
// CHECK-COUNT-12: xegpu.prefetch_nd %{{.*}}
|
||||
// CHECK-SAME-COUNT-12 : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
|
||||
gpu.func @prefetch_nd_tdesc(%src: memref<256x128xf32>) {
|
||||
// CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}}
|
||||
// CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK-NOT: xegpu.prefetch_nd
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
|
||||
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
xegpu.prefetch_nd %tdesc
|
||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
||||
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: broadcast
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x1xf32>
|
||||
gpu.func @broadcast(%src: memref<24x1xf32>) {
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x1xf32>
|
||||
-> !xegpu.tensor_desc<24x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [2, 1], lane_layout = [2, 1], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<128x1xf32>
|
||||
gpu.func @broadcast(%src: memref<128x1xf32>) {
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<128x1xf32>
|
||||
-> !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
|
||||
%load = xegpu.load_nd %tdesc
|
||||
: !xegpu.tensor_desc<24x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [2, 1], lane_layout = [2, 1], lane_data = [1, 1]>>
|
||||
-> vector<24x1xf32>
|
||||
// CHECK-COUNT-3: vector.broadcast {{.*}}
|
||||
// CHECK-SAME-COUNT-3: {layout_result_0 = #xegpu.layout<lane_layout = [2, 1], lane_data = [1, 1]>}
|
||||
// CHECK-SAME-COUNT-3: : vector<2x1xf32> to vector<2x4xf32>
|
||||
: !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
|
||||
-> vector<128x1xf32>
|
||||
// CHECK-COUNT-2: vector.broadcast {{.*}}
|
||||
// CHECK-SAME-COUNT-2: {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>}
|
||||
// CHECK-SAME-COUNT-2: : vector<16x1xf32> to vector<16x32xf32>
|
||||
// CHECK-NOT: vector.broadcast
|
||||
%broadcast = vector.broadcast %load
|
||||
{layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [2, 4], lane_layout = [2, 1], lane_data = [1, 1]>}
|
||||
: vector<24x1xf32> to vector<24x8xf32>
|
||||
{layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
|
||||
: vector<128x1xf32> to vector<128x64xf32>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
|
||||
@@ -4,201 +4,181 @@
|
||||
//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
|
||||
gpu.module @test_1_1_assignment {
|
||||
// CHECK-LABEL: create_nd_tdesc
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
||||
gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) {
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
|
||||
gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
|
||||
// CHECK: %[[SGID:.*]] = gpu.subgroup_id
|
||||
// CHECK: %[[C12:.*]] = arith.constant 12 : index
|
||||
// CHECK: %[[C4:.*]] = arith.constant 4 : index
|
||||
// CHECK: %[[C8:.*]] = arith.constant 8 : index
|
||||
// CHECK: %[[C32:.*]] = arith.constant 32 : index
|
||||
// CHECK: %[[C4:.*]] = arith.constant 4 : index
|
||||
// CHECK: %[[C32_0:.*]] = arith.constant 32 : index
|
||||
// CHECK: %[[C4_1:.*]] = arith.constant 4 : index
|
||||
// CHECK: %[[DIV:.*]] = affine.apply #map()[%[[SGID]]]
|
||||
// CHECK: %[[REM:.*]] = affine.apply #map1()[%[[SGID]]]
|
||||
// CHECK: %[[MUL1:.*]] = index.mul %[[DIV]], %[[C12]]
|
||||
// CHECK: %[[MUL2:.*]] = index.mul %[[REM]], %[[C8]]
|
||||
// CHECK: %[[C24:.*]] = arith.constant 24 : index
|
||||
// CHECK: %[[MOD:.*]] = index.remu %[[MUL1]], %[[C24]]
|
||||
// CHECK: %[[MUL1:.*]] = index.mul %[[DIV]], %[[C32]]
|
||||
// CHECK: %[[MUL2:.*]] = index.mul %[[REM]], %[[C32_0]]
|
||||
// CHECK: %[[C0:.*]] = arith.constant 0 : index
|
||||
// CHECK: %[[ADD1:.*]] = index.add %[[MOD]], %[[C0]]
|
||||
// CHECK: %[[C32:.*]] = arith.constant 32 : index
|
||||
// CHECK: %[[MOD1:.*]] = index.remu %[[MUL2]], %[[C32]]
|
||||
// CHECK: %[[C0_1:.*]] = arith.constant 0 : index
|
||||
// CHECK: %[[ADD2:.*]] = index.add %[[MOD1]], %[[C0_1]]
|
||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[ADD1]], %[[ADD2]]] : memref<24x32xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK: %[[C256:.*]] = arith.constant 256 : index
|
||||
// CHECK: %[[MOD:.*]] = index.remu %[[MUL1]], %[[C256]]
|
||||
// CHECK: %[[C0_2:.*]] = arith.constant 0 : index
|
||||
// CHECK: %[[ADD1:.*]] = index.add %[[MOD]], %[[C0_2]]
|
||||
// CHECK: %[[C0_3:.*]] = arith.constant 0 : index
|
||||
// CHECK: %[[C128:.*]] = arith.constant 128 : index
|
||||
// CHECK: %[[MOD1:.*]] = index.remu %[[MUL2]], %[[C128]]
|
||||
// CHECK: %[[C0_4:.*]] = arith.constant 0 : index
|
||||
// CHECK: %[[ADD2:.*]] = index.add %[[MOD1]], %[[C0_4]]
|
||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[ADD1]], %[[ADD2]]] : memref<256x128xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK: gpu.return
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
|
||||
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: load_nd_tdesc
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
||||
gpu.func @load_nd_tdesc(%src: memref<24x32xf32>) {
|
||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
|
||||
gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {
|
||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]]
|
||||
// CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: -> vector<12x8xf32>
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: -> vector<32x32xf32>
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
|
||||
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
%load = xegpu.load_nd %tdesc
|
||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
-> vector<24x32xf32>
|
||||
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
-> vector<256x128xf32>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: store_nd
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
||||
gpu.func @store_nd(%src: memref<24x32xf32>) {
|
||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
|
||||
gpu.func @store_nd(%src: memref<256x128xf32>) {
|
||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]]
|
||||
// CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: -> vector<12x8xf32>
|
||||
// CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: -> vector<32x32xf32>
|
||||
// CHECK: xegpu.store_nd %[[LOAD]], %[[TDESC]]
|
||||
// CHECK-SAME: : vector<12x8xf32>, !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
|
||||
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
%load = xegpu.load_nd %tdesc
|
||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
-> vector<24x32xf32>
|
||||
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
-> vector<256x128xf32>
|
||||
xegpu.store_nd %load, %tdesc
|
||||
: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
: vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: update_nd
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
||||
gpu.func @update_nd(%src: memref<24x32xf32>){
|
||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
|
||||
gpu.func @update_nd(%src: memref<256x128xf32>){
|
||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16]
|
||||
// CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
|
||||
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
%update = xegpu.update_nd_offset %tdesc, [0, 16]
|
||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: dpas
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
||||
// CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32>
|
||||
gpu.func @dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
|
||||
// CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
|
||||
// CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]]
|
||||
// CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: -> vector<12x8xf32>
|
||||
// CHECK: %[[TDESC_B:.*]] = xegpu.create_nd_tdesc %[[ARG_1]][{{%.*}}, {{%.*}}] : memref<32x24xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<8x12xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
|
||||
// CHECK: %[[LOAD_B:.*]] = xegpu.load_nd %[[TDESC_B]]
|
||||
// CHECK-SAME: : !xegpu.tensor_desc<8x12xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: -> vector<8x12xf32>
|
||||
// CHECK: %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]]
|
||||
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
|
||||
// CHECK-SAME: : vector<12x8xf32>, vector<8x12xf32> -> vector<12x12xf32>
|
||||
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
|
||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
|
||||
// CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
|
||||
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
|
||||
-> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
%load_a = xegpu.load_nd %tdesc_a
|
||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
-> vector<24x32xf32>
|
||||
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<32x24xf32>
|
||||
-> !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 12], lane_layout = [8, 2], lane_data = [1, 1]>>
|
||||
: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
-> vector<128x128xf16>
|
||||
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16>
|
||||
-> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
|
||||
%load_b = xegpu.load_nd %tdesc_b
|
||||
: !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 12], lane_layout = [8, 2], lane_data = [1, 1]>>
|
||||
-> vector<32x24xf32>
|
||||
: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
|
||||
-> vector<128x128xf16>
|
||||
%dpas = xegpu.dpas %load_a, %load_b
|
||||
{layout_result_0 = #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
|
||||
: vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
|
||||
{layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
|
||||
: vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
|
||||
// CHECK-LABEL: dpas_no_sg_data
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
||||
// CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32>
|
||||
gpu.func @dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
|
||||
// CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
|
||||
// CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]]
|
||||
// CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: -> vector<12x8xf32>
|
||||
// CHECK: %[[TDESC_B:.*]] = xegpu.create_nd_tdesc %[[ARG_1]][{{%.*}}, {{%.*}}] : memref<32x24xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<8x12xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
|
||||
// CHECK: %[[LOAD_B:.*]] = xegpu.load_nd %[[TDESC_B]]
|
||||
// CHECK-SAME: : !xegpu.tensor_desc<8x12xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: -> vector<8x12xf32>
|
||||
// CHECK: %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]]
|
||||
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
|
||||
// CHECK-SAME: : vector<12x8xf32>, vector<8x12xf32> -> vector<12x12xf32>
|
||||
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
|
||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
|
||||
// CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
|
||||
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
|
||||
-> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
|
||||
order = [1, 0]>>
|
||||
%load_a = xegpu.load_nd %tdesc_a
|
||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
-> vector<24x32xf32>
|
||||
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<32x24xf32>
|
||||
-> !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], lane_layout = [8, 2], lane_data = [1, 1]>>
|
||||
: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
|
||||
order = [1, 0]>>
|
||||
-> vector<128x128xf16>
|
||||
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16>
|
||||
-> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
|
||||
order = [1, 0]>>
|
||||
%load_b = xegpu.load_nd %tdesc_b
|
||||
: !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], lane_layout = [8, 2], lane_data = [1, 1]>>
|
||||
-> vector<32x24xf32>
|
||||
: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
|
||||
order = [1, 0]>>
|
||||
-> vector<128x128xf16>
|
||||
%dpas = xegpu.dpas %load_a, %load_b
|
||||
{layout_result_0 = #xegpu.layout<sg_layout = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
|
||||
: vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
|
||||
{layout_result_0 = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
|
||||
: vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: prefetch_nd_tdesc
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
||||
gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) {
|
||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
|
||||
gpu.func @prefetch_nd_tdesc(%src: memref<256x128xf32>) {
|
||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
|
||||
// CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
// CHECK: xegpu.prefetch_nd %[[TDESC]]
|
||||
// CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
|
||||
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
xegpu.prefetch_nd %tdesc
|
||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
||||
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: dpas_with_no_create_nd_desc
|
||||
gpu.func @dpas_with_no_create_nd_desc(%a: vector<24x32xf32>, %b: vector<32x24xf32>) {
|
||||
// CHECK-NOT: vector<12x12xf32>
|
||||
gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) {
|
||||
// CHECK-NOT: vector<32x32xf32>
|
||||
%dpas = xegpu.dpas %a, %b
|
||||
{layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
|
||||
: vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
|
||||
: vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: broadcast_dim1
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x1xf32>
|
||||
gpu.func @broadcast_dim1(%src: memref<24x1xf32>) {
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x1xf32>
|
||||
-> !xegpu.tensor_desc<24x1xf32, #xegpu.layout<sg_layout = [2, 1], sg_data = [12, 1], lane_layout = [2, 1], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x1xf32>
|
||||
gpu.func @broadcast_dim1(%src: memref<256x1xf32>) {
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x1xf32>
|
||||
-> !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
|
||||
%load = xegpu.load_nd %tdesc
|
||||
: !xegpu.tensor_desc<24x1xf32, #xegpu.layout<sg_layout = [2, 1], sg_data = [12, 1], lane_layout = [2, 1], lane_data = [1, 1]>>
|
||||
-> vector<24x1xf32>
|
||||
// CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 1], lane_data = [1, 1]>}
|
||||
// CHECK-SAME: : vector<12x1xf32> to vector<12x8xf32>
|
||||
%broadcast = vector.broadcast %load
|
||||
{layout_result_0 = #xegpu.layout<sg_layout = [2, 1], sg_data = [12, 8], lane_layout = [2, 1], lane_data = [1, 1]>}
|
||||
: vector<24x1xf32> to vector<24x8xf32>
|
||||
: !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
|
||||
-> vector<256x1xf32>
|
||||
// CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>}
|
||||
// CHECK-SAME: : vector<32x1xf32> to vector<32x32xf32>
|
||||
%broadcast = vector.broadcast %load
|
||||
{layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
|
||||
: vector<256x1xf32> to vector<256x32xf32>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: broadcast_dim0
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<1x32xf32>
|
||||
gpu.func @broadcast_dim0(%src: memref<1x32xf32>) {
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<1x32xf32>
|
||||
-> !xegpu.tensor_desc<1x32xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 8], lane_layout = [1, 8], lane_data = [1, 1]>>
|
||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<1x128xf32>
|
||||
gpu.func @broadcast_dim0(%src: memref<1x128xf32>) {
|
||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<1x128xf32>
|
||||
-> !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
%load = xegpu.load_nd %tdesc
|
||||
: !xegpu.tensor_desc<1x32xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 8], lane_layout = [1, 8], lane_data = [1, 1]>>
|
||||
-> vector<1x32xf32>
|
||||
// CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 8], lane_data = [1, 1]>}
|
||||
// CHECK-SAME: : vector<1x8xf32> to vector<12x8xf32>
|
||||
: !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
|
||||
-> vector<1x128xf32>
|
||||
// CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
|
||||
// CHECK-SAME: : vector<1x32xf32> to vector<32x32xf32>
|
||||
%broadcast = vector.broadcast %load
|
||||
{layout_result_0 = #xegpu.layout<sg_layout = [1, 4], sg_data = [12, 8], lane_layout = [1, 8], lane_data = [1, 1]>}
|
||||
: vector<1x32xf32> to vector<12x32xf32>
|
||||
{layout_result_0 = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
|
||||
: vector<1x128xf32> to vector<32x128xf32>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user