mirror of
https://github.com/intel/llvm.git
synced 2026-01-24 08:30:34 +08:00
[NVPTX] Add im2colw/w128 modes support to TMA intrinsics (#148863)
This patch adds support for the im2col-w/w128 and scatter/gather modes for TMA Copy and Prefetch intrinsics, completing support for all the available modes. These are lowered through tablegen, building on top of earlier patches. * lit tests are added for all the combinations and verified with a 12.8 ptxas executable. * Documentation is updated in the NVPTXUsage.rst file. Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
This commit is contained in:
@@ -1072,6 +1072,8 @@ Syntax:
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
|
||||
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
@@ -1082,7 +1084,13 @@ global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
|
||||
in ``tile`` mode. In tile mode, the multi-dimensional layout of the
|
||||
source tensor is preserved at the destination. The dimension of the
|
||||
tensor data ranges from 1d to 5d with the coordinates specified
|
||||
by the ``i32 %d0 ... i32 %d4`` arguments.
|
||||
by the ``i32 %d0 ... i32 %d4`` arguments. In ``tile.gather4`` mode,
|
||||
four rows in a 2D tensor are combined to form a single 2D destination
|
||||
tensor. The first coordinate ``i32 %x0`` denotes the column index
|
||||
followed by four coordinates indicating the four row-indices.
|
||||
So, this mode takes a total of 5 coordinates as input arguments.
|
||||
For more information on ``gather4`` mode, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_.
|
||||
|
||||
* The last three arguments to these intrinsics are flags
|
||||
indicating support for multicast, cache_hint and cta_group::1/2
|
||||
@@ -1116,10 +1124,18 @@ Syntax:
|
||||
|
||||
.. code-block:: llvm
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
|
||||
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
@@ -1131,10 +1147,105 @@ in ``im2col`` mode. In im2col mode, some dimensions of the source tensor
|
||||
are unrolled into a single dimensional column at the destination. In this
|
||||
mode, the tensor has to be at least three-dimensional. Along with the tensor
|
||||
coordinates, im2col offsets are also specified (denoted by
|
||||
``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less
|
||||
than the number of dimensions of the tensor operation. The last three arguments
|
||||
to these intrinsics are flags, with the same functionality as described
|
||||
in the ``tile`` mode intrinsics above.
|
||||
``i16 im2col0...i16 %im2col2``). For the ``im2col`` mode, the number of offsets
|
||||
is two less than the number of dimensions of the tensor operation. For the
|
||||
``im2col.w`` and ``im2col.w.128`` mode, the number of offsets is always 2,
|
||||
denoted by ``i16 %wHalo`` and ``i16 %wOffset`` arguments. For more information
|
||||
on ``im2col.w`` and ``im2col.w.128`` modes, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes>`_.
|
||||
|
||||
The last three arguments to these intrinsics are flags, with the same functionality
|
||||
as described in the ``tile`` mode intrinsics above.
|
||||
|
||||
For more information, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
|
||||
|
||||
'``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d``'
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Syntax:
|
||||
"""""""
|
||||
|
||||
.. code-block:: llvm
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(..., i32 %d0, i32 %d1, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
|
||||
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d``' intrinsics
|
||||
correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.*``
|
||||
set of PTX instructions. These instructions initiate an asynchronous
|
||||
copy of tensor data from global memory to shared::cta memory in
|
||||
``tile`` mode. In tile mode, the multi-dimensional layout of the
|
||||
source tensor is preserved at the destination. The dimension of the
|
||||
tensor data ranges from 1d to 5d with the coordinates specified
|
||||
by the ``i32 %d0 ... i32 %d4`` arguments. In ``tile.gather4`` mode,
|
||||
four rows in a 2D tensor are combined to form a single 2D destination
|
||||
tensor. The first coordinate ``i32 %x0`` denotes the column index
|
||||
followed by four coordinates indicating the four row-indices.
|
||||
So, this mode takes a total of 5 coordinates as input arguments.
|
||||
For more information on ``gather4`` mode, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_.
|
||||
|
||||
* The last argument to these intrinsics is a boolean flag
|
||||
indicating support for cache_hint. This flag argument must
|
||||
be a compile-time constant. When set, it indicates a valid
|
||||
cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
|
||||
variant of the PTX instruction.
|
||||
|
||||
For more information, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
|
||||
|
||||
'``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d``'
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Syntax:
|
||||
"""""""
|
||||
|
||||
.. code-block:: llvm
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
|
||||
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d``' intrinsics
|
||||
correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.*``
|
||||
set of PTX instructions. These instructions initiate an asynchronous copy
|
||||
of tensor data from global memory to shared::cta memory in ``im2col`` mode.
|
||||
In im2col mode, some dimensions of the source tensor are unrolled into a
|
||||
single dimensional column at the destination. In this mode, the tensor has
|
||||
to be at least three-dimensional. Along with the tensor coordinates, im2col
|
||||
offsets are also specified (denoted by ``i16 im2col0...i16 %im2col2``).
|
||||
For the ``im2col`` mode, the number of offsets is two less than the number
|
||||
of dimensions of the tensor operation. For the ``im2col.w`` and ``im2col.w.128``
|
||||
mode, the number of offsets is always 2, denoted by ``i16 %wHalo`` and
|
||||
``i16 %wOffset`` arguments. For more information on ``im2col.w`` and
|
||||
``im2col.w.128`` modes, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes>`_.
|
||||
|
||||
* The last argument to these intrinsics is a boolean flag
|
||||
indicating support for cache_hint. This flag argument must
|
||||
be a compile-time constant. When set, it indicates a valid
|
||||
cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
|
||||
variant of the PTX instruction.
|
||||
|
||||
For more information, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
|
||||
@@ -1153,6 +1264,8 @@ Syntax:
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
|
||||
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
@@ -1162,6 +1275,12 @@ These instructions initiate an asynchronous copy of tensor data from
|
||||
shared::cta to global memory (indicated by the ``s2g`` prefix)
|
||||
in ``tile`` mode. The dimension of the tensor data ranges from 1d to 5d
|
||||
with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments.
|
||||
In ``tile.scatter4`` mode, a single 2D source tensor is divided into
|
||||
four rows in the 2D destination tensor. The first coordinate ``i32 %x0``
|
||||
denotes the column index followed by four coordinates indicating the
|
||||
four row-indices. So, this mode takes a total of 5 coordinates as input arguments.
|
||||
For more information on ``scatter4`` mode, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_.
|
||||
|
||||
* The last argument to these intrinsics is a boolean flag
|
||||
indicating support for cache_hint. This flag argument must
|
||||
@@ -1214,6 +1333,8 @@ Syntax:
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
|
||||
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
@@ -1225,6 +1346,13 @@ multi-dimensional layout of the source tensor is preserved at the destination.
|
||||
The dimension of the tensor data ranges from 1d to 5d with the coordinates
|
||||
specified by the ``i32 %d0 ... i32 %d4`` arguments.
|
||||
|
||||
In ``tile.gather4`` mode, four rows in the 2-dimnesional source tensor are
|
||||
fetched to the L2 cache. The first coordinate ``i32 %x0`` denotes the column index
|
||||
followed by four coordinates indicating the four row-indices. So, this mode takes
|
||||
a total of 5 coordinates as input arguments.
|
||||
For more information on ``gather4`` mode, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_.
|
||||
|
||||
* The last argument to these intrinsics is a boolean flag
|
||||
indicating support for cache_hint. This flag argument must
|
||||
be a compile-time constant. When set, it indicates a valid
|
||||
@@ -1246,6 +1374,14 @@ Syntax:
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
|
||||
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
@@ -1256,9 +1392,16 @@ of tensor data from global memory to the L2 cache. In im2col mode, some
|
||||
dimensions of the source tensor are unrolled into a single dimensional
|
||||
column at the destination. In this mode, the tensor has to be at least
|
||||
three-dimensional. Along with the tensor coordinates, im2col offsets are
|
||||
also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number
|
||||
of im2col offsets is two less than the number of dimensions of the tensor
|
||||
operation. The last argument to these intrinsics is a boolean flag, with
|
||||
also specified (denoted by ``i16 im2col0...i16 %im2col2``). For ``im2col``
|
||||
mode, the number of offsets is two less than the number of dimensions of
|
||||
the tensor operation. For the ``im2col.w`` and ``im2col.w.128`` modes,
|
||||
the number of offsets is always 2, denoted by ``i16 %wHalo`` and
|
||||
``i16 %wOffset`` arguments. For more information on ``im2col.w`` and
|
||||
``im2col.w.128`` modes, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes>`_.
|
||||
|
||||
|
||||
The last argument to these intrinsics is a boolean flag, with
|
||||
the same functionality as described in the ``tile`` mode intrinsics above.
|
||||
|
||||
For more information, refer PTX ISA
|
||||
|
||||
@@ -2024,9 +2024,7 @@ foreach dim = 1...5 in {
|
||||
tensor_dim_args, // actual tensor dims
|
||||
[llvm_i64_ty]), // cache_hint
|
||||
[llvm_i1_ty], // Flag for cache_hint
|
||||
[IntrConvergent,
|
||||
ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
|
||||
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
|
||||
[IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>;
|
||||
|
||||
// Intrinsics for TMA Copy with reduction
|
||||
foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in
|
||||
@@ -2037,18 +2035,31 @@ foreach dim = 1...5 in {
|
||||
tensor_dim_args, // actual tensor dims
|
||||
[llvm_i64_ty]), // cache_hint
|
||||
[llvm_i1_ty], // Flag for cache_hint
|
||||
[IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
|
||||
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
|
||||
[IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>;
|
||||
}
|
||||
}
|
||||
|
||||
// TMA S2G tile::scatter4
|
||||
def int_nvvm_cp_async_bulk_tensor_s2g_tile_scatter4_2d
|
||||
: DefaultAttrsIntrinsicFlags<[],
|
||||
!listconcat([llvm_shared_ptr_ty, // src_smem_ptr
|
||||
llvm_ptr_ty], // tensormap_ptr
|
||||
!listsplat(llvm_i32_ty, 5), // dims
|
||||
[llvm_i64_ty]), // cache_hint
|
||||
[llvm_i1_ty], // Flag for cache_hint
|
||||
[IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>;
|
||||
|
||||
// TMA Tensor Copy Intrinsics: G2S -> From Global to Shared memory variants
|
||||
foreach dim = 1...5 in {
|
||||
defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim);
|
||||
|
||||
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
|
||||
foreach mode = !if(!ge(dim, 3), ["tile", "im2col", "im2col_w", "im2col_w_128"], ["tile"]) in {
|
||||
defvar is_im2col = !eq(mode, "im2col");
|
||||
defvar num_im2col_offsets = !if(is_im2col, !add(dim, -2), 0);
|
||||
defvar is_im2colw = !or(!eq(mode, "im2col_w"), !eq(mode, "im2col_w_128"));
|
||||
|
||||
// For im2col_w/w128 modes, the num_offsets is always 2.
|
||||
// For im2col mode, the num_offsets is (dim - 2).
|
||||
defvar num_im2col_offsets = !if(is_im2colw, 2, !if(is_im2col, !add(dim, -2), 0));
|
||||
defvar im2col_offsets_args = !listsplat(llvm_i16_ty, num_im2col_offsets);
|
||||
|
||||
defvar g2s_params = !listconcat(
|
||||
@@ -2079,11 +2090,60 @@ foreach dim = 1...5 in {
|
||||
im2col_offsets_args, // im2col offsets
|
||||
[llvm_i64_ty]), // cache_hint
|
||||
[llvm_i1_ty], // Flag for cache_hint
|
||||
[IntrConvergent,
|
||||
ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
|
||||
[IntrConvergent, ReadOnly<ArgIndex<0>>]>;
|
||||
|
||||
def int_nvvm_cp_async_bulk_tensor_g2s_cta_ # mode # _ # dim # d :
|
||||
DefaultAttrsIntrinsicFlags<[],
|
||||
!listconcat([llvm_shared_ptr_ty, // dst_ptr
|
||||
llvm_shared_ptr_ty, // mbarrier_ptr
|
||||
llvm_ptr_ty], // tensormap_ptr
|
||||
tensor_dim_args, // actual tensor dims
|
||||
im2col_offsets_args, // im2col offsets
|
||||
[llvm_i64_ty]), // cache_hint
|
||||
[llvm_i1_ty], // Flag for cache_hint
|
||||
[IntrConvergent, WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>]>;
|
||||
}
|
||||
}
|
||||
|
||||
// TMA copy for tile::gather4
|
||||
def int_nvvm_cp_async_bulk_tensor_g2s_tile_gather4_2d
|
||||
: DefaultAttrsIntrinsicFlags<[],
|
||||
!listconcat(
|
||||
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
|
||||
llvm_shared_ptr_ty, // mbarrier_ptr
|
||||
llvm_ptr_ty], // tensormap_ptr
|
||||
!listsplat(llvm_i32_ty, 5), // co-ordinates
|
||||
[llvm_i16_ty, // cta_mask
|
||||
llvm_i64_ty]), // cache_hint
|
||||
[llvm_i1_ty, // Flag for cta_mask
|
||||
llvm_i1_ty, // Flag for cache_hint
|
||||
llvm_i32_ty], // Flag for cta_group
|
||||
[IntrConvergent,
|
||||
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
|
||||
// Allowed values for cta_group are {0,1,2} i.e [0, 3).
|
||||
Range<ArgIndex<12>, 0, 3>]>;
|
||||
|
||||
def int_nvvm_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d
|
||||
: DefaultAttrsIntrinsicFlags<[],
|
||||
!listconcat(
|
||||
[llvm_shared_ptr_ty, // dst_shared_ptr
|
||||
llvm_shared_ptr_ty, // mbarrier_ptr
|
||||
llvm_ptr_ty], // tensormap_ptr
|
||||
!listsplat(llvm_i32_ty, 5), // co-ordinates
|
||||
[llvm_i64_ty]), // cache_hint
|
||||
[llvm_i1_ty], // Flag for cache_hint
|
||||
[IntrConvergent,
|
||||
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>]>;
|
||||
|
||||
// TMA prefetch for tile::gather4
|
||||
def int_nvvm_cp_async_bulk_tensor_prefetch_tile_gather4_2d
|
||||
: DefaultAttrsIntrinsicFlags<[],
|
||||
!listconcat([llvm_ptr_ty], // tensormap_ptr
|
||||
!listsplat(llvm_i32_ty, 5), // co-ordinates
|
||||
[llvm_i64_ty]), // cache_hint
|
||||
[llvm_i1_ty], // Flag for cache_hint
|
||||
[IntrConvergent, ReadOnly<ArgIndex<0>>]>;
|
||||
|
||||
// Intrinsics for Prefetch and Prefetchu
|
||||
let IntrProperties = [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>] in {
|
||||
foreach level = ["L1", "L2"] in {
|
||||
|
||||
@@ -131,6 +131,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
|
||||
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
|
||||
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
|
||||
def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
|
||||
def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">;
|
||||
def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;
|
||||
|
||||
class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
|
||||
|
||||
@@ -600,12 +600,23 @@ defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 1>;
|
||||
// TMA Async Bulk Tensor Copy Functions
|
||||
//-------------------------------------
|
||||
|
||||
class TMA_DIMS_UTIL<int dim> {
|
||||
class TMA_DIMS_UTIL<int dim, string mode = ""> {
|
||||
// For example, when 'dim' is 3, this generates:
|
||||
// an ins_dag: B32:$d0, B32:$d1, B32:$d2
|
||||
// with base_str: $d0, $d1, $d2
|
||||
dag ins_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i));
|
||||
string base_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
|
||||
|
||||
// Tile::Gather4/scatter4 actually operate on a 2D tensor,
|
||||
// though they take 5 co-ordinates.
|
||||
//
|
||||
// The scatter-gather happens over 4 rows with a fixed
|
||||
// column-index. The first co-ordinate represents the
|
||||
// col-index followed by four row-indices.
|
||||
int num_dims = !cond(
|
||||
!eq(mode, "tile_scatter4") : 2,
|
||||
!eq(mode, "tile_gather4") : 2,
|
||||
true : dim); // for all other modes
|
||||
}
|
||||
|
||||
class TMA_IM2COL_UTIL<int dim, string mode> {
|
||||
@@ -692,14 +703,138 @@ foreach dim = [1, 2, 3, 4, 5] in {
|
||||
}
|
||||
}
|
||||
|
||||
multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> {
|
||||
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
|
||||
defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
|
||||
defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
|
||||
|
||||
defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag;
|
||||
defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str;
|
||||
defvar asm_str = !if(!empty(im2col_str),
|
||||
asm_str_base,
|
||||
asm_str_base # ", {{" # im2col_str # "}}");
|
||||
|
||||
defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
|
||||
defvar inst_name = "cp.async.bulk.tensor"
|
||||
# "." # dim_val # "d"
|
||||
# "." # "shared::cluster.global"
|
||||
# "." # !subst("_", "::", mode)
|
||||
# "." # "mbarrier::complete_tx::bytes";
|
||||
defvar intr = !cast<Intrinsic>(
|
||||
"int_nvvm_cp_async_bulk_tensor_g2s_" # mode # "_" # dim_val # "d");
|
||||
|
||||
defvar ins_dag = !con(
|
||||
(ins ADDR:$dst, ADDR:$mbar, B64:$tmap),
|
||||
dims_dag, im2col_dag,
|
||||
(ins B16:$mc, B64:$ch, CTAGroupFlags:$cg));
|
||||
|
||||
defvar intr_dag_base = !con(
|
||||
(intr addr:$dst, addr:$mbar, B64:$tmap),
|
||||
!setdagop(dims_dag, intr),
|
||||
!setdagop(im2col_dag, intr),
|
||||
(intr B16:$mc, B64:$ch));
|
||||
defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, timm:$cg));
|
||||
defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, timm:$cg));
|
||||
defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, timm:$cg));
|
||||
defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg));
|
||||
|
||||
def "" : NVPTXInst<(outs), ins_dag,
|
||||
inst_name # asm_str # ";",
|
||||
[intr_dag_no_hints]>,
|
||||
Requires<pred>;
|
||||
def _MC : NVPTXInst<(outs), ins_dag,
|
||||
inst_name # ".multicast::cluster" # asm_str # ", $mc;",
|
||||
[intr_dag_with_mc]>,
|
||||
Requires<pred>;
|
||||
def _CH : NVPTXInst<(outs), ins_dag,
|
||||
inst_name # ".L2::cache_hint" # asm_str # ", $ch;",
|
||||
[intr_dag_with_ch]>,
|
||||
Requires<pred>;
|
||||
def _MC_CH : NVPTXInst<(outs), ins_dag,
|
||||
inst_name # ".multicast::cluster.L2::cache_hint" # asm_str # ", $mc, $ch;",
|
||||
[intr_dag_with_mc_ch]>,
|
||||
Requires<pred>;
|
||||
}
|
||||
foreach dim = 3...5 in {
|
||||
foreach mode = ["im2col_w", "im2col_w_128"] in {
|
||||
defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D"
|
||||
: TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>;
|
||||
}
|
||||
}
|
||||
defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4",
|
||||
[hasTMACTAGroupSupport]>;
|
||||
|
||||
multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> {
|
||||
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
|
||||
defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
|
||||
defvar asm_str_base = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
|
||||
|
||||
defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag;
|
||||
defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str;
|
||||
defvar asm_str = !if(!empty(im2col_str),
|
||||
asm_str_base,
|
||||
asm_str_base # ", {{" # im2col_str # "}}");
|
||||
|
||||
defvar ins_dag = !con(
|
||||
(ins ADDR:$dst, ADDR:$mbar, B64:$tmap),
|
||||
dims_dag, im2col_dag,
|
||||
(ins B64:$ch));
|
||||
|
||||
defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
|
||||
defvar intr = !cast<Intrinsic>(
|
||||
"int_nvvm_cp_async_bulk_tensor_g2s_cta_" # mode # "_" # dim_val # "d");
|
||||
defvar intr_dag = !con(
|
||||
(intr addr:$dst, addr:$mbar, B64:$tmap),
|
||||
!setdagop(dims_dag, intr),
|
||||
!setdagop(im2col_dag, intr),
|
||||
(intr B64:$ch, 0));
|
||||
defvar intr_dag_with_ch = !con(
|
||||
(intr addr:$dst, addr:$mbar, B64:$tmap),
|
||||
!setdagop(dims_dag, intr),
|
||||
!setdagop(im2col_dag, intr),
|
||||
(intr B64:$ch, -1));
|
||||
defvar inst_name = "cp.async.bulk.tensor"
|
||||
# "." # dim_val # "d"
|
||||
# "." # "shared::cta.global"
|
||||
# "." # !subst("_", "::", mode)
|
||||
# "." # "mbarrier::complete_tx::bytes";
|
||||
|
||||
def "" : NVPTXInst<(outs), ins_dag,
|
||||
inst_name # asm_str # ";",
|
||||
[intr_dag]>,
|
||||
Requires<pred>;
|
||||
def _CH : NVPTXInst<(outs), ins_dag,
|
||||
inst_name # ".L2::cache_hint" # asm_str # ", $ch;",
|
||||
[intr_dag_with_ch]>,
|
||||
Requires<pred>;
|
||||
}
|
||||
foreach dim = 1...5 in {
|
||||
defm TMA_G2S_CTA_TILE_ # dim # "D"
|
||||
: TMA_TENSOR_G2S_CTA_INTR<dim, "tile", [hasPTX<86>, hasSM<90>]>;
|
||||
}
|
||||
foreach dim = 3...5 in {
|
||||
defm TMA_G2S_CTA_IM2COL_ # dim # "D"
|
||||
: TMA_TENSOR_G2S_CTA_INTR<dim, "im2col", [hasPTX<86>, hasSM<90>]>;
|
||||
|
||||
defm TMA_G2S_CTA_IM2COL_W_ # dim # "D"
|
||||
: TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>;
|
||||
|
||||
defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D"
|
||||
: TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>;
|
||||
}
|
||||
defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4",
|
||||
[hasPTX<86>, hasSM<100>]>;
|
||||
|
||||
multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
|
||||
list<Predicate> pred = [hasPTX<80>, hasSM<90>]> {
|
||||
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
|
||||
defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
|
||||
defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]";
|
||||
|
||||
defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
|
||||
defvar intr = !cast<Intrinsic>(
|
||||
"int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # d);
|
||||
"int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim_val # "d");
|
||||
|
||||
defvar intr_dag = !con((intr addr:$src, B64:$tmap),
|
||||
!setdagop(dims_dag, intr),
|
||||
(intr B64:$ch, 0));
|
||||
@@ -707,11 +842,13 @@ multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
|
||||
!setdagop(dims_dag, intr),
|
||||
(intr B64:$ch, -1));
|
||||
|
||||
// For im2col mode, the actual asm_str is "im2col_no_offs"
|
||||
defvar mode_asm_str = !if(!eq(mode, "im2col"),
|
||||
"im2col_no_offs", mode);
|
||||
// Fix-up the asm_str when it is im2col/scatter4.
|
||||
defvar mode_asm_str = !cond(
|
||||
!eq(mode, "im2col") : "im2col_no_offs",
|
||||
!eq(mode, "tile_scatter4") : "tile::scatter4",
|
||||
true : mode);
|
||||
defvar prefix = "cp.async.bulk.tensor"
|
||||
# "." # dim # "d"
|
||||
# "." # dim_val # "d"
|
||||
# ".global.shared::cta"
|
||||
# "." # mode_asm_str
|
||||
# ".bulk_group";
|
||||
@@ -729,10 +866,12 @@ multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
|
||||
}
|
||||
foreach dim = 1...5 in {
|
||||
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
|
||||
defvar suffix = !toupper(mode) # "_" # dim # D;
|
||||
defvar suffix = !toupper(mode) # "_" # dim # "D";
|
||||
defm TMA_TENSOR_S2G_ # suffix : TMA_TENSOR_S2G_INTR<dim, mode>;
|
||||
}
|
||||
}
|
||||
defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4",
|
||||
[hasTMACTAGroupSupport]>;
|
||||
|
||||
def TMAReductionFlags : Operand<i32> {
|
||||
let PrintMethod = "printTmaReductionMode";
|
||||
@@ -786,13 +925,14 @@ multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode,
|
||||
asm_str_base,
|
||||
asm_str_base # ", {{" # im2col_str # "}}");
|
||||
|
||||
defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
|
||||
defvar inst_name = "cp.async.bulk.prefetch.tensor"
|
||||
# "." # dim # "d"
|
||||
# "." # dim_val # "d"
|
||||
# "." # "L2.global"
|
||||
# "." # mode;
|
||||
# "." # !subst("_", "::", mode);
|
||||
|
||||
defvar intr = !cast<Intrinsic>(
|
||||
"int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # d);
|
||||
"int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim_val # "d");
|
||||
|
||||
defvar ins_dag = !con((ins B64:$tmap),
|
||||
dims_dag,
|
||||
@@ -818,10 +958,19 @@ multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode,
|
||||
}
|
||||
foreach dim = 1...5 in {
|
||||
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
|
||||
defvar suffix = !toupper(mode) # "_" # dim # D;
|
||||
defvar suffix = !toupper(mode) # "_" # dim # "D";
|
||||
defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode>;
|
||||
}
|
||||
}
|
||||
foreach dim = 3...5 in {
|
||||
foreach mode = ["im2col_w", "im2col_w_128"] in {
|
||||
defvar suffix = !toupper(mode) # "_" # dim # "D";
|
||||
defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode,
|
||||
[hasTMACTAGroupSupport]>;
|
||||
}
|
||||
}
|
||||
defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
|
||||
[hasTMACTAGroupSupport]>;
|
||||
|
||||
//Prefetch and Prefetchu
|
||||
|
||||
|
||||
195
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll
Normal file
195
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll
Normal file
@@ -0,0 +1,195 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %}
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %}
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
|
||||
|
||||
define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_8];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_10];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_8];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
152
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll
Normal file
152
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll
Normal file
@@ -0,0 +1,152 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
|
||||
|
||||
define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_8];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_10];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
353
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll
Normal file
353
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll
Normal file
@@ -0,0 +1,353 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %}
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %}
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %f1);
|
||||
|
||||
define void @cp_async_bulk_tensor_g2s_cta_tile_1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_1d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_4];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_1d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @cp_async_bulk_tensor_g2s_cta_tile_2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_2d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_5];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_2d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @cp_async_bulk_tensor_g2s_cta_tile_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_3d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_3d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @cp_async_bulk_tensor_g2s_cta_tile_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_4d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_4d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @cp_async_bulk_tensor_g2s_cta_tile_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_5d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_5d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 %f1);
|
||||
|
||||
define void @test_cp_async_bulk_tensor_g2s_cta_im2col_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_3d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_7];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_3d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_cp_async_bulk_tensor_g2s_cta_im2col_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_4d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_4d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_cp_async_bulk_tensor_g2s_cta_im2col_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_5d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_10];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_11];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_5d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_11];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 0)
|
||||
ret void
|
||||
}
|
||||
174
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
Normal file
174
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
Normal file
@@ -0,0 +1,174 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
|
||||
|
||||
; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d
|
||||
define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1
|
||||
define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2
|
||||
define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
|
||||
|
||||
ret void
|
||||
}
|
||||
524
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
Normal file
524
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
Normal file
@@ -0,0 +1,524 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_3d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_3d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_10];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_4d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_10];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_4d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_10];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_10];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_11];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_11];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_5d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_10];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_11];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_11];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_5d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_10];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_11];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_11];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
|
||||
|
||||
ret void
|
||||
}
|
||||
524
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
Normal file
524
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
Normal file
@@ -0,0 +1,524 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_128_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_128_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_10];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_10];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_10];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_128_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_10];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_11];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_11];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_10];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_11];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_11];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2
|
||||
define void @cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_10];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_11];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_11];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
|
||||
|
||||
ret void
|
||||
}
|
||||
171
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll
Normal file
171
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll
Normal file
@@ -0,0 +1,171 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %f1);
|
||||
|
||||
define void @test_cp_async_bulk_tensor_prefetch_3d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_3d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_3d_param_7];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_3d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_3d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
|
||||
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_cp_async_bulk_tensor_prefetch_4d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_4d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_4d_param_8];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_4d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_4d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
|
||||
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_cp_async_bulk_tensor_prefetch_5d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_5d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_prefetch_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_5d_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_5d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_prefetch_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_5d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
|
||||
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_cp_async_bulk_tensor_prefetch_tile_gather4_2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
ret void
|
||||
}
|
||||
52
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll
Normal file
52
llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll
Normal file
@@ -0,0 +1,52 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
|
||||
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag);
|
||||
|
||||
; CHECK-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d
|
||||
define void @cp_async_bulk_tensor_s2g_tile_scatter4_2d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
|
||||
; CHECK-PTX64-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d(
|
||||
; CHECK-PTX64: {
|
||||
; CHECK-PTX64-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_8];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d(
|
||||
; CHECK-PTX-SHARED32: {
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0)
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
|
||||
|
||||
ret void
|
||||
}
|
||||
Reference in New Issue
Block a user