[AMDGPU] Support for gfx940 fp8 smfmac

Differential Revision: https://reviews.llvm.org/D129908
2026-01-26 21:53:12 +08:00 · 2022-07-15 15:16:04 -07:00
parent 2695f0a688
commit 523a99c0eb
13 changed files with 572 additions and 1 deletions
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -353,6 +353,14 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x32_bf16, "V4fV4sV8sV4fiIiIi", "
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_bf16, "V16fV4sV8sV16fiIiIi", "nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x64_i8, "V4iV2iV4iV4iiIiIi", "nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x32_i8, "V16iV2iV4iV16iiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8, "V4fV2iV4iV4fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8, "V4fV2iV4iV4fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8, "V4fV2iV4iV4fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8, "V4fV2iV4iV4fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8, "V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8, "V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8, "V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8, "V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")

 TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-insts")
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -348,4 +348,60 @@ void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx
 {
  *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, 0);
 }
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_16x16x64_bf8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_16x16x64_bf8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_16x16x64_fp8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8
+// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_16x16x64_fp8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8
+// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_32x32x32_bf8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_fp8
+// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_32x32x32_bf8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_bf8
+// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_32x32x32_fp8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, 0, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_fp8
+// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
+void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx)
+{
+  *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0);
+}
 #endif // MFMA_GFX940_TESTS
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl
@@ -130,3 +130,51 @@ void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx
  *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_i32_32x32x32_i8' must be a constant integer}}
  *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_i32_32x32x32_i8' must be a constant integer}}
 }
+
+void test_smfmac_f32_16x16x64_bf8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx, int d)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8' must be a constant integer}}
+}
+
+void test_smfmac_f32_16x16x64_bf8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx, int d)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8' must be a constant integer}}
+}
+
+void test_smfmac_f32_16x16x64_fp8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx, int d)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8' must be a constant integer}}
+}
+
+void test_smfmac_f32_16x16x64_fp8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx, int d)
+{
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8' must be a constant integer}}
+}
+
+void test_smfmac_f32_32x32x32_bf8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx, int d)
+{
+  *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8' must be a constant integer}}
+}
+
+void test_smfmac_f32_32x32x32_bf8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx, int d)
+{
+  *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8' must be a constant integer}}
+}
+
+void test_smfmac_f32_32x32x32_fp8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx, int d)
+{
+  *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8' must be a constant integer}}
+}
+
+void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx, int d)
+{
+  *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8' must be a constant integer}}
+}
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2331,6 +2331,17 @@ def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty,
 def int_amdgcn_smfmac_i32_16x16x64_i8   : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty,  llvm_v2i32_ty, llvm_v4i32_ty>;
 def int_amdgcn_smfmac_i32_32x32x32_i8   : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;

+class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
+  AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>;
+
+multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
+  foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
+    def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
+}
+
+defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>;
+defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>;
+
 // llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
 // byte_sel selects byte from srcA.
 def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">,
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1006,6 +1006,14 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
  case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
  case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
  case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
    return selectSMFMACIntrin(I);
  default:
    return selectImpl(I, *CoverageInfo);
@@ -3354,6 +3362,30 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
  case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
    Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
    break;
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
+    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
+    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
+    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
+    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
+    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
+    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
+    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
+    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
+    break;
  default:
    llvm_unreachable("unhandled smfmac intrinsic");
  }
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4459,7 +4459,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
    case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
    case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
    case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
-    case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: {
+    case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
      // vdst, srcA, srcB, srcC, idx
      OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
      OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -354,6 +354,14 @@ def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
 def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>;
 def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>;
 def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;

 // The dummy boolean output is divergent from the IR's perspective,
 // but the mask results are uniform. These produce a divergent and
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2680,6 +2680,8 @@ def VOP_V4F32_V4I16_V8I16_I32     : VOPProfile <[v4f32,  v4i16, v8i16, i32]>;
 def VOP_V16F32_V4I16_V8I16_I32    : VOPProfile <[v16f32, v4i16, v8i16, i32]>;
 def VOP_V4I32_V2I32_V4I32_I32     : VOPProfile <[v4i32,  v2i32, v4i32, i32]>;
 def VOP_V16I32_V2I32_V4I32_I32    : VOPProfile <[v16i32, v2i32, v4i32, i32]>;
+def VOP_V4F32_V2I32_V4I32_I32     : VOPProfile <[v4f32,  v2i32, v4i32, i32]>;
+def VOP_V16F32_V2I32_V4I32_I32    : VOPProfile <[v16f32, v2i32, v4i32, i32]>;

 class Commutable_REV <string revOp, bit isOrig> {
  string RevOp = revOp;
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -526,6 +526,8 @@ def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I
 def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
 def VOPProfileSMFMAC_I32_16X16X64_I8  : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32,  AVDst_128, AVSrc_64, AVSrc_128>;
 def VOPProfileSMFMAC_I32_32X32X32_I8  : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_F32_16X16X64_F8  : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32,  AVDst_128, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_F32_32X32X32_F8  : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;

 class MFMATable <bit is_mac, string Name> {
  bit IsMac = is_mac;
@@ -666,6 +668,14 @@ defm V_SMFMAC_F32_16X16X32_BF16    : SMFMACInst<"v_smfmac_f32_16x16x32_bf16",
 defm V_SMFMAC_F32_32X32X16_BF16    : SMFMACInst<"v_smfmac_f32_32x32x16_bf16",    "F32_32X32X16_I16", int_amdgcn_smfmac_f32_32x32x16_bf16>;
 defm V_SMFMAC_I32_16X16X64_I8      : SMFMACInst<"v_smfmac_i32_16x16x64_i8",      "I32_16X16X64_I8",  int_amdgcn_smfmac_i32_16x16x64_i8>;
 defm V_SMFMAC_I32_32X32X32_I8      : SMFMACInst<"v_smfmac_i32_32x32x32_i8",      "I32_32X32X32_I8",  int_amdgcn_smfmac_i32_32x32x32_i8>;
+defm V_SMFMAC_F32_16X16X64_BF8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_bf8", "F32_16X16X64_F8",  int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
+defm V_SMFMAC_F32_16X16X64_BF8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_fp8", "F32_16X16X64_F8",  int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
+defm V_SMFMAC_F32_16X16X64_FP8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_bf8", "F32_16X16X64_F8",  int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
+defm V_SMFMAC_F32_16X16X64_FP8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_fp8", "F32_16X16X64_F8",  int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
+defm V_SMFMAC_F32_32X32X32_BF8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_bf8", "F32_32X32X32_F8",  int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
+defm V_SMFMAC_F32_32X32X32_BF8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_fp8", "F32_32X32X32_F8",  int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
+defm V_SMFMAC_F32_32X32X32_FP8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_bf8", "F32_32X32X32_F8",  int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
+defm V_SMFMAC_F32_32X32X32_FP8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_fp8", "F32_32X32X32_F8",  int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
 }

 def MAIInstInfoTable : GenericTable {
@@ -1157,6 +1167,14 @@ defm V_SMFMAC_F32_16X16X32_BF16    : VOP3P_Real_SMFMAC <0x66, "v_smfmac_f32_16x1
 defm V_SMFMAC_F32_32X32X16_BF16    : VOP3P_Real_SMFMAC <0x68, "v_smfmac_f32_32x32x16bf16">;
 defm V_SMFMAC_I32_16X16X64_I8      : VOP3P_Real_SMFMAC <0x6a, "v_smfmac_i32_16x16x64i8">;
 defm V_SMFMAC_I32_32X32X32_I8      : VOP3P_Real_SMFMAC <0x6c, "v_smfmac_i32_32x32x32i8">;
+defm V_SMFMAC_F32_16X16X64_BF8_BF8 : VOP3P_Real_SMFMAC <0x78, "v_smfmac_f32_16x16x64bf8bf8">;
+defm V_SMFMAC_F32_16X16X64_BF8_FP8 : VOP3P_Real_SMFMAC <0x79, "v_smfmac_f32_16x16x64bf8fp8">;
+defm V_SMFMAC_F32_16X16X64_FP8_BF8 : VOP3P_Real_SMFMAC <0x7a, "v_smfmac_f32_16x16x64fp8bf8">;
+defm V_SMFMAC_F32_16X16X64_FP8_FP8 : VOP3P_Real_SMFMAC <0x7b, "v_smfmac_f32_16x16x64fp8fp8">;
+defm V_SMFMAC_F32_32X32X32_BF8_BF8 : VOP3P_Real_SMFMAC <0x7c, "v_smfmac_f32_32x32x32bf8bf8">;
+defm V_SMFMAC_F32_32X32X32_BF8_FP8 : VOP3P_Real_SMFMAC <0x7d, "v_smfmac_f32_32x32x32bf8fp8">;
+defm V_SMFMAC_F32_32X32X32_FP8_BF8 : VOP3P_Real_SMFMAC <0x7e, "v_smfmac_f32_32x32x32fp8bf8">;
+defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x32x32fp8fp8">;

 let SubtargetPredicate = HasPackedFP32Ops in {
  defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
@@ -21,6 +21,14 @@ declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16>, <8 x i16>,
 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16>, <8 x i16>, <16 x float>, i32, i32, i32)
 declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32>, <4 x i32>, <4 x i32>, i32, i32, i32)
 declare <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32>, <4 x i32>, <16 x i32>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)

 ; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8:
 ; GFX940-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
@@ -343,4 +351,144 @@ bb:
  ret void
 }

+; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_bf8:
+; GCN:        s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
+; GCN:        v_smfmac_f32_16x16x64_bf8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
+; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_fp8:
+; GCN:        s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
+; GCN:        v_smfmac_f32_16x16x64_bf8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
+; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_bf8:
+; GCN:        s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
+; GCN:        v_smfmac_f32_16x16x64_fp8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
+; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_fp8:
+; GCN:        s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
+; GCN:        v_smfmac_f32_16x16x64_fp8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
+; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_bf8:
+; GCN:        s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
+; GCN:        v_smfmac_f32_32x32x32_bf8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_fp8:
+; GCN:        s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
+; GCN:        v_smfmac_f32_32x32x32_bf8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_bf8:
+; GCN:        s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
+; GCN:        v_smfmac_f32_32x32x32_fp8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_fp8:
+; GCN:        s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
+; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
+; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
+; GCN:        v_smfmac_f32_32x32x32_fp8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
+; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
--- a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
@@ -19,6 +19,14 @@ declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16>, <8 x i16>,
 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16>, <8 x i16>, <16 x float>, i32, i32, i32)
 declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32>, <4 x i32>, <4 x i32>, i32, i32, i32)
 declare <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32>, <4 x i32>, <16 x i32>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)

 ; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8:
 ; GCN: v_mfma_i32_16x16x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
@@ -199,3 +207,83 @@ bb:
  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
  ret void
 }
+
+; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_bf8:
+; GCN: v_smfmac_f32_16x16x64_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_fp8:
+; GCN: v_smfmac_f32_16x16x64_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_bf8:
+; GCN: v_smfmac_f32_16x16x64_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_fp8:
+; GCN: v_smfmac_f32_16x16x64_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_bf8:
+; GCN: v_smfmac_f32_32x32x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_fp8:
+; GCN: v_smfmac_f32_32x32x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_bf8:
+; GCN: v_smfmac_f32_32x32x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_fp8:
+; GCN: v_smfmac_f32_32x32x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
--- a/llvm/test/MC/AMDGPU/mai-gfx940.s
+++ b/llvm/test/MC/AMDGPU/mai-gfx940.s
@@ -616,6 +616,70 @@ v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11
 // GFX940: v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11 ; encoding: [0x0a,0x80,0xec,0xd3,0x02,0x09,0x2e,0x14]
 // GFX90A: error: instruction not supported on this GPU

+v_smfmac_f32_16x16x64_bf8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_16x16x64_bf8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xf8,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[2:3], a[4:7], v1
+// GFX940: v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xf8,0xd3,0x02,0x09,0x06,0x14]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64_bf8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_16x16x64_bf8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xf9,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[2:3], a[4:7], v1
+// GFX940: v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xf9,0xd3,0x02,0x09,0x06,0x14]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64_fp8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_16x16x64_fp8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfa,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[2:3], a[4:7], v1
+// GFX940: v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfa,0xd3,0x02,0x09,0x06,0x14]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64_fp8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_16x16x64_fp8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfb,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[2:3], a[4:7], v1
+// GFX940: v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfb,0xd3,0x02,0x09,0x06,0x14]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfc,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[2:3], a[4:7], v1
+// GFX940: v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfc,0xd3,0x02,0x09,0x06,0x14]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfd,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[2:3], a[4:7], v1
+// GFX940: v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfd,0xd3,0x02,0x09,0x06,0x14]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfe,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[2:3], a[4:7], v1
+// GFX940: v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfe,0xd3,0x02,0x09,0x06,0x14]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xff,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[2:3], a[4:7], v1
+// GFX940: v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xff,0xd3,0x02,0x09,0x06,0x14]
+// GFX90A: error: instruction not supported on this GPU
+
 //===----------------------------------------------------------------------===//
 // SMFMAC aliases.
 //===----------------------------------------------------------------------===//
@@ -643,3 +707,35 @@ v_smfmac_i32_16x16x64i8 v[10:13], a[2:3], v[4:7], v8 cbsz:3 abid:1
 v_smfmac_i32_32x32x32i8 a[10:25], v[2:3], a[4:7], v11
 // GFX940: v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11 ; encoding: [0x0a,0x80,0xec,0xd3,0x02,0x09,0x2e,0x14]
 // GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64bf8bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_16x16x64_bf8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xf8,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64bf8fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_16x16x64_bf8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xf9,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64fp8bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_16x16x64_fp8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfa,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64fp8fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_16x16x64_fp8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfb,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32bf8bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfc,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32bf8fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfd,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32fp8bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfe,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32fp8fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
+// GFX940: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xff,0xd3,0x02,0x09,0x06,0x0c]
+// GFX90A: error: instruction not supported on this GPU
--- a/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt
@@ -392,3 +392,51 @@

 # GFX940: v_smfmac_i32_32x32x32_i8 v[10:25], v[2:3], v[4:7], v3 abid:15 ; encoding: [0x0a,0x78,0xec,0xd3,0x02,0x09,0x0e,0x04]
 0x0a,0x78,0xec,0xd3,0x02,0x09,0x0e,0x04
+
+# GFX940: v_smfmac_f32_16x16x64_bf8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xf8,0xd3,0x02,0x09,0x06,0x0c]
+0x00,0x0b,0xf8,0xd3,0x02,0x09,0x06,0x0c
+
+# GFX940: v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xf8,0xd3,0x02,0x09,0x06,0x14]
+0x00,0x80,0xf8,0xd3,0x02,0x09,0x06,0x14
+
+# GFX940: v_smfmac_f32_16x16x64_bf8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xf9,0xd3,0x02,0x09,0x06,0x0c]
+0x00,0x0b,0xf9,0xd3,0x02,0x09,0x06,0x0c
+
+# GFX940: v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xf9,0xd3,0x02,0x09,0x06,0x14]
+0x00,0x80,0xf9,0xd3,0x02,0x09,0x06,0x14
+
+# GFX940: v_smfmac_f32_16x16x64_fp8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfa,0xd3,0x02,0x09,0x06,0x0c]
+0x00,0x0b,0xfa,0xd3,0x02,0x09,0x06,0x0c
+
+# GFX940: v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfa,0xd3,0x02,0x09,0x06,0x14]
+0x00,0x80,0xfa,0xd3,0x02,0x09,0x06,0x14
+
+# GFX940: v_smfmac_f32_16x16x64_fp8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfb,0xd3,0x02,0x09,0x06,0x0c]
+0x00,0x0b,0xfb,0xd3,0x02,0x09,0x06,0x0c
+
+# GFX940: v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfb,0xd3,0x02,0x09,0x06,0x14]
+0x00,0x80,0xfb,0xd3,0x02,0x09,0x06,0x14
+
+# GFX940: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfc,0xd3,0x02,0x09,0x06,0x0c]
+0x00,0x0b,0xfc,0xd3,0x02,0x09,0x06,0x0c
+
+# GFX940: v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfc,0xd3,0x02,0x09,0x06,0x14]
+0x00,0x80,0xfc,0xd3,0x02,0x09,0x06,0x14
+
+# GFX940: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfd,0xd3,0x02,0x09,0x06,0x0c]
+0x00,0x0b,0xfd,0xd3,0x02,0x09,0x06,0x0c
+
+# GFX940: v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfd,0xd3,0x02,0x09,0x06,0x14]
+0x00,0x80,0xfd,0xd3,0x02,0x09,0x06,0x14
+
+# GFX940: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfe,0xd3,0x02,0x09,0x06,0x0c]
+0x00,0x0b,0xfe,0xd3,0x02,0x09,0x06,0x0c
+
+# GFX940: v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfe,0xd3,0x02,0x09,0x06,0x14]
+0x00,0x80,0xfe,0xd3,0x02,0x09,0x06,0x14
+
+# GFX940: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xff,0xd3,0x02,0x09,0x06,0x0c]
+0x00,0x0b,0xff,0xd3,0x02,0x09,0x06,0x0c
+
+# GFX940: v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xff,0xd3,0x02,0x09,0x06,0x14]
+0x00,0x80,0xff,0xd3,0x02,0x09,0x06,0x14