mirror of
https://github.com/intel/llvm.git
synced 2026-01-26 21:53:12 +08:00
[AMDGPU] Support for gfx940 fp8 smfmac
Differential Revision: https://reviews.llvm.org/D129908
This commit is contained in:
@@ -353,6 +353,14 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x32_bf16, "V4fV4sV8sV4fiIiIi", "
|
||||
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_bf16, "V16fV4sV8sV16fiIiIi", "nc", "mai-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x64_i8, "V4iV2iV4iV4iiIiIi", "nc", "mai-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x32_i8, "V16iV2iV4iV16iiIiIi", "nc", "mai-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8, "V4fV2iV4iV4fiIiIi", "nc", "fp8-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8, "V4fV2iV4iV4fiIiIi", "nc", "fp8-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8, "V4fV2iV4iV4fiIiIi", "nc", "fp8-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8, "V4fV2iV4iV4fiIiIi", "nc", "fp8-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8, "V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8, "V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8, "V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8, "V16fV2iV4iV16fiIiIi", "nc", "fp8-insts")
|
||||
|
||||
TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-insts")
|
||||
|
||||
@@ -348,4 +348,60 @@ void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8
|
||||
// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
|
||||
void test_smfmac_f32_16x16x64_bf8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8
|
||||
// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
|
||||
void test_smfmac_f32_16x16x64_bf8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8
|
||||
// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
|
||||
void test_smfmac_f32_16x16x64_fp8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8
|
||||
// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
|
||||
void test_smfmac_f32_16x16x64_fp8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8
|
||||
// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
|
||||
void test_smfmac_f32_32x32x32_bf8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_fp8
|
||||
// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
|
||||
void test_smfmac_f32_32x32x32_bf8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_bf8
|
||||
// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
|
||||
void test_smfmac_f32_32x32x32_fp8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_fp8
|
||||
// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
|
||||
void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0);
|
||||
}
|
||||
#endif // MFMA_GFX940_TESTS
|
||||
|
||||
@@ -130,3 +130,51 @@ void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx
|
||||
*out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_i32_32x32x32_i8' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_i32_32x32x32_i8' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_smfmac_f32_16x16x64_bf8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx, int d)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_smfmac_f32_16x16x64_bf8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx, int d)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_smfmac_f32_16x16x64_fp8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx, int d)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_smfmac_f32_16x16x64_fp8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx, int d)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_smfmac_f32_32x32x32_bf8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx, int d)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_smfmac_f32_32x32x32_bf8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx, int d)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_smfmac_f32_32x32x32_fp8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx, int d)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx, int d)
|
||||
{
|
||||
*out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8' must be a constant integer}}
|
||||
*out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8' must be a constant integer}}
|
||||
}
|
||||
|
||||
@@ -2331,6 +2331,17 @@ def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty,
|
||||
def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
|
||||
def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
|
||||
|
||||
class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
|
||||
AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>;
|
||||
|
||||
multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
|
||||
foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
|
||||
def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
|
||||
}
|
||||
|
||||
defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>;
|
||||
defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>;
|
||||
|
||||
// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
|
||||
// byte_sel selects byte from srcA.
|
||||
def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">,
|
||||
|
||||
@@ -1006,6 +1006,14 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
|
||||
case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
|
||||
case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
|
||||
return selectSMFMACIntrin(I);
|
||||
default:
|
||||
return selectImpl(I, *CoverageInfo);
|
||||
@@ -3354,6 +3362,30 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
|
||||
case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
|
||||
Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
|
||||
break;
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
|
||||
Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
|
||||
break;
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
|
||||
Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
|
||||
break;
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
|
||||
Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
|
||||
break;
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
|
||||
Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
|
||||
break;
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
|
||||
Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
|
||||
break;
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
|
||||
Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
|
||||
break;
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
|
||||
Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
|
||||
break;
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
|
||||
Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("unhandled smfmac intrinsic");
|
||||
}
|
||||
|
||||
@@ -4459,7 +4459,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
|
||||
case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
|
||||
case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: {
|
||||
case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
|
||||
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
|
||||
// vdst, srcA, srcB, srcC, idx
|
||||
OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
|
||||
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
|
||||
|
||||
@@ -354,6 +354,14 @@ def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
|
||||
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>;
|
||||
def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>;
|
||||
def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>;
|
||||
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
|
||||
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
|
||||
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
|
||||
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
|
||||
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
|
||||
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
|
||||
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
|
||||
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
|
||||
|
||||
// The dummy boolean output is divergent from the IR's perspective,
|
||||
// but the mask results are uniform. These produce a divergent and
|
||||
|
||||
@@ -2680,6 +2680,8 @@ def VOP_V4F32_V4I16_V8I16_I32 : VOPProfile <[v4f32, v4i16, v8i16, i32]>;
|
||||
def VOP_V16F32_V4I16_V8I16_I32 : VOPProfile <[v16f32, v4i16, v8i16, i32]>;
|
||||
def VOP_V4I32_V2I32_V4I32_I32 : VOPProfile <[v4i32, v2i32, v4i32, i32]>;
|
||||
def VOP_V16I32_V2I32_V4I32_I32 : VOPProfile <[v16i32, v2i32, v4i32, i32]>;
|
||||
def VOP_V4F32_V2I32_V4I32_I32 : VOPProfile <[v4f32, v2i32, v4i32, i32]>;
|
||||
def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
|
||||
|
||||
class Commutable_REV <string revOp, bit isOrig> {
|
||||
string RevOp = revOp;
|
||||
|
||||
@@ -526,6 +526,8 @@ def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I
|
||||
def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
|
||||
def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>;
|
||||
def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;
|
||||
def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>;
|
||||
def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;
|
||||
|
||||
class MFMATable <bit is_mac, string Name> {
|
||||
bit IsMac = is_mac;
|
||||
@@ -666,6 +668,14 @@ defm V_SMFMAC_F32_16X16X32_BF16 : SMFMACInst<"v_smfmac_f32_16x16x32_bf16",
|
||||
defm V_SMFMAC_F32_32X32X16_BF16 : SMFMACInst<"v_smfmac_f32_32x32x16_bf16", "F32_32X32X16_I16", int_amdgcn_smfmac_f32_32x32x16_bf16>;
|
||||
defm V_SMFMAC_I32_16X16X64_I8 : SMFMACInst<"v_smfmac_i32_16x16x64_i8", "I32_16X16X64_I8", int_amdgcn_smfmac_i32_16x16x64_i8>;
|
||||
defm V_SMFMAC_I32_32X32X32_I8 : SMFMACInst<"v_smfmac_i32_32x32x32_i8", "I32_32X32X32_I8", int_amdgcn_smfmac_i32_32x32x32_i8>;
|
||||
defm V_SMFMAC_F32_16X16X64_BF8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_bf8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
|
||||
defm V_SMFMAC_F32_16X16X64_BF8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_fp8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
|
||||
defm V_SMFMAC_F32_16X16X64_FP8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_bf8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
|
||||
defm V_SMFMAC_F32_16X16X64_FP8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_fp8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
|
||||
defm V_SMFMAC_F32_32X32X32_BF8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_bf8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
|
||||
defm V_SMFMAC_F32_32X32X32_BF8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_fp8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
|
||||
defm V_SMFMAC_F32_32X32X32_FP8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_bf8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
|
||||
defm V_SMFMAC_F32_32X32X32_FP8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_fp8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
|
||||
}
|
||||
|
||||
def MAIInstInfoTable : GenericTable {
|
||||
@@ -1157,6 +1167,14 @@ defm V_SMFMAC_F32_16X16X32_BF16 : VOP3P_Real_SMFMAC <0x66, "v_smfmac_f32_16x1
|
||||
defm V_SMFMAC_F32_32X32X16_BF16 : VOP3P_Real_SMFMAC <0x68, "v_smfmac_f32_32x32x16bf16">;
|
||||
defm V_SMFMAC_I32_16X16X64_I8 : VOP3P_Real_SMFMAC <0x6a, "v_smfmac_i32_16x16x64i8">;
|
||||
defm V_SMFMAC_I32_32X32X32_I8 : VOP3P_Real_SMFMAC <0x6c, "v_smfmac_i32_32x32x32i8">;
|
||||
defm V_SMFMAC_F32_16X16X64_BF8_BF8 : VOP3P_Real_SMFMAC <0x78, "v_smfmac_f32_16x16x64bf8bf8">;
|
||||
defm V_SMFMAC_F32_16X16X64_BF8_FP8 : VOP3P_Real_SMFMAC <0x79, "v_smfmac_f32_16x16x64bf8fp8">;
|
||||
defm V_SMFMAC_F32_16X16X64_FP8_BF8 : VOP3P_Real_SMFMAC <0x7a, "v_smfmac_f32_16x16x64fp8bf8">;
|
||||
defm V_SMFMAC_F32_16X16X64_FP8_FP8 : VOP3P_Real_SMFMAC <0x7b, "v_smfmac_f32_16x16x64fp8fp8">;
|
||||
defm V_SMFMAC_F32_32X32X32_BF8_BF8 : VOP3P_Real_SMFMAC <0x7c, "v_smfmac_f32_32x32x32bf8bf8">;
|
||||
defm V_SMFMAC_F32_32X32X32_BF8_FP8 : VOP3P_Real_SMFMAC <0x7d, "v_smfmac_f32_32x32x32bf8fp8">;
|
||||
defm V_SMFMAC_F32_32X32X32_FP8_BF8 : VOP3P_Real_SMFMAC <0x7e, "v_smfmac_f32_32x32x32fp8bf8">;
|
||||
defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x32x32fp8fp8">;
|
||||
|
||||
let SubtargetPredicate = HasPackedFP32Ops in {
|
||||
defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
|
||||
|
||||
@@ -21,6 +21,14 @@ declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16>, <8 x i16>,
|
||||
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16>, <8 x i16>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32>, <4 x i32>, <4 x i32>, i32, i32, i32)
|
||||
declare <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32>, <4 x i32>, <16 x i32>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
|
||||
|
||||
; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8:
|
||||
; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
|
||||
@@ -343,4 +351,144 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_bf8:
|
||||
; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
|
||||
; GCN: v_smfmac_f32_16x16x64_bf8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
|
||||
; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
|
||||
define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
|
||||
store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_fp8:
|
||||
; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
|
||||
; GCN: v_smfmac_f32_16x16x64_bf8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
|
||||
; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
|
||||
define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
|
||||
store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_bf8:
|
||||
; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
|
||||
; GCN: v_smfmac_f32_16x16x64_fp8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
|
||||
; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
|
||||
define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
|
||||
store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_fp8:
|
||||
; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
|
||||
; GCN: v_smfmac_f32_16x16x64_fp8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
|
||||
; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
|
||||
define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
|
||||
store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_bf8:
|
||||
; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
|
||||
; GCN: v_smfmac_f32_32x32x32_bf8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
|
||||
define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
|
||||
store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_fp8:
|
||||
; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
|
||||
; GCN: v_smfmac_f32_32x32x32_bf8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
|
||||
define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
|
||||
store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_bf8:
|
||||
; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
|
||||
; GCN: v_smfmac_f32_32x32x32_fp8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
|
||||
define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
|
||||
store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_fp8:
|
||||
; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
|
||||
; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
|
||||
; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
|
||||
; GCN: v_smfmac_f32_32x32x32_fp8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
|
||||
; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
|
||||
define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
|
||||
store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
|
||||
|
||||
@@ -19,6 +19,14 @@ declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16>, <8 x i16>,
|
||||
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16>, <8 x i16>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32>, <4 x i32>, <4 x i32>, i32, i32, i32)
|
||||
declare <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32>, <4 x i32>, <16 x i32>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
|
||||
|
||||
; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8:
|
||||
; GCN: v_mfma_i32_16x16x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
@@ -199,3 +207,83 @@ bb:
|
||||
store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_bf8:
|
||||
; GCN: v_smfmac_f32_16x16x64_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
|
||||
define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
|
||||
store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_fp8:
|
||||
; GCN: v_smfmac_f32_16x16x64_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
|
||||
define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
|
||||
store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_bf8:
|
||||
; GCN: v_smfmac_f32_16x16x64_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
|
||||
define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
|
||||
store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_fp8:
|
||||
; GCN: v_smfmac_f32_16x16x64_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
|
||||
define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
|
||||
store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_bf8:
|
||||
; GCN: v_smfmac_f32_32x32x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
|
||||
define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
|
||||
store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_fp8:
|
||||
; GCN: v_smfmac_f32_32x32x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
|
||||
define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
|
||||
store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_bf8:
|
||||
; GCN: v_smfmac_f32_32x32x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
|
||||
define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
|
||||
store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_fp8:
|
||||
; GCN: v_smfmac_f32_32x32x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
|
||||
define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
|
||||
store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
@@ -616,6 +616,70 @@ v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11
|
||||
// GFX940: v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11 ; encoding: [0x0a,0x80,0xec,0xd3,0x02,0x09,0x2e,0x14]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_16x16x64_bf8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_16x16x64_bf8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xf8,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[2:3], a[4:7], v1
|
||||
// GFX940: v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xf8,0xd3,0x02,0x09,0x06,0x14]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_16x16x64_bf8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_16x16x64_bf8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xf9,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[2:3], a[4:7], v1
|
||||
// GFX940: v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xf9,0xd3,0x02,0x09,0x06,0x14]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_16x16x64_fp8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_16x16x64_fp8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfa,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[2:3], a[4:7], v1
|
||||
// GFX940: v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfa,0xd3,0x02,0x09,0x06,0x14]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_16x16x64_fp8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_16x16x64_fp8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfb,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[2:3], a[4:7], v1
|
||||
// GFX940: v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfb,0xd3,0x02,0x09,0x06,0x14]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfc,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[2:3], a[4:7], v1
|
||||
// GFX940: v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfc,0xd3,0x02,0x09,0x06,0x14]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfd,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[2:3], a[4:7], v1
|
||||
// GFX940: v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfd,0xd3,0x02,0x09,0x06,0x14]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfe,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[2:3], a[4:7], v1
|
||||
// GFX940: v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfe,0xd3,0x02,0x09,0x06,0x14]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xff,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[2:3], a[4:7], v1
|
||||
// GFX940: v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xff,0xd3,0x02,0x09,0x06,0x14]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SMFMAC aliases.
|
||||
//===----------------------------------------------------------------------===//
|
||||
@@ -643,3 +707,35 @@ v_smfmac_i32_16x16x64i8 v[10:13], a[2:3], v[4:7], v8 cbsz:3 abid:1
|
||||
v_smfmac_i32_32x32x32i8 a[10:25], v[2:3], a[4:7], v11
|
||||
// GFX940: v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11 ; encoding: [0x0a,0x80,0xec,0xd3,0x02,0x09,0x2e,0x14]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_16x16x64bf8bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_16x16x64_bf8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xf8,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_16x16x64bf8fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_16x16x64_bf8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xf9,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_16x16x64fp8bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_16x16x64_fp8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfa,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_16x16x64fp8fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_16x16x64_fp8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfb,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_32x32x32bf8bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfc,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_32x32x32bf8fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfd,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_32x32x32fp8bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfe,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
v_smfmac_f32_32x32x32fp8fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1
|
||||
// GFX940: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xff,0xd3,0x02,0x09,0x06,0x0c]
|
||||
// GFX90A: error: instruction not supported on this GPU
|
||||
|
||||
@@ -392,3 +392,51 @@
|
||||
|
||||
# GFX940: v_smfmac_i32_32x32x32_i8 v[10:25], v[2:3], v[4:7], v3 abid:15 ; encoding: [0x0a,0x78,0xec,0xd3,0x02,0x09,0x0e,0x04]
|
||||
0x0a,0x78,0xec,0xd3,0x02,0x09,0x0e,0x04
|
||||
|
||||
# GFX940: v_smfmac_f32_16x16x64_bf8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xf8,0xd3,0x02,0x09,0x06,0x0c]
|
||||
0x00,0x0b,0xf8,0xd3,0x02,0x09,0x06,0x0c
|
||||
|
||||
# GFX940: v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xf8,0xd3,0x02,0x09,0x06,0x14]
|
||||
0x00,0x80,0xf8,0xd3,0x02,0x09,0x06,0x14
|
||||
|
||||
# GFX940: v_smfmac_f32_16x16x64_bf8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xf9,0xd3,0x02,0x09,0x06,0x0c]
|
||||
0x00,0x0b,0xf9,0xd3,0x02,0x09,0x06,0x0c
|
||||
|
||||
# GFX940: v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xf9,0xd3,0x02,0x09,0x06,0x14]
|
||||
0x00,0x80,0xf9,0xd3,0x02,0x09,0x06,0x14
|
||||
|
||||
# GFX940: v_smfmac_f32_16x16x64_fp8_bf8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfa,0xd3,0x02,0x09,0x06,0x0c]
|
||||
0x00,0x0b,0xfa,0xd3,0x02,0x09,0x06,0x0c
|
||||
|
||||
# GFX940: v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfa,0xd3,0x02,0x09,0x06,0x14]
|
||||
0x00,0x80,0xfa,0xd3,0x02,0x09,0x06,0x14
|
||||
|
||||
# GFX940: v_smfmac_f32_16x16x64_fp8_fp8 v[0:3], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfb,0xd3,0x02,0x09,0x06,0x0c]
|
||||
0x00,0x0b,0xfb,0xd3,0x02,0x09,0x06,0x0c
|
||||
|
||||
# GFX940: v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfb,0xd3,0x02,0x09,0x06,0x14]
|
||||
0x00,0x80,0xfb,0xd3,0x02,0x09,0x06,0x14
|
||||
|
||||
# GFX940: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfc,0xd3,0x02,0x09,0x06,0x0c]
|
||||
0x00,0x0b,0xfc,0xd3,0x02,0x09,0x06,0x0c
|
||||
|
||||
# GFX940: v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfc,0xd3,0x02,0x09,0x06,0x14]
|
||||
0x00,0x80,0xfc,0xd3,0x02,0x09,0x06,0x14
|
||||
|
||||
# GFX940: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfd,0xd3,0x02,0x09,0x06,0x0c]
|
||||
0x00,0x0b,0xfd,0xd3,0x02,0x09,0x06,0x0c
|
||||
|
||||
# GFX940: v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfd,0xd3,0x02,0x09,0x06,0x14]
|
||||
0x00,0x80,0xfd,0xd3,0x02,0x09,0x06,0x14
|
||||
|
||||
# GFX940: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xfe,0xd3,0x02,0x09,0x06,0x0c]
|
||||
0x00,0x0b,0xfe,0xd3,0x02,0x09,0x06,0x0c
|
||||
|
||||
# GFX940: v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xfe,0xd3,0x02,0x09,0x06,0x14]
|
||||
0x00,0x80,0xfe,0xd3,0x02,0x09,0x06,0x14
|
||||
|
||||
# GFX940: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], a[2:3], v[4:7], v1 cbsz:3 abid:1 ; encoding: [0x00,0x0b,0xff,0xd3,0x02,0x09,0x06,0x0c]
|
||||
0x00,0x0b,0xff,0xd3,0x02,0x09,0x06,0x0c
|
||||
|
||||
# GFX940: v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[2:3], a[4:7], v1 ; encoding: [0x00,0x80,0xff,0xd3,0x02,0x09,0x06,0x14]
|
||||
0x00,0x80,0xff,0xd3,0x02,0x09,0x06,0x14
|
||||
|
||||
Reference in New Issue
Block a user