[AMDGPU] Replace gfx940 and gfx941 with gfx942 in llvm (#126763)

gfx940 and gfx941 are no longer supported. This is one of a series of
PRs to remove them from the code base.

This PR removes all non-documentation occurrences of gfx940/gfx941 from
the llvm directory, and the remaining occurrences in clang.

Documentation changes will follow.

For SWDEV-512631
This commit is contained in:
Fabian Ritter
2025-02-19 10:20:48 +01:00
committed by GitHub
parent cc539138ac
commit 8615f9aaff
23 changed files with 60 additions and 113 deletions

View File

@@ -45,8 +45,6 @@
// CHECK-SAME: {{^}}, gfx909
// CHECK-SAME: {{^}}, gfx90a
// CHECK-SAME: {{^}}, gfx90c
// CHECK-SAME: {{^}}, gfx940
// CHECK-SAME: {{^}}, gfx941
// CHECK-SAME: {{^}}, gfx942
// CHECK-SAME: {{^}}, gfx950
// CHECK-SAME: {{^}}, gfx1010

View File

@@ -2232,7 +2232,7 @@ The AMDGPU backend uses the following ELF header:
``EF_AMDGPU_MACH_AMDGCN_GFX1035`` 0x03d ``gfx1035``
``EF_AMDGPU_MACH_AMDGCN_GFX1034`` 0x03e ``gfx1034``
``EF_AMDGPU_MACH_AMDGCN_GFX90A`` 0x03f ``gfx90a``
``EF_AMDGPU_MACH_AMDGCN_GFX940`` 0x040 ``gfx940``
*reserved* 0x040 Reserved.
``EF_AMDGPU_MACH_AMDGCN_GFX1100`` 0x041 ``gfx1100``
``EF_AMDGPU_MACH_AMDGCN_GFX1013`` 0x042 ``gfx1013``
``EF_AMDGPU_MACH_AMDGCN_GFX1150`` 0x043 ``gfx1150``
@@ -2243,7 +2243,7 @@ The AMDGPU backend uses the following ELF header:
``EF_AMDGPU_MACH_AMDGCN_GFX1200`` 0x048 ``gfx1200``
*reserved* 0x049 Reserved.
``EF_AMDGPU_MACH_AMDGCN_GFX1151`` 0x04a ``gfx1151``
``EF_AMDGPU_MACH_AMDGCN_GFX941`` 0x04b ``gfx941``
*reserved* 0x04b Reserved.
``EF_AMDGPU_MACH_AMDGCN_GFX942`` 0x04c ``gfx942``
*reserved* 0x04d Reserved.
``EF_AMDGPU_MACH_AMDGCN_GFX1201`` 0x04e ``gfx1201``

View File

@@ -814,7 +814,7 @@ enum : unsigned {
EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d,
EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e,
EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f,
EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040,
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X40 = 0x040,
EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042,
EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043,
@@ -825,7 +825,7 @@ enum : unsigned {
EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048,
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49 = 0x049,
EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a,
EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b,
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4B = 0x04b,
EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c,
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d,
EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e,

View File

@@ -1074,7 +1074,7 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
// bit 0 = glc, bit 1 = slc,
// bit 2 = dlc (gfx10/gfx11),
// bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope
!listconcat(props, [IntrNoCallback, IntrNoFree, IntrWillReturn],
!if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
@@ -1321,7 +1321,7 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// Note: volatile bit is **not** permitted here.
@@ -1351,7 +1351,7 @@ class AMDGPURawBufferLoad : DefaultAttrsIntrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1381,7 +1381,7 @@ class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntri
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1413,7 +1413,7 @@ class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntri
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1431,7 +1431,7 @@ class AMDGPUStructAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1448,7 +1448,7 @@ class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIn
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1467,7 +1467,7 @@ class AMDGPUStructPtrAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsi
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1485,7 +1485,7 @@ class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrins
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1503,7 +1503,7 @@ class AMDGPURawPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntr
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1523,7 +1523,7 @@ class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntr
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1542,7 +1542,7 @@ class AMDGPUStructPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsI
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1628,7 +1628,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
// gfx908 intrinsic
def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx940, gfx950, gfx12+.
// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx942, gfx950, gfx12+.
def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
@@ -1727,7 +1727,7 @@ def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
[IntrReadMem,
@@ -1743,7 +1743,7 @@ def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1761,7 +1761,7 @@ def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1779,7 +1779,7 @@ def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1797,7 +1797,7 @@ def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1815,7 +1815,7 @@ def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1834,7 +1834,7 @@ def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1853,7 +1853,7 @@ def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1872,7 +1872,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1891,7 +1891,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1914,7 +1914,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -1934,7 +1934,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
@@ -3007,7 +3007,7 @@ def int_amdgcn_fdot2_f32_bf16 :
// f32 %r = llvm.amdgcn.fdot2c.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
// %r = %a[0] * %b[0] + %a[1] * %b[1] + c
// TODO: This actually is similar to llvm.amdgcn.fdot2 intrinsics which produces
// v_dot2c_f32_f16 on gfx940. Maybe we can consolidate these.
// v_dot2c_f32_f16 on gfx942. Maybe we can consolidate these.
def int_amdgcn_fdot2c_f32_bf16 :
ClangBuiltin<"__builtin_amdgcn_fdot2c_f32_bf16">,
@@ -3250,7 +3250,7 @@ def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, ll
def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>;
// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA.
// Note: in gfx942 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA.
// Three bits corresponding to the neg modifier applied to the respective
// source operand.
def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>;
@@ -3258,7 +3258,7 @@ def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, ll
}
//===----------------------------------------------------------------------===//
// gfx940 intrinsics
// gfx942 intrinsics
// ===----------------------------------------------------------------------===//
class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> :

View File

@@ -83,8 +83,6 @@ enum GPUKind : uint32_t {
GK_GFX909 = 65,
GK_GFX90A = 66,
GK_GFX90C = 67,
GK_GFX940 = 68,
GK_GFX941 = 69,
GK_GFX942 = 70,
GK_GFX950 = 71,

View File

@@ -545,10 +545,6 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
return "gfx90a";
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C:
return "gfx90c";
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940:
return "gfx940";
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941:
return "gfx941";
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942:
return "gfx942";
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950:

View File

@@ -609,8 +609,6 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90A, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90C, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX940, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX941, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX942, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX950, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH);

View File

@@ -1619,28 +1619,6 @@ def FeatureISAVersion9_5_Common : FeatureSet<
FeatureAtomicBufferPkAddBF16Inst
])>;
def FeatureISAVersion9_4_0 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
[
FeatureAddressableLocalMemorySize65536,
FeatureForceStoreSC0SC1,
FeatureFP8Insts,
FeatureFP8ConversionInsts,
FeatureCvtFP8VOP1Bug,
FeatureXF32Insts
])>;
def FeatureISAVersion9_4_1 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
[
FeatureAddressableLocalMemorySize65536,
FeatureForceStoreSC0SC1,
FeatureFP8Insts,
FeatureFP8ConversionInsts,
FeatureCvtFP8VOP1Bug,
FeatureXF32Insts
])>;
def FeatureISAVersion9_4_2 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
[

View File

@@ -4295,7 +4295,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(
// TODO: Handle G_FSUB 0 as fneg
// TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
(void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
(void)IsDOT; // DOTs do not use OPSEL on gfx942+, check ST.hasDOTOpSelHazard()
// Packed instructions do not have abs modifiers.
Mods |= SISrcMods::OP_SEL_1;

View File

@@ -1773,7 +1773,7 @@ def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>;
def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>;
def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
// GFX940+.
// GFX942+.
def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>;
def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>;

View File

@@ -814,7 +814,7 @@ defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax",
} // End SubtargetPredicate = isGFX7GFX10GFX11
// GFX940-, GFX11-only flat instructions.
// GFX942-, GFX11-only flat instructions.
let SubtargetPredicate = HasFlatAtomicFaddF32Inst in {
defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>;
} // End SubtargetPredicate = HasFlatAtomicFaddF32Inst
@@ -2076,7 +2076,7 @@ defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_SVE_vi <0x1e>;
defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_SVE_vi <0x1f>;
let SubtargetPredicate = isGFX8GFX9NotGFX940 in {
// These instructions are encoded differently on gfx90* and gfx940.
// These instructions are encoded differently on gfx90* and gfx94*.
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>;
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>;
}

View File

@@ -2292,7 +2292,7 @@ GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses,
bool IsGFX950) {
// xdl def cycles | gfx940 | gfx950
// xdl def cycles | gfx942 | gfx950
// 2 pass | 5 5
// 4 pass | 7 8
// 8 pass | 11 12
@@ -2600,7 +2600,7 @@ static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses,
bool IsGFX950) {
// xdl def cycles | gfx940 | gfx950
// xdl def cycles | gfx942 | gfx950
// 2 pass | 5 5
// 4 pass | 7 8
// 8 pass | 11 12
@@ -2610,7 +2610,7 @@ static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses,
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
bool IsGFX950) {
// xdl def cycles | gfx940 | gfx950
// xdl def cycles | gfx942 | gfx950
// 2 pass | 5 5
// 4 pass | 7 8
// 8 pass | 11 12

View File

@@ -192,15 +192,7 @@ def : ProcessorModel<"gfx90c", SIQuarterSpeedModel,
FeatureISAVersion9_0_C.Features
>;
def : ProcessorModel<"gfx940", SIDPGFX940FullSpeedModel,
FeatureISAVersion9_4_0.Features
>;
def : ProcessorModel<"gfx941", SIDPGFX940FullSpeedModel,
FeatureISAVersion9_4_1.Features
>;
def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel,
def : ProcessorModel<"gfx942", SIDPGFX942FullSpeedModel,
FeatureISAVersion9_4_2.Features
>;
@@ -213,8 +205,8 @@ def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel,
FeatureISAVersion9_Generic.Features
>;
// [gfx940, gfx941, gfx942]
def : ProcessorModel<"gfx9-4-generic", SIDPGFX940FullSpeedModel,
// [gfx942]
def : ProcessorModel<"gfx9-4-generic", SIDPGFX942FullSpeedModel,
FeatureISAVersion9_4_Generic.Features
>;

View File

@@ -1297,11 +1297,11 @@ public:
bool hasPackedTID() const { return HasPackedTID; }
// GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
// GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that
// hasGFX90AInsts is also true.
bool hasGFX940Insts() const { return GFX940Insts; }
// GFX950 is a derivation to GFX940. hasGFX950Insts() implies that
// GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that
// hasGFX940Insts and hasGFX90AInsts are also true.
bool hasGFX950Insts() const { return GFX950Insts; }

View File

@@ -93,8 +93,6 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: AK = GK_GFX90A; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: AK = GK_GFX940; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: AK = GK_GFX941; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: AK = GK_GFX942; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950: AK = GK_GFX950; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
@@ -180,8 +178,6 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
case GK_GFX90A: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A;
case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C;
case GK_GFX940: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940;
case GK_GFX941: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX941;
case GK_GFX942: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX942;
case GK_GFX950: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX950;
case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;

View File

@@ -542,7 +542,7 @@ enum Id { // HwRegCode, (6) [5:0]
ID_EXCP_FLAG_USER = 18,
ID_TRAP_CTRL = 19,
// GFX940 specific registers
// GFX94* specific registers
ID_XCC_ID = 20,
ID_SQ_PERF_SNAPSHOT_DATA = 21,
ID_SQ_PERF_SNAPSHOT_DATA1 = 22,

View File

@@ -16823,39 +16823,39 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// safe. The message phrasing also should be better.
if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
if (AS == AMDGPUAS::FLAT_ADDRESS) {
// gfx940, gfx12
// gfx942, gfx12
if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
return ReportUnsafeHWInst(AtomicExpansionKind::None);
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
// gfx90a, gfx940, gfx12
// gfx90a, gfx942, gfx12
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
return ReportUnsafeHWInst(AtomicExpansionKind::None);
// gfx940, gfx12
// gfx942, gfx12
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
return ReportUnsafeHWInst(AtomicExpansionKind::None);
} else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
// gfx90a, gfx940, gfx12
// gfx90a, gfx942, gfx12
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
return ReportUnsafeHWInst(AtomicExpansionKind::None);
// While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
// While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
// buffer. gfx12 does have the buffer version.
if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
return ReportUnsafeHWInst(AtomicExpansionKind::None);
}
// global and flat atomic fadd f64: gfx90a, gfx940.
// global and flat atomic fadd f64: gfx90a, gfx942.
if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
if (AS != AMDGPUAS::FLAT_ADDRESS) {
if (Ty->isFloatTy()) {
// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
// gfx11+.
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
// global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
// global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
} else {
@@ -16867,7 +16867,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
}
}
// flat atomic fadd f32: gfx940, gfx11+.
// flat atomic fadd f32: gfx942, gfx11+.
if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
if (Subtarget->hasFlatAtomicFaddF32Inst())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
@@ -16906,7 +16906,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// float, double restored in gfx10.
// double removed again in gfx11, so only f32 for gfx11/gfx12.
//
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
// For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
// no f32.
if (AS == AMDGPUAS::FLAT_ADDRESS) {
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())

View File

@@ -492,7 +492,6 @@ protected:
}
public:
SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,

View File

@@ -94,7 +94,7 @@ class SISchedMachineModel : SchedMachineModel {
def SIFullSpeedModel : SISchedMachineModel;
def SIQuarterSpeedModel : SISchedMachineModel;
def SIDPFullSpeedModel : SISchedMachineModel;
def SIDPGFX940FullSpeedModel : SISchedMachineModel;
def SIDPGFX942FullSpeedModel : SISchedMachineModel;
def SIDPGFX950FullSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;
def GFX11SpeedModel : SISchedMachineModel;
@@ -276,7 +276,7 @@ def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>;
} // End SchedModel = SIDPFullSpeedModel
let SchedModel = SIDPGFX940FullSpeedModel in {
let SchedModel = SIDPGFX942FullSpeedModel in {
defm : SICommonWriteRes;
@@ -308,7 +308,7 @@ def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>;
def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>;
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>;
} // End SchedModel = SIDPGFX940FullSpeedModel
} // End SchedModel = SIDPGFX942FullSpeedModel
let SchedModel = SIDPGFX950FullSpeedModel in {

View File

@@ -216,7 +216,7 @@ static constexpr CustomOperand Operands[] = {
{{"HW_REG_SCRATCH_BASE_HI"}, ID_FLAT_SCR_HI, isGFX12Plus},
{{"HW_REG_SHADER_CYCLES_LO"}, ID_SHADER_CYCLES, isGFX12Plus},
// GFX940 specific registers
// GFX942 specific registers
{{"HW_REG_XCC_ID"}, ID_XCC_ID, isGFX940},
{{"HW_REG_SQ_PERF_SNAPSHOT_DATA"}, ID_SQ_PERF_SNAPSHOT_DATA, isGFX940},
{{"HW_REG_SQ_PERF_SNAPSHOT_DATA1"}, ID_SQ_PERF_SNAPSHOT_DATA1, isGFX940},

View File

@@ -104,8 +104,6 @@ constexpr GPUInfo AMDGCNGPUs[] = {
{{"gfx909"}, {"gfx909"}, GK_GFX909, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
{{"gfx90a"}, {"gfx90a"}, GK_GFX90A, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
{{"gfx90c"}, {"gfx90c"}, GK_GFX90C, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
{{"gfx940"}, {"gfx940"}, GK_GFX940, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
{{"gfx941"}, {"gfx941"}, GK_GFX941, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
{{"gfx942"}, {"gfx942"}, GK_GFX942, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
{{"gfx950"}, {"gfx950"}, GK_GFX950, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
{{"gfx1010"}, {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
@@ -260,8 +258,6 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
case GK_GFX909: return {9, 0, 9};
case GK_GFX90A: return {9, 0, 10};
case GK_GFX90C: return {9, 0, 12};
case GK_GFX940: return {9, 4, 0};
case GK_GFX941: return {9, 4, 1};
case GK_GFX942: return {9, 4, 2};
case GK_GFX950: return {9, 5, 0};
case GK_GFX1010: return {10, 1, 0};
@@ -506,8 +502,6 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["gfx950-insts"] = true;
[[fallthrough]];
case GK_GFX942:
case GK_GFX941:
case GK_GFX940:
Features["fp8-insts"] = true;
Features["fp8-conversion-insts"] = true;
if (Kind != GK_GFX950)

View File

@@ -1,4 +1,4 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 \
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 \
; RUN: | FileCheck --match-full-lines --implicit-check-not='declare' %s
; Confirms we do not leave behind a declaration which references the same

View File

@@ -1624,8 +1624,6 @@ const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX909, "gfx909"), \
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90A, "gfx90a"), \
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90C, "gfx90c"), \
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX940, "gfx940"), \
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX941, "gfx941"), \
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX942, "gfx942"), \
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX950, "gfx950"), \
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1010, "gfx1010"), \