Merge pull request #7894 from obilaniu/cudaupdates
Update unstable CUDA Module to support newest toolkits and drivers.
This commit is contained in:
commit
913374834c
|
@ -35,14 +35,14 @@ It offers several useful functions that are enumerated below.
|
|||
_Since: 0.50.0_
|
||||
|
||||
``` meson
|
||||
cuda.nvcc_arch_flags(nvcc_or_version, ...,
|
||||
cuda.nvcc_arch_flags(cuda_version_string, ...,
|
||||
detected: string_or_array)
|
||||
```
|
||||
|
||||
Returns a list of `-gencode` flags that should be passed to `cuda_args:` in
|
||||
order to compile a "fat binary" for the architectures/compute capabilities
|
||||
enumerated in the positional argument(s). The flags shall be acceptable to
|
||||
the NVCC compiler object `nvcc_or_version`, or its version string.
|
||||
an NVCC with CUDA Toolkit version string `cuda_version_string`.
|
||||
|
||||
A set of architectures and/or compute capabilities may be specified by:
|
||||
|
||||
|
@ -71,14 +71,6 @@ mixed with architecture names or compute capabilities. Their interpretation is:
|
|||
| `'Common'` | Relatively common CCs supported by given NVCC compiler. Generally excludes Tegra and Tesla devices. |
|
||||
| `'Auto'` | The CCs provided by the `detected:` keyword, filtered for support by given NVCC compiler. |
|
||||
|
||||
As a special case, when `nvcc_arch_flags()` is invoked with
|
||||
|
||||
- an NVCC `compiler` object `nvcc`,
|
||||
- `'Auto'` mode and
|
||||
- no `detected:` keyword,
|
||||
|
||||
Meson uses `nvcc`'s architecture auto-detection results.
|
||||
|
||||
The supported architecture names and their corresponding compute capabilities
|
||||
are:
|
||||
|
||||
|
@ -95,6 +87,7 @@ are:
|
|||
| `'Volta'` | 7.0 |
|
||||
| `'Xavier'` | 7.2 |
|
||||
| `'Turing'` | 7.5 |
|
||||
| `'Ampere'` | 8.0, 8.6 |
|
||||
|
||||
|
||||
Examples:
|
||||
|
@ -152,7 +145,7 @@ function `CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable, [list of CUDA compute archit
|
|||
_Since: 0.50.0_
|
||||
|
||||
``` meson
|
||||
cuda.nvcc_arch_readable(nvcc_or_version, ...,
|
||||
cuda.nvcc_arch_readable(cuda_version_string, ...,
|
||||
detected: string_or_array)
|
||||
```
|
||||
|
||||
|
@ -162,7 +155,7 @@ architectures that will be compiled for. The output of this function is solely
|
|||
intended for informative message printing.
|
||||
|
||||
archs = '3.0 3.5 5.0+PTX'
|
||||
readable = cuda.nvcc_arch_readable(nvcc, archs)
|
||||
readable = cuda.nvcc_arch_readable('10.0', archs)
|
||||
message('Building for architectures ' + ' '.join(readable))
|
||||
|
||||
This will print
|
||||
|
@ -178,11 +171,11 @@ _Note:_ This function is intended to closely replicate CMake's FindCUDA module f
|
|||
_Since: 0.50.0_
|
||||
|
||||
``` meson
|
||||
cuda.min_driver_version(nvcc_or_version)
|
||||
cuda.min_driver_version(cuda_version_string)
|
||||
```
|
||||
|
||||
Returns the minimum NVIDIA proprietary driver version required, on the host
|
||||
system, by kernels compiled with the given NVCC compiler or its version string.
|
||||
system, by kernels compiled with a CUDA Toolkit with the given version string.
|
||||
|
||||
The output of this function is generally intended for informative message
|
||||
printing, but could be used for assertions or to conditionally enable
|
||||
|
|
|
@ -34,16 +34,19 @@ class CudaModule(ExtensionModule):
|
|||
@noKwargs
|
||||
def min_driver_version(self, state, args, kwargs):
|
||||
argerror = InvalidArguments('min_driver_version must have exactly one positional argument: ' +
|
||||
'an NVCC compiler object, or its version string.')
|
||||
'a CUDA Toolkit version string. Beware that, since CUDA 11.0, ' +
|
||||
'the CUDA Toolkit\'s components (including NVCC) are versioned ' +
|
||||
'independently from each other (and the CUDA Toolkit as a whole).')
|
||||
|
||||
if len(args) != 1:
|
||||
if len(args) != 1 or not isinstance(args[0], str):
|
||||
raise argerror
|
||||
else:
|
||||
cuda_version = self._version_from_compiler(args[0])
|
||||
if cuda_version == 'unknown':
|
||||
raise argerror
|
||||
|
||||
cuda_version = args[0]
|
||||
driver_version_table = [
|
||||
{'cuda_version': '>=11.1.0', 'windows': '456.38', 'linux': '455.23'},
|
||||
{'cuda_version': '>=11.0.3', 'windows': '451.82', 'linux': '450.51.06'},
|
||||
{'cuda_version': '>=11.0.2', 'windows': '451.48', 'linux': '450.51.05'},
|
||||
{'cuda_version': '>=11.0.1', 'windows': '451.22', 'linux': '450.36.06'},
|
||||
{'cuda_version': '>=10.2.89', 'windows': '441.22', 'linux': '440.33'},
|
||||
{'cuda_version': '>=10.1.105', 'windows': '418.96', 'linux': '418.39'},
|
||||
{'cuda_version': '>=10.0.130', 'windows': '411.31', 'linux': '410.48'},
|
||||
|
@ -128,19 +131,45 @@ class CudaModule(ExtensionModule):
|
|||
|
||||
return cuda_version, arch_list, detected
|
||||
|
||||
def _filter_cuda_arch_list(self, cuda_arch_list, lo=None, hi=None, saturate=None):
|
||||
"""
|
||||
Filter CUDA arch list (no codenames) for >= low and < hi architecture
|
||||
bounds, and deduplicate.
|
||||
If saturate is provided, architectures >= hi are replaced with saturate.
|
||||
"""
|
||||
|
||||
filtered_cuda_arch_list = []
|
||||
for arch in cuda_arch_list:
|
||||
if arch:
|
||||
if lo and version_compare(arch, '<' + lo):
|
||||
continue
|
||||
if hi and version_compare(arch, '>=' + hi):
|
||||
if not saturate:
|
||||
continue
|
||||
arch = saturate
|
||||
if arch not in filtered_cuda_arch_list:
|
||||
filtered_cuda_arch_list.append(arch)
|
||||
return filtered_cuda_arch_list
|
||||
|
||||
def _nvcc_arch_flags(self, cuda_version, cuda_arch_list='Auto', detected=''):
|
||||
"""
|
||||
Using the CUDA Toolkit version (the NVCC version) and the target
|
||||
architectures, compute the NVCC architecture flags.
|
||||
Using the CUDA Toolkit version and the target architectures, compute
|
||||
the NVCC architecture flags.
|
||||
"""
|
||||
|
||||
cuda_known_gpu_architectures = ['Fermi', 'Kepler', 'Maxwell'] # noqa: E221
|
||||
cuda_common_gpu_architectures = ['3.0', '3.5', '5.0'] # noqa: E221
|
||||
cuda_limit_gpu_architecture = None # noqa: E221
|
||||
cuda_all_gpu_architectures = ['3.0', '3.2', '3.5', '5.0'] # noqa: E221
|
||||
# Replicates much of the logic of
|
||||
# https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake
|
||||
# except that a bug with cuda_arch_list="All" is worked around by
|
||||
# tracking both lower and upper limits on GPU architectures.
|
||||
|
||||
cuda_known_gpu_architectures = ['Fermi', 'Kepler', 'Maxwell'] # noqa: E221
|
||||
cuda_common_gpu_architectures = ['3.0', '3.5', '5.0'] # noqa: E221
|
||||
cuda_hi_limit_gpu_architecture = None # noqa: E221
|
||||
cuda_lo_limit_gpu_architecture = '2.0' # noqa: E221
|
||||
cuda_all_gpu_architectures = ['3.0', '3.2', '3.5', '5.0'] # noqa: E221
|
||||
|
||||
if version_compare(cuda_version, '<7.0'):
|
||||
cuda_limit_gpu_architecture = '5.2'
|
||||
cuda_hi_limit_gpu_architecture = '5.2'
|
||||
|
||||
if version_compare(cuda_version, '>=7.0'):
|
||||
cuda_known_gpu_architectures += ['Kepler+Tegra', 'Kepler+Tesla', 'Maxwell+Tegra'] # noqa: E221
|
||||
|
@ -148,7 +177,7 @@ class CudaModule(ExtensionModule):
|
|||
|
||||
if version_compare(cuda_version, '<8.0'):
|
||||
cuda_common_gpu_architectures += ['5.2+PTX'] # noqa: E221
|
||||
cuda_limit_gpu_architecture = '6.0' # noqa: E221
|
||||
cuda_hi_limit_gpu_architecture = '6.0' # noqa: E221
|
||||
|
||||
if version_compare(cuda_version, '>=8.0'):
|
||||
cuda_known_gpu_architectures += ['Pascal', 'Pascal+Tegra'] # noqa: E221
|
||||
|
@ -157,23 +186,45 @@ class CudaModule(ExtensionModule):
|
|||
|
||||
if version_compare(cuda_version, '<9.0'):
|
||||
cuda_common_gpu_architectures += ['6.1+PTX'] # noqa: E221
|
||||
cuda_limit_gpu_architecture = '7.0' # noqa: E221
|
||||
cuda_hi_limit_gpu_architecture = '7.0' # noqa: E221
|
||||
|
||||
if version_compare(cuda_version, '>=9.0'):
|
||||
cuda_known_gpu_architectures += ['Volta', 'Xavier'] # noqa: E221
|
||||
cuda_common_gpu_architectures += ['7.0', '7.0+PTX'] # noqa: E221
|
||||
cuda_all_gpu_architectures += ['7.0', '7.0+PTX', '7.2', '7.2+PTX'] # noqa: E221
|
||||
cuda_known_gpu_architectures += ['Volta', 'Xavier'] # noqa: E221
|
||||
cuda_common_gpu_architectures += ['7.0'] # noqa: E221
|
||||
cuda_all_gpu_architectures += ['7.0', '7.2'] # noqa: E221
|
||||
# https://docs.nvidia.com/cuda/archive/9.0/cuda-toolkit-release-notes/index.html#unsupported-features
|
||||
cuda_lo_limit_gpu_architecture = '3.0' # noqa: E221
|
||||
|
||||
if version_compare(cuda_version, '<10.0'):
|
||||
cuda_limit_gpu_architecture = '7.5'
|
||||
cuda_common_gpu_architectures += ['7.2+PTX'] # noqa: E221
|
||||
cuda_hi_limit_gpu_architecture = '8.0' # noqa: E221
|
||||
|
||||
if version_compare(cuda_version, '>=10.0'):
|
||||
cuda_known_gpu_architectures += ['Turing'] # noqa: E221
|
||||
cuda_common_gpu_architectures += ['7.5', '7.5+PTX'] # noqa: E221
|
||||
cuda_all_gpu_architectures += ['7.5', '7.5+PTX'] # noqa: E221
|
||||
cuda_known_gpu_architectures += ['Turing'] # noqa: E221
|
||||
cuda_common_gpu_architectures += ['7.5'] # noqa: E221
|
||||
cuda_all_gpu_architectures += ['7.5'] # noqa: E221
|
||||
|
||||
if version_compare(cuda_version, '<11.0'):
|
||||
cuda_limit_gpu_architecture = '8.0'
|
||||
cuda_common_gpu_architectures += ['7.5+PTX'] # noqa: E221
|
||||
cuda_hi_limit_gpu_architecture = '8.0' # noqa: E221
|
||||
|
||||
if version_compare(cuda_version, '>=11.0'):
|
||||
cuda_known_gpu_architectures += ['Ampere'] # noqa: E221
|
||||
cuda_common_gpu_architectures += ['8.0'] # noqa: E221
|
||||
cuda_all_gpu_architectures += ['8.0'] # noqa: E221
|
||||
# https://docs.nvidia.com/cuda/archive/11.0/cuda-toolkit-release-notes/index.html#deprecated-features
|
||||
cuda_lo_limit_gpu_architecture = '3.5' # noqa: E221
|
||||
|
||||
if version_compare(cuda_version, '<11.1'):
|
||||
cuda_common_gpu_architectures += ['8.0+PTX'] # noqa: E221
|
||||
cuda_hi_limit_gpu_architecture = '8.6' # noqa: E221
|
||||
|
||||
if version_compare(cuda_version, '>=11.1'):
|
||||
cuda_common_gpu_architectures += ['8.6', '8.6+PTX'] # noqa: E221
|
||||
cuda_all_gpu_architectures += ['8.6'] # noqa: E221
|
||||
|
||||
if version_compare(cuda_version, '<12.0'):
|
||||
cuda_hi_limit_gpu_architecture = '9.0' # noqa: E221
|
||||
|
||||
if not cuda_arch_list:
|
||||
cuda_arch_list = 'Auto'
|
||||
|
@ -188,16 +239,10 @@ class CudaModule(ExtensionModule):
|
|||
cuda_arch_list = detected
|
||||
else:
|
||||
cuda_arch_list = self._break_arch_string(detected)
|
||||
|
||||
if cuda_limit_gpu_architecture:
|
||||
filtered_cuda_arch_list = []
|
||||
for arch in cuda_arch_list:
|
||||
if arch:
|
||||
if version_compare(arch, '>=' + cuda_limit_gpu_architecture):
|
||||
arch = cuda_common_gpu_architectures[-1]
|
||||
if arch not in filtered_cuda_arch_list:
|
||||
filtered_cuda_arch_list.append(arch)
|
||||
cuda_arch_list = filtered_cuda_arch_list
|
||||
cuda_arch_list = self._filter_cuda_arch_list(cuda_arch_list,
|
||||
cuda_lo_limit_gpu_architecture,
|
||||
cuda_hi_limit_gpu_architecture,
|
||||
cuda_common_gpu_architectures[-1])
|
||||
else:
|
||||
cuda_arch_list = cuda_common_gpu_architectures
|
||||
elif isinstance(cuda_arch_list, str):
|
||||
|
@ -229,6 +274,7 @@ class CudaModule(ExtensionModule):
|
|||
'Volta': (['7.0'], ['7.0']),
|
||||
'Xavier': (['7.2'], []),
|
||||
'Turing': (['7.5'], ['7.5']),
|
||||
'Ampere': (['8.0'], ['8.0']),
|
||||
}.get(arch_name, (None, None))
|
||||
|
||||
if arch_bin is None:
|
||||
|
@ -242,10 +288,6 @@ class CudaModule(ExtensionModule):
|
|||
arch_ptx = arch_bin
|
||||
cuda_arch_ptx += arch_ptx
|
||||
|
||||
cuda_arch_bin = re.sub('\\.', '', ' '.join(cuda_arch_bin))
|
||||
cuda_arch_ptx = re.sub('\\.', '', ' '.join(cuda_arch_ptx))
|
||||
cuda_arch_bin = re.findall('[0-9()]+', cuda_arch_bin)
|
||||
cuda_arch_ptx = re.findall('[0-9]+', cuda_arch_ptx)
|
||||
cuda_arch_bin = sorted(list(set(cuda_arch_bin)))
|
||||
cuda_arch_ptx = sorted(list(set(cuda_arch_ptx)))
|
||||
|
||||
|
@ -253,15 +295,37 @@ class CudaModule(ExtensionModule):
|
|||
nvcc_archs_readable = []
|
||||
|
||||
for arch in cuda_arch_bin:
|
||||
m = re.match('([0-9]+)\\(([0-9]+)\\)', arch)
|
||||
if m:
|
||||
nvcc_flags += ['-gencode', 'arch=compute_' + m[2] + ',code=sm_' + m[1]]
|
||||
nvcc_archs_readable += ['sm_' + m[1]]
|
||||
arch, codev = re.fullmatch(
|
||||
'([0-9]+\\.[0-9])(?:\\(([0-9]+\\.[0-9])\\))?', arch).groups()
|
||||
|
||||
if version_compare(arch, '<' + cuda_lo_limit_gpu_architecture):
|
||||
continue
|
||||
if version_compare(arch, '>=' + cuda_hi_limit_gpu_architecture):
|
||||
continue
|
||||
|
||||
if codev:
|
||||
arch = arch.replace('.', '')
|
||||
codev = codev.replace('.', '')
|
||||
nvcc_flags += ['-gencode', 'arch=compute_' + codev + ',code=sm_' + arch]
|
||||
nvcc_archs_readable += ['sm_' + arch]
|
||||
else:
|
||||
arch = arch.replace('.', '')
|
||||
nvcc_flags += ['-gencode', 'arch=compute_' + arch + ',code=sm_' + arch]
|
||||
nvcc_archs_readable += ['sm_' + arch]
|
||||
|
||||
for arch in cuda_arch_ptx:
|
||||
arch, codev = re.fullmatch(
|
||||
'([0-9]+\\.[0-9])(?:\\(([0-9]+\\.[0-9])\\))?', arch).groups()
|
||||
|
||||
if codev:
|
||||
arch = codev
|
||||
|
||||
if version_compare(arch, '<' + cuda_lo_limit_gpu_architecture):
|
||||
continue
|
||||
if version_compare(arch, '>=' + cuda_hi_limit_gpu_architecture):
|
||||
continue
|
||||
|
||||
arch = arch.replace('.', '')
|
||||
nvcc_flags += ['-gencode', 'arch=compute_' + arch + ',code=compute_' + arch]
|
||||
nvcc_archs_readable += ['compute_' + arch]
|
||||
|
||||
|
|
|
@ -3,9 +3,9 @@ project('cudamodule', 'cuda', version : '1.0.0')
|
|||
nvcc = meson.get_compiler('cuda')
|
||||
cuda = import('unstable-cuda')
|
||||
|
||||
arch_flags = cuda.nvcc_arch_flags(nvcc, 'Auto', detected: ['6.0'])
|
||||
arch_readable = cuda.nvcc_arch_readable(nvcc, 'Auto', detected: ['6.0'])
|
||||
driver_version = cuda.min_driver_version(nvcc)
|
||||
arch_flags = cuda.nvcc_arch_flags(nvcc.version(), 'Auto', detected: ['6.0'])
|
||||
arch_readable = cuda.nvcc_arch_readable(nvcc.version(), 'Auto', detected: ['6.0'])
|
||||
driver_version = cuda.min_driver_version(nvcc.version())
|
||||
|
||||
message('NVCC version: ' + nvcc.version())
|
||||
message('NVCC flags: ' + ' '.join(arch_flags))
|
||||
|
@ -14,3 +14,57 @@ message('Driver version: >=' + driver_version)
|
|||
|
||||
exe = executable('prog', 'prog.cu', cuda_args: arch_flags)
|
||||
test('cudatest', exe)
|
||||
|
||||
|
||||
#
|
||||
# Assert Series
|
||||
#
|
||||
|
||||
# Sanity test.
|
||||
assert(' '.join(cuda.nvcc_arch_flags('11.1', '8.6')) ==
|
||||
'-gencode arch=compute_86,code=sm_86')
|
||||
|
||||
# CUDA Toolkit too old, flag filtered out.
|
||||
assert(' '.join(cuda.nvcc_arch_flags('11.0', '8.6')) ==
|
||||
'')
|
||||
|
||||
# Named architectures.
|
||||
assert(' '.join(cuda.nvcc_arch_flags('11.0', 'Ampere')) ==
|
||||
'-gencode arch=compute_80,code=sm_80')
|
||||
|
||||
# Splitting & deduplication.
|
||||
assert(' '.join(cuda.nvcc_arch_flags('11.0', 'Ampere;8.0,8.0')) ==
|
||||
'-gencode arch=compute_80,code=sm_80')
|
||||
|
||||
# Same, but list supplied as list.
|
||||
assert(' '.join(cuda.nvcc_arch_flags('11.0', ['Ampere', '8.0', '8.0'])) ==
|
||||
'-gencode arch=compute_80,code=sm_80')
|
||||
|
||||
# Same, but mode set to Auto with detected set to a string with a variety of separators.
|
||||
assert(' '.join(cuda.nvcc_arch_flags('11.0', 'Auto', detected: 'Ampere;8.0,8.0')) ==
|
||||
'-gencode arch=compute_80,code=sm_80')
|
||||
|
||||
# Same, but detected set to a list.
|
||||
assert(' '.join(cuda.nvcc_arch_flags('11.0', 'Auto', detected: ['Ampere', '8.0', '8.0'])) ==
|
||||
'-gencode arch=compute_80,code=sm_80')
|
||||
|
||||
# Ask for 8.6 binary with 8.0-level PTX.
|
||||
assert(' '.join(cuda.nvcc_arch_flags('11.1', '8.6(8.0)')) ==
|
||||
'-gencode arch=compute_80,code=sm_86')
|
||||
|
||||
# Same, but keep the 8.0 PTX.
|
||||
assert(' '.join(cuda.nvcc_arch_flags('11.1', '8.6(8.0)+PTX')) ==
|
||||
'-gencode arch=compute_80,code=sm_86 -gencode arch=compute_80,code=compute_80')
|
||||
|
||||
# Detected Ampere RTX 3090 on CUDA 10.2, saturate to 7.5+PTX
|
||||
assert(' '.join(cuda.nvcc_arch_flags('10.2', 'Auto', detected: ['8.0'])) ==
|
||||
'-gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75')
|
||||
|
||||
# Failed to auto-detect with CUDA 10.2, default to common GPUs (3.0;3.5;5.0;5.2;6.0;6.1;7.0;7.5+PTX)
|
||||
assert(' '.join(cuda.nvcc_arch_flags('10.2', 'Auto', detected: [])) ==
|
||||
'-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 '+
|
||||
'-gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 '+
|
||||
'-gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 '+
|
||||
'-gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 '+
|
||||
'-gencode arch=compute_75,code=compute_75')
|
||||
|
||||
|
|
Loading…
Reference in New Issue