blob: 2c923ecdfb827aa73740eaaeb327cade0646af33 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
import os
DEFAULT_CUDA_ARCHITECTURES="sm_50"
def arch2num(arch):
if not arch[-1].isdigit():
arch = arch[:-1]
return f"{arch}0"
def oncuda_srcs(unit, *args):
"""
@usage: CUDA_SRCS(File...)
A macro for efficient distributed compilation of CUDA code for multiple device architectures.
For each source .cu file multiple nodes are generated:
- node per each device architecture producing PTX and CUBIN
- node merging all PTX and CUBIN files into a single FATBIN blob
- node producing .cpp with host code
- node compiling host .cpp with embedded FATBIN blob
CUDA_ARCHITECTURES variable is used to determine the list of architectures to compile device code for.
"""
architecture_names = (unit.get("CUDA_ARCHITECTURES") or DEFAULT_CUDA_ARCHITECTURES).split(":")
architectures = [name.split('_')[1] for name in architecture_names]
stub_arch = architectures[-1]
arch_list = arch2num(stub_arch)
cflags = [
"-D__NV_LEGACY_LAUNCH",
"-Wno-unused-function",
"-Wno-unused-parameter",
"-Wno-deprecated-literal-operator",
]
for cu in args:
name, _ = os.path.splitext(cu)
images = []
for arch in architectures:
unit.on_cuda_compile_device([cu, arch, arch_list])
images.append(f"{name}.{arch}.ptx")
images.append(f"{name}.{arch}.cubin")
images.append(f"{name}.{arch}.module_id")
unit.on_cuda_fatbin([cu, *images])
unit.on_cuda_compile_host([cu, stub_arch, arch_list])
unit.onsrc([f"{name}.cudafe1.cpp", *cflags])
|