build/plugins/cuda.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

import os

DEFAULT_CUDA_ARCHITECTURES="sm_50"


def arch2num(arch):
    if not arch[-1].isdigit():
        arch = arch[:-1]

    return f"{arch}0"


def oncuda_srcs(unit, *args):
    """
    @usage: CUDA_SRCS(File...)

    A macro for efficient distributed compilation of CUDA code for multiple device architectures.

    For each source .cu file multiple nodes are generated:
    - node per each device architecture producing PTX and CUBIN
    - node merging all PTX and CUBIN files into a single FATBIN blob
    - node producing .cpp with host code
    - node compiling host .cpp with embedded FATBIN blob

    CUDA_ARCHITECTURES variable is used to determine the list of architectures to compile device code for.
    """
    architecture_names = (unit.get("CUDA_ARCHITECTURES") or DEFAULT_CUDA_ARCHITECTURES).split(":")
    architectures = [name.split('_')[1] for name in architecture_names]

    stub_arch = architectures[-1]
    arch_list = arch2num(stub_arch)

    cflags = [
        "-D__NV_LEGACY_LAUNCH",
        "-Wno-unused-function",
        "-Wno-unused-parameter",
        "-Wno-deprecated-literal-operator",
    ]

    for cu in args:
        name, _ = os.path.splitext(cu)
        images = []

        for arch in architectures:
            unit.on_cuda_compile_device([cu, arch, arch_list])

            images.append(f"{name}.{arch}.ptx")
            images.append(f"{name}.{arch}.cubin")
            images.append(f"{name}.{arch}.module_id")

        unit.on_cuda_fatbin([cu, *images])

        unit.on_cuda_compile_host([cu, stub_arch, arch_list])

        unit.onsrc([f"{name}.cudafe1.cpp", *cflags])