CoCalc -- setup.py

GitHub Repository: ai-forever/sber-swap
Path: blob/main/apex/setup.py
¹⁰⁶⁹ views
1
import torch
2
from torch.utils import cpp_extension
3
from setuptools import setup, find_packages
4
import subprocess
5

6
import sys
7
import warnings
8
import os
9

10
# ninja build does not work unless include_dirs are abs path
11
this_dir = os.path.dirname(os.path.abspath(__file__))
12

13
def get_cuda_bare_metal_version(cuda_dir):
14
    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
15
    output = raw_output.split()
16
    release_idx = output.index("release") + 1
17
    release = output[release_idx].split(".")
18
    bare_metal_major = release[0]
19
    bare_metal_minor = release[1][0]
20

21
    return raw_output, bare_metal_major, bare_metal_minor
22

23
if not torch.cuda.is_available():
24
    # https://github.com/NVIDIA/apex/issues/486
25
    # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query torch.cuda.get_device_capability(),
26
    # which will fail if you are compiling in an environment without visible GPUs (e.g. during an nvidia-docker build command).
27
    print('\nWarning: Torch did not find available GPUs on this system.\n',
28
          'If your intention is to cross-compile, this is not an error.\n'
29
          'By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n'
30
          'Volta (compute capability 7.0), Turing (compute capability 7.5),\n'
31
          'and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n'
32
          'If you wish to cross-compile for a single specific architecture,\n'
33
          'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n')
34
    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
35
        _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
36
        if int(bare_metal_major) == 11:
37
            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0"
38
        else:
39
            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
40

41
print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
42
TORCH_MAJOR = int(torch.__version__.split('.')[0])
43
TORCH_MINOR = int(torch.__version__.split('.')[1])
44

45
if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
46
      raise RuntimeError("Apex requires Pytorch 0.4 or newer.\n" +
47
                         "The latest stable release can be obtained from https://pytorch.org/")
48

49
cmdclass = {}
50
ext_modules = []
51

52
extras = {}
53
if "--pyprof" in sys.argv:
54
    string = "\n\nPyprof has been moved to its own dedicated repository and will " + \
55
             "soon be removed from Apex.  Please visit\n" + \
56
             "https://github.com/NVIDIA/PyProf\n" + \
57
             "for the latest version."
58
    warnings.warn(string, DeprecationWarning)
59
    with open('requirements.txt') as f:
60
        required_packages = f.read().splitlines()
61
        extras['pyprof'] = required_packages
62
    try:
63
        sys.argv.remove("--pyprof")
64
    except:
65
        pass
66
else:
67
    warnings.warn("Option --pyprof not specified. Not installing PyProf dependencies!")
68

69
if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
70
    if TORCH_MAJOR == 0:
71
        raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, "
72
                           "found torch.__version__ = {}".format(torch.__version__))
73
    from torch.utils.cpp_extension import BuildExtension
74
    cmdclass['build_ext'] = BuildExtension
75

76
if "--cpp_ext" in sys.argv:
77
    from torch.utils.cpp_extension import CppExtension
78
    sys.argv.remove("--cpp_ext")
79
    ext_modules.append(
80
        CppExtension('apex_C',
81
                     ['csrc/flatten_unflatten.cpp',]))
82

83
def get_cuda_bare_metal_version(cuda_dir):
84
    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
85
    output = raw_output.split()
86
    release_idx = output.index("release") + 1
87
    release = output[release_idx].split(".")
88
    bare_metal_major = release[0]
89
    bare_metal_minor = release[1][0]
90

91
    return raw_output, bare_metal_major, bare_metal_minor
92

93
def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
94
    raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir)
95
    torch_binary_major = torch.version.cuda.split(".")[0]
96
    torch_binary_minor = torch.version.cuda.split(".")[1]
97

98
    print("\nCompiling cuda extensions with")
99
    print(raw_output + "from " + cuda_dir + "/bin\n")
100

101
    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
102
        raise RuntimeError("Cuda extensions are being compiled with a version of Cuda that does " +
103
                           "not match the version used to compile Pytorch binaries.  " +
104
                           "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) +
105
                           "In some cases, a minor-version mismatch will not cause later errors:  " +
106
                           "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
107
                           "You can try commenting out this check (at your own risk).")
108

109

110
# Set up macros for forward/backward compatibility hack around
111
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
112
# and
113
# https://github.com/NVIDIA/apex/issues/456
114
# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
115
version_ge_1_1 = []
116
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
117
    version_ge_1_1 = ['-DVERSION_GE_1_1']
118
version_ge_1_3 = []
119
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
120
    version_ge_1_3 = ['-DVERSION_GE_1_3']
121
version_ge_1_5 = []
122
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
123
    version_ge_1_5 = ['-DVERSION_GE_1_5']
124
version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
125

126
if "--distributed_adam" in sys.argv:
127
    from torch.utils.cpp_extension import CUDAExtension
128
    sys.argv.remove("--distributed_adam")
129

130
    from torch.utils.cpp_extension import BuildExtension
131
    cmdclass['build_ext'] = BuildExtension
132

133
    if torch.utils.cpp_extension.CUDA_HOME is None:
134
        raise RuntimeError("--distributed_adam was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
135
    else:
136
        ext_modules.append(
137
            CUDAExtension(name='distributed_adam_cuda',
138
                          sources=['apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp',
139
                                   'apex/contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu'],
140
                          include_dirs=[os.path.join(this_dir, 'csrc')],
141
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
142
                                              'nvcc':['-O3',
143
                                                      '--use_fast_math'] + version_dependent_macros}))
144

145
if "--distributed_lamb" in sys.argv:
146
    from torch.utils.cpp_extension import CUDAExtension
147
    sys.argv.remove("--distributed_lamb")
148

149
    from torch.utils.cpp_extension import BuildExtension
150
    cmdclass['build_ext'] = BuildExtension
151

152
    if torch.utils.cpp_extension.CUDA_HOME is None:
153
        raise RuntimeError("--distributed_lamb was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
154
    else:
155
        ext_modules.append(
156
            CUDAExtension(name='distributed_lamb_cuda',
157
                          sources=['apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp',
158
                                   'apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu'],
159
                          include_dirs=[os.path.join(this_dir, 'csrc')],
160
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
161
                                              'nvcc':['-O3',
162
                                                      '--use_fast_math'] + version_dependent_macros}))
163

164
if "--cuda_ext" in sys.argv:
165
    from torch.utils.cpp_extension import CUDAExtension
166
    sys.argv.remove("--cuda_ext")
167

168
    if torch.utils.cpp_extension.CUDA_HOME is None:
169
        raise RuntimeError("--cuda_ext was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
170
    else:
171
        check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)
172

173
        ext_modules.append(
174
            CUDAExtension(name='amp_C',
175
                          sources=['csrc/amp_C_frontend.cpp',
176
                                   'csrc/multi_tensor_sgd_kernel.cu',
177
                                   'csrc/multi_tensor_scale_kernel.cu',
178
                                   'csrc/multi_tensor_axpby_kernel.cu',
179
                                   'csrc/multi_tensor_l2norm_kernel.cu',
180
                                   'csrc/multi_tensor_lamb_stage_1.cu',
181
                                   'csrc/multi_tensor_lamb_stage_2.cu',
182
                                   'csrc/multi_tensor_adam.cu',
183
                                   'csrc/multi_tensor_adagrad.cu',
184
                                   'csrc/multi_tensor_novograd.cu',
185
                                   'csrc/multi_tensor_lamb.cu'],
186
                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
187
                                              'nvcc':['-lineinfo',
188
                                                      '-O3',
189
                                                      # '--resource-usage',
190
                                                      '--use_fast_math'] + version_dependent_macros}))
191
        ext_modules.append(
192
            CUDAExtension(name='syncbn',
193
                          sources=['csrc/syncbn.cpp',
194
                                   'csrc/welford.cu'],
195
                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
196
                                              'nvcc':['-O3'] + version_dependent_macros}))
197

198
        ext_modules.append(
199
            CUDAExtension(name='fused_layer_norm_cuda',
200
                          sources=['csrc/layer_norm_cuda.cpp',
201
                                   'csrc/layer_norm_cuda_kernel.cu'],
202
                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
203
                                              'nvcc':['-maxrregcount=50',
204
                                                      '-O3',
205
                                                      '--use_fast_math'] + version_dependent_macros}))
206

207
        ext_modules.append(
208
            CUDAExtension(name='mlp_cuda',
209
                          sources=['csrc/mlp.cpp',
210
                                   'csrc/mlp_cuda.cu'],
211
                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
212
                                              'nvcc':['-O3'] + version_dependent_macros}))
213

214
if "--bnp" in sys.argv:
215
    from torch.utils.cpp_extension import CUDAExtension
216
    sys.argv.remove("--bnp")
217

218
    from torch.utils.cpp_extension import BuildExtension
219
    cmdclass['build_ext'] = BuildExtension
220

221
    if torch.utils.cpp_extension.CUDA_HOME is None:
222
        raise RuntimeError("--bnp was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
223
    else:
224
        ext_modules.append(
225
            CUDAExtension(name='bnp',
226
                          sources=['apex/contrib/csrc/groupbn/batch_norm.cu',
227
                                   'apex/contrib/csrc/groupbn/ipc.cu',
228
                                   'apex/contrib/csrc/groupbn/interface.cpp',
229
                                   'apex/contrib/csrc/groupbn/batch_norm_add_relu.cu'],
230
                          include_dirs=[os.path.join(this_dir, 'csrc')],
231
                          extra_compile_args={'cxx': [] + version_dependent_macros,
232
                                              'nvcc':['-DCUDA_HAS_FP16=1',
233
                                                      '-D__CUDA_NO_HALF_OPERATORS__',
234
                                                      '-D__CUDA_NO_HALF_CONVERSIONS__',
235
                                                      '-D__CUDA_NO_HALF2_OPERATORS__'] + version_dependent_macros}))
236

237
if "--xentropy" in sys.argv:
238
    from torch.utils.cpp_extension import CUDAExtension
239
    sys.argv.remove("--xentropy")
240

241
    from torch.utils.cpp_extension import BuildExtension
242
    cmdclass['build_ext'] = BuildExtension
243

244
    if torch.utils.cpp_extension.CUDA_HOME is None:
245
        raise RuntimeError("--xentropy was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
246
    else:
247
        ext_modules.append(
248
            CUDAExtension(name='xentropy_cuda',
249
                          sources=['apex/contrib/csrc/xentropy/interface.cpp',
250
                                   'apex/contrib/csrc/xentropy/xentropy_kernel.cu'],
251
                          include_dirs=[os.path.join(this_dir, 'csrc')],
252
                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
253
                                              'nvcc':['-O3'] + version_dependent_macros}))
254

255
if "--deprecated_fused_adam" in sys.argv:
256
    from torch.utils.cpp_extension import CUDAExtension
257
    sys.argv.remove("--deprecated_fused_adam")
258

259
    from torch.utils.cpp_extension import BuildExtension
260
    cmdclass['build_ext'] = BuildExtension
261

262
    if torch.utils.cpp_extension.CUDA_HOME is None:
263
        raise RuntimeError("--deprecated_fused_adam was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
264
    else:
265
        ext_modules.append(
266
            CUDAExtension(name='fused_adam_cuda',
267
                          sources=['apex/contrib/csrc/optimizers/fused_adam_cuda.cpp',
268
                                   'apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu'],
269
                          include_dirs=[os.path.join(this_dir, 'csrc')],
270
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
271
                                              'nvcc':['-O3',
272
                                                      '--use_fast_math'] + version_dependent_macros}))
273

274
if "--deprecated_fused_lamb" in sys.argv:
275
    from torch.utils.cpp_extension import CUDAExtension
276
    sys.argv.remove("--deprecated_fused_lamb")
277

278
    from torch.utils.cpp_extension import BuildExtension
279
    cmdclass['build_ext'] = BuildExtension
280

281
    if torch.utils.cpp_extension.CUDA_HOME is None:
282
        raise RuntimeError("--deprecated_fused_lamb was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
283
    else:
284
        ext_modules.append(
285
            CUDAExtension(name='fused_lamb_cuda',
286
                          sources=['apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp',
287
                                   'apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu',
288
                                   'csrc/multi_tensor_l2norm_kernel.cu'],
289
                          include_dirs=[os.path.join(this_dir, 'csrc')],
290
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
291
                                              'nvcc':['-O3',
292
                                                      '--use_fast_math'] + version_dependent_macros}))
293

294
# Check, if ATen/CUDAGenerator.h is found, otherwise use the new ATen/CUDAGeneratorImpl.h, due to breaking change in https://github.com/pytorch/pytorch/pull/36026
295
generator_flag = []
296
torch_dir = torch.__path__[0]
297
if os.path.exists(os.path.join(torch_dir, 'include', 'ATen', 'CUDAGenerator.h')):
298
    generator_flag = ['-DOLD_GENERATOR']
299

300
if "--fast_layer_norm" in sys.argv:
301
    from torch.utils.cpp_extension import CUDAExtension
302
    sys.argv.remove("--fast_layer_norm")
303

304
    from torch.utils.cpp_extension import BuildExtension
305
    cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
306

307
    if torch.utils.cpp_extension.CUDA_HOME is None:
308
        raise RuntimeError("--fast_layer_norm was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
309
    else:
310
        # Check, if CUDA11 is installed for compute capability 8.0
311
        cc_flag = []
312
        _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
313
        if int(bare_metal_major) >= 11:
314
            cc_flag.append('-gencode')
315
            cc_flag.append('arch=compute_80,code=sm_80')
316

317
        ext_modules.append(
318
            CUDAExtension(name='fast_layer_norm',
319
                          sources=['apex/contrib/csrc/layer_norm/ln_api.cpp',
320
                                   'apex/contrib/csrc/layer_norm/ln_fwd_cuda_kernel.cu',
321
                                   'apex/contrib/csrc/layer_norm/ln_bwd_semi_cuda_kernel.cu',
322
                                   ],
323
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
324
                                              'nvcc':['-O3',
325
                                                      '-gencode', 'arch=compute_70,code=sm_70',
326
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
327
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
328
                                                      '-I./apex/contrib/csrc/layer_norm/',
329
                                                      '--expt-relaxed-constexpr',
330
                                                      '--expt-extended-lambda',
331
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
332
if "--fmha" in sys.argv:
333
    from torch.utils.cpp_extension import CUDAExtension
334
    sys.argv.remove("--fmha")
335

336
    from torch.utils.cpp_extension import BuildExtension
337
    cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
338

339
    if torch.utils.cpp_extension.CUDA_HOME is None:
340
        raise RuntimeError("--fmha was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
341
    else:
342
        # Check, if CUDA11 is installed for compute capability 8.0
343
        cc_flag = []
344
        _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
345
        if int(bare_metal_major) < 11:
346
            raise RuntimeError("--fmha only supported on SM80")
347

348
        ext_modules.append(
349
            CUDAExtension(name='fmhalib',
350
                          sources=[
351
                                   'apex/contrib/csrc/fmha/fmha_api.cpp',
352
                                   'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_128_64_kernel.sm80.cu',
353
                                   'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_256_64_kernel.sm80.cu',
354
                                   'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_384_64_kernel.sm80.cu',
355
                                   'apex/contrib/csrc/fmha/src/fmha_fprop_fp16_512_64_kernel.sm80.cu',
356
                                   'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_128_64_kernel.sm80.cu',
357
                                   'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_256_64_kernel.sm80.cu',
358
                                   'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_384_64_kernel.sm80.cu',
359
                                   'apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_512_64_kernel.sm80.cu',
360
                                   ],
361
                          extra_compile_args={'cxx': ['-O3',
362
                                                      '-I./apex/contrib/csrc/fmha/src',
363
                                                      ] + version_dependent_macros + generator_flag,
364
                                              'nvcc':['-O3',
365
                                                      '-gencode', 'arch=compute_80,code=sm_80',
366
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
367
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
368
                                                      '-I./apex/contrib/csrc/',
369
                                                      '-I./apex/contrib/csrc/fmha/src',
370
                                                      '--expt-relaxed-constexpr',
371
                                                      '--expt-extended-lambda',
372
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
373

374

375
if "--fast_multihead_attn" in sys.argv:
376
    from torch.utils.cpp_extension import CUDAExtension
377
    sys.argv.remove("--fast_multihead_attn")
378

379
    from torch.utils.cpp_extension import BuildExtension
380
    cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
381

382
    if torch.utils.cpp_extension.CUDA_HOME is None:
383
        raise RuntimeError("--fast_multihead_attn was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
384
    else:
385
        # Check, if CUDA11 is installed for compute capability 8.0
386
        cc_flag = []
387
        _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
388
        if int(bare_metal_major) >= 11:
389
            cc_flag.append('-gencode')
390
            cc_flag.append('arch=compute_80,code=sm_80')
391

392
        subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/multihead_attn/cutlass"])
393
        ext_modules.append(
394
            CUDAExtension(name='fast_additive_mask_softmax_dropout',
395
                          sources=['apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout.cpp',
396
                                   'apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu'],
397
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
398
                                              'nvcc':['-O3',
399
                                                      '-gencode', 'arch=compute_70,code=sm_70',
400
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
401
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
402
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
403
                                                      '--expt-relaxed-constexpr',
404
                                                      '--expt-extended-lambda',
405
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
406
        ext_modules.append(
407
            CUDAExtension(name='fast_mask_softmax_dropout',
408
                          sources=['apex/contrib/csrc/multihead_attn/masked_softmax_dropout.cpp',
409
                                   'apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu'],
410
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
411
                                              'nvcc':['-O3',
412
                                                      '-gencode', 'arch=compute_70,code=sm_70',
413
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
414
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
415
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
416
                                                      '--expt-relaxed-constexpr',
417
                                                      '--expt-extended-lambda',
418
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
419
        ext_modules.append(
420
            CUDAExtension(name='fast_self_multihead_attn_bias_additive_mask',
421
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp',
422
                                   'apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu'],
423
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
424
                                              'nvcc':['-O3',
425
                                                      '-gencode', 'arch=compute_70,code=sm_70',
426
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
427
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
428
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
429
                                                      '--expt-relaxed-constexpr',
430
                                                      '--expt-extended-lambda',
431
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
432
        ext_modules.append(
433
            CUDAExtension(name='fast_self_multihead_attn_bias',
434
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_bias.cpp',
435
                                   'apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu'],
436
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
437
                                              'nvcc':['-O3',
438
                                                      '-gencode', 'arch=compute_70,code=sm_70',
439
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
440
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
441
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
442
                                                      '--expt-relaxed-constexpr',
443
                                                      '--expt-extended-lambda',
444
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
445
        ext_modules.append(
446
            CUDAExtension(name='fast_self_multihead_attn',
447
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp',
448
                                   'apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu'],
449
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
450
                                              'nvcc':['-O3',
451
                                                      '-gencode', 'arch=compute_70,code=sm_70',
452
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
453
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
454
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
455
                                                      '--expt-relaxed-constexpr',
456
                                                      '--expt-extended-lambda',
457
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
458
        ext_modules.append(
459
            CUDAExtension(name='fast_self_multihead_attn_norm_add',
460
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp',
461
                                   'apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu'],
462
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
463
                                              'nvcc':['-O3',
464
                                                      '-gencode', 'arch=compute_70,code=sm_70',
465
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
466
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
467
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
468
                                                      '--expt-relaxed-constexpr',
469
                                                      '--expt-extended-lambda',
470
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
471
        ext_modules.append(
472
            CUDAExtension(name='fast_encdec_multihead_attn',
473
                          sources=['apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp',
474
                                   'apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu'],
475
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
476
                                              'nvcc':['-O3',
477
                                                      '-gencode', 'arch=compute_70,code=sm_70',
478
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
479
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
480
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
481
                                                      '--expt-relaxed-constexpr',
482
                                                      '--expt-extended-lambda',
483
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
484
        ext_modules.append(
485
            CUDAExtension(name='fast_encdec_multihead_attn_norm_add',
486
                          sources=['apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp',
487
                                   'apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu'],
488
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
489
                                              'nvcc':['-O3',
490
                                                      '-gencode', 'arch=compute_70,code=sm_70',
491
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
492
                                                      '-U__CUDA_NO_HALF_OPERATORS__',
493
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
494
                                                      '--expt-relaxed-constexpr',
495
                                                      '--expt-extended-lambda',
496
                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
497

498
if "--transducer" in sys.argv:
499
    from torch.utils.cpp_extension import CUDAExtension
500
    sys.argv.remove("--transducer")
501

502
    from torch.utils.cpp_extension import BuildExtension
503
    cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
504

505
    if torch.utils.cpp_extension.CUDA_HOME is None:
506
        raise RuntimeError("--transducer was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
507
    else:
508
        ext_modules.append(
509
            CUDAExtension(name='transducer_joint_cuda',
510
                          sources=['apex/contrib/csrc/transducer/transducer_joint.cpp',
511
                                   'apex/contrib/csrc/transducer/transducer_joint_kernel.cu'],
512
                          include_dirs=[os.path.join(this_dir, 'csrc')],
513
                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
514
                                              'nvcc':['-O3'] + version_dependent_macros}))
515
        ext_modules.append(
516
            CUDAExtension(name='transducer_loss_cuda',
517
                          sources=['apex/contrib/csrc/transducer/transducer_loss.cpp',
518
                                   'apex/contrib/csrc/transducer/transducer_loss_kernel.cu'],
519
                          include_dirs=[os.path.join(this_dir, 'csrc')],
520
                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
521
                                              'nvcc':['-O3'] + version_dependent_macros}))
522

523
if "--fast_bottleneck" in sys.argv:
524
    from torch.utils.cpp_extension import CUDAExtension
525
    sys.argv.remove("--fast_bottleneck")
526

527
    from torch.utils.cpp_extension import BuildExtension
528
    cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
529

530
    if torch.utils.cpp_extension.CUDA_HOME is None:
531
        raise RuntimeError("--fast_bottleneck was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
532
    else:
533
        subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/cudnn-frontend/"])
534
        ext_modules.append(
535
            CUDAExtension(name='fast_bottleneck',
536
                          sources=['apex/contrib/csrc/bottleneck/bottleneck.cpp'],
537
                          include_dirs=['apex/contrib/csrc/cudnn-frontend/include'],
538
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag}))
539

540
setup(
541
    name='apex',
542
    version='0.1',
543
    packages=find_packages(exclude=('build',
544
                                    'csrc',
545
                                    'include',
546
                                    'tests',
547
                                    'dist',
548
                                    'docs',
549
                                    'tests',
550
                                    'examples',
551
                                    'apex.egg-info',)),
552
    description='PyTorch Extensions written by NVIDIA',
553
    ext_modules=ext_modules,
554
    cmdclass=cmdclass,
555
    extras_require=extras,
556
)
557

558
Product

Resources

Company