1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
|
/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
#define __CLANG_CUDA_COMPLEX_BUILTINS
// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3. These are
// libgcc functions that clang assumes are available when compiling c99 complex
// operations. (These implementations come from libc++, and have been modified
// to work with CUDA and OpenMP target offloading [in C and C++ mode].)
#pragma push_macro("__DEVICE__")
#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
#pragma omp declare target
#define __DEVICE__ __attribute__((noinline, nothrow, cold, weak))
#else
#define __DEVICE__ __device__ inline
#endif
// To make the algorithms available for C and C++ in CUDA and OpenMP we select
// different but equivalent function versions. TODO: For OpenMP we currently
// select the native builtins as the overload support for templates is lacking.
#if !defined(__OPENMP_NVPTX__) && !defined(__OPENMP_AMDGCN__)
#define _ISNANd std::isnan
#define _ISNANf std::isnan
#define _ISINFd std::isinf
#define _ISINFf std::isinf
#define _ISFINITEd std::isfinite
#define _ISFINITEf std::isfinite
#define _COPYSIGNd std::copysign
#define _COPYSIGNf std::copysign
#define _SCALBNd std::scalbn
#define _SCALBNf std::scalbn
#define _ABSd std::abs
#define _ABSf std::abs
#define _LOGBd std::logb
#define _LOGBf std::logb
// Rather than pulling in std::max from algorithm everytime, use available ::max.
#define _fmaxd max
#define _fmaxf max
#else
#ifdef __AMDGCN__
#define _ISNANd __ocml_isnan_f64
#define _ISNANf __ocml_isnan_f32
#define _ISINFd __ocml_isinf_f64
#define _ISINFf __ocml_isinf_f32
#define _ISFINITEd __ocml_isfinite_f64
#define _ISFINITEf __ocml_isfinite_f32
#define _COPYSIGNd __ocml_copysign_f64
#define _COPYSIGNf __ocml_copysign_f32
#define _SCALBNd __ocml_scalbn_f64
#define _SCALBNf __ocml_scalbn_f32
#define _ABSd __ocml_fabs_f64
#define _ABSf __ocml_fabs_f32
#define _LOGBd __ocml_logb_f64
#define _LOGBf __ocml_logb_f32
#define _fmaxd __ocml_fmax_f64
#define _fmaxf __ocml_fmax_f32
#else
#define _ISNANd __nv_isnand
#define _ISNANf __nv_isnanf
#define _ISINFd __nv_isinfd
#define _ISINFf __nv_isinff
#define _ISFINITEd __nv_isfinited
#define _ISFINITEf __nv_finitef
#define _COPYSIGNd __nv_copysign
#define _COPYSIGNf __nv_copysignf
#define _SCALBNd __nv_scalbn
#define _SCALBNf __nv_scalbnf
#define _ABSd __nv_fabs
#define _ABSf __nv_fabsf
#define _LOGBd __nv_logb
#define _LOGBf __nv_logbf
#define _fmaxd __nv_fmax
#define _fmaxf __nv_fmaxf
#endif
#endif
#if defined(__cplusplus)
extern "C" {
#endif
__DEVICE__ double _Complex __muldc3(double __a, double __b, double __c,
double __d) {
double __ac = __a * __c;
double __bd = __b * __d;
double __ad = __a * __d;
double __bc = __b * __c;
double _Complex z;
__real__(z) = __ac - __bd;
__imag__(z) = __ad + __bc;
if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) {
int __recalc = 0;
if (_ISINFd(__a) || _ISINFd(__b)) {
__a = _COPYSIGNd(_ISINFd(__a) ? 1 : 0, __a);
__b = _COPYSIGNd(_ISINFd(__b) ? 1 : 0, __b);
if (_ISNANd(__c))
__c = _COPYSIGNd(0, __c);
if (_ISNANd(__d))
__d = _COPYSIGNd(0, __d);
__recalc = 1;
}
if (_ISINFd(__c) || _ISINFd(__d)) {
__c = _COPYSIGNd(_ISINFd(__c) ? 1 : 0, __c);
__d = _COPYSIGNd(_ISINFd(__d) ? 1 : 0, __d);
if (_ISNANd(__a))
__a = _COPYSIGNd(0, __a);
if (_ISNANd(__b))
__b = _COPYSIGNd(0, __b);
__recalc = 1;
}
if (!__recalc &&
(_ISINFd(__ac) || _ISINFd(__bd) || _ISINFd(__ad) || _ISINFd(__bc))) {
if (_ISNANd(__a))
__a = _COPYSIGNd(0, __a);
if (_ISNANd(__b))
__b = _COPYSIGNd(0, __b);
if (_ISNANd(__c))
__c = _COPYSIGNd(0, __c);
if (_ISNANd(__d))
__d = _COPYSIGNd(0, __d);
__recalc = 1;
}
if (__recalc) {
// Can't use std::numeric_limits<double>::infinity() -- that doesn't have
// a device overload (and isn't constexpr before C++11, naturally).
__real__(z) = __builtin_huge_val() * (__a * __c - __b * __d);
__imag__(z) = __builtin_huge_val() * (__a * __d + __b * __c);
}
}
return z;
}
__DEVICE__ float _Complex __mulsc3(float __a, float __b, float __c, float __d) {
float __ac = __a * __c;
float __bd = __b * __d;
float __ad = __a * __d;
float __bc = __b * __c;
float _Complex z;
__real__(z) = __ac - __bd;
__imag__(z) = __ad + __bc;
if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) {
int __recalc = 0;
if (_ISINFf(__a) || _ISINFf(__b)) {
__a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a);
__b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b);
if (_ISNANf(__c))
__c = _COPYSIGNf(0, __c);
if (_ISNANf(__d))
__d = _COPYSIGNf(0, __d);
__recalc = 1;
}
if (_ISINFf(__c) || _ISINFf(__d)) {
__c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c);
__d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
if (_ISNANf(__a))
__a = _COPYSIGNf(0, __a);
if (_ISNANf(__b))
__b = _COPYSIGNf(0, __b);
__recalc = 1;
}
if (!__recalc &&
(_ISINFf(__ac) || _ISINFf(__bd) || _ISINFf(__ad) || _ISINFf(__bc))) {
if (_ISNANf(__a))
__a = _COPYSIGNf(0, __a);
if (_ISNANf(__b))
__b = _COPYSIGNf(0, __b);
if (_ISNANf(__c))
__c = _COPYSIGNf(0, __c);
if (_ISNANf(__d))
__d = _COPYSIGNf(0, __d);
__recalc = 1;
}
if (__recalc) {
__real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
__imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
}
}
return z;
}
__DEVICE__ double _Complex __divdc3(double __a, double __b, double __c,
double __d) {
int __ilogbw = 0;
// Can't use std::max, because that's defined in <algorithm>, and we don't
// want to pull that in for every compile. The CUDA headers define
// ::max(float, float) and ::max(double, double), which is sufficient for us.
double __logbw = _LOGBd(_fmaxd(_ABSd(__c), _ABSd(__d)));
if (_ISFINITEd(__logbw)) {
__ilogbw = (int)__logbw;
__c = _SCALBNd(__c, -__ilogbw);
__d = _SCALBNd(__d, -__ilogbw);
}
double __denom = __c * __c + __d * __d;
double _Complex z;
__real__(z) = _SCALBNd((__a * __c + __b * __d) / __denom, -__ilogbw);
__imag__(z) = _SCALBNd((__b * __c - __a * __d) / __denom, -__ilogbw);
if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) {
if ((__denom == 0.0) && (!_ISNANd(__a) || !_ISNANd(__b))) {
__real__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __a;
__imag__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __b;
} else if ((_ISINFd(__a) || _ISINFd(__b)) && _ISFINITEd(__c) &&
_ISFINITEd(__d)) {
__a = _COPYSIGNd(_ISINFd(__a) ? 1.0 : 0.0, __a);
__b = _COPYSIGNd(_ISINFd(__b) ? 1.0 : 0.0, __b);
__real__(z) = __builtin_huge_val() * (__a * __c + __b * __d);
__imag__(z) = __builtin_huge_val() * (__b * __c - __a * __d);
} else if (_ISINFd(__logbw) && __logbw > 0.0 && _ISFINITEd(__a) &&
_ISFINITEd(__b)) {
__c = _COPYSIGNd(_ISINFd(__c) ? 1.0 : 0.0, __c);
__d = _COPYSIGNd(_ISINFd(__d) ? 1.0 : 0.0, __d);
__real__(z) = 0.0 * (__a * __c + __b * __d);
__imag__(z) = 0.0 * (__b * __c - __a * __d);
}
}
return z;
}
__DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
int __ilogbw = 0;
float __logbw = _LOGBf(_fmaxf(_ABSf(__c), _ABSf(__d)));
if (_ISFINITEf(__logbw)) {
__ilogbw = (int)__logbw;
__c = _SCALBNf(__c, -__ilogbw);
__d = _SCALBNf(__d, -__ilogbw);
}
float __denom = __c * __c + __d * __d;
float _Complex z;
__real__(z) = _SCALBNf((__a * __c + __b * __d) / __denom, -__ilogbw);
__imag__(z) = _SCALBNf((__b * __c - __a * __d) / __denom, -__ilogbw);
if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) {
if ((__denom == 0) && (!_ISNANf(__a) || !_ISNANf(__b))) {
__real__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __a;
__imag__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __b;
} else if ((_ISINFf(__a) || _ISINFf(__b)) && _ISFINITEf(__c) &&
_ISFINITEf(__d)) {
__a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a);
__b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b);
__real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
__imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
} else if (_ISINFf(__logbw) && __logbw > 0 && _ISFINITEf(__a) &&
_ISFINITEf(__b)) {
__c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c);
__d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
__real__(z) = 0 * (__a * __c + __b * __d);
__imag__(z) = 0 * (__b * __c - __a * __d);
}
}
return z;
}
#if defined(__cplusplus)
} // extern "C"
#endif
#undef _ISNANd
#undef _ISNANf
#undef _ISINFd
#undef _ISINFf
#undef _COPYSIGNd
#undef _COPYSIGNf
#undef _ISFINITEd
#undef _ISFINITEf
#undef _SCALBNd
#undef _SCALBNf
#undef _ABSd
#undef _ABSf
#undef _LOGBd
#undef _LOGBf
#undef _fmaxd
#undef _fmaxf
#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
#pragma omp end declare target
#endif
#pragma pop_macro("__DEVICE__")
#endif // __CLANG_CUDA_COMPLEX_BUILTINS
|