1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
|
/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined __IMMINTRIN_H
#error "Never use <f16cintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __F16CINTRIN_H
#define __F16CINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))
/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
* but that's because icc can emulate these without f16c using a library call.
* Since we don't do that let's leave these in f16cintrin.h.
*/
/// Converts a 16-bit half-precision float value into a 32-bit float
/// value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 16-bit half-precision float value.
/// \returns The converted 32-bit float value.
static __inline float __DEFAULT_FN_ATTRS128
_cvtsh_ss(unsigned short __a)
{
__v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
__v4sf __r = __builtin_ia32_vcvtph2ps(__v);
return __r[0];
}
/// Converts a 32-bit single-precision float value to a 16-bit
/// half-precision float value.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned short _cvtss_sh(float a, const int imm);
/// \endcode
///
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 32-bit single-precision float value to be converted to a 16-bit
/// half-precision float value.
/// \param imm
/// An immediate value controlling rounding using bits [2:0]: \n
/// 000: Nearest \n
/// 001: Down \n
/// 010: Up \n
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns The converted 16-bit half-precision float value.
#define _cvtss_sh(a, imm) __extension__ ({ \
(unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
(imm)))[0]); })
/// Converts a 128-bit vector containing 32-bit float values into a
/// 128-bit vector containing 16-bit half-precision float values.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
/// \endcode
///
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 128-bit vector containing 32-bit float values.
/// \param imm
/// An immediate value controlling rounding using bits [2:0]: \n
/// 000: Nearest \n
/// 001: Down \n
/// 010: Up \n
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns A 128-bit vector containing converted 16-bit half-precision float
/// values. The lower 64 bits are used to store the converted 16-bit
/// half-precision floating-point values.
#define _mm_cvtps_ph(a, imm) \
((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
/// Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 128-bit vector containing 32-bit float values.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 128-bit vector containing 16-bit half-precision float values. The lower
/// 64 bits are used in the conversion.
/// \returns A 128-bit vector of [4 x float] containing converted float values.
static __inline __m128 __DEFAULT_FN_ATTRS128
_mm_cvtph_ps(__m128i __a)
{
return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
}
/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
/// containing 16-bit half-precision float values.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
/// \endcode
///
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 256-bit vector containing 32-bit single-precision float values to be
/// converted to 16-bit half-precision float values.
/// \param imm
/// An immediate value controlling rounding using bits [2:0]: \n
/// 000: Nearest \n
/// 001: Down \n
/// 010: Up \n
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns A 128-bit vector containing the converted 16-bit half-precision
/// float values.
#define _mm256_cvtps_ph(a, imm) \
((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)))
/// Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 256-bit vector of [8 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 128-bit vector containing 16-bit half-precision float values to be
/// converted to 32-bit single-precision float values.
/// \returns A vector of [8 x float] containing the converted 32-bit
/// single-precision float values.
static __inline __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtph_ps(__m128i __a)
{
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif /* __F16CINTRIN_H */
|