1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
|
/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __PMMINTRIN_H
#define __PMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include <emmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128)))
/// Loads data from an unaligned memory location to elements in a 128-bit
/// vector.
///
/// If the address of the data is not 16-byte aligned, the instruction may
/// read two adjacent aligned blocks of memory to retrieve the requested
/// data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
///
/// \param __p
/// A pointer to a 128-bit integer vector containing integer values.
/// \returns A 128-bit vector containing the moved values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_lddqu_si128(__m128i_u const *__p)
{
return (__m128i)__builtin_ia32_lddqu((char const *)__p);
}
/// Adds the even-indexed values and subtracts the odd-indexed values of
/// two 128-bit vectors of [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing the left source operand.
/// \param __b
/// A 128-bit vector of [4 x float] containing the right source operand.
/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
/// differences of both operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_addsub_ps(__m128 __a, __m128 __b)
{
return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
}
/// Horizontally adds the adjacent pairs of values contained in two
/// 128-bit vectors of [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// The horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// The horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
/// both operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_hadd_ps(__m128 __a, __m128 __b)
{
return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
}
/// Horizontally subtracts the adjacent pairs of values contained in two
/// 128-bit vectors of [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// The horizontal differences between the values are stored in the lower
/// bits of the destination.
/// \param __b
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// The horizontal differences between the values are stored in the upper
/// bits of the destination.
/// \returns A 128-bit vector of [4 x float] containing the horizontal
/// differences of both operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_hsub_ps(__m128 __a, __m128 __b)
{
return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
}
/// Moves and duplicates odd-indexed values from a 128-bit vector
/// of [4 x float] to float values stored in a 128-bit vector of
/// [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float]. \n
/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of
/// the destination. \n
/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
/// destination.
/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
/// values.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movehdup_ps(__m128 __a)
{
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
}
/// Duplicates even-indexed values from a 128-bit vector of
/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] \n
/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of
/// the destination. \n
/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
/// destination.
/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
/// values.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_moveldup_ps(__m128 __a)
{
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
}
/// Adds the even-indexed values and subtracts the odd-indexed values of
/// two 128-bit vectors of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double] containing the left source operand.
/// \param __b
/// A 128-bit vector of [2 x double] containing the right source operand.
/// \returns A 128-bit vector of [2 x double] containing the alternating sums
/// and differences of both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_addsub_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
}
/// Horizontally adds the pairs of values contained in two 128-bit
/// vectors of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double] containing one of the source operands.
/// The horizontal sum of the values is stored in the lower bits of the
/// destination.
/// \param __b
/// A 128-bit vector of [2 x double] containing one of the source operands.
/// The horizontal sum of the values is stored in the upper bits of the
/// destination.
/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
/// both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_hadd_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
}
/// Horizontally subtracts the pairs of values contained in two 128-bit
/// vectors of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double] containing one of the source operands.
/// The horizontal difference of the values is stored in the lower bits of
/// the destination.
/// \param __b
/// A 128-bit vector of [2 x double] containing one of the source operands.
/// The horizontal difference of the values is stored in the upper bits of
/// the destination.
/// \returns A 128-bit vector of [2 x double] containing the horizontal
/// differences of both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_hsub_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
}
/// Moves and duplicates one double-precision value to double-precision
/// values stored in a 128-bit vector of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128d _mm_loaddup_pd(double const *dp);
/// \endcode
///
/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
///
/// \param dp
/// A pointer to a double-precision value to be moved and duplicated.
/// \returns A 128-bit vector of [2 x double] containing the moved and
/// duplicated values.
#define _mm_loaddup_pd(dp) _mm_load1_pd(dp)
/// Moves and duplicates the double-precision value in the lower bits of
/// a 128-bit vector of [2 x double] to double-precision values stored in a
/// 128-bit vector of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
/// [127:64] and [63:0] of the destination.
/// \returns A 128-bit vector of [2 x double] containing the moved and
/// duplicated values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_movedup_pd(__m128d __a)
{
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
}
/// Establishes a linear address memory range to be monitored and puts
/// the processor in the monitor event pending state. Data stored in the
/// monitored address range causes the processor to exit the pending state.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
///
/// \param __p
/// The memory range to be monitored. The size of the range is determined by
/// CPUID function 0000_0005h.
/// \param __extensions
/// Optional extensions for the monitoring state.
/// \param __hints
/// Optional hints for the monitoring state.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
{
__builtin_ia32_monitor(__p, __extensions, __hints);
}
/// Used with the MONITOR instruction to wait while the processor is in
/// the monitor event pending state. Data stored in the monitored address
/// range causes the processor to exit the pending state.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
///
/// \param __extensions
/// Optional extensions for the monitoring state, which may vary by
/// processor.
/// \param __hints
/// Optional hints for the monitoring state, which may vary by processor.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_mwait(unsigned __extensions, unsigned __hints)
{
__builtin_ia32_mwait(__extensions, __hints);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __PMMINTRIN_H */
|