diff options
author | danlark <danlark@yandex-team.ru> | 2022-02-10 16:46:10 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:10 +0300 |
commit | baa58daefa91fde4b4769facdbd2903763b9c6a8 (patch) | |
tree | 1a2c5ffcf89eb53ecd79dbc9bc0a195c27404d0c /library/cpp/sse/sse2neon.h | |
parent | 3426a9bc7f169ae9da54cef557ad2a33f6e8eee0 (diff) | |
download | ydb-baa58daefa91fde4b4769facdbd2903763b9c6a8.tar.gz |
Restoring authorship annotation for <danlark@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/sse/sse2neon.h')
-rw-r--r-- | library/cpp/sse/sse2neon.h | 406 |
1 files changed, 203 insertions, 203 deletions
diff --git a/library/cpp/sse/sse2neon.h b/library/cpp/sse/sse2neon.h index f60711e77f..695dbd3041 100644 --- a/library/cpp/sse/sse2neon.h +++ b/library/cpp/sse/sse2neon.h @@ -17,7 +17,7 @@ #if !defined(_arm64_) #error "This header is for ARM64 (aarch64) platform only. " \ - "Include sse.h instead of including this header directly." + "Include sse.h instead of including this header directly." #endif #include <arm_neon.h> @@ -57,21 +57,21 @@ union __m128 { typedef float64x2_t __m128d; -enum _mm_hint -{ - /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ - _MM_HINT_ET0 = 7, - _MM_HINT_ET1 = 6, - _MM_HINT_T0 = 3, - _MM_HINT_T1 = 2, - _MM_HINT_T2 = 1, - _MM_HINT_NTA = 0 -}; - -Y_FORCE_INLINE void _mm_prefetch(const void *p, enum _mm_hint) { - __builtin_prefetch(p); -} - +enum _mm_hint +{ + /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ + _MM_HINT_ET0 = 7, + _MM_HINT_ET1 = 6, + _MM_HINT_T0 = 3, + _MM_HINT_T1 = 2, + _MM_HINT_T2 = 1, + _MM_HINT_NTA = 0 +}; + +Y_FORCE_INLINE void _mm_prefetch(const void *p, enum _mm_hint) { + __builtin_prefetch(p); +} + template <typename TType> struct TQType; @@ -299,9 +299,9 @@ using _mm_or_si128 = TWrapperDual<uint64x2_t, decltype(vorrq_u64), vorrq_u64>; using _mm_and_si128 = TWrapperDual<uint64x2_t, decltype(vandq_u64), vandq_u64>; using _mm_andnot_si128 = TWrapperDualSwap<uint64x2_t, decltype(vbicq_u64), vbicq_u64>; -using _mm_xor_si128 = TWrapperDual<uint64x2_t, decltype(veorq_u64), veorq_u64>; +using _mm_xor_si128 = TWrapperDual<uint64x2_t, decltype(veorq_u64), veorq_u64>; -using _mm_add_epi8 = TWrapperDual<uint8x16_t, decltype(vaddq_u8), vaddq_u8>; +using _mm_add_epi8 = TWrapperDual<uint8x16_t, decltype(vaddq_u8), vaddq_u8>; using _mm_add_epi16 = TWrapperDual<uint16x8_t, decltype(vaddq_u16), vaddq_u16>; using _mm_add_epi32 = TWrapperDual<uint32x4_t, decltype(vaddq_u32), vaddq_u32>; using _mm_add_epi64 = TWrapperDual<uint64x2_t, decltype(vaddq_u64), vaddq_u64>; @@ -342,7 +342,7 @@ inline __m128i _mm_madd_epi16(__m128i a, __m128i b) { return res; } -using _mm_sub_epi8 = TWrapperDual<uint8x16_t, decltype(vsubq_u8), vsubq_u8>; +using _mm_sub_epi8 = TWrapperDual<uint8x16_t, decltype(vsubq_u8), vsubq_u8>; using _mm_sub_epi16 = TWrapperDual<uint16x8_t, decltype(vsubq_u16), vsubq_u16>; using _mm_sub_epi32 = TWrapperDual<uint32x4_t, decltype(vsubq_u32), vsubq_u32>; using _mm_sub_epi64 = TWrapperDual<uint64x2_t, decltype(vsubq_u64), vsubq_u64>; @@ -385,27 +385,27 @@ using _mm_cmplt_epi16 = using _mm_cmplt_epi32 = TWrapperDual<int32x4_t, decltype(vcltq_s32), vcltq_s32>; -Y_FORCE_INLINE __m128i _mm_load_si128(const __m128i* ptr) { +Y_FORCE_INLINE __m128i _mm_load_si128(const __m128i* ptr) { __m128i result; result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr); return result; } -Y_FORCE_INLINE __m128i _mm_loadu_si128(const __m128i* ptr) { +Y_FORCE_INLINE __m128i _mm_loadu_si128(const __m128i* ptr) { __m128i result; result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr); return result; } -Y_FORCE_INLINE __m128i _mm_lddqu_si128(const __m128i* ptr) { - return _mm_loadu_si128(ptr); -} - -Y_FORCE_INLINE void _mm_storeu_si128(__m128i* ptr, const __m128i& op) { +Y_FORCE_INLINE __m128i _mm_lddqu_si128(const __m128i* ptr) { + return _mm_loadu_si128(ptr); +} + +Y_FORCE_INLINE void _mm_storeu_si128(__m128i* ptr, const __m128i& op) { vst1q_u64((uint64_t*)ptr, op.AsUi64x2); } -Y_FORCE_INLINE void +Y_FORCE_INLINE void _mm_store_si128(__m128i* ptr, const __m128i& op) { vst1q_u64((uint64_t*)ptr, op.AsUi64x2); } @@ -459,7 +459,7 @@ struct ShuffleStruct4 { ui8 x[4]; }; -Y_FORCE_INLINE ShuffleStruct4 +Y_FORCE_INLINE ShuffleStruct4 _MM_SHUFFLE(ui8 x4, ui8 x3, ui8 x2, ui8 x1) { ShuffleStruct4 result; result.x[0] = x1; @@ -469,7 +469,7 @@ _MM_SHUFFLE(ui8 x4, ui8 x3, ui8 x2, ui8 x1) { return result; } -Y_FORCE_INLINE __m128i +Y_FORCE_INLINE __m128i _mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) { __m128i result; const ui8 xi[4] = { @@ -486,7 +486,7 @@ _mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) { return result; } -Y_FORCE_INLINE int +Y_FORCE_INLINE int _mm_movemask_epi8(const __m128i& op) { uint8x16_t mask = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; @@ -542,7 +542,7 @@ struct THelper_mm_slli_si128 : TBaseWrapper<__m128i> { #define _mm_slli_si128(a, imm) THelper_mm_slli_si128<imm>(a) -Y_FORCE_INLINE int _mm_cvtsi128_si32(const __m128i& op) { +Y_FORCE_INLINE int _mm_cvtsi128_si32(const __m128i& op) { return vgetq_lane_s32(op.AsSi32x4, 0); } @@ -558,18 +558,18 @@ struct _mm_set_epi16 : TBaseWrapper<__m128i> { } }; -struct _mm_setr_epi16 : TBaseWrapper<__m128i> { - Y_FORCE_INLINE - _mm_setr_epi16(const short w7, const short w6, - const short w5, const short w4, - const short w3, const short w2, - const short w1, const short w0) { - int16x4_t d0 = {w7, w6, w5, w4}; - int16x4_t d1 = {w3, w2, w1, w0}; - TQType<int16x8_t>::As(Value) = vcombine_s16(d0, d1); - } -}; - +struct _mm_setr_epi16 : TBaseWrapper<__m128i> { + Y_FORCE_INLINE + _mm_setr_epi16(const short w7, const short w6, + const short w5, const short w4, + const short w3, const short w2, + const short w1, const short w0) { + int16x4_t d0 = {w7, w6, w5, w4}; + int16x4_t d1 = {w3, w2, w1, w0}; + TQType<int16x8_t>::As(Value) = vcombine_s16(d0, d1); + } +}; + struct _mm_set_epi32 : TBaseWrapper<__m128i> { Y_FORCE_INLINE _mm_set_epi32(const int x3, const int x2, @@ -580,16 +580,16 @@ struct _mm_set_epi32 : TBaseWrapper<__m128i> { } }; -struct _mm_setr_epi32 : TBaseWrapper<__m128i> { - Y_FORCE_INLINE - _mm_setr_epi32(const int x3, const int x2, - const int x1, const int x0) { - int32x2_t d0 = {x3, x2}; - int32x2_t d1 = {x1, x0}; - TQType<int32x4_t>::As(Value) = vcombine_s32(d0, d1); - } -}; - +struct _mm_setr_epi32 : TBaseWrapper<__m128i> { + Y_FORCE_INLINE + _mm_setr_epi32(const int x3, const int x2, + const int x1, const int x0) { + int32x2_t d0 = {x3, x2}; + int32x2_t d1 = {x1, x0}; + TQType<int32x4_t>::As(Value) = vcombine_s32(d0, d1); + } +}; + struct _mm_cvtsi32_si128 : TBaseWrapper<__m128i> { Y_FORCE_INLINE _mm_cvtsi32_si128(int op) { @@ -642,31 +642,31 @@ struct TScalarOutWrapper : TBaseWrapper<TOpOut> { }; template<int imm> -int extract_epi8_arm(__m128i arg) { - return vgetq_lane_u8(arg.AsUi8x16, imm); -} - -template<int imm> -int extract_epi16_arm(__m128i arg) { +int extract_epi8_arm(__m128i arg) { + return vgetq_lane_u8(arg.AsUi8x16, imm); +} + +template<int imm> +int extract_epi16_arm(__m128i arg) { return vgetq_lane_u16(arg.AsUi16x8, imm); } -template<int imm> -int extract_epi32_arm(__m128i arg) { - return vgetq_lane_s32(arg.AsSi32x4, imm); -} - -template<int imm> -long long extract_epi64_arm(__m128i arg) { - return vgetq_lane_s64(arg.AsSi64x2, imm); -} - -#define _mm_extract_epi8(op, imm) extract_epi8_arm<imm>(op) -#define _mm_extract_epi16(op, imm) extract_epi16_arm<imm>(op) -#define _mm_extract_epi32(op, imm) extract_epi32_arm<imm>(op) -#define _mm_extract_epi64(op, imm) extract_epi64_arm<imm>(op) -#define _mm_extract_ps(op, imm) _mm_extract_epi32(op, imm) - +template<int imm> +int extract_epi32_arm(__m128i arg) { + return vgetq_lane_s32(arg.AsSi32x4, imm); +} + +template<int imm> +long long extract_epi64_arm(__m128i arg) { + return vgetq_lane_s64(arg.AsSi64x2, imm); +} + +#define _mm_extract_epi8(op, imm) extract_epi8_arm<imm>(op) +#define _mm_extract_epi16(op, imm) extract_epi16_arm<imm>(op) +#define _mm_extract_epi32(op, imm) extract_epi32_arm<imm>(op) +#define _mm_extract_epi64(op, imm) extract_epi64_arm<imm>(op) +#define _mm_extract_ps(op, imm) _mm_extract_epi32(op, imm) + static Y_FORCE_INLINE __m128i _mm_mul_epu32(__m128i op1, __m128i op2) { __m128i result; @@ -734,27 +734,27 @@ struct _mm_setzero_ps : TBaseWrapper<__m128> { } }; -Y_FORCE_INLINE __m128d _mm_setzero_pd() { +Y_FORCE_INLINE __m128d _mm_setzero_pd() { return vdupq_n_f64(0.); } -Y_FORCE_INLINE __m128 _mm_loadu_ps(const float* ptr) { +Y_FORCE_INLINE __m128 _mm_loadu_ps(const float* ptr) { __m128 result; result.AsFloat32x4 = vld1q_f32(ptr); return result; } -Y_FORCE_INLINE __m128 _mm_load_ps(const float* ptr) { +Y_FORCE_INLINE __m128 _mm_load_ps(const float* ptr) { __m128 result; result.AsFloat32x4 = vld1q_f32(ptr); return result; } -Y_FORCE_INLINE void _mm_storeu_ps(float* ptr, const __m128& op) { +Y_FORCE_INLINE void _mm_storeu_ps(float* ptr, const __m128& op) { vst1q_f32(ptr, op.AsFloat32x4); } -Y_FORCE_INLINE void _mm_store_ps(float* ptr, const __m128& op) { +Y_FORCE_INLINE void _mm_store_ps(float* ptr, const __m128& op) { vst1q_f32(ptr, op.AsFloat32x4); } @@ -768,23 +768,23 @@ struct _mm_set_ps : TBaseWrapper<__m128> { } }; -Y_FORCE_INLINE __m128d _mm_set_pd(double d1, double d0) { +Y_FORCE_INLINE __m128d _mm_set_pd(double d1, double d0) { const float64x1_t p0 = {d0}; const float64x1_t p1 = {d1}; return vcombine_f64(p0, p1); } -Y_FORCE_INLINE __m128d _mm_loadu_pd(const double* d) { +Y_FORCE_INLINE __m128d _mm_loadu_pd(const double* d) { __m128d res; res = vld1q_f64(d); return res; } -Y_FORCE_INLINE void _mm_storeu_pd(double* res, __m128d a) { +Y_FORCE_INLINE void _mm_storeu_pd(double* res, __m128d a) { vst1q_f64(res, a); } -Y_FORCE_INLINE void _mm_store_pd(double* res, __m128d a) { +Y_FORCE_INLINE void _mm_store_pd(double* res, __m128d a) { vst1q_f64(res, a); } @@ -811,11 +811,11 @@ struct _mm_and_ps : TBaseWrapper<__m128> { } }; -Y_FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) { - return vandq_u64(a, b); -} - -Y_FORCE_INLINE void _MM_TRANSPOSE4_PS(__m128& op0, __m128& op1, __m128& op2, __m128& op3) { +Y_FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) { + return vandq_u64(a, b); +} + +Y_FORCE_INLINE void _MM_TRANSPOSE4_PS(__m128& op0, __m128& op1, __m128& op2, __m128& op3) { float64x2_t im0 = (float64x2_t)vtrn1q_f32(op0.AsFloat32x4, op1.AsFloat32x4); float64x2_t im1 = @@ -831,11 +831,11 @@ Y_FORCE_INLINE void _MM_TRANSPOSE4_PS(__m128& op0, __m128& op1, __m128& op2, __m TQType<float64x2_t>::As(op3) = vtrn2q_f64(im1, im3); }; -Y_FORCE_INLINE __m128 _mm_castsi128_ps(__m128i op) { +Y_FORCE_INLINE __m128 _mm_castsi128_ps(__m128i op) { return reinterpret_cast<__m128&>(op); } -Y_FORCE_INLINE __m128i _mm_castps_si128(__m128 op) { +Y_FORCE_INLINE __m128i _mm_castps_si128(__m128 op) { return reinterpret_cast<__m128i&>(op); } @@ -878,7 +878,7 @@ using _mm_cvttps_epi32 = TCvtF2SWrapperSingle<int32x4_t, float32x4_t, decltype(vcvtq_s32_f32), vcvtq_s32_f32>; -Y_FORCE_INLINE int +Y_FORCE_INLINE int _mm_movemask_ps(const __m128& op) { uint32x4_t mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; uint32x4_t bits = vandq_u32(op.AsUi32x4, mask); @@ -924,27 +924,27 @@ inline void _mm_store_ss(float* p, __m128 a) { *p = vgetq_lane_f32(a.AsFloat32x4, 0); } -inline float vgetg_lane_f32_switch(float32x4_t a, ui8 b) { - switch (b & 0x3) { - case 0: - return vgetq_lane_f32(a, 0); - case 1: - return vgetq_lane_f32(a, 1); - case 2: - return vgetq_lane_f32(a, 2); - case 3: - return vgetq_lane_f32(a, 3); - } - return 0; -} - +inline float vgetg_lane_f32_switch(float32x4_t a, ui8 b) { + switch (b & 0x3) { + case 0: + return vgetq_lane_f32(a, 0); + case 1: + return vgetq_lane_f32(a, 1); + case 2: + return vgetq_lane_f32(a, 2); + case 3: + return vgetq_lane_f32(a, 3); + } + return 0; +} + inline __m128 _mm_shuffle_ps(__m128 a, __m128 b, const ShuffleStruct4& shuf) { - __m128 ret; - ret.AsFloat32x4 = vmovq_n_f32(vgetg_lane_f32_switch(a.AsFloat32x4, shuf.x[0])); - ret.AsFloat32x4 = vsetq_lane_f32(vgetg_lane_f32_switch(a.AsFloat32x4, shuf.x[1]), ret.AsFloat32x4, 1); - ret.AsFloat32x4 = vsetq_lane_f32(vgetg_lane_f32_switch(b.AsFloat32x4, shuf.x[2]), ret.AsFloat32x4, 2); - ret.AsFloat32x4 = vsetq_lane_f32(vgetg_lane_f32_switch(b.AsFloat32x4, shuf.x[3]), ret.AsFloat32x4, 3); - return ret; + __m128 ret; + ret.AsFloat32x4 = vmovq_n_f32(vgetg_lane_f32_switch(a.AsFloat32x4, shuf.x[0])); + ret.AsFloat32x4 = vsetq_lane_f32(vgetg_lane_f32_switch(a.AsFloat32x4, shuf.x[1]), ret.AsFloat32x4, 1); + ret.AsFloat32x4 = vsetq_lane_f32(vgetg_lane_f32_switch(b.AsFloat32x4, shuf.x[2]), ret.AsFloat32x4, 2); + ret.AsFloat32x4 = vsetq_lane_f32(vgetg_lane_f32_switch(b.AsFloat32x4, shuf.x[3]), ret.AsFloat32x4, 3); + return ret; } inline __m128 _mm_or_ps(__m128 a, __m128 b) { @@ -952,94 +952,94 @@ inline __m128 _mm_or_ps(__m128 a, __m128 b) { res.AsUi32x4 = vorrq_u32(a.AsUi32x4, b.AsUi32x4); return res; } - -inline __m128i _mm_sad_epu8(__m128i a, __m128i b) { - uint16x8_t t = vpaddlq_u8(vabdq_u8(a.AsUi8x16, b.AsUi8x16)); - uint16_t r0 = t[0] + t[1] + t[2] + t[3]; - uint16_t r4 = t[4] + t[5] + t[6] + t[7]; - uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0); - __m128i ans; - ans.AsUi16x8 = vsetq_lane_u16(r4, r, 4); - return ans; -} - -Y_FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) { - __m128i ans; - ans.AsSi8x16 = vqsubq_s8(a.AsSi8x16, b.AsSi8x16); - return ans; -} - -Y_FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) { - __m128i ans; - ans.AsSi16x8 = vqsubq_s16(a.AsSi16x8, b.AsSi16x8); - return ans; -} - -Y_FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) { - __m128i ans; - ans.AsUi8x16 = vqsubq_u8(a.AsUi8x16, b.AsUi8x16); - return ans; -} - -Y_FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) { - __m128i ans; - ans.AsUi16x8 = vqsubq_u16(a.AsUi16x8, b.AsUi16x8); - return ans; -} - -Y_FORCE_INLINE __m128d _mm_castsi128_pd(__m128i __A) { - return reinterpret_cast<__m128d&>(__A); -} - -Y_FORCE_INLINE __m128i _mm_set_epi8(ui8 i15, ui8 i14, ui8 i13, ui8 i12, ui8 i11, ui8 i10, ui8 i9, ui8 i8, - ui8 i7, ui8 i6, ui8 i5, ui8 i4, ui8 i3, ui8 i2, ui8 i1, ui8 i0) -{ - int a0 = i0 | (i1<<8) | (i2<<16) | (i3<<24); - int a1 = i4 | (i5<<8) | (i6<<16) | (i7<<24); - int a2 = i8 | (i9<<8) | (i10<<16) | (i11<<24); - int a3 = i12 | (i13<<8) | (i14<<16) | (i15<<24); - return _mm_set_epi32(a3, a2, a1, a0); -} - -Y_FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) { - __m128i ans; - ans.AsUi8x16 = vmaxq_u8(a.AsUi8x16, b.AsUi8x16); - return ans; -} - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wuninitialized" -Y_FORCE_INLINE __m128d _mm_undefined_pd(void) { - __m128d ans = ans; - return ans; -} -#pragma GCC diagnostic pop - -Y_FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double* b) { - a[1] = *b; - return a; -} - -Y_FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double* b) { - a[0] = *b; - return a; -} - -Y_FORCE_INLINE double _mm_cvtsd_f64(__m128d a) { - return a[0]; -} - -Y_FORCE_INLINE __m128d _mm_shuffle_pd(__m128d a, __m128d b, int mask) { - __m128d result; - const int litmsk = mask & 0x3; - - if (litmsk == 0) - result = vzip1q_f64(a, b); - else if (litmsk == 1) - result = __builtin_shufflevector(a, b, 1, 2); - else if (litmsk == 2) - result = __builtin_shufflevector(a, b, 0, 3); - else - result = vzip2q_f64(a, b); - return result; -} + +inline __m128i _mm_sad_epu8(__m128i a, __m128i b) { + uint16x8_t t = vpaddlq_u8(vabdq_u8(a.AsUi8x16, b.AsUi8x16)); + uint16_t r0 = t[0] + t[1] + t[2] + t[3]; + uint16_t r4 = t[4] + t[5] + t[6] + t[7]; + uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0); + __m128i ans; + ans.AsUi16x8 = vsetq_lane_u16(r4, r, 4); + return ans; +} + +Y_FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) { + __m128i ans; + ans.AsSi8x16 = vqsubq_s8(a.AsSi8x16, b.AsSi8x16); + return ans; +} + +Y_FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) { + __m128i ans; + ans.AsSi16x8 = vqsubq_s16(a.AsSi16x8, b.AsSi16x8); + return ans; +} + +Y_FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) { + __m128i ans; + ans.AsUi8x16 = vqsubq_u8(a.AsUi8x16, b.AsUi8x16); + return ans; +} + +Y_FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) { + __m128i ans; + ans.AsUi16x8 = vqsubq_u16(a.AsUi16x8, b.AsUi16x8); + return ans; +} + +Y_FORCE_INLINE __m128d _mm_castsi128_pd(__m128i __A) { + return reinterpret_cast<__m128d&>(__A); +} + +Y_FORCE_INLINE __m128i _mm_set_epi8(ui8 i15, ui8 i14, ui8 i13, ui8 i12, ui8 i11, ui8 i10, ui8 i9, ui8 i8, + ui8 i7, ui8 i6, ui8 i5, ui8 i4, ui8 i3, ui8 i2, ui8 i1, ui8 i0) +{ + int a0 = i0 | (i1<<8) | (i2<<16) | (i3<<24); + int a1 = i4 | (i5<<8) | (i6<<16) | (i7<<24); + int a2 = i8 | (i9<<8) | (i10<<16) | (i11<<24); + int a3 = i12 | (i13<<8) | (i14<<16) | (i15<<24); + return _mm_set_epi32(a3, a2, a1, a0); +} + +Y_FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) { + __m128i ans; + ans.AsUi8x16 = vmaxq_u8(a.AsUi8x16, b.AsUi8x16); + return ans; +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +Y_FORCE_INLINE __m128d _mm_undefined_pd(void) { + __m128d ans = ans; + return ans; +} +#pragma GCC diagnostic pop + +Y_FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double* b) { + a[1] = *b; + return a; +} + +Y_FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double* b) { + a[0] = *b; + return a; +} + +Y_FORCE_INLINE double _mm_cvtsd_f64(__m128d a) { + return a[0]; +} + +Y_FORCE_INLINE __m128d _mm_shuffle_pd(__m128d a, __m128d b, int mask) { + __m128d result; + const int litmsk = mask & 0x3; + + if (litmsk == 0) + result = vzip1q_f64(a, b); + else if (litmsk == 1) + result = __builtin_shufflevector(a, b, 1, 2); + else if (litmsk == 2) + result = __builtin_shufflevector(a, b, 0, 3); + else + result = vzip2q_f64(a, b); + return result; +} |