Restoring authorship annotation for <danlark@yandex-team.ru>. Commit 2 of 2.

author: danlark <danlark@yandex-team.ru> 2022-02-10 16:46:10 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:46:10 +0300
commit: baa58daefa91fde4b4769facdbd2903763b9c6a8 (patch)
tree: 1a2c5ffcf89eb53ecd79dbc9bc0a195c27404d0c /library/cpp/sse/sse2neon.h
parent: 3426a9bc7f169ae9da54cef557ad2a33f6e8eee0 (diff)
download: ydb-baa58daefa91fde4b4769facdbd2903763b9c6a8.tar.gz
1 files changed, 203 insertions, 203 deletions
diff --git a/library/cpp/sse/sse2neon.h b/library/cpp/sse/sse2neon.h
index f60711e77f..695dbd3041 100644
--- a/library/cpp/sse/sse2neon.h
+++ b/library/cpp/sse/sse2neon.h
@@ -17,7 +17,7 @@
 
 #if !defined(_arm64_)
 #error "This header is for ARM64 (aarch64) platform only. " \
-    "Include sse.h instead of including this header directly." 
+    "Include sse.h instead of including this header directly."
 #endif
 
 #include <arm_neon.h>
@@ -57,21 +57,21 @@ union __m128 {
 
 typedef float64x2_t __m128d;
 
-enum _mm_hint 
-{ 
-  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */ 
-  _MM_HINT_ET0 = 7, 
-  _MM_HINT_ET1 = 6, 
-  _MM_HINT_T0 = 3, 
-  _MM_HINT_T1 = 2, 
-  _MM_HINT_T2 = 1, 
-  _MM_HINT_NTA = 0 
-}; 
- 
-Y_FORCE_INLINE void _mm_prefetch(const void *p, enum _mm_hint) { 
-    __builtin_prefetch(p); 
-} 
- 
+enum _mm_hint
+{
+  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
+  _MM_HINT_ET0 = 7,
+  _MM_HINT_ET1 = 6,
+  _MM_HINT_T0 = 3,
+  _MM_HINT_T1 = 2,
+  _MM_HINT_T2 = 1,
+  _MM_HINT_NTA = 0
+};
+
+Y_FORCE_INLINE void _mm_prefetch(const void *p, enum _mm_hint) {
+    __builtin_prefetch(p);
+}
+
 template <typename TType>
 struct TQType;
 
@@ -299,9 +299,9 @@ using _mm_or_si128 = TWrapperDual<uint64x2_t, decltype(vorrq_u64), vorrq_u64>;
 using _mm_and_si128 = TWrapperDual<uint64x2_t, decltype(vandq_u64), vandq_u64>;
 using _mm_andnot_si128 =
     TWrapperDualSwap<uint64x2_t, decltype(vbicq_u64), vbicq_u64>;
-using _mm_xor_si128 = TWrapperDual<uint64x2_t, decltype(veorq_u64), veorq_u64>; 
+using _mm_xor_si128 = TWrapperDual<uint64x2_t, decltype(veorq_u64), veorq_u64>;
 
-using _mm_add_epi8 = TWrapperDual<uint8x16_t, decltype(vaddq_u8), vaddq_u8>; 
+using _mm_add_epi8 = TWrapperDual<uint8x16_t, decltype(vaddq_u8), vaddq_u8>;
 using _mm_add_epi16 = TWrapperDual<uint16x8_t, decltype(vaddq_u16), vaddq_u16>;
 using _mm_add_epi32 = TWrapperDual<uint32x4_t, decltype(vaddq_u32), vaddq_u32>;
 using _mm_add_epi64 = TWrapperDual<uint64x2_t, decltype(vaddq_u64), vaddq_u64>;
@@ -342,7 +342,7 @@ inline __m128i _mm_madd_epi16(__m128i a, __m128i b) {
     return res;
 }
 
-using _mm_sub_epi8 = TWrapperDual<uint8x16_t, decltype(vsubq_u8), vsubq_u8>; 
+using _mm_sub_epi8 = TWrapperDual<uint8x16_t, decltype(vsubq_u8), vsubq_u8>;
 using _mm_sub_epi16 = TWrapperDual<uint16x8_t, decltype(vsubq_u16), vsubq_u16>;
 using _mm_sub_epi32 = TWrapperDual<uint32x4_t, decltype(vsubq_u32), vsubq_u32>;
 using _mm_sub_epi64 = TWrapperDual<uint64x2_t, decltype(vsubq_u64), vsubq_u64>;
@@ -385,27 +385,27 @@ using _mm_cmplt_epi16 =
 using _mm_cmplt_epi32 =
     TWrapperDual<int32x4_t, decltype(vcltq_s32), vcltq_s32>;
 
-Y_FORCE_INLINE __m128i _mm_load_si128(const __m128i* ptr) { 
+Y_FORCE_INLINE __m128i _mm_load_si128(const __m128i* ptr) {
     __m128i result;
     result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr);
     return result;
 }
 
-Y_FORCE_INLINE __m128i _mm_loadu_si128(const __m128i* ptr) { 
+Y_FORCE_INLINE __m128i _mm_loadu_si128(const __m128i* ptr) {
     __m128i result;
     result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr);
     return result;
 }
 
-Y_FORCE_INLINE __m128i _mm_lddqu_si128(const __m128i* ptr) { 
-    return _mm_loadu_si128(ptr); 
-} 
- 
-Y_FORCE_INLINE void _mm_storeu_si128(__m128i* ptr, const __m128i& op) { 
+Y_FORCE_INLINE __m128i _mm_lddqu_si128(const __m128i* ptr) {
+    return _mm_loadu_si128(ptr);
+}
+
+Y_FORCE_INLINE void _mm_storeu_si128(__m128i* ptr, const __m128i& op) {
     vst1q_u64((uint64_t*)ptr, op.AsUi64x2);
 }
 
-Y_FORCE_INLINE void 
+Y_FORCE_INLINE void
 _mm_store_si128(__m128i* ptr, const __m128i& op) {
     vst1q_u64((uint64_t*)ptr, op.AsUi64x2);
 }
@@ -459,7 +459,7 @@ struct ShuffleStruct4 {
     ui8 x[4];
 };
 
-Y_FORCE_INLINE ShuffleStruct4 
+Y_FORCE_INLINE ShuffleStruct4
 _MM_SHUFFLE(ui8 x4, ui8 x3, ui8 x2, ui8 x1) {
     ShuffleStruct4 result;
     result.x[0] = x1;
@@ -469,7 +469,7 @@ _MM_SHUFFLE(ui8 x4, ui8 x3, ui8 x2, ui8 x1) {
     return result;
 }
 
-Y_FORCE_INLINE __m128i 
+Y_FORCE_INLINE __m128i
 _mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) {
     __m128i result;
     const ui8 xi[4] = {
@@ -486,7 +486,7 @@ _mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) {
     return result;
 }
 
-Y_FORCE_INLINE int 
+Y_FORCE_INLINE int
 _mm_movemask_epi8(const __m128i& op) {
     uint8x16_t mask = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
                        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
@@ -542,7 +542,7 @@ struct THelper_mm_slli_si128 : TBaseWrapper<__m128i> {
 
 #define _mm_slli_si128(a, imm) THelper_mm_slli_si128<imm>(a)
 
-Y_FORCE_INLINE int _mm_cvtsi128_si32(const __m128i& op) { 
+Y_FORCE_INLINE int _mm_cvtsi128_si32(const __m128i& op) {
     return vgetq_lane_s32(op.AsSi32x4, 0);
 }
 
@@ -558,18 +558,18 @@ struct _mm_set_epi16 : TBaseWrapper<__m128i> {
     }
 };
 
-struct _mm_setr_epi16 : TBaseWrapper<__m128i> { 
-    Y_FORCE_INLINE 
-    _mm_setr_epi16(const short w7, const short w6, 
-                  const short w5, const short w4, 
-                  const short w3, const short w2, 
-                  const short w1, const short w0) { 
-        int16x4_t d0 = {w7, w6, w5, w4}; 
-        int16x4_t d1 = {w3, w2, w1, w0}; 
-        TQType<int16x8_t>::As(Value) = vcombine_s16(d0, d1); 
-    } 
-}; 
- 
+struct _mm_setr_epi16 : TBaseWrapper<__m128i> {
+    Y_FORCE_INLINE
+    _mm_setr_epi16(const short w7, const short w6,
+                  const short w5, const short w4,
+                  const short w3, const short w2,
+                  const short w1, const short w0) {
+        int16x4_t d0 = {w7, w6, w5, w4};
+        int16x4_t d1 = {w3, w2, w1, w0};
+        TQType<int16x8_t>::As(Value) = vcombine_s16(d0, d1);
+    }
+};
+
 struct _mm_set_epi32 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
     _mm_set_epi32(const int x3, const int x2,
@@ -580,16 +580,16 @@ struct _mm_set_epi32 : TBaseWrapper<__m128i> {
     }
 };
 
-struct _mm_setr_epi32 : TBaseWrapper<__m128i> { 
-    Y_FORCE_INLINE 
-    _mm_setr_epi32(const int x3, const int x2, 
-                  const int x1, const int x0) { 
-        int32x2_t d0 = {x3, x2}; 
-        int32x2_t d1 = {x1, x0}; 
-        TQType<int32x4_t>::As(Value) = vcombine_s32(d0, d1); 
-    } 
-}; 
- 
+struct _mm_setr_epi32 : TBaseWrapper<__m128i> {
+    Y_FORCE_INLINE
+    _mm_setr_epi32(const int x3, const int x2,
+                  const int x1, const int x0) {
+        int32x2_t d0 = {x3, x2};
+        int32x2_t d1 = {x1, x0};
+        TQType<int32x4_t>::As(Value) = vcombine_s32(d0, d1);
+    }
+};
+
 struct _mm_cvtsi32_si128 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
     _mm_cvtsi32_si128(int op) {
@@ -642,31 +642,31 @@ struct TScalarOutWrapper : TBaseWrapper<TOpOut> {
 };
 
 template<int imm>
-int extract_epi8_arm(__m128i arg) { 
-    return vgetq_lane_u8(arg.AsUi8x16, imm); 
-} 
- 
-template<int imm> 
-int extract_epi16_arm(__m128i arg) { 
+int extract_epi8_arm(__m128i arg) {
+    return vgetq_lane_u8(arg.AsUi8x16, imm);
+}
+
+template<int imm>
+int extract_epi16_arm(__m128i arg) {
     return vgetq_lane_u16(arg.AsUi16x8, imm);
 }
 
-template<int imm> 
-int extract_epi32_arm(__m128i arg) { 
-    return vgetq_lane_s32(arg.AsSi32x4, imm); 
-} 
-
-template<int imm> 
-long long extract_epi64_arm(__m128i arg) { 
-    return vgetq_lane_s64(arg.AsSi64x2, imm); 
-} 
- 
-#define _mm_extract_epi8(op, imm) extract_epi8_arm<imm>(op) 
-#define _mm_extract_epi16(op, imm) extract_epi16_arm<imm>(op) 
-#define _mm_extract_epi32(op, imm) extract_epi32_arm<imm>(op) 
-#define _mm_extract_epi64(op, imm) extract_epi64_arm<imm>(op) 
-#define _mm_extract_ps(op, imm) _mm_extract_epi32(op, imm) 
- 
+template<int imm>
+int extract_epi32_arm(__m128i arg) {
+    return vgetq_lane_s32(arg.AsSi32x4, imm);
+}
+
+template<int imm>
+long long extract_epi64_arm(__m128i arg) {
+    return vgetq_lane_s64(arg.AsSi64x2, imm);
+}
+
+#define _mm_extract_epi8(op, imm) extract_epi8_arm<imm>(op)
+#define _mm_extract_epi16(op, imm) extract_epi16_arm<imm>(op)
+#define _mm_extract_epi32(op, imm) extract_epi32_arm<imm>(op)
+#define _mm_extract_epi64(op, imm) extract_epi64_arm<imm>(op)
+#define _mm_extract_ps(op, imm) _mm_extract_epi32(op, imm)
+
 static Y_FORCE_INLINE
 __m128i _mm_mul_epu32(__m128i op1, __m128i op2) {
     __m128i result;
@@ -734,27 +734,27 @@ struct _mm_setzero_ps : TBaseWrapper<__m128> {
     }
 };
 
-Y_FORCE_INLINE __m128d _mm_setzero_pd() { 
+Y_FORCE_INLINE __m128d _mm_setzero_pd() {
     return vdupq_n_f64(0.);
 }
 
-Y_FORCE_INLINE __m128 _mm_loadu_ps(const float* ptr) { 
+Y_FORCE_INLINE __m128 _mm_loadu_ps(const float* ptr) {
     __m128 result;
     result.AsFloat32x4 = vld1q_f32(ptr);
     return result;
 }
 
-Y_FORCE_INLINE __m128 _mm_load_ps(const float* ptr) { 
+Y_FORCE_INLINE __m128 _mm_load_ps(const float* ptr) {
     __m128 result;
     result.AsFloat32x4 = vld1q_f32(ptr);
     return result;
 }
 
-Y_FORCE_INLINE void _mm_storeu_ps(float* ptr, const __m128& op) { 
+Y_FORCE_INLINE void _mm_storeu_ps(float* ptr, const __m128& op) {
     vst1q_f32(ptr, op.AsFloat32x4);
 }
 
-Y_FORCE_INLINE void _mm_store_ps(float* ptr, const __m128& op) { 
+Y_FORCE_INLINE void _mm_store_ps(float* ptr, const __m128& op) {
     vst1q_f32(ptr, op.AsFloat32x4);
 }
 
@@ -768,23 +768,23 @@ struct _mm_set_ps : TBaseWrapper<__m128> {
     }
 };
 
-Y_FORCE_INLINE __m128d _mm_set_pd(double d1, double d0) { 
+Y_FORCE_INLINE __m128d _mm_set_pd(double d1, double d0) {
     const float64x1_t p0 = {d0};
     const float64x1_t p1 = {d1};
     return vcombine_f64(p0, p1);
 }
 
-Y_FORCE_INLINE __m128d _mm_loadu_pd(const double* d) { 
+Y_FORCE_INLINE __m128d _mm_loadu_pd(const double* d) {
     __m128d res;
     res = vld1q_f64(d);
     return res;
 }
 
-Y_FORCE_INLINE void _mm_storeu_pd(double* res, __m128d a) { 
+Y_FORCE_INLINE void _mm_storeu_pd(double* res, __m128d a) {
     vst1q_f64(res, a);
 }
 
-Y_FORCE_INLINE void _mm_store_pd(double* res, __m128d a) { 
+Y_FORCE_INLINE void _mm_store_pd(double* res, __m128d a) {
     vst1q_f64(res, a);
 }
 
@@ -811,11 +811,11 @@ struct _mm_and_ps : TBaseWrapper<__m128> {
     }
 };
 
-Y_FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) { 
-    return vandq_u64(a, b); 
-} 
- 
-Y_FORCE_INLINE void _MM_TRANSPOSE4_PS(__m128& op0, __m128& op1, __m128& op2, __m128& op3) { 
+Y_FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) {
+    return vandq_u64(a, b);
+}
+
+Y_FORCE_INLINE void _MM_TRANSPOSE4_PS(__m128& op0, __m128& op1, __m128& op2, __m128& op3) {
     float64x2_t im0 =
         (float64x2_t)vtrn1q_f32(op0.AsFloat32x4, op1.AsFloat32x4);
     float64x2_t im1 =
@@ -831,11 +831,11 @@ Y_FORCE_INLINE void _MM_TRANSPOSE4_PS(__m128& op0, __m128& op1, __m128& op2, __m
     TQType<float64x2_t>::As(op3) = vtrn2q_f64(im1, im3);
 };
 
-Y_FORCE_INLINE __m128 _mm_castsi128_ps(__m128i op) { 
+Y_FORCE_INLINE __m128 _mm_castsi128_ps(__m128i op) {
     return reinterpret_cast<__m128&>(op);
 }
 
-Y_FORCE_INLINE __m128i _mm_castps_si128(__m128 op) { 
+Y_FORCE_INLINE __m128i _mm_castps_si128(__m128 op) {
     return reinterpret_cast<__m128i&>(op);
 }
 
@@ -878,7 +878,7 @@ using _mm_cvttps_epi32 =
     TCvtF2SWrapperSingle<int32x4_t, float32x4_t,
                          decltype(vcvtq_s32_f32), vcvtq_s32_f32>;
 
-Y_FORCE_INLINE int 
+Y_FORCE_INLINE int
 _mm_movemask_ps(const __m128& op) {
     uint32x4_t mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
     uint32x4_t bits = vandq_u32(op.AsUi32x4, mask);
@@ -924,27 +924,27 @@ inline void _mm_store_ss(float* p, __m128 a) {
     *p = vgetq_lane_f32(a.AsFloat32x4, 0);
 }
 
-inline float vgetg_lane_f32_switch(float32x4_t a, ui8 b) { 
-    switch (b & 0x3) { 
-        case 0: 
-            return vgetq_lane_f32(a, 0); 
-        case 1: 
-            return vgetq_lane_f32(a, 1); 
-        case 2: 
-            return vgetq_lane_f32(a, 2); 
-        case 3: 
-            return vgetq_lane_f32(a, 3); 
-    } 
-    return 0; 
-} 
- 
+inline float vgetg_lane_f32_switch(float32x4_t a, ui8 b) {
+    switch (b & 0x3) {
+        case 0:
+            return vgetq_lane_f32(a, 0);
+        case 1:
+            return vgetq_lane_f32(a, 1);
+        case 2:
+            return vgetq_lane_f32(a, 2);
+        case 3:
+            return vgetq_lane_f32(a, 3);
+    }
+    return 0;
+}
+
 inline __m128 _mm_shuffle_ps(__m128 a, __m128 b, const ShuffleStruct4& shuf) {
-    __m128 ret; 
-    ret.AsFloat32x4 = vmovq_n_f32(vgetg_lane_f32_switch(a.AsFloat32x4, shuf.x[0])); 
-    ret.AsFloat32x4 = vsetq_lane_f32(vgetg_lane_f32_switch(a.AsFloat32x4, shuf.x[1]), ret.AsFloat32x4, 1); 
-    ret.AsFloat32x4 = vsetq_lane_f32(vgetg_lane_f32_switch(b.AsFloat32x4, shuf.x[2]), ret.AsFloat32x4, 2); 
-    ret.AsFloat32x4 = vsetq_lane_f32(vgetg_lane_f32_switch(b.AsFloat32x4, shuf.x[3]), ret.AsFloat32x4, 3); 
-    return ret; 
+    __m128 ret;
+    ret.AsFloat32x4 = vmovq_n_f32(vgetg_lane_f32_switch(a.AsFloat32x4, shuf.x[0]));
+    ret.AsFloat32x4 = vsetq_lane_f32(vgetg_lane_f32_switch(a.AsFloat32x4, shuf.x[1]), ret.AsFloat32x4, 1);
+    ret.AsFloat32x4 = vsetq_lane_f32(vgetg_lane_f32_switch(b.AsFloat32x4, shuf.x[2]), ret.AsFloat32x4, 2);
+    ret.AsFloat32x4 = vsetq_lane_f32(vgetg_lane_f32_switch(b.AsFloat32x4, shuf.x[3]), ret.AsFloat32x4, 3);
+    return ret;
 }
 
 inline __m128 _mm_or_ps(__m128 a, __m128 b) {
@@ -952,94 +952,94 @@ inline __m128 _mm_or_ps(__m128 a, __m128 b) {
     res.AsUi32x4 = vorrq_u32(a.AsUi32x4, b.AsUi32x4);
     return res;
 }
- 
-inline __m128i _mm_sad_epu8(__m128i a, __m128i b) { 
-    uint16x8_t t = vpaddlq_u8(vabdq_u8(a.AsUi8x16, b.AsUi8x16)); 
-    uint16_t r0 = t[0] + t[1] + t[2] + t[3]; 
-    uint16_t r4 = t[4] + t[5] + t[6] + t[7]; 
-    uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0); 
-    __m128i ans; 
-    ans.AsUi16x8 = vsetq_lane_u16(r4, r, 4); 
-    return ans; 
-} 
- 
-Y_FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) { 
-    __m128i ans; 
-    ans.AsSi8x16 = vqsubq_s8(a.AsSi8x16, b.AsSi8x16); 
-    return ans; 
-} 
- 
-Y_FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) { 
-    __m128i ans; 
-    ans.AsSi16x8 = vqsubq_s16(a.AsSi16x8, b.AsSi16x8); 
-    return ans; 
-} 
- 
-Y_FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) { 
-    __m128i ans; 
-    ans.AsUi8x16 = vqsubq_u8(a.AsUi8x16, b.AsUi8x16); 
-    return ans; 
-} 
- 
-Y_FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) { 
-    __m128i ans; 
-    ans.AsUi16x8 = vqsubq_u16(a.AsUi16x8, b.AsUi16x8); 
-    return ans; 
-} 
- 
-Y_FORCE_INLINE __m128d _mm_castsi128_pd(__m128i __A) { 
-    return reinterpret_cast<__m128d&>(__A); 
-} 
- 
-Y_FORCE_INLINE __m128i _mm_set_epi8(ui8 i15, ui8 i14, ui8 i13, ui8 i12, ui8 i11, ui8 i10, ui8 i9, ui8 i8, 
-        ui8 i7, ui8 i6, ui8 i5, ui8 i4, ui8 i3, ui8 i2, ui8 i1, ui8 i0) 
-{ 
-    int a0 =  i0 |  (i1<<8) |  (i2<<16) |  (i3<<24); 
-    int a1 =  i4 |  (i5<<8) |  (i6<<16) |  (i7<<24); 
-    int a2 =  i8 |  (i9<<8) | (i10<<16) | (i11<<24); 
-    int a3 = i12 | (i13<<8) | (i14<<16) | (i15<<24); 
-    return _mm_set_epi32(a3, a2, a1, a0); 
-} 
- 
-Y_FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) { 
-    __m128i ans; 
-    ans.AsUi8x16 = vmaxq_u8(a.AsUi8x16, b.AsUi8x16); 
-    return ans; 
-} 
- 
-#pragma GCC diagnostic push 
-#pragma GCC diagnostic ignored "-Wuninitialized" 
-Y_FORCE_INLINE __m128d _mm_undefined_pd(void) { 
-    __m128d ans = ans; 
-    return ans; 
-} 
-#pragma GCC diagnostic pop 
- 
-Y_FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double* b) { 
-    a[1] = *b; 
-    return a; 
-} 
- 
-Y_FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double* b) { 
-    a[0] = *b; 
-    return a; 
-} 
- 
-Y_FORCE_INLINE double _mm_cvtsd_f64(__m128d a) { 
-    return a[0]; 
-} 
- 
-Y_FORCE_INLINE __m128d _mm_shuffle_pd(__m128d a, __m128d b, int mask) { 
-    __m128d result; 
-    const int litmsk = mask & 0x3; 
- 
-    if (litmsk == 0) 
-        result = vzip1q_f64(a, b); 
-    else if (litmsk == 1) 
-        result = __builtin_shufflevector(a, b, 1, 2); 
-    else if (litmsk == 2) 
-        result = __builtin_shufflevector(a, b, 0, 3); 
-    else 
-        result = vzip2q_f64(a, b); 
-    return result; 
-} 
+
+inline __m128i _mm_sad_epu8(__m128i a, __m128i b) {
+    uint16x8_t t = vpaddlq_u8(vabdq_u8(a.AsUi8x16, b.AsUi8x16));
+    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+    uint16_t r4 = t[4] + t[5] + t[6] + t[7];
+    uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
+    __m128i ans;
+    ans.AsUi16x8 = vsetq_lane_u16(r4, r, 4);
+    return ans;
+}
+
+Y_FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) {
+    __m128i ans;
+    ans.AsSi8x16 = vqsubq_s8(a.AsSi8x16, b.AsSi8x16);
+    return ans;
+}
+
+Y_FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) {
+    __m128i ans;
+    ans.AsSi16x8 = vqsubq_s16(a.AsSi16x8, b.AsSi16x8);
+    return ans;
+}
+
+Y_FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) {
+    __m128i ans;
+    ans.AsUi8x16 = vqsubq_u8(a.AsUi8x16, b.AsUi8x16);
+    return ans;
+}
+
+Y_FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) {
+    __m128i ans;
+    ans.AsUi16x8 = vqsubq_u16(a.AsUi16x8, b.AsUi16x8);
+    return ans;
+}
+
+Y_FORCE_INLINE __m128d _mm_castsi128_pd(__m128i __A) {
+    return reinterpret_cast<__m128d&>(__A);
+}
+
+Y_FORCE_INLINE __m128i _mm_set_epi8(ui8 i15, ui8 i14, ui8 i13, ui8 i12, ui8 i11, ui8 i10, ui8 i9, ui8 i8,
+        ui8 i7, ui8 i6, ui8 i5, ui8 i4, ui8 i3, ui8 i2, ui8 i1, ui8 i0)
+{
+    int a0 =  i0 |  (i1<<8) |  (i2<<16) |  (i3<<24);
+    int a1 =  i4 |  (i5<<8) |  (i6<<16) |  (i7<<24);
+    int a2 =  i8 |  (i9<<8) | (i10<<16) | (i11<<24);
+    int a3 = i12 | (i13<<8) | (i14<<16) | (i15<<24);
+    return _mm_set_epi32(a3, a2, a1, a0);
+}
+
+Y_FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) {
+    __m128i ans;
+    ans.AsUi8x16 = vmaxq_u8(a.AsUi8x16, b.AsUi8x16);
+    return ans;
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+Y_FORCE_INLINE __m128d _mm_undefined_pd(void) {
+    __m128d ans = ans;
+    return ans;
+}
+#pragma GCC diagnostic pop
+
+Y_FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double* b) {
+    a[1] = *b;
+    return a;
+}
+
+Y_FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double* b) {
+    a[0] = *b;
+    return a;
+}
+
+Y_FORCE_INLINE double _mm_cvtsd_f64(__m128d a) {
+    return a[0];
+}
+
+Y_FORCE_INLINE __m128d _mm_shuffle_pd(__m128d a, __m128d b, int mask) {
+    __m128d result;
+    const int litmsk = mask & 0x3;
+
+    if (litmsk == 0)
+        result = vzip1q_f64(a, b);
+    else if (litmsk == 1)
+        result = __builtin_shufflevector(a, b, 1, 2);
+    else if (litmsk == 2)
+        result = __builtin_shufflevector(a, b, 0, 3);
+    else
+        result = vzip2q_f64(a, b);
+    return result;
+}
author	danlark <danlark@yandex-team.ru>	2022-02-10 16:46:10 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:46:10 +0300
commit	baa58daefa91fde4b4769facdbd2903763b9c6a8 (patch)
tree	1a2c5ffcf89eb53ecd79dbc9bc0a195c27404d0c /library/cpp/sse/sse2neon.h
parent	3426a9bc7f169ae9da54cef557ad2a33f6e8eee0 (diff)
download	ydb-baa58daefa91fde4b4769facdbd2903763b9c6a8.tar.gz