Restoring authorship annotation for <agri@yandex-team.ru>. Commit 2 of 2.

author: agri <agri@yandex-team.ru> 2022-02-10 16:48:12 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:48:12 +0300
commit: 2909866fbc652492b7d7cab3023cb19489dc4fd8 (patch)
tree: b222e5ac2e2e98872661c51ccceee5da0d291e13 /library/cpp/sse
parent: d3530b2692e400bd4d29bd4f07cafaee139164e7 (diff)
download: ydb-2909866fbc652492b7d7cab3023cb19489dc4fd8.tar.gz
4 files changed, 1726 insertions, 1726 deletions
diff --git a/library/cpp/sse/sse.h b/library/cpp/sse/sse.h
index 918a942803..19bac17de0 100644
--- a/library/cpp/sse/sse.h
+++ b/library/cpp/sse/sse.h
@@ -1,18 +1,18 @@
-#pragma once 
- 
-/* 
-  The header chooses appropriate SSE support. 
-  On Intel: SSE intrinsics 
-  On ARM64: translation to NEON intrinsics or software emulation 
+#pragma once
+
+/*
+  The header chooses appropriate SSE support.
+  On Intel: SSE intrinsics
+  On ARM64: translation to NEON intrinsics or software emulation
   On PowerPc: translation to Altivec intrinsics or software emulation
-*/ 
+*/
 /* Author: Vitaliy Manushkin <agri@yandex-team.ru>, Danila Kutenin <danlark@yandex-team.ru> */
- 
-#include <util/system/platform.h> 
+
+#include <util/system/platform.h>
 
 #if (defined(_i386_) || defined(_x86_64_)) && defined(_sse_)
-#include <xmmintrin.h> 
-#include <emmintrin.h> 
+#include <xmmintrin.h>
+#include <emmintrin.h>
 #include <pmmintrin.h>
 #define ARCADIA_SSE
 #if defined(_ssse3_)
@@ -24,10 +24,10 @@
 #if defined(_sse4_2_)
 #include <nmmintrin.h>
 #endif
-#elif defined(_arm64_) 
-#include "sse2neon.h" 
+#elif defined(_arm64_)
+#include "sse2neon.h"
 #define ARCADIA_SSE
 #elif defined(_ppc64_)
 #include "powerpc.h"
 #define ARCADIA_SSE
-#endif 
+#endif
diff --git a/library/cpp/sse/sse2neon.h b/library/cpp/sse/sse2neon.h
index af7f3ed242..695dbd3041 100644
--- a/library/cpp/sse/sse2neon.h
+++ b/library/cpp/sse/sse2neon.h
@@ -1,60 +1,60 @@
-#pragma once 
- 
-/* 
-  The header contains inlining code 
-  which translates SSE intrinsics to NEON intrinsics or software emulation. 
-  You are encouraged for commitments. 
-  Add missing intrinsics, add unittests, purify the implementation, 
-  merge and simplify templates. 
-  Warning: The code is made in deep nights, so it surely contains bugs, 
-  imperfections, flaws and all other kinds of errors and mistakes. 
-*/ 
-/* Author: Vitaliy Manushkin <agri@yandex-team.ru> */ 
- 
-#include <util/system/platform.h> 
+#pragma once
+
+/*
+  The header contains inlining code
+  which translates SSE intrinsics to NEON intrinsics or software emulation.
+  You are encouraged for commitments.
+  Add missing intrinsics, add unittests, purify the implementation,
+  merge and simplify templates.
+  Warning: The code is made in deep nights, so it surely contains bugs,
+  imperfections, flaws and all other kinds of errors and mistakes.
+*/
+/* Author: Vitaliy Manushkin <agri@yandex-team.ru> */
+
+#include <util/system/platform.h>
 #include <util/system/compiler.h>
-#include <util/system/types.h> 
- 
-#if !defined(_arm64_) 
-#error "This header is for ARM64 (aarch64) platform only. " \ 
+#include <util/system/types.h>
+
+#if !defined(_arm64_)
+#error "This header is for ARM64 (aarch64) platform only. " \
     "Include sse.h instead of including this header directly."
-#endif 
- 
-#include <arm_neon.h> 
- 
-union __m128i { 
-    uint64x2_t AsUi64x2; 
-    int64x2_t AsSi64x2; 
- 
-    uint32x4_t AsUi32x4; 
-    int32x4_t AsSi32x4; 
- 
-    uint16x8_t AsUi16x8; 
-    int16x8_t AsSi16x8; 
- 
-    uint8x16_t AsUi8x16; 
-    int8x16_t AsSi8x16; 
- 
-    float32x4_t AsFloat32x4; 
-    float64x2_t AsFloat64x2; 
-}; 
- 
-union __m128 { 
-    float32x4_t AsFloat32x4; 
-    float64x2_t AsFloat64x2; 
+#endif
+
+#include <arm_neon.h>
+
+union __m128i {
+    uint64x2_t AsUi64x2;
+    int64x2_t AsSi64x2;
 
     uint32x4_t AsUi32x4;
     int32x4_t AsSi32x4;
 
-    uint64x2_t AsUi64x2; 
-    int64x2_t AsSi64x2; 
+    uint16x8_t AsUi16x8;
+    int16x8_t AsSi16x8;
 
-    uint8x16_t AsUi8x16; 
+    uint8x16_t AsUi8x16;
+    int8x16_t AsSi8x16;
+
+    float32x4_t AsFloat32x4;
+    float64x2_t AsFloat64x2;
+};
+
+union __m128 {
+    float32x4_t AsFloat32x4;
+    float64x2_t AsFloat64x2;
+
+    uint32x4_t AsUi32x4;
+    int32x4_t AsSi32x4;
+
+    uint64x2_t AsUi64x2;
+    int64x2_t AsSi64x2;
+
+    uint8x16_t AsUi8x16;
     int8x16_t AsSi8x16;
 
     __m128i As128i;
-}; 
- 
+};
+
 typedef float64x2_t __m128d;
 
 enum _mm_hint
@@ -72,128 +72,128 @@ Y_FORCE_INLINE void _mm_prefetch(const void *p, enum _mm_hint) {
     __builtin_prefetch(p);
 }
 
-template <typename TType> 
-struct TQType; 
- 
-template <> 
-struct TQType<uint8x16_t> { 
-    static inline uint8x16_t& As(__m128i& value) { 
-        return value.AsUi8x16; 
-    } 
-    static inline const uint8x16_t& As(const __m128i& value) { 
-        return value.AsUi8x16; 
-    } 
-}; 
- 
-template <> 
-struct TQType<int8x16_t> { 
-    static inline int8x16_t& As(__m128i& value) { 
-        return value.AsSi8x16; 
-    } 
-    static inline const int8x16_t& As(const __m128i& value) { 
-        return value.AsSi8x16; 
-    } 
-}; 
- 
-template <> 
-struct TQType<uint16x8_t> { 
-    static inline uint16x8_t& As(__m128i& value) { 
-        return value.AsUi16x8; 
-    } 
-    static inline const uint16x8_t& As(const __m128i& value) { 
-        return value.AsUi16x8; 
-    } 
-}; 
- 
-template <> 
-struct TQType<int16x8_t> { 
-    static inline int16x8_t& As(__m128i& value) { 
-        return value.AsSi16x8; 
-    } 
-    static inline const int16x8_t& As(const __m128i& value) { 
-        return value.AsSi16x8; 
-    } 
-}; 
- 
-template <> 
-struct TQType<uint32x4_t> { 
-    static inline uint32x4_t& As(__m128i& value) { 
-        return value.AsUi32x4; 
-    } 
-    static inline const uint32x4_t& As(const __m128i& value) { 
-        return value.AsUi32x4; 
-    } 
-}; 
- 
-template <> 
-struct TQType<int32x4_t> { 
-    static inline int32x4_t& As(__m128i& value) { 
-        return value.AsSi32x4; 
-    } 
-    static inline const int32x4_t& As(const __m128i& value) { 
-        return value.AsSi32x4; 
-    } 
-}; 
- 
-template <> 
-struct TQType<uint64x2_t> { 
-    static inline uint64x2_t& As(__m128i& value) { 
-        return value.AsUi64x2; 
-    } 
-    static inline const uint64x2_t& As(const __m128i& value) { 
-        return value.AsUi64x2; 
-    } 
-    static inline uint64x2_t& As(__m128& value) { 
-        return value.AsUi64x2; 
-    } 
-    static inline const uint64x2_t& As(const __m128& value) { 
-        return value.AsUi64x2; 
-    } 
-}; 
- 
-template <> 
-struct TQType<int64x2_t> { 
-    static inline int64x2_t& As(__m128i& value) { 
-        return value.AsSi64x2; 
-    } 
-    static inline const int64x2_t& As(const __m128i& value) { 
-        return value.AsSi64x2; 
-    } 
-}; 
- 
-template <typename TValue> 
-struct TBaseWrapper { 
-    TValue Value; 
- 
+template <typename TType>
+struct TQType;
+
+template <>
+struct TQType<uint8x16_t> {
+    static inline uint8x16_t& As(__m128i& value) {
+        return value.AsUi8x16;
+    }
+    static inline const uint8x16_t& As(const __m128i& value) {
+        return value.AsUi8x16;
+    }
+};
+
+template <>
+struct TQType<int8x16_t> {
+    static inline int8x16_t& As(__m128i& value) {
+        return value.AsSi8x16;
+    }
+    static inline const int8x16_t& As(const __m128i& value) {
+        return value.AsSi8x16;
+    }
+};
+
+template <>
+struct TQType<uint16x8_t> {
+    static inline uint16x8_t& As(__m128i& value) {
+        return value.AsUi16x8;
+    }
+    static inline const uint16x8_t& As(const __m128i& value) {
+        return value.AsUi16x8;
+    }
+};
+
+template <>
+struct TQType<int16x8_t> {
+    static inline int16x8_t& As(__m128i& value) {
+        return value.AsSi16x8;
+    }
+    static inline const int16x8_t& As(const __m128i& value) {
+        return value.AsSi16x8;
+    }
+};
+
+template <>
+struct TQType<uint32x4_t> {
+    static inline uint32x4_t& As(__m128i& value) {
+        return value.AsUi32x4;
+    }
+    static inline const uint32x4_t& As(const __m128i& value) {
+        return value.AsUi32x4;
+    }
+};
+
+template <>
+struct TQType<int32x4_t> {
+    static inline int32x4_t& As(__m128i& value) {
+        return value.AsSi32x4;
+    }
+    static inline const int32x4_t& As(const __m128i& value) {
+        return value.AsSi32x4;
+    }
+};
+
+template <>
+struct TQType<uint64x2_t> {
+    static inline uint64x2_t& As(__m128i& value) {
+        return value.AsUi64x2;
+    }
+    static inline const uint64x2_t& As(const __m128i& value) {
+        return value.AsUi64x2;
+    }
+    static inline uint64x2_t& As(__m128& value) {
+        return value.AsUi64x2;
+    }
+    static inline const uint64x2_t& As(const __m128& value) {
+        return value.AsUi64x2;
+    }
+};
+
+template <>
+struct TQType<int64x2_t> {
+    static inline int64x2_t& As(__m128i& value) {
+        return value.AsSi64x2;
+    }
+    static inline const int64x2_t& As(const __m128i& value) {
+        return value.AsSi64x2;
+    }
+};
+
+template <typename TValue>
+struct TBaseWrapper {
+    TValue Value;
+
     Y_FORCE_INLINE
-    operator TValue&() { 
-        return Value; 
-    } 
- 
+    operator TValue&() {
+        return Value;
+    }
+
     Y_FORCE_INLINE
-    operator const TValue&() const { 
-        return Value; 
-    } 
-}; 
- 
-template <typename TOp, typename TFunc, TFunc* func, 
-          typename TDup, TDup* dupfunc> 
-struct TWrapperSingleDup: public TBaseWrapper<__m128i> { 
+    operator const TValue&() const {
+        return Value;
+    }
+};
+
+template <typename TOp, typename TFunc, TFunc* func,
+          typename TDup, TDup* dupfunc>
+struct TWrapperSingleDup: public TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    TWrapperSingleDup(const __m128i& op, const int shift) { 
-        TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(shift)); 
-    } 
-}; 
- 
-template <typename TOp, typename TFunc, TFunc* func, 
-          typename TDup, TDup* dupfunc> 
-struct TWrapperSingleNegDup: public TBaseWrapper<__m128i> { 
+    TWrapperSingleDup(const __m128i& op, const int shift) {
+        TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(shift));
+    }
+};
+
+template <typename TOp, typename TFunc, TFunc* func,
+          typename TDup, TDup* dupfunc>
+struct TWrapperSingleNegDup: public TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    TWrapperSingleNegDup(const __m128i& op, const int shift) { 
-        TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(-shift)); 
-    } 
-}; 
- 
+    TWrapperSingleNegDup(const __m128i& op, const int shift) {
+        TQType<TOp>::As(Value) = func(TQType<TOp>::As(op), dupfunc(-shift));
+    }
+};
+
 inline __m128i _mm_srl_epi16(__m128i a, __m128i count) {
     __m128i res;
     res.AsUi16x8 = vshlq_u16(a.AsUi16x8, vdupq_n_s16(-count.AsUi16x8[0]));
@@ -225,16 +225,16 @@ inline __m128i _mm_srai_epi32(__m128i a, int count) {
     return res;
 }
 
-using _mm_srli_epi16 = 
-    TWrapperSingleNegDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16, 
-                         decltype(vdupq_n_s16), vdupq_n_s16>; 
-using _mm_srli_epi32 = 
-    TWrapperSingleNegDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32, 
-                         decltype(vdupq_n_s32), vdupq_n_s32>; 
-using _mm_srli_epi64 = 
-    TWrapperSingleNegDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64, 
-                         decltype(vdupq_n_s64), vdupq_n_s64>; 
- 
+using _mm_srli_epi16 =
+    TWrapperSingleNegDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16,
+                         decltype(vdupq_n_s16), vdupq_n_s16>;
+using _mm_srli_epi32 =
+    TWrapperSingleNegDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32,
+                         decltype(vdupq_n_s32), vdupq_n_s32>;
+using _mm_srli_epi64 =
+    TWrapperSingleNegDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64,
+                         decltype(vdupq_n_s64), vdupq_n_s64>;
+
 
 inline __m128i _mm_sll_epi16(__m128i a, __m128i count) {
     __m128i res;
@@ -255,57 +255,57 @@ inline __m128i _mm_sll_epi64(__m128i a, __m128i count) {
     return res;
 }
 
-using _mm_slli_epi16 = 
-    TWrapperSingleDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16, 
-                      decltype(vdupq_n_s16), vdupq_n_s16>; 
-using _mm_slli_epi32 = 
-    TWrapperSingleDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32, 
-                      decltype(vdupq_n_s32), vdupq_n_s32>; 
-using _mm_slli_epi64 = 
-    TWrapperSingleDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64, 
-                      decltype(vdupq_n_s64), vdupq_n_s64>; 
- 
-template <typename TOp, typename TFunc, TFunc* func, typename... TParams> 
-struct TWrapperDual : TBaseWrapper<__m128i> { 
+using _mm_slli_epi16 =
+    TWrapperSingleDup<uint16x8_t, decltype(vshlq_u16), vshlq_u16,
+                      decltype(vdupq_n_s16), vdupq_n_s16>;
+using _mm_slli_epi32 =
+    TWrapperSingleDup<uint32x4_t, decltype(vshlq_u32), vshlq_u32,
+                      decltype(vdupq_n_s32), vdupq_n_s32>;
+using _mm_slli_epi64 =
+    TWrapperSingleDup<uint64x2_t, decltype(vshlq_u64), vshlq_u64,
+                      decltype(vdupq_n_s64), vdupq_n_s64>;
+
+template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
+struct TWrapperDual : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    TWrapperDual(const __m128i& op1, const __m128i& op2, TParams... params) { 
-        TQType<TOp>::As(Value) = (TOp) 
-            func(TQType<TOp>::As(op1), 
-                 TQType<TOp>::As(op2), 
-                 params...); 
-    } 
-}; 
- 
-template <typename TOp, typename TFunc, TFunc* func, typename... TParams> 
-struct TWrapperDualSwap : TBaseWrapper<__m128i> { 
+    TWrapperDual(const __m128i& op1, const __m128i& op2, TParams... params) {
+        TQType<TOp>::As(Value) = (TOp)
+            func(TQType<TOp>::As(op1),
+                 TQType<TOp>::As(op2),
+                 params...);
+    }
+};
+
+template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
+struct TWrapperDualSwap : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    TWrapperDualSwap(const __m128i& op1, const __m128i& op2, TParams... params) { 
-        TQType<TOp>::As(Value) = 
-            func(TQType<TOp>::As(op2), 
-                 TQType<TOp>::As(op1), 
-                 params...); 
-    } 
-}; 
- 
+    TWrapperDualSwap(const __m128i& op1, const __m128i& op2, TParams... params) {
+        TQType<TOp>::As(Value) =
+            func(TQType<TOp>::As(op2),
+                 TQType<TOp>::As(op1),
+                 params...);
+    }
+};
+
 template <typename TOp, typename TFunc, TFunc* func, typename TArgument = __m128>
 struct TWrapperDualF : TBaseWrapper<TArgument> {
     Y_FORCE_INLINE
     TWrapperDualF(const TArgument& op1, const TArgument& op2) {
         TQType<TOp>::As(TBaseWrapper<TArgument>::Value) = (TOp) func(TQType<TOp>::As(op1), TQType<TOp>::As(op2));
-    } 
-}; 
- 
-using _mm_or_si128 = TWrapperDual<uint64x2_t, decltype(vorrq_u64), vorrq_u64>; 
-using _mm_and_si128 = TWrapperDual<uint64x2_t, decltype(vandq_u64), vandq_u64>; 
-using _mm_andnot_si128 = 
-    TWrapperDualSwap<uint64x2_t, decltype(vbicq_u64), vbicq_u64>; 
+    }
+};
+
+using _mm_or_si128 = TWrapperDual<uint64x2_t, decltype(vorrq_u64), vorrq_u64>;
+using _mm_and_si128 = TWrapperDual<uint64x2_t, decltype(vandq_u64), vandq_u64>;
+using _mm_andnot_si128 =
+    TWrapperDualSwap<uint64x2_t, decltype(vbicq_u64), vbicq_u64>;
 using _mm_xor_si128 = TWrapperDual<uint64x2_t, decltype(veorq_u64), veorq_u64>;
- 
+
 using _mm_add_epi8 = TWrapperDual<uint8x16_t, decltype(vaddq_u8), vaddq_u8>;
-using _mm_add_epi16 = TWrapperDual<uint16x8_t, decltype(vaddq_u16), vaddq_u16>; 
-using _mm_add_epi32 = TWrapperDual<uint32x4_t, decltype(vaddq_u32), vaddq_u32>; 
-using _mm_add_epi64 = TWrapperDual<uint64x2_t, decltype(vaddq_u64), vaddq_u64>; 
- 
+using _mm_add_epi16 = TWrapperDual<uint16x8_t, decltype(vaddq_u16), vaddq_u16>;
+using _mm_add_epi32 = TWrapperDual<uint32x4_t, decltype(vaddq_u32), vaddq_u32>;
+using _mm_add_epi64 = TWrapperDual<uint64x2_t, decltype(vaddq_u64), vaddq_u64>;
+
 inline __m128i _mm_madd_epi16(__m128i a, __m128i b) {
     int32x4_t aLow;
     int32x4_t aHigh;
@@ -343,118 +343,118 @@ inline __m128i _mm_madd_epi16(__m128i a, __m128i b) {
 }
 
 using _mm_sub_epi8 = TWrapperDual<uint8x16_t, decltype(vsubq_u8), vsubq_u8>;
-using _mm_sub_epi16 = TWrapperDual<uint16x8_t, decltype(vsubq_u16), vsubq_u16>; 
-using _mm_sub_epi32 = TWrapperDual<uint32x4_t, decltype(vsubq_u32), vsubq_u32>; 
-using _mm_sub_epi64 = TWrapperDual<uint64x2_t, decltype(vsubq_u64), vsubq_u64>; 
- 
-using _mm_unpacklo_epi8 = 
-    TWrapperDual<uint8x16_t, decltype(vzip1q_u8), vzip1q_u8>; 
-using _mm_unpackhi_epi8 = 
-    TWrapperDual<uint8x16_t, decltype(vzip2q_u8), vzip2q_u8>; 
-using _mm_unpacklo_epi16 = 
-    TWrapperDual<uint16x8_t, decltype(vzip1q_u16), vzip1q_u16>; 
-using _mm_unpackhi_epi16 = 
-    TWrapperDual<uint16x8_t, decltype(vzip2q_u16), vzip2q_u16>; 
-using _mm_unpacklo_epi32 = 
-    TWrapperDual<uint32x4_t, decltype(vzip1q_u32), vzip1q_u32>; 
-using _mm_unpackhi_epi32 = 
-    TWrapperDual<uint32x4_t, decltype(vzip2q_u32), vzip2q_u32>; 
-using _mm_unpacklo_epi64 = 
-    TWrapperDual<uint64x2_t, decltype(vzip1q_u64), vzip1q_u64>; 
-using _mm_unpackhi_epi64 = 
-    TWrapperDual<uint64x2_t, decltype(vzip2q_u64), vzip2q_u64>; 
- 
-using _mm_cmpeq_epi8 = 
-    TWrapperDual<uint8x16_t, decltype(vceqq_u8), vceqq_u8>; 
-using _mm_cmpeq_epi16 = 
-    TWrapperDual<uint16x8_t, decltype(vceqq_u16), vceqq_u16>; 
-using _mm_cmpeq_epi32 = 
-    TWrapperDual<uint32x4_t, decltype(vceqq_u32), vceqq_u32>; 
- 
-using _mm_cmpgt_epi8 = 
-    TWrapperDual<int8x16_t, decltype(vcgtq_s8), vcgtq_s8>; 
-using _mm_cmpgt_epi16 = 
-    TWrapperDual<int16x8_t, decltype(vcgtq_s16), vcgtq_s16>; 
-using _mm_cmpgt_epi32 = 
-    TWrapperDual<int32x4_t, decltype(vcgtq_s32), vcgtq_s32>; 
- 
-using _mm_cmplt_epi8 = 
-    TWrapperDual<int8x16_t, decltype(vcltq_s8), vcltq_s8>; 
-using _mm_cmplt_epi16 = 
-    TWrapperDual<int16x8_t, decltype(vcltq_s16), vcltq_s16>; 
-using _mm_cmplt_epi32 = 
-    TWrapperDual<int32x4_t, decltype(vcltq_s32), vcltq_s32>; 
- 
+using _mm_sub_epi16 = TWrapperDual<uint16x8_t, decltype(vsubq_u16), vsubq_u16>;
+using _mm_sub_epi32 = TWrapperDual<uint32x4_t, decltype(vsubq_u32), vsubq_u32>;
+using _mm_sub_epi64 = TWrapperDual<uint64x2_t, decltype(vsubq_u64), vsubq_u64>;
+
+using _mm_unpacklo_epi8 =
+    TWrapperDual<uint8x16_t, decltype(vzip1q_u8), vzip1q_u8>;
+using _mm_unpackhi_epi8 =
+    TWrapperDual<uint8x16_t, decltype(vzip2q_u8), vzip2q_u8>;
+using _mm_unpacklo_epi16 =
+    TWrapperDual<uint16x8_t, decltype(vzip1q_u16), vzip1q_u16>;
+using _mm_unpackhi_epi16 =
+    TWrapperDual<uint16x8_t, decltype(vzip2q_u16), vzip2q_u16>;
+using _mm_unpacklo_epi32 =
+    TWrapperDual<uint32x4_t, decltype(vzip1q_u32), vzip1q_u32>;
+using _mm_unpackhi_epi32 =
+    TWrapperDual<uint32x4_t, decltype(vzip2q_u32), vzip2q_u32>;
+using _mm_unpacklo_epi64 =
+    TWrapperDual<uint64x2_t, decltype(vzip1q_u64), vzip1q_u64>;
+using _mm_unpackhi_epi64 =
+    TWrapperDual<uint64x2_t, decltype(vzip2q_u64), vzip2q_u64>;
+
+using _mm_cmpeq_epi8 =
+    TWrapperDual<uint8x16_t, decltype(vceqq_u8), vceqq_u8>;
+using _mm_cmpeq_epi16 =
+    TWrapperDual<uint16x8_t, decltype(vceqq_u16), vceqq_u16>;
+using _mm_cmpeq_epi32 =
+    TWrapperDual<uint32x4_t, decltype(vceqq_u32), vceqq_u32>;
+
+using _mm_cmpgt_epi8 =
+    TWrapperDual<int8x16_t, decltype(vcgtq_s8), vcgtq_s8>;
+using _mm_cmpgt_epi16 =
+    TWrapperDual<int16x8_t, decltype(vcgtq_s16), vcgtq_s16>;
+using _mm_cmpgt_epi32 =
+    TWrapperDual<int32x4_t, decltype(vcgtq_s32), vcgtq_s32>;
+
+using _mm_cmplt_epi8 =
+    TWrapperDual<int8x16_t, decltype(vcltq_s8), vcltq_s8>;
+using _mm_cmplt_epi16 =
+    TWrapperDual<int16x8_t, decltype(vcltq_s16), vcltq_s16>;
+using _mm_cmplt_epi32 =
+    TWrapperDual<int32x4_t, decltype(vcltq_s32), vcltq_s32>;
+
 Y_FORCE_INLINE __m128i _mm_load_si128(const __m128i* ptr) {
-    __m128i result; 
+    __m128i result;
     result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr);
-    return result; 
-} 
- 
+    return result;
+}
+
 Y_FORCE_INLINE __m128i _mm_loadu_si128(const __m128i* ptr) {
-    __m128i result; 
+    __m128i result;
     result.AsUi64x2 = vld1q_u64((const uint64_t*)ptr);
-    return result; 
-} 
- 
+    return result;
+}
+
 Y_FORCE_INLINE __m128i _mm_lddqu_si128(const __m128i* ptr) {
     return _mm_loadu_si128(ptr);
 }
 
 Y_FORCE_INLINE void _mm_storeu_si128(__m128i* ptr, const __m128i& op) {
     vst1q_u64((uint64_t*)ptr, op.AsUi64x2);
-} 
- 
+}
+
 Y_FORCE_INLINE void
-_mm_store_si128(__m128i* ptr, const __m128i& op) { 
+_mm_store_si128(__m128i* ptr, const __m128i& op) {
     vst1q_u64((uint64_t*)ptr, op.AsUi64x2);
-} 
- 
-template <typename TOp, typename TFunc, TFunc* func, typename... TParams> 
-struct TWrapperSimple : TBaseWrapper<__m128i> { 
+}
+
+template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
+struct TWrapperSimple : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    TWrapperSimple(TParams... params) { 
-        TQType<TOp>::As(Value) = func(params...); 
-    } 
-}; 
- 
-template <typename TOp, typename TFunc, TFunc* func, typename... TParams> 
-struct TWrapperSimpleF : TBaseWrapper<__m128> { 
+    TWrapperSimple(TParams... params) {
+        TQType<TOp>::As(Value) = func(params...);
+    }
+};
+
+template <typename TOp, typename TFunc, TFunc* func, typename... TParams>
+struct TWrapperSimpleF : TBaseWrapper<__m128> {
     Y_FORCE_INLINE
-    TWrapperSimpleF(TParams... params) { 
-        TQType<TOp>::As(Value) = func(params...); 
-    } 
-}; 
- 
-using _mm_set1_epi8 = 
-    TWrapperSimple<int8x16_t, decltype(vdupq_n_s8), vdupq_n_s8, const char>; 
-using _mm_set1_epi16 = 
-    TWrapperSimple<int16x8_t, decltype(vdupq_n_s16), vdupq_n_s16, const ui16>; 
-using _mm_set1_epi32 = 
-    TWrapperSimple<int32x4_t, decltype(vdupq_n_s32), vdupq_n_s32, const ui32>; 
- 
-struct _mm_setzero_si128 : TBaseWrapper<__m128i> { 
+    TWrapperSimpleF(TParams... params) {
+        TQType<TOp>::As(Value) = func(params...);
+    }
+};
+
+using _mm_set1_epi8 =
+    TWrapperSimple<int8x16_t, decltype(vdupq_n_s8), vdupq_n_s8, const char>;
+using _mm_set1_epi16 =
+    TWrapperSimple<int16x8_t, decltype(vdupq_n_s16), vdupq_n_s16, const ui16>;
+using _mm_set1_epi32 =
+    TWrapperSimple<int32x4_t, decltype(vdupq_n_s32), vdupq_n_s32, const ui32>;
+
+struct _mm_setzero_si128 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    _mm_setzero_si128() { 
-        TQType<uint64x2_t>::As(Value) = vdupq_n_u64(0); 
-    } 
-}; 
- 
-struct _mm_loadl_epi64 : TBaseWrapper<__m128i> { 
+    _mm_setzero_si128() {
+        TQType<uint64x2_t>::As(Value) = vdupq_n_u64(0);
+    }
+};
+
+struct _mm_loadl_epi64 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    _mm_loadl_epi64(const __m128i* p) { 
+    _mm_loadl_epi64(const __m128i* p) {
         uint64x1_t im = vld1_u64((const uint64_t*)p);
-        TQType<uint64x2_t>::As(Value) = vcombine_u64(im, vdup_n_u64(0)); 
-    } 
-}; 
- 
-struct _mm_storel_epi64 : TBaseWrapper<__m128i> { 
+        TQType<uint64x2_t>::As(Value) = vcombine_u64(im, vdup_n_u64(0));
+    }
+};
+
+struct _mm_storel_epi64 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    _mm_storel_epi64(__m128i* a, __m128i op) { 
+    _mm_storel_epi64(__m128i* a, __m128i op) {
         vst1_u64((uint64_t*)a, vget_low_u64(op.AsUi64x2));
-    } 
-}; 
- 
+    }
+};
+
 struct ShuffleStruct4 {
     ui8 x[4];
 };
@@ -470,45 +470,45 @@ _MM_SHUFFLE(ui8 x4, ui8 x3, ui8 x2, ui8 x1) {
 }
 
 Y_FORCE_INLINE __m128i
-_mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) { 
-    __m128i result; 
+_mm_shuffle_epi32(const __m128i& op1, const ShuffleStruct4& op2) {
+    __m128i result;
     const ui8 xi[4] = {
         ui8(op2.x[0] * 4), ui8(op2.x[1] * 4),
         ui8(op2.x[2] * 4), ui8(op2.x[3] * 4)
     };
     const uint8x16_t transform = {
-        ui8(xi[0]), ui8(xi[0] + 1), ui8(xi[0] + 2), ui8(xi[0] + 3), 
-        ui8(xi[1]), ui8(xi[1] + 1), ui8(xi[1] + 2), ui8(xi[1] + 3), 
-        ui8(xi[2]), ui8(xi[2] + 1), ui8(xi[2] + 2), ui8(xi[2] + 3), 
+        ui8(xi[0]), ui8(xi[0] + 1), ui8(xi[0] + 2), ui8(xi[0] + 3),
+        ui8(xi[1]), ui8(xi[1] + 1), ui8(xi[1] + 2), ui8(xi[1] + 3),
+        ui8(xi[2]), ui8(xi[2] + 1), ui8(xi[2] + 2), ui8(xi[2] + 3),
         ui8(xi[3]), ui8(xi[3] + 1), ui8(xi[3] + 2), ui8(xi[3] + 3)
     };
-    result.AsUi8x16 = vqtbl1q_u8(op1.AsUi8x16, transform); 
-    return result; 
-} 
- 
+    result.AsUi8x16 = vqtbl1q_u8(op1.AsUi8x16, transform);
+    return result;
+}
+
 Y_FORCE_INLINE int
-_mm_movemask_epi8(const __m128i& op) { 
-    uint8x16_t mask = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
-                       0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; 
-    uint8x16_t opmasked = vandq_u8(op.AsUi8x16, mask); 
-    int8x16_t byteshifter = { 
-        0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7}; 
-    uint8x16_t opshifted = vshlq_u8(opmasked, byteshifter); 
-    int16x8_t wordshifter = {-7, -5, -3, -1, 1, 3, 5, 7}; 
-    uint16x8_t wordshifted = 
-        vshlq_u16(vreinterpretq_u16_u8(opshifted), wordshifter); 
-    return vaddvq_u16(wordshifted); 
-} 
- 
-template <int imm> 
-struct THelper_mm_srli_si128 : TBaseWrapper<__m128i> { 
+_mm_movemask_epi8(const __m128i& op) {
+    uint8x16_t mask = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                       0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+    uint8x16_t opmasked = vandq_u8(op.AsUi8x16, mask);
+    int8x16_t byteshifter = {
+        0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7, 0, -7};
+    uint8x16_t opshifted = vshlq_u8(opmasked, byteshifter);
+    int16x8_t wordshifter = {-7, -5, -3, -1, 1, 3, 5, 7};
+    uint16x8_t wordshifted =
+        vshlq_u16(vreinterpretq_u16_u8(opshifted), wordshifter);
+    return vaddvq_u16(wordshifted);
+}
+
+template <int imm>
+struct THelper_mm_srli_si128 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
     THelper_mm_srli_si128(const __m128i a) {
         const auto zero = vdupq_n_u8(0);
-        TQType<uint8x16_t>::As(Value) = vextq_u8(a.AsUi8x16, zero, imm); 
-    } 
-}; 
- 
+        TQType<uint8x16_t>::As(Value) = vextq_u8(a.AsUi8x16, zero, imm);
+    }
+};
+
 template <>
 struct THelper_mm_srli_si128<16> : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
@@ -518,8 +518,8 @@ struct THelper_mm_srli_si128<16> : TBaseWrapper<__m128i> {
     }
 };
 
-#define _mm_srli_si128(a, imm) THelper_mm_srli_si128<imm>(a) 
- 
+#define _mm_srli_si128(a, imm) THelper_mm_srli_si128<imm>(a)
+
 template<int imm>
 inline uint8x16_t vextq_u8_function(uint8x16_t a, uint8x16_t b) {
     return vextq_u8(a, b, imm);
@@ -531,33 +531,33 @@ inline uint8x16_t vextq_u8_function<16>(uint8x16_t /* a */, uint8x16_t b) {
 }
 
 
-template <int imm> 
-struct THelper_mm_slli_si128 : TBaseWrapper<__m128i> { 
+template <int imm>
+struct THelper_mm_slli_si128 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
     THelper_mm_slli_si128(const __m128i a) {
-        auto zero = vdupq_n_u8(0); 
+        auto zero = vdupq_n_u8(0);
         TQType<uint8x16_t>::As(Value) = vextq_u8_function<16 - imm>(zero, a.AsUi8x16);
-    } 
-}; 
- 
-#define _mm_slli_si128(a, imm) THelper_mm_slli_si128<imm>(a) 
- 
+    }
+};
+
+#define _mm_slli_si128(a, imm) THelper_mm_slli_si128<imm>(a)
+
 Y_FORCE_INLINE int _mm_cvtsi128_si32(const __m128i& op) {
-    return vgetq_lane_s32(op.AsSi32x4, 0); 
-} 
- 
-struct _mm_set_epi16 : TBaseWrapper<__m128i> { 
+    return vgetq_lane_s32(op.AsSi32x4, 0);
+}
+
+struct _mm_set_epi16 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    _mm_set_epi16(const short w7, const short w6, 
-                  const short w5, const short w4, 
-                  const short w3, const short w2, 
-                  const short w1, const short w0) { 
-        int16x4_t d0 = {w0, w1, w2, w3}; 
-        int16x4_t d1 = {w4, w5, w6, w7}; 
-        TQType<int16x8_t>::As(Value) = vcombine_s16(d0, d1); 
-    } 
-}; 
- 
+    _mm_set_epi16(const short w7, const short w6,
+                  const short w5, const short w4,
+                  const short w3, const short w2,
+                  const short w1, const short w0) {
+        int16x4_t d0 = {w0, w1, w2, w3};
+        int16x4_t d1 = {w4, w5, w6, w7};
+        TQType<int16x8_t>::As(Value) = vcombine_s16(d0, d1);
+    }
+};
+
 struct _mm_setr_epi16 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
     _mm_setr_epi16(const short w7, const short w6,
@@ -570,16 +570,16 @@ struct _mm_setr_epi16 : TBaseWrapper<__m128i> {
     }
 };
 
-struct _mm_set_epi32 : TBaseWrapper<__m128i> { 
+struct _mm_set_epi32 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    _mm_set_epi32(const int x3, const int x2, 
-                  const int x1, const int x0) { 
-        int32x2_t d0 = {x0, x1}; 
-        int32x2_t d1 = {x2, x3}; 
-        TQType<int32x4_t>::As(Value) = vcombine_s32(d0, d1); 
-    } 
-}; 
- 
+    _mm_set_epi32(const int x3, const int x2,
+                  const int x1, const int x0) {
+        int32x2_t d0 = {x0, x1};
+        int32x2_t d1 = {x2, x3};
+        TQType<int32x4_t>::As(Value) = vcombine_s32(d0, d1);
+    }
+};
+
 struct _mm_setr_epi32 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
     _mm_setr_epi32(const int x3, const int x2,
@@ -590,14 +590,14 @@ struct _mm_setr_epi32 : TBaseWrapper<__m128i> {
     }
 };
 
-struct _mm_cvtsi32_si128 : TBaseWrapper<__m128i> { 
+struct _mm_cvtsi32_si128 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    _mm_cvtsi32_si128(int op) { 
-        auto zero = vdupq_n_s32(0); 
-        TQType<int32x4_t>::As(Value) = vsetq_lane_s32(op, zero, 0); 
-    } 
-}; 
- 
+    _mm_cvtsi32_si128(int op) {
+        auto zero = vdupq_n_s32(0);
+        TQType<int32x4_t>::As(Value) = vsetq_lane_s32(op, zero, 0);
+    }
+};
+
 struct _mm_cvtsi64_si128 : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
     _mm_cvtsi64_si128(i64 op) {
@@ -606,41 +606,41 @@ struct _mm_cvtsi64_si128 : TBaseWrapper<__m128i> {
     }
 };
 
-template <typename TOpOut, typename TOpIn, 
-          typename TFunc, TFunc* func, 
-          typename TCombine, TCombine* combine> 
-struct TCombineWrapper : TBaseWrapper<__m128i> { 
+template <typename TOpOut, typename TOpIn,
+          typename TFunc, TFunc* func,
+          typename TCombine, TCombine* combine>
+struct TCombineWrapper : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    TCombineWrapper(const __m128i op1, const __m128i op2) { 
-        TQType<TOpOut>::As(Value) = 
-            combine(func(TQType<TOpIn>::As(op1)), 
-                    func(TQType<TOpIn>::As(op2))); 
-    } 
-}; 
- 
-using _mm_packs_epi16 = 
-    TCombineWrapper<int8x16_t, int16x8_t, 
-                    decltype(vqmovn_s16), vqmovn_s16, 
-                    decltype(vcombine_s8), vcombine_s8>; 
-using _mm_packs_epi32 = 
-    TCombineWrapper<int16x8_t, int32x4_t, 
-                    decltype(vqmovn_s32), vqmovn_s32, 
-                    decltype(vcombine_s16), vcombine_s16>; 
-using _mm_packus_epi16 = 
-    TCombineWrapper<uint8x16_t, int16x8_t, 
-                    decltype(vqmovun_s16), vqmovun_s16, 
-                    decltype(vcombine_u8), vcombine_u8>; 
- 
-template <typename TOpOut, typename TOpIn, 
-          typename TFunc, TFunc* func, typename... TParams> 
-struct TScalarOutWrapper : TBaseWrapper<TOpOut> { 
+    TCombineWrapper(const __m128i op1, const __m128i op2) {
+        TQType<TOpOut>::As(Value) =
+            combine(func(TQType<TOpIn>::As(op1)),
+                    func(TQType<TOpIn>::As(op2)));
+    }
+};
+
+using _mm_packs_epi16 =
+    TCombineWrapper<int8x16_t, int16x8_t,
+                    decltype(vqmovn_s16), vqmovn_s16,
+                    decltype(vcombine_s8), vcombine_s8>;
+using _mm_packs_epi32 =
+    TCombineWrapper<int16x8_t, int32x4_t,
+                    decltype(vqmovn_s32), vqmovn_s32,
+                    decltype(vcombine_s16), vcombine_s16>;
+using _mm_packus_epi16 =
+    TCombineWrapper<uint8x16_t, int16x8_t,
+                    decltype(vqmovun_s16), vqmovun_s16,
+                    decltype(vcombine_u8), vcombine_u8>;
+
+template <typename TOpOut, typename TOpIn,
+          typename TFunc, TFunc* func, typename... TParams>
+struct TScalarOutWrapper : TBaseWrapper<TOpOut> {
     Y_FORCE_INLINE
-    TScalarOutWrapper(const __m128i op, TParams... params) { 
-        TBaseWrapper<TOpOut>::Value = 
-            func(TQType<TOpIn>::As(op), params...); 
-    } 
-}; 
- 
+    TScalarOutWrapper(const __m128i op, TParams... params) {
+        TBaseWrapper<TOpOut>::Value =
+            func(TQType<TOpIn>::As(op), params...);
+    }
+};
+
 template<int imm>
 int extract_epi8_arm(__m128i arg) {
     return vgetq_lane_u8(arg.AsUi8x16, imm);
@@ -649,13 +649,13 @@ int extract_epi8_arm(__m128i arg) {
 template<int imm>
 int extract_epi16_arm(__m128i arg) {
     return vgetq_lane_u16(arg.AsUi16x8, imm);
-} 
- 
+}
+
 template<int imm>
 int extract_epi32_arm(__m128i arg) {
     return vgetq_lane_s32(arg.AsSi32x4, imm);
 }
- 
+
 template<int imm>
 long long extract_epi64_arm(__m128i arg) {
     return vgetq_lane_s64(arg.AsSi64x2, imm);
@@ -669,49 +669,49 @@ long long extract_epi64_arm(__m128i arg) {
 
 static Y_FORCE_INLINE
 __m128i _mm_mul_epu32(__m128i op1, __m128i op2) {
-    __m128i result; 
-    uint32x4_t r1 = vuzp1q_u32(op1.AsUi32x4, op2.AsUi32x4); 
-    uint32x4_t r2 = vuzp1q_u32(op2.AsUi32x4, op1.AsUi32x4); 
-    result.AsUi64x2 = vmull_u32(vget_low_u32(r1), vget_low_u32(r2)); 
-    return result; 
-} 
- 
-template <> 
-struct TQType<float32x4_t> { 
-    static inline float32x4_t& As(__m128& value) { 
-        return value.AsFloat32x4; 
-    } 
- 
-    static inline const float32x4_t& As(const __m128& value) { 
-        return value.AsFloat32x4; 
-    } 
- 
-    static inline float32x4_t& As(__m128i& value) { 
-        return value.AsFloat32x4; 
-    } 
- 
-    static inline const float32x4_t& As(const __m128i& value) { 
-        return value.AsFloat32x4; 
-    } 
-}; 
- 
-template <> 
-struct TQType<float64x2_t> { 
-    static inline float64x2_t& As(__m128& value) { 
-        return value.AsFloat64x2; 
-    } 
- 
-    static inline const float64x2_t& As(const __m128& value) { 
-        return value.AsFloat64x2; 
-    } 
- 
-    static inline float64x2_t& As(__m128i& value) { 
-        return value.AsFloat64x2; 
-    } 
- 
-    static inline const float64x2_t& As(const __m128i& value) { 
-        return value.AsFloat64x2; 
-    } 
+    __m128i result;
+    uint32x4_t r1 = vuzp1q_u32(op1.AsUi32x4, op2.AsUi32x4);
+    uint32x4_t r2 = vuzp1q_u32(op2.AsUi32x4, op1.AsUi32x4);
+    result.AsUi64x2 = vmull_u32(vget_low_u32(r1), vget_low_u32(r2));
+    return result;
+}
+
+template <>
+struct TQType<float32x4_t> {
+    static inline float32x4_t& As(__m128& value) {
+        return value.AsFloat32x4;
+    }
+
+    static inline const float32x4_t& As(const __m128& value) {
+        return value.AsFloat32x4;
+    }
+
+    static inline float32x4_t& As(__m128i& value) {
+        return value.AsFloat32x4;
+    }
+
+    static inline const float32x4_t& As(const __m128i& value) {
+        return value.AsFloat32x4;
+    }
+};
+
+template <>
+struct TQType<float64x2_t> {
+    static inline float64x2_t& As(__m128& value) {
+        return value.AsFloat64x2;
+    }
+
+    static inline const float64x2_t& As(const __m128& value) {
+        return value.AsFloat64x2;
+    }
+
+    static inline float64x2_t& As(__m128i& value) {
+        return value.AsFloat64x2;
+    }
+
+    static inline const float64x2_t& As(const __m128i& value) {
+        return value.AsFloat64x2;
+    }
 
     static inline float64x2_t& As(__m128d& value) {
         return value;
@@ -720,30 +720,30 @@ struct TQType<float64x2_t> {
     static inline const float64x2_t& As(const __m128d& value) {
         return value;
     }
-}; 
- 
-using _mm_set1_ps = TWrapperSimpleF<float32x4_t, 
-                                    decltype(vdupq_n_f32), vdupq_n_f32, const float>; 
-using _mm_set_ps1 = TWrapperSimpleF<float32x4_t, 
-                                    decltype(vdupq_n_f32), vdupq_n_f32, const float>; 
- 
-struct _mm_setzero_ps : TBaseWrapper<__m128> { 
+};
+
+using _mm_set1_ps = TWrapperSimpleF<float32x4_t,
+                                    decltype(vdupq_n_f32), vdupq_n_f32, const float>;
+using _mm_set_ps1 = TWrapperSimpleF<float32x4_t,
+                                    decltype(vdupq_n_f32), vdupq_n_f32, const float>;
+
+struct _mm_setzero_ps : TBaseWrapper<__m128> {
     Y_FORCE_INLINE
-    _mm_setzero_ps() { 
-        TQType<float32x4_t>::As(Value) = vdupq_n_f32(0.); 
-    } 
-}; 
- 
+    _mm_setzero_ps() {
+        TQType<float32x4_t>::As(Value) = vdupq_n_f32(0.);
+    }
+};
+
 Y_FORCE_INLINE __m128d _mm_setzero_pd() {
     return vdupq_n_f64(0.);
 }
 
 Y_FORCE_INLINE __m128 _mm_loadu_ps(const float* ptr) {
-    __m128 result; 
-    result.AsFloat32x4 = vld1q_f32(ptr); 
-    return result; 
-} 
- 
+    __m128 result;
+    result.AsFloat32x4 = vld1q_f32(ptr);
+    return result;
+}
+
 Y_FORCE_INLINE __m128 _mm_load_ps(const float* ptr) {
     __m128 result;
     result.AsFloat32x4 = vld1q_f32(ptr);
@@ -751,23 +751,23 @@ Y_FORCE_INLINE __m128 _mm_load_ps(const float* ptr) {
 }
 
 Y_FORCE_INLINE void _mm_storeu_ps(float* ptr, const __m128& op) {
-    vst1q_f32(ptr, op.AsFloat32x4); 
-} 
- 
+    vst1q_f32(ptr, op.AsFloat32x4);
+}
+
 Y_FORCE_INLINE void _mm_store_ps(float* ptr, const __m128& op) {
     vst1q_f32(ptr, op.AsFloat32x4);
 }
 
-struct _mm_set_ps : TBaseWrapper<__m128> { 
+struct _mm_set_ps : TBaseWrapper<__m128> {
     Y_FORCE_INLINE
-    _mm_set_ps(const float x3, const float x2, 
-               const float x1, const float x0) { 
-        float32x2_t d0 = {x0, x1}; 
-        float32x2_t d1 = {x2, x3}; 
-        TQType<float32x4_t>::As(Value) = vcombine_f32(d0, d1); 
-    } 
-}; 
- 
+    _mm_set_ps(const float x3, const float x2,
+               const float x1, const float x0) {
+        float32x2_t d0 = {x0, x1};
+        float32x2_t d1 = {x2, x3};
+        TQType<float32x4_t>::As(Value) = vcombine_f32(d0, d1);
+    }
+};
+
 Y_FORCE_INLINE __m128d _mm_set_pd(double d1, double d0) {
     const float64x1_t p0 = {d0};
     const float64x1_t p1 = {d1};
@@ -788,81 +788,81 @@ Y_FORCE_INLINE void _mm_store_pd(double* res, __m128d a) {
     vst1q_f64(res, a);
 }
 
-using _mm_add_ps = TWrapperDualF<float32x4_t, decltype(vaddq_f32), vaddq_f32>; 
-using _mm_sub_ps = TWrapperDualF<float32x4_t, decltype(vsubq_f32), vsubq_f32>; 
-using _mm_mul_ps = TWrapperDualF<float32x4_t, decltype(vmulq_f32), vmulq_f32>; 
-using _mm_div_ps = TWrapperDualF<float32x4_t, decltype(vdivq_f32), vdivq_f32>; 
-using _mm_cmpeq_ps = TWrapperDualF<float32x4_t, decltype(vceqq_f32), vceqq_f32>; 
-using _mm_cmpgt_ps = TWrapperDualF<float32x4_t, decltype(vcgtq_f32), vcgtq_f32>; 
-using _mm_max_ps = TWrapperDualF<float32x4_t, decltype(vmaxq_f32), vmaxq_f32>; 
-using _mm_min_ps = TWrapperDualF<float32x4_t, decltype(vminq_f32), vminq_f32>; 
- 
+using _mm_add_ps = TWrapperDualF<float32x4_t, decltype(vaddq_f32), vaddq_f32>;
+using _mm_sub_ps = TWrapperDualF<float32x4_t, decltype(vsubq_f32), vsubq_f32>;
+using _mm_mul_ps = TWrapperDualF<float32x4_t, decltype(vmulq_f32), vmulq_f32>;
+using _mm_div_ps = TWrapperDualF<float32x4_t, decltype(vdivq_f32), vdivq_f32>;
+using _mm_cmpeq_ps = TWrapperDualF<float32x4_t, decltype(vceqq_f32), vceqq_f32>;
+using _mm_cmpgt_ps = TWrapperDualF<float32x4_t, decltype(vcgtq_f32), vcgtq_f32>;
+using _mm_max_ps = TWrapperDualF<float32x4_t, decltype(vmaxq_f32), vmaxq_f32>;
+using _mm_min_ps = TWrapperDualF<float32x4_t, decltype(vminq_f32), vminq_f32>;
+
 using _mm_add_pd = TWrapperDualF<float64x2_t, decltype(vaddq_f64), vaddq_f64, __m128d>;
 using _mm_sub_pd = TWrapperDualF<float64x2_t, decltype(vsubq_f64), vsubq_f64, __m128d>;
 using _mm_mul_pd = TWrapperDualF<float64x2_t, decltype(vmulq_f64), vmulq_f64, __m128d>;
 using _mm_div_pd = TWrapperDualF<float64x2_t, decltype(vdivq_f64), vdivq_f64, __m128d>;
 
-struct _mm_and_ps : TBaseWrapper<__m128> { 
+struct _mm_and_ps : TBaseWrapper<__m128> {
     Y_FORCE_INLINE
-    _mm_and_ps(const __m128& op1, const __m128& op2) { 
-        TQType<uint64x2_t>::As(Value) = 
-            vandq_u64(TQType<uint64x2_t>::As(op1), 
-                      TQType<uint64x2_t>::As(op2)); 
-    } 
-}; 
- 
+    _mm_and_ps(const __m128& op1, const __m128& op2) {
+        TQType<uint64x2_t>::As(Value) =
+            vandq_u64(TQType<uint64x2_t>::As(op1),
+                      TQType<uint64x2_t>::As(op2));
+    }
+};
+
 Y_FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) {
     return vandq_u64(a, b);
 }
 
 Y_FORCE_INLINE void _MM_TRANSPOSE4_PS(__m128& op0, __m128& op1, __m128& op2, __m128& op3) {
-    float64x2_t im0 = 
-        (float64x2_t)vtrn1q_f32(op0.AsFloat32x4, op1.AsFloat32x4); 
-    float64x2_t im1 = 
-        (float64x2_t)vtrn2q_f32(op0.AsFloat32x4, op1.AsFloat32x4); 
-    float64x2_t im2 = 
-        (float64x2_t)vtrn1q_f32(op2.AsFloat32x4, op3.AsFloat32x4); 
-    float64x2_t im3 = 
-        (float64x2_t)vtrn2q_f32(op2.AsFloat32x4, op3.AsFloat32x4); 
- 
-    TQType<float64x2_t>::As(op0) = vtrn1q_f64(im0, im2); 
-    TQType<float64x2_t>::As(op1) = vtrn1q_f64(im1, im3); 
-    TQType<float64x2_t>::As(op2) = vtrn2q_f64(im0, im2); 
-    TQType<float64x2_t>::As(op3) = vtrn2q_f64(im1, im3); 
-}; 
- 
+    float64x2_t im0 =
+        (float64x2_t)vtrn1q_f32(op0.AsFloat32x4, op1.AsFloat32x4);
+    float64x2_t im1 =
+        (float64x2_t)vtrn2q_f32(op0.AsFloat32x4, op1.AsFloat32x4);
+    float64x2_t im2 =
+        (float64x2_t)vtrn1q_f32(op2.AsFloat32x4, op3.AsFloat32x4);
+    float64x2_t im3 =
+        (float64x2_t)vtrn2q_f32(op2.AsFloat32x4, op3.AsFloat32x4);
+
+    TQType<float64x2_t>::As(op0) = vtrn1q_f64(im0, im2);
+    TQType<float64x2_t>::As(op1) = vtrn1q_f64(im1, im3);
+    TQType<float64x2_t>::As(op2) = vtrn2q_f64(im0, im2);
+    TQType<float64x2_t>::As(op3) = vtrn2q_f64(im1, im3);
+};
+
 Y_FORCE_INLINE __m128 _mm_castsi128_ps(__m128i op) {
-    return reinterpret_cast<__m128&>(op); 
-} 
- 
+    return reinterpret_cast<__m128&>(op);
+}
+
 Y_FORCE_INLINE __m128i _mm_castps_si128(__m128 op) {
-    return reinterpret_cast<__m128i&>(op); 
-} 
- 
-template <typename TOpOut, typename TOpIn, 
-          typename TFunc, TFunc* func, typename... TParams> 
-struct TCvtS2FWrapperSingle : TBaseWrapper<__m128> { 
+    return reinterpret_cast<__m128i&>(op);
+}
+
+template <typename TOpOut, typename TOpIn,
+          typename TFunc, TFunc* func, typename... TParams>
+struct TCvtS2FWrapperSingle : TBaseWrapper<__m128> {
     Y_FORCE_INLINE
-    TCvtS2FWrapperSingle(const __m128i& op, TParams... params) { 
-        TQType<TOpOut>::As(Value) = 
-            func(TQType<TOpIn>::As(op), params...); 
-    } 
-}; 
- 
-using _mm_cvtepi32_ps = 
-    TCvtS2FWrapperSingle<float32x4_t, int32x4_t, 
-                         decltype(vcvtq_f32_s32), vcvtq_f32_s32>; 
- 
-template <typename TOpOut, typename TOpIn, 
-          typename TFunc, TFunc* func, typename... TParams> 
-struct TCvtF2SWrapperSingle : TBaseWrapper<__m128i> { 
+    TCvtS2FWrapperSingle(const __m128i& op, TParams... params) {
+        TQType<TOpOut>::As(Value) =
+            func(TQType<TOpIn>::As(op), params...);
+    }
+};
+
+using _mm_cvtepi32_ps =
+    TCvtS2FWrapperSingle<float32x4_t, int32x4_t,
+                         decltype(vcvtq_f32_s32), vcvtq_f32_s32>;
+
+template <typename TOpOut, typename TOpIn,
+          typename TFunc, TFunc* func, typename... TParams>
+struct TCvtF2SWrapperSingle : TBaseWrapper<__m128i> {
     Y_FORCE_INLINE
-    TCvtF2SWrapperSingle(const __m128& op, TParams... params) { 
-        TQType<TOpOut>::As(Value) = 
-            func(TQType<TOpIn>::As(op), params...); 
-    } 
-}; 
- 
+    TCvtF2SWrapperSingle(const __m128& op, TParams... params) {
+        TQType<TOpOut>::As(Value) =
+            func(TQType<TOpIn>::As(op), params...);
+    }
+};
+
 inline __m128i _mm_cvtps_epi32(__m128 a) {
     /// vcvtq_s32_f32 rounds to zero, but we need to round to the nearest.
     static const float32x4_t half = vdupq_n_f32(0.5f);
@@ -874,26 +874,26 @@ inline __m128i _mm_cvtps_epi32(__m128 a) {
     return res;
 }
 
-using _mm_cvttps_epi32 = 
-    TCvtF2SWrapperSingle<int32x4_t, float32x4_t, 
-                         decltype(vcvtq_s32_f32), vcvtq_s32_f32>; 
- 
+using _mm_cvttps_epi32 =
+    TCvtF2SWrapperSingle<int32x4_t, float32x4_t,
+                         decltype(vcvtq_s32_f32), vcvtq_s32_f32>;
+
 Y_FORCE_INLINE int
-_mm_movemask_ps(const __m128& op) { 
-    uint32x4_t mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; 
-    uint32x4_t bits = vandq_u32(op.AsUi32x4, mask); 
-    int32x4_t shifts = {-31, -30, -29, -28}; 
-    bits = vshlq_u32(bits, shifts); 
-    return vaddvq_u32(bits); 
-} 
+_mm_movemask_ps(const __m128& op) {
+    uint32x4_t mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    uint32x4_t bits = vandq_u32(op.AsUi32x4, mask);
+    int32x4_t shifts = {-31, -30, -29, -28};
+    bits = vshlq_u32(bits, shifts);
+    return vaddvq_u32(bits);
+}
 
 Y_FORCE_INLINE i64 _mm_cvtsi128_si64(__m128i a) {
     return vgetq_lane_s64(a.AsSi64x2, 0);
 }
- 
-static inline void _mm_pause() { 
+
+static inline void _mm_pause() {
     __asm__ ("YIELD");
-} 
+}
 
 static inline __m128 _mm_rsqrt_ps(__m128 a) {
     __m128 res;
diff --git a/library/cpp/sse/ut/test.cpp b/library/cpp/sse/ut/test.cpp
index 42a82a8cfa..33c999d284 100644
--- a/library/cpp/sse/ut/test.cpp
+++ b/library/cpp/sse/ut/test.cpp
@@ -1,10 +1,10 @@
-/* 
-  Unittests for all SSE instrinsics translated to NEON instrinsics or 
-  software implementation. 
-  Should be tested both on Intel and ARM64. 
- */ 
-/* Author: Vitaliy Manushkin <agri@yandex-team.ru */ 
- 
+/*
+  Unittests for all SSE instrinsics translated to NEON instrinsics or
+  software implementation.
+  Should be tested both on Intel and ARM64.
+ */
+/* Author: Vitaliy Manushkin <agri@yandex-team.ru */
+
 #include <library/cpp/testing/unittest/registar.h>
 
 #include <util/generic/typetraits.h>
@@ -13,35 +13,35 @@
 #include <util/stream/output.h>
 
 #include <algorithm>
-#include <array> 
-#include <limits> 
+#include <array>
+#include <limits>
 #include <memory>
 #include <type_traits>
 #include <utility>
- 
-template <typename TResult, typename TFunc, TFunc* func> 
-struct T_mm_CallWrapper { 
-    TResult Value; 
- 
-    template <typename... TParams> 
-    T_mm_CallWrapper(TParams&&... params) { 
-        Value = func(std::forward<TParams>(params)...); 
-    } 
- 
-    operator TResult&() { 
-        return Value; 
-    } 
- 
-    operator const TResult&() const { 
-        return Value; 
-    } 
-}; 
- 
-#if defined(_arm64_) 
+
+template <typename TResult, typename TFunc, TFunc* func>
+struct T_mm_CallWrapper {
+    TResult Value;
+
+    template <typename... TParams>
+    T_mm_CallWrapper(TParams&&... params) {
+        Value = func(std::forward<TParams>(params)...);
+    }
+
+    operator TResult&() {
+        return Value;
+    }
+
+    operator const TResult&() const {
+        return Value;
+    }
+};
+
+#if defined(_arm64_)
 #include "library/cpp/sse/sse2neon.h"
 #elif defined(_i386_) || defined(_x86_64_)
-#include <xmmintrin.h> 
-#include <emmintrin.h> 
+#include <xmmintrin.h>
+#include <emmintrin.h>
 #include <smmintrin.h>
 #elif defined(_ppc64_)
 #include "library/cpp/sse/powerpc.h"
@@ -54,10 +54,10 @@ struct T_mm_CallWrapper {
 #define WrapF(T_mm_func) T_mm_func
 #define WrapD(T_mm_func) T_mm_func
 #elif defined(_ppc64_) || defined(_i386_) || defined(_x86_64_)
-#define Wrap(_mm_func) \ 
-    T_mm_CallWrapper<__m128i, decltype(_mm_func), _mm_func> 
-#define WrapF(_mm_func) \ 
-    T_mm_CallWrapper<__m128, decltype(_mm_func), _mm_func> 
+#define Wrap(_mm_func) \
+    T_mm_CallWrapper<__m128i, decltype(_mm_func), _mm_func>
+#define WrapF(_mm_func) \
+    T_mm_CallWrapper<__m128, decltype(_mm_func), _mm_func>
 #define WrapD(_mm_func) \
     T_mm_CallWrapper<__m128d, decltype(_mm_func), _mm_func>
 using int8x16_t = std::array<i8, 16>;
@@ -70,69 +70,69 @@ using uint32x4_t = std::array<ui32, 4>;
 using uint64x2_t = std::array<ui64, 2>;
 using float32x4_t = std::array<float, 4>;
 using float64x2_t = std::array<double, 2>;
- 
+
 template <typename TVectorType>
-struct TQType { 
+struct TQType {
     static TVectorType As(__m128i param) {
         TVectorType value;
-        _mm_storeu_si128((__m128i*)&value, param); 
-        return value; 
-    } 
+        _mm_storeu_si128((__m128i*)&value, param);
+        return value;
+    }
     static TVectorType As(__m128 param) {
         TVectorType value;
-        _mm_storeu_ps((float*)&value, param); 
-        return value; 
-    } 
+        _mm_storeu_ps((float*)&value, param);
+        return value;
+    }
     static TVectorType As(__m128d param) {
         TVectorType value;
         _mm_storeu_pd((double*)&value, param);
         return value;
     }
-}; 
-#endif 
- 
+};
+#endif
+
 template <typename TVectorType>
-struct TFuncLoad; 
+struct TFuncLoad;
 template <typename TVectorType>
-struct TFuncStore; 
- 
-template <> 
-struct TFuncLoad<__m128i> { 
-    __m128i Value; 
- 
-    template <typename TPointer> 
-    TFuncLoad(TPointer* ptr) { 
-        Value = _mm_loadu_si128((__m128i*)ptr); 
-    } 
- 
-    operator __m128i&() { 
-        return Value; 
-    } 
- 
-    operator const __m128i&() const { 
-        return Value; 
-    } 
-}; 
- 
-template <> 
-struct TFuncLoad<__m128> { 
-    __m128 Value; 
- 
-    template <typename TPointer> 
-    TFuncLoad(TPointer* ptr) { 
-        Value = _mm_loadu_ps((float*)ptr); 
-    } 
- 
-    operator __m128&() { 
-        return Value; 
-    } 
- 
-    operator const __m128&() const { 
-        return Value; 
-    } 
-}; 
- 
-template <> 
+struct TFuncStore;
+
+template <>
+struct TFuncLoad<__m128i> {
+    __m128i Value;
+
+    template <typename TPointer>
+    TFuncLoad(TPointer* ptr) {
+        Value = _mm_loadu_si128((__m128i*)ptr);
+    }
+
+    operator __m128i&() {
+        return Value;
+    }
+
+    operator const __m128i&() const {
+        return Value;
+    }
+};
+
+template <>
+struct TFuncLoad<__m128> {
+    __m128 Value;
+
+    template <typename TPointer>
+    TFuncLoad(TPointer* ptr) {
+        Value = _mm_loadu_ps((float*)ptr);
+    }
+
+    operator __m128&() {
+        return Value;
+    }
+
+    operator const __m128&() const {
+        return Value;
+    }
+};
+
+template <>
 struct TFuncLoad<__m128d> {
     __m128d Value;
 
@@ -151,153 +151,153 @@ struct TFuncLoad<__m128d> {
 };
 
 template <>
-struct TFuncStore<__m128i> { 
-    template <typename TPointer> 
-    TFuncStore(TPointer* ptr, __m128i Value) { 
-        _mm_storeu_si128((__m128i*)ptr, Value); 
-    } 
-}; 
- 
-template <> 
-struct TFuncStore<__m128> { 
-    template <typename TPointer> 
-    TFuncStore(TPointer* ptr, __m128 Value) { 
-        _mm_storeu_ps((float*)ptr, Value); 
-    } 
-}; 
- 
-class TSSEEmulTest: public TTestBase { 
-private: 
-    UNIT_TEST_SUITE(TSSEEmulTest); 
-    UNIT_TEST(Test_mm_load_si128); 
-    UNIT_TEST(Test_mm_loadu_si128); 
+struct TFuncStore<__m128i> {
+    template <typename TPointer>
+    TFuncStore(TPointer* ptr, __m128i Value) {
+        _mm_storeu_si128((__m128i*)ptr, Value);
+    }
+};
+
+template <>
+struct TFuncStore<__m128> {
+    template <typename TPointer>
+    TFuncStore(TPointer* ptr, __m128 Value) {
+        _mm_storeu_ps((float*)ptr, Value);
+    }
+};
+
+class TSSEEmulTest: public TTestBase {
+private:
+    UNIT_TEST_SUITE(TSSEEmulTest);
+    UNIT_TEST(Test_mm_load_si128);
+    UNIT_TEST(Test_mm_loadu_si128);
     UNIT_TEST(Test_mm_storeu_si128);
     UNIT_TEST(Test_mm_loadu_si128_2);
     UNIT_TEST(Test_mm_loadu_ps);
     UNIT_TEST(Test_mm_storeu_ps);
- 
+
     UNIT_TEST(Test_mm_slli_epi16);
     UNIT_TEST(Test_mm_slli_epi32);
     UNIT_TEST(Test_mm_slli_epi64);
     UNIT_TEST(Test_mm_slli_si128);
 
-    UNIT_TEST(Test_mm_srli_epi16); 
-    UNIT_TEST(Test_mm_srli_epi32); 
-    UNIT_TEST(Test_mm_srli_epi64); 
+    UNIT_TEST(Test_mm_srli_epi16);
+    UNIT_TEST(Test_mm_srli_epi32);
+    UNIT_TEST(Test_mm_srli_epi64);
     UNIT_TEST(Test_mm_srli_si128);
- 
+
     UNIT_TEST(Test_mm_srai_epi16);
     UNIT_TEST(Test_mm_srai_epi32);
 
     UNIT_TEST(Test_mm_sll_epi16);
     UNIT_TEST(Test_mm_sll_epi32);
     UNIT_TEST(Test_mm_sll_epi64);
- 
+
     UNIT_TEST(Test_mm_srl_epi16);
     UNIT_TEST(Test_mm_srl_epi32);
     UNIT_TEST(Test_mm_srl_epi64);
 
-    UNIT_TEST(Test_mm_add_epi16); 
-    UNIT_TEST(Test_mm_add_epi32); 
-    UNIT_TEST(Test_mm_add_epi64); 
-    UNIT_TEST(Test_mm_add_ps); 
+    UNIT_TEST(Test_mm_add_epi16);
+    UNIT_TEST(Test_mm_add_epi32);
+    UNIT_TEST(Test_mm_add_epi64);
+    UNIT_TEST(Test_mm_add_ps);
     UNIT_TEST(Test_mm_add_pd);
- 
+
     UNIT_TEST(Test_mm_madd_epi16);
 
-    UNIT_TEST(Test_mm_sub_epi16); 
-    UNIT_TEST(Test_mm_sub_epi32); 
-    UNIT_TEST(Test_mm_sub_epi64); 
-    UNIT_TEST(Test_mm_sub_ps); 
+    UNIT_TEST(Test_mm_sub_epi16);
+    UNIT_TEST(Test_mm_sub_epi32);
+    UNIT_TEST(Test_mm_sub_epi64);
+    UNIT_TEST(Test_mm_sub_ps);
     UNIT_TEST(Test_mm_sub_pd);
- 
-    UNIT_TEST(Test_mm_mul_ps); 
+
+    UNIT_TEST(Test_mm_mul_ps);
     UNIT_TEST(Test_mm_mul_pd);
-    UNIT_TEST(Test_mm_div_ps); 
+    UNIT_TEST(Test_mm_div_ps);
     UNIT_TEST(Test_mm_div_pd);
-    UNIT_TEST(Test_mm_max_ps); 
-    UNIT_TEST(Test_mm_min_ps); 
-    UNIT_TEST(Test_mm_and_ps); 
- 
-    UNIT_TEST(Test_mm_unpacklo_epi8); 
-    UNIT_TEST(Test_mm_unpackhi_epi8); 
-    UNIT_TEST(Test_mm_unpacklo_epi16); 
-    UNIT_TEST(Test_mm_unpackhi_epi16); 
-    UNIT_TEST(Test_mm_unpacklo_epi32); 
-    UNIT_TEST(Test_mm_unpackhi_epi32); 
-    UNIT_TEST(Test_mm_unpacklo_epi64); 
-    UNIT_TEST(Test_mm_unpackhi_epi64); 
- 
-    UNIT_TEST(Test_mm_or_si128); 
-    UNIT_TEST(Test_mm_and_si128); 
-    UNIT_TEST(Test_mm_andnot_si128); 
- 
-    UNIT_TEST(Test_mm_cmpeq_epi8); 
-    UNIT_TEST(Test_mm_cmpeq_epi16); 
-    UNIT_TEST(Test_mm_cmpeq_epi32); 
-    UNIT_TEST(Test_mm_cmpeq_ps); 
- 
-    UNIT_TEST(Test_mm_cmpgt_epi8); 
-    UNIT_TEST(Test_mm_cmpgt_epi16); 
-    UNIT_TEST(Test_mm_cmpgt_epi32); 
-    UNIT_TEST(Test_mm_cmpgt_ps); 
- 
-    UNIT_TEST(Test_mm_cmplt_epi8); 
-    UNIT_TEST(Test_mm_cmplt_epi16); 
-    UNIT_TEST(Test_mm_cmplt_epi32); 
- 
-    UNIT_TEST(Test_mm_set1_epi8); 
-    UNIT_TEST(Test_mm_set1_epi16); 
-    UNIT_TEST(Test_mm_set1_epi32); 
-    UNIT_TEST(Test_mm_set1_ps); 
+    UNIT_TEST(Test_mm_max_ps);
+    UNIT_TEST(Test_mm_min_ps);
+    UNIT_TEST(Test_mm_and_ps);
+
+    UNIT_TEST(Test_mm_unpacklo_epi8);
+    UNIT_TEST(Test_mm_unpackhi_epi8);
+    UNIT_TEST(Test_mm_unpacklo_epi16);
+    UNIT_TEST(Test_mm_unpackhi_epi16);
+    UNIT_TEST(Test_mm_unpacklo_epi32);
+    UNIT_TEST(Test_mm_unpackhi_epi32);
+    UNIT_TEST(Test_mm_unpacklo_epi64);
+    UNIT_TEST(Test_mm_unpackhi_epi64);
+
+    UNIT_TEST(Test_mm_or_si128);
+    UNIT_TEST(Test_mm_and_si128);
+    UNIT_TEST(Test_mm_andnot_si128);
+
+    UNIT_TEST(Test_mm_cmpeq_epi8);
+    UNIT_TEST(Test_mm_cmpeq_epi16);
+    UNIT_TEST(Test_mm_cmpeq_epi32);
+    UNIT_TEST(Test_mm_cmpeq_ps);
+
+    UNIT_TEST(Test_mm_cmpgt_epi8);
+    UNIT_TEST(Test_mm_cmpgt_epi16);
+    UNIT_TEST(Test_mm_cmpgt_epi32);
+    UNIT_TEST(Test_mm_cmpgt_ps);
+
+    UNIT_TEST(Test_mm_cmplt_epi8);
+    UNIT_TEST(Test_mm_cmplt_epi16);
+    UNIT_TEST(Test_mm_cmplt_epi32);
+
+    UNIT_TEST(Test_mm_set1_epi8);
+    UNIT_TEST(Test_mm_set1_epi16);
+    UNIT_TEST(Test_mm_set1_epi32);
+    UNIT_TEST(Test_mm_set1_ps);
     UNIT_TEST(Test_mm_set_ps1);
- 
-    UNIT_TEST(Test_mm_setzero_si128); 
-    UNIT_TEST(Test_mm_setzero_ps); 
+
+    UNIT_TEST(Test_mm_setzero_si128);
+    UNIT_TEST(Test_mm_setzero_ps);
     UNIT_TEST(Test_mm_setzero_pd);
- 
-    UNIT_TEST(Test_mm_storel_epi64); 
-    UNIT_TEST(Test_mm_loadl_epi64); 
- 
+
+    UNIT_TEST(Test_mm_storel_epi64);
+    UNIT_TEST(Test_mm_loadl_epi64);
+
     UNIT_TEST(Test_mm_loadl_pd);
     UNIT_TEST(Test_mm_loadh_pd);
     UNIT_TEST(Test_mm_cvtsd_f64);
 
-    UNIT_TEST(Test_mm_shuffle_epi32); 
-    UNIT_TEST(Test_mm_movemask_epi8); 
-    UNIT_TEST(Test_mm_cvtsi128_si32); 
+    UNIT_TEST(Test_mm_shuffle_epi32);
+    UNIT_TEST(Test_mm_movemask_epi8);
+    UNIT_TEST(Test_mm_cvtsi128_si32);
     UNIT_TEST(Test_mm_cvtsi128_si64);
- 
-    UNIT_TEST(Test_mm_set_epi16); 
-    UNIT_TEST(Test_mm_set_epi32); 
-    UNIT_TEST(Test_mm_set_ps); 
+
+    UNIT_TEST(Test_mm_set_epi16);
+    UNIT_TEST(Test_mm_set_epi32);
+    UNIT_TEST(Test_mm_set_ps);
     UNIT_TEST(Test_mm_set_pd);
- 
-    UNIT_TEST(Test_mm_cvtsi32_si128); 
+
+    UNIT_TEST(Test_mm_cvtsi32_si128);
     UNIT_TEST(Test_mm_cvtsi64_si128);
- 
-    UNIT_TEST(Test_mm_packs_epi16); 
-    UNIT_TEST(Test_mm_packs_epi32); 
-    UNIT_TEST(Test_mm_packus_epi16); 
- 
-    UNIT_TEST(Test_mm_extract_epi16); 
+
+    UNIT_TEST(Test_mm_packs_epi16);
+    UNIT_TEST(Test_mm_packs_epi32);
+    UNIT_TEST(Test_mm_packus_epi16);
+
+    UNIT_TEST(Test_mm_extract_epi16);
     UNIT_TEST(Test_mm_extract_epi8);
     UNIT_TEST(Test_mm_extract_epi32);
     UNIT_TEST(Test_mm_extract_epi64);
- 
-    UNIT_TEST(Test_MM_TRANSPOSE4_PS); 
-    UNIT_TEST(Test_mm_movemask_ps); 
+
+    UNIT_TEST(Test_MM_TRANSPOSE4_PS);
+    UNIT_TEST(Test_mm_movemask_ps);
     UNIT_TEST(Test_mm_movemask_ps_2);
- 
-    UNIT_TEST(Test_mm_cvtepi32_ps); 
-    UNIT_TEST(Test_mm_cvtps_epi32); 
-    UNIT_TEST(Test_mm_cvttps_epi32); 
- 
-    UNIT_TEST(Test_mm_castsi128_ps); 
-    UNIT_TEST(Test_mm_castps_si128); 
- 
-    UNIT_TEST(Test_mm_mul_epu32); 
- 
+
+    UNIT_TEST(Test_mm_cvtepi32_ps);
+    UNIT_TEST(Test_mm_cvtps_epi32);
+    UNIT_TEST(Test_mm_cvttps_epi32);
+
+    UNIT_TEST(Test_mm_castsi128_ps);
+    UNIT_TEST(Test_mm_castps_si128);
+
+    UNIT_TEST(Test_mm_mul_epu32);
+
     UNIT_TEST(Test_mm_cmpunord_ps);
     UNIT_TEST(Test_mm_andnot_ps);
     UNIT_TEST(Test_mm_shuffle_ps);
@@ -310,36 +310,36 @@ private:
     UNIT_TEST(Test_mm_rsqrt_ps);
     UNIT_TEST(Test_matrixnet_powerpc);
 
-    UNIT_TEST_SUITE_END(); 
- 
-public: 
-    void Test_mm_load_si128(); 
-    void Test_mm_loadu_si128(); 
+    UNIT_TEST_SUITE_END();
+
+public:
+    void Test_mm_load_si128();
+    void Test_mm_loadu_si128();
     void Test_mm_storeu_si128();
     void Test_mm_loadu_si128_2();
     void Test_mm_loadu_ps();
     void Test_mm_storeu_ps();
- 
-    template <typename TElem, int bits, int elemCount, 
+
+    template <typename TElem, int bits, int elemCount,
               typename TFunc, typename TShifter, typename TOp, typename TElemFunc>
-    void Test_mm_shifter_epiXX(); 
- 
+    void Test_mm_shifter_epiXX();
+
     enum class EDirection {
         Left,
         Right
     };
- 
+
     struct TShiftRes {
         __m128i Value[17];
     };
 
     void Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo);
 
-    void Test_mm_slli_epi16(); 
-    void Test_mm_slli_epi32(); 
-    void Test_mm_slli_epi64(); 
+    void Test_mm_slli_epi16();
+    void Test_mm_slli_epi32();
+    void Test_mm_slli_epi64();
     void Test_mm_slli_si128();
- 
+
     void Test_mm_srli_epi16();
     void Test_mm_srli_epi32();
     void Test_mm_srli_epi64();
@@ -356,134 +356,134 @@ public:
     void Test_mm_srl_epi32();
     void Test_mm_srl_epi64();
 
-    void Test_mm_add_epi8(); 
-    void Test_mm_add_epi16(); 
-    void Test_mm_add_epi32(); 
-    void Test_mm_add_epi64(); 
-    void Test_mm_add_ps(); 
+    void Test_mm_add_epi8();
+    void Test_mm_add_epi16();
+    void Test_mm_add_epi32();
+    void Test_mm_add_epi64();
+    void Test_mm_add_ps();
     void Test_mm_add_pd();
- 
+
     void Test_mm_madd_epi16();
 
-    void Test_mm_sub_epi8(); 
-    void Test_mm_sub_epi16(); 
-    void Test_mm_sub_epi32(); 
-    void Test_mm_sub_epi64(); 
-    void Test_mm_sub_ps(); 
+    void Test_mm_sub_epi8();
+    void Test_mm_sub_epi16();
+    void Test_mm_sub_epi32();
+    void Test_mm_sub_epi64();
+    void Test_mm_sub_ps();
     void Test_mm_sub_pd();
- 
-    void Test_mm_mul_ps(); 
+
+    void Test_mm_mul_ps();
     void Test_mm_mul_pd();
-    void Test_mm_div_ps(); 
+    void Test_mm_div_ps();
     void Test_mm_div_pd();
-    void Test_mm_max_ps(); 
-    void Test_mm_min_ps(); 
-    void Test_mm_and_ps(); 
- 
-    template <typename TElem, int bits, int elemCount, int shift, 
-              typename TFunc, typename TOp> 
-    void Test_mm_unpack_epiXX(); 
-    void Test_mm_unpacklo_epi8(); 
-    void Test_mm_unpackhi_epi8(); 
-    void Test_mm_unpacklo_epi16(); 
-    void Test_mm_unpackhi_epi16(); 
-    void Test_mm_unpacklo_epi32(); 
-    void Test_mm_unpackhi_epi32(); 
-    void Test_mm_unpacklo_epi64(); 
-    void Test_mm_unpackhi_epi64(); 
- 
-    template <typename TElem, unsigned elemCount, 
-              typename TFunc, typename TElemFunc, 
+    void Test_mm_max_ps();
+    void Test_mm_min_ps();
+    void Test_mm_and_ps();
+
+    template <typename TElem, int bits, int elemCount, int shift,
+              typename TFunc, typename TOp>
+    void Test_mm_unpack_epiXX();
+    void Test_mm_unpacklo_epi8();
+    void Test_mm_unpackhi_epi8();
+    void Test_mm_unpacklo_epi16();
+    void Test_mm_unpackhi_epi16();
+    void Test_mm_unpacklo_epi32();
+    void Test_mm_unpackhi_epi32();
+    void Test_mm_unpacklo_epi64();
+    void Test_mm_unpackhi_epi64();
+
+    template <typename TElem, unsigned elemCount,
+              typename TFunc, typename TElemFunc,
               typename TOp, typename TVectorType = __m128i>
-    void Test_mm_dualop(); 
- 
-    template <typename TElem, unsigned elemCount, 
-              typename TFunc, typename TElemFunc, 
+    void Test_mm_dualop();
+
+    template <typename TElem, unsigned elemCount,
+              typename TFunc, typename TElemFunc,
               typename TOp, typename TVectorType = __m128i>
-    void Test_mm_dualcmp(); 
- 
-    void Test_mm_or_si128(); 
-    void Test_mm_and_si128(); 
-    void Test_mm_andnot_si128(); 
- 
-    void Test_mm_cmpeq_epi8(); 
-    void Test_mm_cmpeq_epi16(); 
-    void Test_mm_cmpeq_epi32(); 
-    void Test_mm_cmpeq_ps(); 
- 
-    void Test_mm_cmpgt_epi8(); 
-    void Test_mm_cmpgt_epi16(); 
-    void Test_mm_cmpgt_epi32(); 
-    void Test_mm_cmpgt_ps(); 
- 
-    void Test_mm_cmplt_epi8(); 
-    void Test_mm_cmplt_epi16(); 
-    void Test_mm_cmplt_epi32(); 
- 
-    template <typename TElem, int elemCount, 
+    void Test_mm_dualcmp();
+
+    void Test_mm_or_si128();
+    void Test_mm_and_si128();
+    void Test_mm_andnot_si128();
+
+    void Test_mm_cmpeq_epi8();
+    void Test_mm_cmpeq_epi16();
+    void Test_mm_cmpeq_epi32();
+    void Test_mm_cmpeq_ps();
+
+    void Test_mm_cmpgt_epi8();
+    void Test_mm_cmpgt_epi16();
+    void Test_mm_cmpgt_epi32();
+    void Test_mm_cmpgt_ps();
+
+    void Test_mm_cmplt_epi8();
+    void Test_mm_cmplt_epi16();
+    void Test_mm_cmplt_epi32();
+
+    template <typename TElem, int elemCount,
               typename TFunc, typename TOp, typename TVectorType>
-    void Test_mm_setter_epiXX(); 
-    void Test_mm_set1_epi8(); 
-    void Test_mm_set1_epi16(); 
-    void Test_mm_set1_epi32(); 
-    void Test_mm_set1_ps(); 
+    void Test_mm_setter_epiXX();
+    void Test_mm_set1_epi8();
+    void Test_mm_set1_epi16();
+    void Test_mm_set1_epi32();
+    void Test_mm_set1_ps();
     void Test_mm_set_ps1();
- 
-    void Test_mm_setzero_si128(); 
-    void Test_mm_setzero_ps(); 
+
+    void Test_mm_setzero_si128();
+    void Test_mm_setzero_ps();
     void Test_mm_setzero_pd();
- 
-    void Test_mm_loadl_epi64(); 
-    void Test_mm_storel_epi64(); 
- 
+
+    void Test_mm_loadl_epi64();
+    void Test_mm_storel_epi64();
+
     void Test_mm_loadl_pd();
     void Test_mm_loadh_pd();
     void Test_mm_cvtsd_f64();
 
-    void Test_mm_shuffle_epi32(); 
-    void Test_mm_movemask_epi8(); 
-    void Test_mm_cvtsi128_si32(); 
+    void Test_mm_shuffle_epi32();
+    void Test_mm_movemask_epi8();
+    void Test_mm_cvtsi128_si32();
     void Test_mm_cvtsi128_si64();
- 
-    void Test_mm_set_epi16(); 
-    void Test_mm_set_epi32(); 
-    void Test_mm_set_ps(); 
+
+    void Test_mm_set_epi16();
+    void Test_mm_set_epi32();
+    void Test_mm_set_ps();
     void Test_mm_set_pd();
- 
-    void Test_mm_cvtsi32_si128(); 
+
+    void Test_mm_cvtsi32_si128();
     void Test_mm_cvtsi64_si128();
- 
-    template <typename TElem, typename TNarrow, unsigned elemCount, 
-              typename TFunc> 
-    void Test_mm_packs_epiXX(); 
-    void Test_mm_packs_epi16(); 
-    void Test_mm_packs_epi32(); 
-    void Test_mm_packus_epi16(); 
- 
-    void Test_mm_extract_epi16(); 
+
+    template <typename TElem, typename TNarrow, unsigned elemCount,
+              typename TFunc>
+    void Test_mm_packs_epiXX();
+    void Test_mm_packs_epi16();
+    void Test_mm_packs_epi32();
+    void Test_mm_packus_epi16();
+
+    void Test_mm_extract_epi16();
     void Test_mm_extract_epi8();
     void Test_mm_extract_epi32();
     void Test_mm_extract_epi64();
- 
-    void Test_MM_TRANSPOSE4_PS(); 
-    void Test_mm_movemask_ps(); 
+
+    void Test_MM_TRANSPOSE4_PS();
+    void Test_mm_movemask_ps();
     void Test_mm_movemask_ps_2();
- 
-    template <typename TFrom, typename TTo, unsigned elemCount, 
-              typename TLoadVector, typename TResultVector, 
-              typename TElemFunc, typename TFunc, typename TOp> 
-    void Test_mm_convertop(); 
-    void Test_mm_cvtepi32_ps(); 
-    void Test_mm_cvtps_epi32(); 
-    void Test_mm_cvttps_epi32(); 
- 
-    template <typename TLoadVector, typename TCastVector, 
-              typename TFunc, TFunc* func> 
-    void Test_mm_castXX(); 
-    void Test_mm_castsi128_ps(); 
-    void Test_mm_castps_si128(); 
- 
-    void Test_mm_mul_epu32(); 
+
+    template <typename TFrom, typename TTo, unsigned elemCount,
+              typename TLoadVector, typename TResultVector,
+              typename TElemFunc, typename TFunc, typename TOp>
+    void Test_mm_convertop();
+    void Test_mm_cvtepi32_ps();
+    void Test_mm_cvtps_epi32();
+    void Test_mm_cvttps_epi32();
+
+    template <typename TLoadVector, typename TCastVector,
+              typename TFunc, TFunc* func>
+    void Test_mm_castXX();
+    void Test_mm_castsi128_ps();
+    void Test_mm_castps_si128();
+
+    void Test_mm_mul_epu32();
 
     void Test_mm_cmpunord_ps();
     void Test_mm_store_ss();
@@ -497,30 +497,30 @@ public:
     void Test_mm_rsqrt_ps();
     void Test_mm_rsqrt_ss();
     void Test_matrixnet_powerpc();
-}; 
- 
-UNIT_TEST_SUITE_REGISTRATION(TSSEEmulTest); 
- 
-void TSSEEmulTest::Test_mm_load_si128() { 
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TSSEEmulTest);
+
+void TSSEEmulTest::Test_mm_load_si128() {
     alignas(16) char data[16] = {
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    __m128i value = _mm_load_si128((__m128i*)&data); 
-    UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[0], 0xAABB2211CCFF00AAUL); 
-    UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[1], 0x1C66775588449933UL); 
-} 
- 
-void TSSEEmulTest::Test_mm_loadu_si128() { 
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    __m128i value = _mm_load_si128((__m128i*)&data);
+    UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[0], 0xAABB2211CCFF00AAUL);
+    UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[1], 0x1C66775588449933UL);
+}
+
+void TSSEEmulTest::Test_mm_loadu_si128() {
     alignas(16) char data[17] = {
-        '\x66', 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    UNIT_ASSERT((ui64(&data[1]) & 0x1) == 0x1); 
-    __m128i value = _mm_loadu_si128((__m128i*)&data[1]); 
-    UNIT_ASSERT(TQType<uint64x2_t>::As(value)[0] == 0xAABB2211CCFF00AAUL); 
-    UNIT_ASSERT(TQType<uint64x2_t>::As(value)[1] == 0x1C66775588449933UL); 
-} 
- 
+        '\x66',
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    UNIT_ASSERT((ui64(&data[1]) & 0x1) == 0x1);
+    __m128i value = _mm_loadu_si128((__m128i*)&data[1]);
+    UNIT_ASSERT(TQType<uint64x2_t>::As(value)[0] == 0xAABB2211CCFF00AAUL);
+    UNIT_ASSERT(TQType<uint64x2_t>::As(value)[1] == 0x1C66775588449933UL);
+}
+
 void TSSEEmulTest::Test_mm_storeu_si128() {
     alignas(16) unsigned char stub[32] = {
         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
@@ -640,32 +640,32 @@ unsigned MakeNumber<unsigned>(unsigned number) {
     return number;
 }
 
-template <typename TElem, int bits, int elemCount, 
+template <typename TElem, int bits, int elemCount,
           typename TFunc, typename TShifter, typename TOp, typename TElemFunc>
-void TSSEEmulTest::Test_mm_shifter_epiXX() { 
-    char data[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    TElem* dataw = reinterpret_cast<TElem*>(&data); 
- 
-    __m128i value = _mm_loadu_si128((__m128i*)&data); 
- 
+void TSSEEmulTest::Test_mm_shifter_epiXX() {
+    char data[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    TElem* dataw = reinterpret_cast<TElem*>(&data);
+
+    __m128i value = _mm_loadu_si128((__m128i*)&data);
+
     for (unsigned shifter = 0; shifter <= bits; ++shifter) {
-        TElem shiftedData[elemCount]; 
+        TElem shiftedData[elemCount];
         for (unsigned i = 0; i < elemCount; ++i) {
-            shiftedData[i] = TElemFunc::Call(dataw[i], shifter); 
+            shiftedData[i] = TElemFunc::Call(dataw[i], shifter);
         }
- 
+
         const TShifter adhoc_shifter = MakeNumber<TShifter>(shifter);
 
         __m128i result = TFunc(value, adhoc_shifter);
 
         for (unsigned i = 0; i < elemCount; ++i) {
-            UNIT_ASSERT_EQUAL(shiftedData[i], TQType<TOp>::As(result)[i]); 
+            UNIT_ASSERT_EQUAL(shiftedData[i], TQType<TOp>::As(result)[i]);
         }
-    } 
-} 
- 
+    }
+}
+
 
 void TSSEEmulTest::Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo) {
     const char data[48] = {
@@ -713,52 +713,52 @@ struct THelperASHR {
     }
 };
 
-template <typename TElem> 
-struct THelperSHR { 
-    static TElem Call(const TElem op, const int shift) { 
+template <typename TElem>
+struct THelperSHR {
+    static TElem Call(const TElem op, const int shift) {
         constexpr int nBitsInOp = sizeof(op) * CHAR_BIT;
         return shift < nBitsInOp ? op >> shift : 0;
-    } 
-}; 
- 
-void TSSEEmulTest::Test_mm_srli_epi16() { 
+    }
+};
+
+void TSSEEmulTest::Test_mm_srli_epi16() {
     Test_mm_shifter_epiXX<ui16, 16, 8, Wrap(_mm_srli_epi16), unsigned, uint16x8_t,
-                          THelperSHR<ui16>>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_srli_epi32() { 
+                          THelperSHR<ui16>>();
+}
+
+void TSSEEmulTest::Test_mm_srli_epi32() {
     Test_mm_shifter_epiXX<ui32, 32, 4, Wrap(_mm_srli_epi32), unsigned, uint32x4_t,
-                          THelperSHR<ui32>>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_srli_epi64() { 
+                          THelperSHR<ui32>>();
+}
+
+void TSSEEmulTest::Test_mm_srli_epi64() {
     Test_mm_shifter_epiXX<ui64, 64, 2, Wrap(_mm_srli_epi64), unsigned, uint64x2_t,
-                          THelperSHR<ui64>>(); 
-} 
- 
-template <typename TElem> 
-struct THelperSHL { 
-    static TElem Call(const TElem op, const int shift) { 
+                          THelperSHR<ui64>>();
+}
+
+template <typename TElem>
+struct THelperSHL {
+    static TElem Call(const TElem op, const int shift) {
         constexpr int nBitsInOp = sizeof(op) * CHAR_BIT;
         return shift < nBitsInOp ? op << shift : 0;
-    } 
-}; 
- 
-void TSSEEmulTest::Test_mm_slli_epi16() { 
+    }
+};
+
+void TSSEEmulTest::Test_mm_slli_epi16() {
     Test_mm_shifter_epiXX<ui16, 16, 8, Wrap(_mm_slli_epi16), unsigned, uint16x8_t,
-                          THelperSHL<ui16>>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_slli_epi32() { 
+                          THelperSHL<ui16>>();
+}
+
+void TSSEEmulTest::Test_mm_slli_epi32() {
     Test_mm_shifter_epiXX<ui32, 32, 4, Wrap(_mm_slli_epi32), unsigned, uint32x4_t,
-                          THelperSHL<ui32>>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_slli_epi64() { 
+                          THelperSHL<ui32>>();
+}
+
+void TSSEEmulTest::Test_mm_slli_epi64() {
     Test_mm_shifter_epiXX<ui64, 64, 2, Wrap(_mm_slli_epi64), unsigned, uint64x2_t,
-                          THelperSHL<ui64>>(); 
-} 
- 
+                          THelperSHL<ui64>>();
+}
+
 void TSSEEmulTest::Test_mm_slli_si128() {
     Test_mm_byte_shifter(EDirection::Left, [] (__m128i a) -> TShiftRes {
         TShiftRes res;
@@ -849,30 +849,30 @@ void TSSEEmulTest::Test_mm_sll_epi64() {
                           THelperSHL<ui64>>();
 }
 
-template <typename TElem> 
-struct THelperAdd { 
-    static TElem Call(const TElem op1, const TElem op2) { 
-        return op1 + op2; 
-    } 
-}; 
- 
-void TSSEEmulTest::Test_mm_add_epi16() { 
-    Test_mm_dualop<ui16, 8, Wrap(_mm_add_epi16), THelperAdd<ui16>, uint16x8_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_add_epi32() { 
-    Test_mm_dualop<ui32, 4, Wrap(_mm_add_epi32), THelperAdd<ui32>, uint32x4_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_add_epi64() { 
-    Test_mm_dualop<ui64, 2, Wrap(_mm_add_epi64), THelperAdd<ui64>, uint64x2_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_add_ps() { 
-    Test_mm_dualop<float, 2, WrapF(_mm_add_ps), 
-                   THelperAdd<float>, float32x4_t, __m128>(); 
-} 
- 
+template <typename TElem>
+struct THelperAdd {
+    static TElem Call(const TElem op1, const TElem op2) {
+        return op1 + op2;
+    }
+};
+
+void TSSEEmulTest::Test_mm_add_epi16() {
+    Test_mm_dualop<ui16, 8, Wrap(_mm_add_epi16), THelperAdd<ui16>, uint16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_add_epi32() {
+    Test_mm_dualop<ui32, 4, Wrap(_mm_add_epi32), THelperAdd<ui32>, uint32x4_t>();
+}
+
+void TSSEEmulTest::Test_mm_add_epi64() {
+    Test_mm_dualop<ui64, 2, Wrap(_mm_add_epi64), THelperAdd<ui64>, uint64x2_t>();
+}
+
+void TSSEEmulTest::Test_mm_add_ps() {
+    Test_mm_dualop<float, 2, WrapF(_mm_add_ps),
+                   THelperAdd<float>, float32x4_t, __m128>();
+}
+
 void TSSEEmulTest::Test_mm_add_pd() {
     Test_mm_dualop<double, 2, WrapD(_mm_add_pd),
                    THelperAdd<double>, float64x2_t, __m128d>();
@@ -904,44 +904,44 @@ void TSSEEmulTest::Test_mm_madd_epi16() {
 }
 
 
-template <typename TElem> 
-struct THelperSub { 
-    static TElem Call(const TElem op1, const TElem op2) { 
-        return op1 - op2; 
-    } 
-}; 
- 
-void TSSEEmulTest::Test_mm_sub_epi16() { 
-    Test_mm_dualop<ui16, 8, Wrap(_mm_sub_epi16), THelperSub<ui16>, uint16x8_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_sub_epi32() { 
-    Test_mm_dualop<ui32, 4, Wrap(_mm_sub_epi32), THelperSub<ui32>, uint32x4_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_sub_epi64() { 
-    Test_mm_dualop<ui64, 2, Wrap(_mm_sub_epi64), THelperSub<ui64>, uint64x2_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_sub_ps() { 
-    Test_mm_dualop<float, 4, WrapF(_mm_sub_ps), THelperSub<float>, 
-                   float32x4_t, __m128>(); 
-} 
- 
+template <typename TElem>
+struct THelperSub {
+    static TElem Call(const TElem op1, const TElem op2) {
+        return op1 - op2;
+    }
+};
+
+void TSSEEmulTest::Test_mm_sub_epi16() {
+    Test_mm_dualop<ui16, 8, Wrap(_mm_sub_epi16), THelperSub<ui16>, uint16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_sub_epi32() {
+    Test_mm_dualop<ui32, 4, Wrap(_mm_sub_epi32), THelperSub<ui32>, uint32x4_t>();
+}
+
+void TSSEEmulTest::Test_mm_sub_epi64() {
+    Test_mm_dualop<ui64, 2, Wrap(_mm_sub_epi64), THelperSub<ui64>, uint64x2_t>();
+}
+
+void TSSEEmulTest::Test_mm_sub_ps() {
+    Test_mm_dualop<float, 4, WrapF(_mm_sub_ps), THelperSub<float>,
+                   float32x4_t, __m128>();
+}
+
 void TSSEEmulTest::Test_mm_sub_pd() {
     Test_mm_dualop<double, 2, WrapD(_mm_sub_pd), THelperSub<double>,
                    float64x2_t, __m128d>();
 }
 
-void TSSEEmulTest::Test_mm_mul_ps() { 
-    struct THelper { 
-        static float Call(const float op1, const float op2) { 
-            return op1 * op2; 
-        } 
-    }; 
-    Test_mm_dualop<float, 4, WrapF(_mm_mul_ps), THelper, float32x4_t, __m128>(); 
-} 
- 
+void TSSEEmulTest::Test_mm_mul_ps() {
+    struct THelper {
+        static float Call(const float op1, const float op2) {
+            return op1 * op2;
+        }
+    };
+    Test_mm_dualop<float, 4, WrapF(_mm_mul_ps), THelper, float32x4_t, __m128>();
+}
+
 void TSSEEmulTest::Test_mm_mul_pd() {
     struct THelper {
         static double Call(const double op1, const double op2) {
@@ -951,15 +951,15 @@ void TSSEEmulTest::Test_mm_mul_pd() {
     Test_mm_dualop<double, 2, WrapD(_mm_mul_pd), THelper, float64x2_t, __m128d>();
 }
 
-void TSSEEmulTest::Test_mm_div_ps() { 
-    struct THelper { 
-        static float Call(const float op1, const float op2) { 
-            return op1 / op2; 
-        } 
-    }; 
-    Test_mm_dualop<float, 4, WrapF(_mm_div_ps), THelper, float32x4_t, __m128>(); 
-} 
- 
+void TSSEEmulTest::Test_mm_div_ps() {
+    struct THelper {
+        static float Call(const float op1, const float op2) {
+            return op1 / op2;
+        }
+    };
+    Test_mm_dualop<float, 4, WrapF(_mm_div_ps), THelper, float32x4_t, __m128>();
+}
+
 void TSSEEmulTest::Test_mm_div_pd() {
     struct THelper {
         static double Call(const double op1, const double op2) {
@@ -969,441 +969,441 @@ void TSSEEmulTest::Test_mm_div_pd() {
     Test_mm_dualop<double, 2, WrapD(_mm_div_pd), THelper, float64x2_t, __m128d>();
 }
 
-void TSSEEmulTest::Test_mm_max_ps() { 
-    struct THelper { 
-        static float Call(const float op1, const float op2) { 
-            return std::max(op1, op2); 
-        } 
-    }; 
-    Test_mm_dualop<float, 4, WrapF(_mm_max_ps), THelper, float32x4_t, __m128>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_min_ps() { 
-    struct THelper { 
-        static float Call(const float op1, const float op2) { 
-            return std::min(op1, op2); 
-        } 
-    }; 
-    Test_mm_dualop<float, 4, WrapF(_mm_min_ps), THelper, float32x4_t, __m128>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_and_ps() { 
-    struct THelper { 
-        static float Call(const float op1, const float op2) { 
-            union Cast { 
-                unsigned int AsUInt; 
-                float AsFloat; 
-            }; 
-            Cast v1, v2, result; 
-            v1.AsFloat = op1; 
-            v2.AsFloat = op2; 
-            result.AsUInt = v1.AsUInt & v2.AsUInt; 
-            return result.AsFloat; 
-        } 
-    }; 
-    Test_mm_dualcmp<float, 4, WrapF(_mm_and_ps), 
-                    THelper, float32x4_t, __m128>(); 
-} 
- 
-template <typename TElem, int bits, int elemCount, int shift, 
-          typename TFunc, typename TOp> 
-void TSSEEmulTest::Test_mm_unpack_epiXX() { 
-    char data1[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    char data2[16] = { 
-        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', 
-        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; 
-    TElem* dataw1 = reinterpret_cast<TElem*>(&data1); 
-    TElem* dataw2 = reinterpret_cast<TElem*>(&data2); 
- 
-    __m128i value1 = _mm_loadu_si128((__m128i*)&data1); 
-    __m128i value2 = _mm_loadu_si128((__m128i*)&data2); 
- 
-    TElem zippedData[elemCount]; 
-    for (unsigned i = 0; i < elemCount / 2; ++i) { 
-        zippedData[i * 2] = dataw1[i + shift]; 
-        zippedData[i * 2 + 1] = dataw2[i + shift]; 
-    } 
-    __m128i result = TFunc(value1, value2); 
- 
-    for (unsigned i = 0; i < elemCount / 2; ++i) { 
-        UNIT_ASSERT_EQUAL(zippedData[i * 2], TQType<TOp>::As(result)[i * 2]); 
-        UNIT_ASSERT_EQUAL(zippedData[i * 2 + 1], 
-                          TQType<TOp>::As(result)[i * 2 + 1]); 
-    } 
-} 
- 
-void TSSEEmulTest::Test_mm_unpacklo_epi8() { 
-    Test_mm_unpack_epiXX<ui8, 8, 16, 0, Wrap(_mm_unpacklo_epi8), uint8x16_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_unpackhi_epi8() { 
-    Test_mm_unpack_epiXX<ui8, 8, 16, 8, Wrap(_mm_unpackhi_epi8), uint8x16_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_unpacklo_epi16() { 
-    Test_mm_unpack_epiXX<ui16, 16, 8, 0, Wrap(_mm_unpacklo_epi16), uint16x8_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_unpackhi_epi16() { 
-    Test_mm_unpack_epiXX<ui16, 16, 8, 4, Wrap(_mm_unpackhi_epi16), uint16x8_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_unpacklo_epi32() { 
-    Test_mm_unpack_epiXX<ui32, 32, 4, 0, Wrap(_mm_unpacklo_epi32), uint32x4_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_unpackhi_epi32() { 
-    Test_mm_unpack_epiXX<ui32, 32, 4, 2, Wrap(_mm_unpackhi_epi32), uint32x4_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_unpacklo_epi64() { 
-    Test_mm_unpack_epiXX<ui64, 64, 2, 0, Wrap(_mm_unpacklo_epi64), uint64x2_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_unpackhi_epi64() { 
-    Test_mm_unpack_epiXX<ui64, 64, 2, 1, Wrap(_mm_unpackhi_epi64), uint64x2_t>(); 
-} 
- 
-template <typename TElem, unsigned elemCount, 
-          typename TFunc, typename TElemFunc, 
+void TSSEEmulTest::Test_mm_max_ps() {
+    struct THelper {
+        static float Call(const float op1, const float op2) {
+            return std::max(op1, op2);
+        }
+    };
+    Test_mm_dualop<float, 4, WrapF(_mm_max_ps), THelper, float32x4_t, __m128>();
+}
+
+void TSSEEmulTest::Test_mm_min_ps() {
+    struct THelper {
+        static float Call(const float op1, const float op2) {
+            return std::min(op1, op2);
+        }
+    };
+    Test_mm_dualop<float, 4, WrapF(_mm_min_ps), THelper, float32x4_t, __m128>();
+}
+
+void TSSEEmulTest::Test_mm_and_ps() {
+    struct THelper {
+        static float Call(const float op1, const float op2) {
+            union Cast {
+                unsigned int AsUInt;
+                float AsFloat;
+            };
+            Cast v1, v2, result;
+            v1.AsFloat = op1;
+            v2.AsFloat = op2;
+            result.AsUInt = v1.AsUInt & v2.AsUInt;
+            return result.AsFloat;
+        }
+    };
+    Test_mm_dualcmp<float, 4, WrapF(_mm_and_ps),
+                    THelper, float32x4_t, __m128>();
+}
+
+template <typename TElem, int bits, int elemCount, int shift,
+          typename TFunc, typename TOp>
+void TSSEEmulTest::Test_mm_unpack_epiXX() {
+    char data1[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    char data2[16] = {
+        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
+    TElem* dataw1 = reinterpret_cast<TElem*>(&data1);
+    TElem* dataw2 = reinterpret_cast<TElem*>(&data2);
+
+    __m128i value1 = _mm_loadu_si128((__m128i*)&data1);
+    __m128i value2 = _mm_loadu_si128((__m128i*)&data2);
+
+    TElem zippedData[elemCount];
+    for (unsigned i = 0; i < elemCount / 2; ++i) {
+        zippedData[i * 2] = dataw1[i + shift];
+        zippedData[i * 2 + 1] = dataw2[i + shift];
+    }
+    __m128i result = TFunc(value1, value2);
+
+    for (unsigned i = 0; i < elemCount / 2; ++i) {
+        UNIT_ASSERT_EQUAL(zippedData[i * 2], TQType<TOp>::As(result)[i * 2]);
+        UNIT_ASSERT_EQUAL(zippedData[i * 2 + 1],
+                          TQType<TOp>::As(result)[i * 2 + 1]);
+    }
+}
+
+void TSSEEmulTest::Test_mm_unpacklo_epi8() {
+    Test_mm_unpack_epiXX<ui8, 8, 16, 0, Wrap(_mm_unpacklo_epi8), uint8x16_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpackhi_epi8() {
+    Test_mm_unpack_epiXX<ui8, 8, 16, 8, Wrap(_mm_unpackhi_epi8), uint8x16_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpacklo_epi16() {
+    Test_mm_unpack_epiXX<ui16, 16, 8, 0, Wrap(_mm_unpacklo_epi16), uint16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpackhi_epi16() {
+    Test_mm_unpack_epiXX<ui16, 16, 8, 4, Wrap(_mm_unpackhi_epi16), uint16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpacklo_epi32() {
+    Test_mm_unpack_epiXX<ui32, 32, 4, 0, Wrap(_mm_unpacklo_epi32), uint32x4_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpackhi_epi32() {
+    Test_mm_unpack_epiXX<ui32, 32, 4, 2, Wrap(_mm_unpackhi_epi32), uint32x4_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpacklo_epi64() {
+    Test_mm_unpack_epiXX<ui64, 64, 2, 0, Wrap(_mm_unpacklo_epi64), uint64x2_t>();
+}
+
+void TSSEEmulTest::Test_mm_unpackhi_epi64() {
+    Test_mm_unpack_epiXX<ui64, 64, 2, 1, Wrap(_mm_unpackhi_epi64), uint64x2_t>();
+}
+
+template <typename TElem, unsigned elemCount,
+          typename TFunc, typename TElemFunc,
           typename TOp, typename TVectorType>
-void TSSEEmulTest::Test_mm_dualop() { 
-    char data1[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    char data2[16] = { 
-        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', 
-        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; 
-    TElem* dataw1 = reinterpret_cast<TElem*>(&data1); 
-    TElem* dataw2 = reinterpret_cast<TElem*>(&data2); 
- 
+void TSSEEmulTest::Test_mm_dualop() {
+    char data1[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    char data2[16] = {
+        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
+    TElem* dataw1 = reinterpret_cast<TElem*>(&data1);
+    TElem* dataw2 = reinterpret_cast<TElem*>(&data2);
+
     TVectorType value1 = TFuncLoad<TVectorType>(&data1);
     TVectorType value2 = TFuncLoad<TVectorType>(&data2);
- 
-    TElem procData[elemCount]; 
-    for (unsigned i = 0; i < elemCount; ++i) { 
-        procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]); 
-    } 
+
+    TElem procData[elemCount];
+    for (unsigned i = 0; i < elemCount; ++i) {
+        procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]);
+    }
     TVectorType result = TFunc(value1, value2);
- 
-    for (unsigned i = 0; i < elemCount; ++i) { 
-        UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]); 
-    } 
-} 
- 
-/* This is almost the same as Test_mm_dualop, 
-   but different data1 and data2 */ 
-template <typename TElem, unsigned elemCount, 
-          typename TFunc, typename TElemFunc, 
+
+    for (unsigned i = 0; i < elemCount; ++i) {
+        UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]);
+    }
+}
+
+/* This is almost the same as Test_mm_dualop,
+   but different data1 and data2 */
+template <typename TElem, unsigned elemCount,
+          typename TFunc, typename TElemFunc,
           typename TOp, typename TVectorType>
-void TSSEEmulTest::Test_mm_dualcmp() { 
-    char data1[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x66', '\x77', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C'}; 
-    char data2[16] = { 
-        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', 
-        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; 
-    TElem* dataw1 = reinterpret_cast<TElem*>(&data1); 
-    TElem* dataw2 = reinterpret_cast<TElem*>(&data2); 
- 
+void TSSEEmulTest::Test_mm_dualcmp() {
+    char data1[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x66', '\x77', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C'};
+    char data2[16] = {
+        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
+        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
+    TElem* dataw1 = reinterpret_cast<TElem*>(&data1);
+    TElem* dataw2 = reinterpret_cast<TElem*>(&data2);
+
     TVectorType value1 = TFuncLoad<TVectorType>(&data1);
     TVectorType value2 = TFuncLoad<TVectorType>(&data2);
- 
-    TElem procData[elemCount]; 
-    for (unsigned i = 0; i < elemCount; ++i) { 
-        procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]); 
-    } 
+
+    TElem procData[elemCount];
+    for (unsigned i = 0; i < elemCount; ++i) {
+        procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]);
+    }
     TVectorType result = TFunc(value1, value2);
- 
-    for (unsigned i = 0; i < elemCount; ++i) { 
-        /* memcmp is for compare to invalid floats in results */ 
+
+    for (unsigned i = 0; i < elemCount; ++i) {
+        /* memcmp is for compare to invalid floats in results */
         const TElem value = TQType<TOp>::As(result)[i];
         UNIT_ASSERT(memcmp(&(procData[i]), &value, sizeof(TElem)) == 0);
-    } 
-} 
- 
-void TSSEEmulTest::Test_mm_or_si128() { 
-    struct THelper { 
-        static ui64 Call(const ui64 op1, const ui64 op2) { 
-            return op1 | op2; 
-        } 
-    }; 
- 
-    Test_mm_dualop<ui64, 2, Wrap(_mm_or_si128), THelper, uint64x2_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_and_si128() { 
-    struct THelper { 
-        static ui64 Call(const ui64 op1, const ui64 op2) { 
-            return op1 & op2; 
-        } 
-    }; 
- 
-    Test_mm_dualop<ui64, 2, Wrap(_mm_and_si128), THelper, uint64x2_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_andnot_si128() { 
-    struct THelper { 
-        static ui64 Call(const ui64 op1, const ui64 op2) { 
-            return (~op1) & op2; 
-        } 
-    }; 
- 
-    Test_mm_dualop<ui64, 2, Wrap(_mm_andnot_si128), THelper, uint64x2_t>(); 
-} 
- 
-template <typename TElem> 
-struct THelperCMPEQ { 
-    static TElem Call(const TElem op1, const TElem op2) { 
-        return op1 == op2 ? ~TElem(0) : TElem(0); 
-    } 
-}; 
- 
-void TSSEEmulTest::Test_mm_cmpeq_epi8() { 
-    Test_mm_dualcmp<ui8, 16, Wrap(_mm_cmpeq_epi8), 
-                    THelperCMPEQ<ui8>, uint8x16_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_cmpeq_epi16() { 
-    Test_mm_dualcmp<ui16, 8, Wrap(_mm_cmpeq_epi16), 
-                    THelperCMPEQ<ui16>, uint16x8_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_cmpeq_epi32() { 
-    Test_mm_dualcmp<ui32, 4, Wrap(_mm_cmpeq_epi32), 
-                    THelperCMPEQ<ui32>, uint32x4_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_cmpeq_ps() { 
-    struct THelperFloat { 
-        static float Call(const float op1, const float op2) { 
-            union Cast { 
-                unsigned int AsUInt; 
-                float AsFloat; 
-            }; 
-            Cast value; 
-            value.AsUInt = op1 == op2 ? 0xFFFFFFFF : 0; 
-            return value.AsFloat; 
-        } 
-    }; 
- 
-    Test_mm_dualcmp<float, 4, WrapF(_mm_cmpeq_ps), 
-                    THelperFloat, float32x4_t, __m128>(); 
-} 
- 
-template <typename TElem> 
-struct THelperCMPGT { 
-    static TElem Call(const TElem op1, const TElem op2) { 
-        return op1 > op2 ? ~TElem(0) : TElem(0); 
-    } 
-}; 
- 
-void TSSEEmulTest::Test_mm_cmpgt_epi8() { 
-    Test_mm_dualcmp<i8, 16, Wrap(_mm_cmpgt_epi8), 
-                    THelperCMPGT<i8>, int8x16_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_cmpgt_epi16() { 
-    Test_mm_dualcmp<i16, 8, Wrap(_mm_cmpgt_epi16), 
-                    THelperCMPGT<i16>, int16x8_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_cmpgt_epi32() { 
-    Test_mm_dualcmp<i32, 4, Wrap(_mm_cmpgt_epi32), 
-                    THelperCMPGT<i32>, int32x4_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_cmpgt_ps() { 
-    struct THelperFloat { 
-        static float Call(const float op1, const float op2) { 
-            union Cast { 
-                unsigned int AsUInt; 
-                float AsFloat; 
-            }; 
-            Cast value; 
-            value.AsUInt = op1 > op2 ? 0xFFFFFFFF : 0; 
-            return value.AsFloat; 
-        } 
-    }; 
- 
-    Test_mm_dualcmp<float, 4, WrapF(_mm_cmpgt_ps), 
-                    THelperFloat, float32x4_t, __m128>(); 
-} 
- 
-template <typename TElem> 
-struct THelperCMPLT { 
-    static TElem Call(const TElem op1, const TElem op2) { 
-        return op1 < op2 ? ~TElem(0) : TElem(0); 
-    } 
-}; 
- 
-void TSSEEmulTest::Test_mm_cmplt_epi8() { 
-    Test_mm_dualcmp<i8, 16, Wrap(_mm_cmplt_epi8), 
-                    THelperCMPLT<i8>, int8x16_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_cmplt_epi16() { 
-    Test_mm_dualcmp<i16, 8, Wrap(_mm_cmplt_epi16), 
-                    THelperCMPLT<i16>, int16x8_t>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_cmplt_epi32() { 
-    Test_mm_dualcmp<i32, 4, Wrap(_mm_cmplt_epi32), 
-                    THelperCMPLT<i32>, int32x4_t>(); 
-} 
- 
-template <typename TElem, int elemCount, 
+    }
+}
+
+void TSSEEmulTest::Test_mm_or_si128() {
+    struct THelper {
+        static ui64 Call(const ui64 op1, const ui64 op2) {
+            return op1 | op2;
+        }
+    };
+
+    Test_mm_dualop<ui64, 2, Wrap(_mm_or_si128), THelper, uint64x2_t>();
+}
+
+void TSSEEmulTest::Test_mm_and_si128() {
+    struct THelper {
+        static ui64 Call(const ui64 op1, const ui64 op2) {
+            return op1 & op2;
+        }
+    };
+
+    Test_mm_dualop<ui64, 2, Wrap(_mm_and_si128), THelper, uint64x2_t>();
+}
+
+void TSSEEmulTest::Test_mm_andnot_si128() {
+    struct THelper {
+        static ui64 Call(const ui64 op1, const ui64 op2) {
+            return (~op1) & op2;
+        }
+    };
+
+    Test_mm_dualop<ui64, 2, Wrap(_mm_andnot_si128), THelper, uint64x2_t>();
+}
+
+template <typename TElem>
+struct THelperCMPEQ {
+    static TElem Call(const TElem op1, const TElem op2) {
+        return op1 == op2 ? ~TElem(0) : TElem(0);
+    }
+};
+
+void TSSEEmulTest::Test_mm_cmpeq_epi8() {
+    Test_mm_dualcmp<ui8, 16, Wrap(_mm_cmpeq_epi8),
+                    THelperCMPEQ<ui8>, uint8x16_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmpeq_epi16() {
+    Test_mm_dualcmp<ui16, 8, Wrap(_mm_cmpeq_epi16),
+                    THelperCMPEQ<ui16>, uint16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmpeq_epi32() {
+    Test_mm_dualcmp<ui32, 4, Wrap(_mm_cmpeq_epi32),
+                    THelperCMPEQ<ui32>, uint32x4_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmpeq_ps() {
+    struct THelperFloat {
+        static float Call(const float op1, const float op2) {
+            union Cast {
+                unsigned int AsUInt;
+                float AsFloat;
+            };
+            Cast value;
+            value.AsUInt = op1 == op2 ? 0xFFFFFFFF : 0;
+            return value.AsFloat;
+        }
+    };
+
+    Test_mm_dualcmp<float, 4, WrapF(_mm_cmpeq_ps),
+                    THelperFloat, float32x4_t, __m128>();
+}
+
+template <typename TElem>
+struct THelperCMPGT {
+    static TElem Call(const TElem op1, const TElem op2) {
+        return op1 > op2 ? ~TElem(0) : TElem(0);
+    }
+};
+
+void TSSEEmulTest::Test_mm_cmpgt_epi8() {
+    Test_mm_dualcmp<i8, 16, Wrap(_mm_cmpgt_epi8),
+                    THelperCMPGT<i8>, int8x16_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmpgt_epi16() {
+    Test_mm_dualcmp<i16, 8, Wrap(_mm_cmpgt_epi16),
+                    THelperCMPGT<i16>, int16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmpgt_epi32() {
+    Test_mm_dualcmp<i32, 4, Wrap(_mm_cmpgt_epi32),
+                    THelperCMPGT<i32>, int32x4_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmpgt_ps() {
+    struct THelperFloat {
+        static float Call(const float op1, const float op2) {
+            union Cast {
+                unsigned int AsUInt;
+                float AsFloat;
+            };
+            Cast value;
+            value.AsUInt = op1 > op2 ? 0xFFFFFFFF : 0;
+            return value.AsFloat;
+        }
+    };
+
+    Test_mm_dualcmp<float, 4, WrapF(_mm_cmpgt_ps),
+                    THelperFloat, float32x4_t, __m128>();
+}
+
+template <typename TElem>
+struct THelperCMPLT {
+    static TElem Call(const TElem op1, const TElem op2) {
+        return op1 < op2 ? ~TElem(0) : TElem(0);
+    }
+};
+
+void TSSEEmulTest::Test_mm_cmplt_epi8() {
+    Test_mm_dualcmp<i8, 16, Wrap(_mm_cmplt_epi8),
+                    THelperCMPLT<i8>, int8x16_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmplt_epi16() {
+    Test_mm_dualcmp<i16, 8, Wrap(_mm_cmplt_epi16),
+                    THelperCMPLT<i16>, int16x8_t>();
+}
+
+void TSSEEmulTest::Test_mm_cmplt_epi32() {
+    Test_mm_dualcmp<i32, 4, Wrap(_mm_cmplt_epi32),
+                    THelperCMPLT<i32>, int32x4_t>();
+}
+
+template <typename TElem, int elemCount,
           typename TFunc, typename TOp, typename TVectorType>
-void TSSEEmulTest::Test_mm_setter_epiXX() { 
-    char data[64] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C', 
-        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', 
-        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF', 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00', 
-        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C', 
-        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', 
-        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; 
-    TElem* dataw = reinterpret_cast<TElem*>(&data); 
- 
-    for (unsigned dataItem = 0; dataItem < elemCount * 4; ++dataItem) { 
+void TSSEEmulTest::Test_mm_setter_epiXX() {
+    char data[64] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',
+        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',
+        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',
+        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
+        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
+    TElem* dataw = reinterpret_cast<TElem*>(&data);
+
+    for (unsigned dataItem = 0; dataItem < elemCount * 4; ++dataItem) {
         TVectorType value = TFunc(dataw[dataItem]);
- 
-        for (unsigned i = 0; i < elemCount; ++i) 
-            UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<TOp>::As(value)[i]); 
-    } 
-} 
- 
-void TSSEEmulTest::Test_mm_set1_epi8() { 
-    Test_mm_setter_epiXX<i8, 16, Wrap(_mm_set1_epi8), int8x16_t, __m128i>(); 
-} 
-void TSSEEmulTest::Test_mm_set1_epi16() { 
-    Test_mm_setter_epiXX<i16, 8, Wrap(_mm_set1_epi16), int16x8_t, __m128i>(); 
-} 
-void TSSEEmulTest::Test_mm_set1_epi32() { 
-    Test_mm_setter_epiXX<i32, 4, Wrap(_mm_set1_epi32), int32x4_t, __m128i>(); 
-} 
-void TSSEEmulTest::Test_mm_set1_ps() { 
-    Test_mm_setter_epiXX<float, 4, WrapF(_mm_set1_ps), float32x4_t, __m128>(); 
-} 
- 
+
+        for (unsigned i = 0; i < elemCount; ++i)
+            UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<TOp>::As(value)[i]);
+    }
+}
+
+void TSSEEmulTest::Test_mm_set1_epi8() {
+    Test_mm_setter_epiXX<i8, 16, Wrap(_mm_set1_epi8), int8x16_t, __m128i>();
+}
+void TSSEEmulTest::Test_mm_set1_epi16() {
+    Test_mm_setter_epiXX<i16, 8, Wrap(_mm_set1_epi16), int16x8_t, __m128i>();
+}
+void TSSEEmulTest::Test_mm_set1_epi32() {
+    Test_mm_setter_epiXX<i32, 4, Wrap(_mm_set1_epi32), int32x4_t, __m128i>();
+}
+void TSSEEmulTest::Test_mm_set1_ps() {
+    Test_mm_setter_epiXX<float, 4, WrapF(_mm_set1_ps), float32x4_t, __m128>();
+}
+
 void TSSEEmulTest::Test_mm_set_ps1() {
     Test_mm_setter_epiXX<float, 4, WrapF(_mm_set_ps1), float32x4_t, __m128>();
 }
 
-void TSSEEmulTest::Test_mm_setzero_si128() { 
-    __m128i value = _mm_setzero_si128(); 
-    for (unsigned i = 0; i < 4; ++i) 
-        UNIT_ASSERT_EQUAL(0, TQType<uint32x4_t>::As(value)[i]); 
-} 
- 
-void TSSEEmulTest::Test_mm_setzero_ps() { 
-    __m128 value = _mm_setzero_ps(); 
-    for (unsigned i = 0; i < 4; ++i) 
-        UNIT_ASSERT_EQUAL(0.0, TQType<float32x4_t>::As(value)[i]); 
-} 
- 
+void TSSEEmulTest::Test_mm_setzero_si128() {
+    __m128i value = _mm_setzero_si128();
+    for (unsigned i = 0; i < 4; ++i)
+        UNIT_ASSERT_EQUAL(0, TQType<uint32x4_t>::As(value)[i]);
+}
+
+void TSSEEmulTest::Test_mm_setzero_ps() {
+    __m128 value = _mm_setzero_ps();
+    for (unsigned i = 0; i < 4; ++i)
+        UNIT_ASSERT_EQUAL(0.0, TQType<float32x4_t>::As(value)[i]);
+}
+
 void TSSEEmulTest::Test_mm_setzero_pd() {
     __m128d value = _mm_setzero_pd();
     for (unsigned i = 0; i < 2; ++i)
         UNIT_ASSERT_EQUAL(0.0, TQType<float64x2_t>::As(value)[i]);
 }
 
-void TSSEEmulTest::Test_mm_loadl_epi64() { 
-    char data[64] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C', 
-        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', 
-        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF', 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00', 
-        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C', 
-        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', 
-        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; 
-    ui64* dataw = reinterpret_cast<ui64*>(&data); 
- 
-    for (unsigned dataItem = 0; dataItem < 8; ++dataItem) { 
-        __m128i value = _mm_loadl_epi64((__m128i const*)&dataw[dataItem]); 
- 
-        UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<uint64x2_t>::As(value)[0]); 
-        UNIT_ASSERT_EQUAL(0, TQType<uint64x2_t>::As(value)[1]); 
-    } 
-} 
- 
-void TSSEEmulTest::Test_mm_storel_epi64() { 
-    char data[64] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C', 
-        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', 
-        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF', 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00', 
-        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C', 
-        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', 
-        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; 
-    ui64* dataw = reinterpret_cast<ui64*>(&data); 
- 
-    for (unsigned dataItem = 0; dataItem < 4; ++dataItem) { 
-        __m128i value = _mm_loadu_si128((__m128i*)&dataw[dataItem * 2]); 
- 
-        ui64 buf[2] = {55, 81}; 
-        _mm_storel_epi64((__m128i*)&buf, value); 
- 
-        UNIT_ASSERT_EQUAL(dataw[dataItem * 2], buf[0]); 
-        UNIT_ASSERT_EQUAL(81, buf[1]); 
-    } 
-} 
- 
-void TSSEEmulTest::Test_mm_shuffle_epi32() { 
-    char data[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    ui32* dataw = reinterpret_cast<ui32*>(&data); 
-    __m128i value = _mm_loadu_si128((__m128i*)&data); 
- 
-    int coding[4] = {1, 3, 0, 2}; 
-    __m128i result = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 0, 3, 1)); 
- 
-    for (unsigned i = 0; i < 4; ++i) 
-        UNIT_ASSERT_EQUAL(dataw[coding[i]], 
-                          TQType<uint32x4_t>::As(result)[i]); 
-} 
- 
-static int GetHighBitAt(char data, int at) { 
-    ui8 udata = data & 0x80; 
-    return int(udata >> 7) << at; 
-} 
- 
-void TSSEEmulTest::Test_mm_movemask_epi8() { 
-    char data[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    __m128i value = _mm_loadu_si128((__m128i*)&data); 
- 
-    int result = _mm_movemask_epi8(value); 
-    int verify = 0; 
-    for (unsigned i = 0; i < 16; ++i) { 
-        verify |= GetHighBitAt(data[i], i); 
-    } 
- 
-    UNIT_ASSERT_EQUAL(result, verify); 
-} 
- 
-void TSSEEmulTest::Test_mm_movemask_ps() { 
-    char data[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    __m128 value = _mm_loadu_ps((float*)&data); 
- 
-    int result = _mm_movemask_ps(value); 
-    int verify = 0; 
-    for (unsigned i = 0; i < 4; ++i) { 
-        verify |= GetHighBitAt(data[i * 4 + 3], i); 
-    } 
- 
-    UNIT_ASSERT_EQUAL(result, verify); 
-} 
- 
+void TSSEEmulTest::Test_mm_loadl_epi64() {
+    char data[64] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',
+        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',
+        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',
+        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
+        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
+    ui64* dataw = reinterpret_cast<ui64*>(&data);
+
+    for (unsigned dataItem = 0; dataItem < 8; ++dataItem) {
+        __m128i value = _mm_loadl_epi64((__m128i const*)&dataw[dataItem]);
+
+        UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<uint64x2_t>::As(value)[0]);
+        UNIT_ASSERT_EQUAL(0, TQType<uint64x2_t>::As(value)[1]);
+    }
+}
+
+void TSSEEmulTest::Test_mm_storel_epi64() {
+    char data[64] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',
+        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',
+        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',
+        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
+        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
+    ui64* dataw = reinterpret_cast<ui64*>(&data);
+
+    for (unsigned dataItem = 0; dataItem < 4; ++dataItem) {
+        __m128i value = _mm_loadu_si128((__m128i*)&dataw[dataItem * 2]);
+
+        ui64 buf[2] = {55, 81};
+        _mm_storel_epi64((__m128i*)&buf, value);
+
+        UNIT_ASSERT_EQUAL(dataw[dataItem * 2], buf[0]);
+        UNIT_ASSERT_EQUAL(81, buf[1]);
+    }
+}
+
+void TSSEEmulTest::Test_mm_shuffle_epi32() {
+    char data[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    ui32* dataw = reinterpret_cast<ui32*>(&data);
+    __m128i value = _mm_loadu_si128((__m128i*)&data);
+
+    int coding[4] = {1, 3, 0, 2};
+    __m128i result = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 0, 3, 1));
+
+    for (unsigned i = 0; i < 4; ++i)
+        UNIT_ASSERT_EQUAL(dataw[coding[i]],
+                          TQType<uint32x4_t>::As(result)[i]);
+}
+
+static int GetHighBitAt(char data, int at) {
+    ui8 udata = data & 0x80;
+    return int(udata >> 7) << at;
+}
+
+void TSSEEmulTest::Test_mm_movemask_epi8() {
+    char data[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    __m128i value = _mm_loadu_si128((__m128i*)&data);
+
+    int result = _mm_movemask_epi8(value);
+    int verify = 0;
+    for (unsigned i = 0; i < 16; ++i) {
+        verify |= GetHighBitAt(data[i], i);
+    }
+
+    UNIT_ASSERT_EQUAL(result, verify);
+}
+
+void TSSEEmulTest::Test_mm_movemask_ps() {
+    char data[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    __m128 value = _mm_loadu_ps((float*)&data);
+
+    int result = _mm_movemask_ps(value);
+    int verify = 0;
+    for (unsigned i = 0; i < 4; ++i) {
+        verify |= GetHighBitAt(data[i * 4 + 3], i);
+    }
+
+    UNIT_ASSERT_EQUAL(result, verify);
+}
+
 void TSSEEmulTest::Test_mm_movemask_ps_2() {
     char data[16] = {
         '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF',
@@ -1414,19 +1414,19 @@ void TSSEEmulTest::Test_mm_movemask_ps_2() {
     UNIT_ASSERT_EQUAL(result, 0xf);
 }
 
-void TSSEEmulTest::Test_mm_cvtsi128_si32() { 
-    char data[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    __m128i value = _mm_loadu_si128((__m128i*)&data); 
- 
-    int result = _mm_cvtsi128_si32(value); 
-    i32* datap = reinterpret_cast<i32*>(&data); 
-    int verify = datap[0]; 
- 
-    UNIT_ASSERT_EQUAL(result, verify); 
-} 
- 
+void TSSEEmulTest::Test_mm_cvtsi128_si32() {
+    char data[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    __m128i value = _mm_loadu_si128((__m128i*)&data);
+
+    int result = _mm_cvtsi128_si32(value);
+    i32* datap = reinterpret_cast<i32*>(&data);
+    int verify = datap[0];
+
+    UNIT_ASSERT_EQUAL(result, verify);
+}
+
 void TSSEEmulTest::Test_mm_cvtsi128_si64() {
     char data[16] = {
         '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
@@ -1440,52 +1440,52 @@ void TSSEEmulTest::Test_mm_cvtsi128_si64() {
     UNIT_ASSERT_EQUAL(result, verify);
 }
 
-void TSSEEmulTest::Test_mm_set_epi16() { 
-    char data[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    i16* dataw = reinterpret_cast<i16*>(&data); 
-    ui64* dataq = reinterpret_cast<ui64*>(&data); 
- 
-    __m128i result = _mm_set_epi16(dataw[7], dataw[6], dataw[5], dataw[4], 
-                                   dataw[3], dataw[2], dataw[1], dataw[0]); 
-    ui64 buf[2] = {53, 81}; 
-    _mm_storeu_si128((__m128i*)&buf, result); 
- 
-    UNIT_ASSERT_EQUAL(buf[0], dataq[0]); 
-    UNIT_ASSERT_EQUAL(buf[1], dataq[1]); 
-} 
- 
-void TSSEEmulTest::Test_mm_set_epi32() { 
-    char data[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    i32* dataw = reinterpret_cast<i32*>(&data); 
-    ui64* dataq = reinterpret_cast<ui64*>(&data); 
- 
-    __m128i result = _mm_set_epi32(dataw[3], dataw[2], dataw[1], dataw[0]); 
-    ui64 buf[2] = {53, 81}; 
-    _mm_storeu_si128((__m128i*)&buf, result); 
- 
-    UNIT_ASSERT_EQUAL(buf[0], dataq[0]); 
-    UNIT_ASSERT_EQUAL(buf[1], dataq[1]); 
-} 
- 
-void TSSEEmulTest::Test_mm_set_ps() { 
-    char data[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    float* dataw = reinterpret_cast<float*>(&data); 
-    ui64* dataq = reinterpret_cast<ui64*>(&data); 
- 
-    __m128 result = _mm_set_ps(dataw[3], dataw[2], dataw[1], dataw[0]); 
-    ui64 buf[2] = {53, 81}; 
-    _mm_storeu_ps((float*)&buf, result); 
- 
-    UNIT_ASSERT_EQUAL(buf[0], dataq[0]); 
-    UNIT_ASSERT_EQUAL(buf[1], dataq[1]); 
-} 
- 
+void TSSEEmulTest::Test_mm_set_epi16() {
+    char data[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    i16* dataw = reinterpret_cast<i16*>(&data);
+    ui64* dataq = reinterpret_cast<ui64*>(&data);
+
+    __m128i result = _mm_set_epi16(dataw[7], dataw[6], dataw[5], dataw[4],
+                                   dataw[3], dataw[2], dataw[1], dataw[0]);
+    ui64 buf[2] = {53, 81};
+    _mm_storeu_si128((__m128i*)&buf, result);
+
+    UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
+    UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
+}
+
+void TSSEEmulTest::Test_mm_set_epi32() {
+    char data[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    i32* dataw = reinterpret_cast<i32*>(&data);
+    ui64* dataq = reinterpret_cast<ui64*>(&data);
+
+    __m128i result = _mm_set_epi32(dataw[3], dataw[2], dataw[1], dataw[0]);
+    ui64 buf[2] = {53, 81};
+    _mm_storeu_si128((__m128i*)&buf, result);
+
+    UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
+    UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
+}
+
+void TSSEEmulTest::Test_mm_set_ps() {
+    char data[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    float* dataw = reinterpret_cast<float*>(&data);
+    ui64* dataq = reinterpret_cast<ui64*>(&data);
+
+    __m128 result = _mm_set_ps(dataw[3], dataw[2], dataw[1], dataw[0]);
+    ui64 buf[2] = {53, 81};
+    _mm_storeu_ps((float*)&buf, result);
+
+    UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
+    UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
+}
+
 void TSSEEmulTest::Test_mm_set_pd() {
     char data[16] = {
         '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
@@ -1501,22 +1501,22 @@ void TSSEEmulTest::Test_mm_set_pd() {
     UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
 }
 
-void TSSEEmulTest::Test_mm_cvtsi32_si128() { 
-    char data[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    i32* dataw = reinterpret_cast<i32*>(&data); 
- 
-    __m128i result = _mm_cvtsi32_si128(dataw[0]); 
-    i32 buf[4] = {53, 81, -43, 2132}; 
-    _mm_storeu_si128((__m128i*)&buf, result); 
- 
-    UNIT_ASSERT_EQUAL(buf[0], dataw[0]); 
-    UNIT_ASSERT_EQUAL(buf[1], 0); 
-    UNIT_ASSERT_EQUAL(buf[2], 0); 
-    UNIT_ASSERT_EQUAL(buf[3], 0); 
-} 
- 
+void TSSEEmulTest::Test_mm_cvtsi32_si128() {
+    char data[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    i32* dataw = reinterpret_cast<i32*>(&data);
+
+    __m128i result = _mm_cvtsi32_si128(dataw[0]);
+    i32 buf[4] = {53, 81, -43, 2132};
+    _mm_storeu_si128((__m128i*)&buf, result);
+
+    UNIT_ASSERT_EQUAL(buf[0], dataw[0]);
+    UNIT_ASSERT_EQUAL(buf[1], 0);
+    UNIT_ASSERT_EQUAL(buf[2], 0);
+    UNIT_ASSERT_EQUAL(buf[3], 0);
+}
+
 void TSSEEmulTest::Test_mm_cvtsi64_si128() {
     char data[16] = {
         '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
@@ -1531,44 +1531,44 @@ void TSSEEmulTest::Test_mm_cvtsi64_si128() {
     UNIT_ASSERT_EQUAL(buf[1], 0);
 }
 
-template <typename TElem, typename TNarrow, unsigned elemCount, typename TFunc> 
-void TSSEEmulTest::Test_mm_packs_epiXX() { 
-    char data[32] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x00', '\x66', '\x1C', 
-        '\x99', '\x33', '\x1C', '\x55', '\x00', '\x00', '\x00', '\x00', 
-        '\x00', '\xAA', '\x00', '\x00', '\xCC', '\xBB', '\x22', '\xFF'}; 
-    __m128i value0 = _mm_loadu_si128((__m128i*)&data); 
-    __m128i value1 = _mm_loadu_si128(((__m128i*)&data) + 1); 
-    TElem* dataw = reinterpret_cast<TElem*>(&data); 
- 
-    __m128i result = TFunc(value0, value1); 
- 
-    TNarrow verify[elemCount]; 
-    for (unsigned i = 0; i < elemCount; ++i) { 
-        TElem sum = dataw[i]; 
-        if (sum > std::numeric_limits<TNarrow>::max()) 
-            sum = std::numeric_limits<TNarrow>::max(); 
-        if (sum < std::numeric_limits<TNarrow>::min()) 
-            sum = std::numeric_limits<TNarrow>::min(); 
-        verify[i] = TNarrow(sum); 
-    } 
- 
-    ui64* verifyp = (ui64*)&verify; 
-    UNIT_ASSERT_EQUAL(verifyp[0], TQType<uint64x2_t>::As(result)[0]); 
-    UNIT_ASSERT_EQUAL(verifyp[1], TQType<uint64x2_t>::As(result)[1]); 
-} 
- 
-void TSSEEmulTest::Test_mm_packs_epi16() { 
-    Test_mm_packs_epiXX<i16, i8, 16, Wrap(_mm_packs_epi16)>(); 
-} 
-void TSSEEmulTest::Test_mm_packs_epi32() { 
-    Test_mm_packs_epiXX<i32, i16, 8, Wrap(_mm_packs_epi32)>(); 
-} 
-void TSSEEmulTest::Test_mm_packus_epi16() { 
-    Test_mm_packs_epiXX<i16, ui8, 16, Wrap(_mm_packus_epi16)>(); 
-} 
- 
+template <typename TElem, typename TNarrow, unsigned elemCount, typename TFunc>
+void TSSEEmulTest::Test_mm_packs_epiXX() {
+    char data[32] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x00', '\x66', '\x1C',
+        '\x99', '\x33', '\x1C', '\x55', '\x00', '\x00', '\x00', '\x00',
+        '\x00', '\xAA', '\x00', '\x00', '\xCC', '\xBB', '\x22', '\xFF'};
+    __m128i value0 = _mm_loadu_si128((__m128i*)&data);
+    __m128i value1 = _mm_loadu_si128(((__m128i*)&data) + 1);
+    TElem* dataw = reinterpret_cast<TElem*>(&data);
+
+    __m128i result = TFunc(value0, value1);
+
+    TNarrow verify[elemCount];
+    for (unsigned i = 0; i < elemCount; ++i) {
+        TElem sum = dataw[i];
+        if (sum > std::numeric_limits<TNarrow>::max())
+            sum = std::numeric_limits<TNarrow>::max();
+        if (sum < std::numeric_limits<TNarrow>::min())
+            sum = std::numeric_limits<TNarrow>::min();
+        verify[i] = TNarrow(sum);
+    }
+
+    ui64* verifyp = (ui64*)&verify;
+    UNIT_ASSERT_EQUAL(verifyp[0], TQType<uint64x2_t>::As(result)[0]);
+    UNIT_ASSERT_EQUAL(verifyp[1], TQType<uint64x2_t>::As(result)[1]);
+}
+
+void TSSEEmulTest::Test_mm_packs_epi16() {
+    Test_mm_packs_epiXX<i16, i8, 16, Wrap(_mm_packs_epi16)>();
+}
+void TSSEEmulTest::Test_mm_packs_epi32() {
+    Test_mm_packs_epiXX<i32, i16, 8, Wrap(_mm_packs_epi32)>();
+}
+void TSSEEmulTest::Test_mm_packus_epi16() {
+    Test_mm_packs_epiXX<i16, ui8, 16, Wrap(_mm_packus_epi16)>();
+}
+
 void TSSEEmulTest::Test_mm_extract_epi8() {
     alignas(16) char data[16] = {
         '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
@@ -1594,23 +1594,23 @@ void TSSEEmulTest::Test_mm_extract_epi8() {
     UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 15)), int(dataw[15]));
 }
 
-void TSSEEmulTest::Test_mm_extract_epi16() { 
+void TSSEEmulTest::Test_mm_extract_epi16() {
     alignas(16) char data[16] = {
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
     const ui16* dataw = reinterpret_cast<const ui16*>(&data);
     const __m128i value = _mm_loadu_si128((__m128i*)&data);
- 
-    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 0)), int(dataw[0])); 
-    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 1)), int(dataw[1])); 
-    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 2)), int(dataw[2])); 
-    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 3)), int(dataw[3])); 
-    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 4)), int(dataw[4])); 
-    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 5)), int(dataw[5])); 
-    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 6)), int(dataw[6])); 
-    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 7)), int(dataw[7])); 
-} 
- 
+
+    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 0)), int(dataw[0]));
+    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 1)), int(dataw[1]));
+    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 2)), int(dataw[2]));
+    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 3)), int(dataw[3]));
+    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 4)), int(dataw[4]));
+    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 5)), int(dataw[5]));
+    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 6)), int(dataw[6]));
+    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 7)), int(dataw[7]));
+}
+
 void TSSEEmulTest::Test_mm_extract_epi64() {
     alignas(16) char data[16] = {
         '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
@@ -1635,160 +1635,160 @@ void TSSEEmulTest::Test_mm_extract_epi32() {
     UNIT_ASSERT_EQUAL((_mm_extract_epi32(value, 3)), int(dataw[3]));
 }
 
-void TSSEEmulTest::Test_MM_TRANSPOSE4_PS() { 
-    char data0[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    char data1[16] = { 
-        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', 
-        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; 
-    char data2[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    char data3[16] = { 
-        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', 
-        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; 
- 
-    __m128 value0 = _mm_loadu_ps((float*)&data0); 
-    __m128 value1 = _mm_loadu_ps((float*)&data1); 
-    __m128 value2 = _mm_loadu_ps((float*)&data2); 
-    __m128 value3 = _mm_loadu_ps((float*)&data3); 
- 
-    _MM_TRANSPOSE4_PS(value0, value1, value2, value3); 
- 
-    ui64 tbuf0[2] = {0, 0}; 
-    ui64 tbuf1[2] = {0, 0}; 
-    ui64 tbuf2[2] = {0, 0}; 
-    ui64 tbuf3[2] = {0, 0}; 
- 
-    _mm_storeu_ps((float*)&tbuf0, value0); 
-    _mm_storeu_ps((float*)&tbuf1, value1); 
-    _mm_storeu_ps((float*)&tbuf2, value2); 
-    _mm_storeu_ps((float*)&tbuf3, value3); 
- 
-    char tdata0[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55', 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55'}; 
-    char tdata1[16] = { 
-        '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44', 
-        '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44'}; 
-    char tdata2[16] = { 
-        '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11', 
-        '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11'}; 
-    char tdata3[16] = { 
-        '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF', 
-        '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF'}; 
- 
-    UNIT_ASSERT(memcmp(tbuf0, tdata0, 16) == 0); 
-    UNIT_ASSERT(memcmp(tbuf1, tdata1, 16) == 0); 
-    UNIT_ASSERT(memcmp(tbuf2, tdata2, 16) == 0); 
-    UNIT_ASSERT(memcmp(tbuf3, tdata3, 16) == 0); 
-} 
- 
-template <typename TFrom, typename TTo, unsigned elemCount, 
-          typename TLoadVector, typename TResultVector, 
-          typename TElemFunc, typename TFunc, typename TOp> 
-void TSSEEmulTest::Test_mm_convertop() { 
-    char data[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    TFrom* datap = reinterpret_cast<TFrom*>(&data); 
- 
-    TLoadVector value = TFuncLoad<TLoadVector>(&data); 
- 
-    TTo procData[elemCount]; 
-    for (unsigned i = 0; i < elemCount; ++i) { 
-        procData[i] = TElemFunc::Call(datap[i]); 
-    } 
- 
-    TResultVector result = TFunc(value); 
- 
-    for (unsigned i = 0; i < elemCount; ++i) { 
-        UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]); 
-    } 
-} 
- 
-void TSSEEmulTest::Test_mm_cvtepi32_ps() { 
-    struct THelper { 
-        static float Call(const i32 op) { 
-            return float(op); 
-        } 
-    }; 
-    Test_mm_convertop<i32, float, 4, __m128i, __m128, 
-                      THelper, WrapF(_mm_cvtepi32_ps), float32x4_t>(); 
-}; 
- 
-void TSSEEmulTest::Test_mm_cvtps_epi32() { 
-    struct THelper { 
-        static i32 Call(const float op) { 
-            return i32(op); 
-        } 
-    }; 
-    Test_mm_convertop<float, i32, 4, __m128, __m128i, 
+void TSSEEmulTest::Test_MM_TRANSPOSE4_PS() {
+    char data0[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    char data1[16] = {
+        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
+    char data2[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    char data3[16] = {
+        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
+
+    __m128 value0 = _mm_loadu_ps((float*)&data0);
+    __m128 value1 = _mm_loadu_ps((float*)&data1);
+    __m128 value2 = _mm_loadu_ps((float*)&data2);
+    __m128 value3 = _mm_loadu_ps((float*)&data3);
+
+    _MM_TRANSPOSE4_PS(value0, value1, value2, value3);
+
+    ui64 tbuf0[2] = {0, 0};
+    ui64 tbuf1[2] = {0, 0};
+    ui64 tbuf2[2] = {0, 0};
+    ui64 tbuf3[2] = {0, 0};
+
+    _mm_storeu_ps((float*)&tbuf0, value0);
+    _mm_storeu_ps((float*)&tbuf1, value1);
+    _mm_storeu_ps((float*)&tbuf2, value2);
+    _mm_storeu_ps((float*)&tbuf3, value3);
+
+    char tdata0[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55',
+        '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55'};
+    char tdata1[16] = {
+        '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44',
+        '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44'};
+    char tdata2[16] = {
+        '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11',
+        '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11'};
+    char tdata3[16] = {
+        '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF',
+        '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF'};
+
+    UNIT_ASSERT(memcmp(tbuf0, tdata0, 16) == 0);
+    UNIT_ASSERT(memcmp(tbuf1, tdata1, 16) == 0);
+    UNIT_ASSERT(memcmp(tbuf2, tdata2, 16) == 0);
+    UNIT_ASSERT(memcmp(tbuf3, tdata3, 16) == 0);
+}
+
+template <typename TFrom, typename TTo, unsigned elemCount,
+          typename TLoadVector, typename TResultVector,
+          typename TElemFunc, typename TFunc, typename TOp>
+void TSSEEmulTest::Test_mm_convertop() {
+    char data[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    TFrom* datap = reinterpret_cast<TFrom*>(&data);
+
+    TLoadVector value = TFuncLoad<TLoadVector>(&data);
+
+    TTo procData[elemCount];
+    for (unsigned i = 0; i < elemCount; ++i) {
+        procData[i] = TElemFunc::Call(datap[i]);
+    }
+
+    TResultVector result = TFunc(value);
+
+    for (unsigned i = 0; i < elemCount; ++i) {
+        UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]);
+    }
+}
+
+void TSSEEmulTest::Test_mm_cvtepi32_ps() {
+    struct THelper {
+        static float Call(const i32 op) {
+            return float(op);
+        }
+    };
+    Test_mm_convertop<i32, float, 4, __m128i, __m128,
+                      THelper, WrapF(_mm_cvtepi32_ps), float32x4_t>();
+};
+
+void TSSEEmulTest::Test_mm_cvtps_epi32() {
+    struct THelper {
+        static i32 Call(const float op) {
+            return i32(op);
+        }
+    };
+    Test_mm_convertop<float, i32, 4, __m128, __m128i,
                       THelper, T_mm_CallWrapper<__m128i, decltype(_mm_cvtps_epi32), _mm_cvtps_epi32>, int32x4_t>();
-}; 
- 
-void TSSEEmulTest::Test_mm_cvttps_epi32() { 
-    struct THelper { 
-        static i32 Call(const float op) { 
-            return i32(op); 
-        } 
-    }; 
-    Test_mm_convertop<float, i32, 4, __m128, __m128i, 
-                      THelper, Wrap(_mm_cvttps_epi32), int32x4_t>(); 
-}; 
- 
-template <typename TLoadVector, typename TCastVector, 
-          typename TFunc, TFunc* func> 
-void TSSEEmulTest::Test_mm_castXX() { 
-    char data[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
- 
-    TLoadVector value = TFuncLoad<TLoadVector>(&data); 
-    const TLoadVector constvalue = TFuncLoad<TLoadVector>(&data); 
-    TCastVector casted = func(value); 
-    const TCastVector constcasted = func(constvalue); 
-    char verify[16]; 
-    char constverify[16]; 
-    TFuncStore<TCastVector>(&verify, casted); 
-    TFuncStore<TCastVector>(&constverify, constcasted); 
- 
-    UNIT_ASSERT(memcmp(&data, &verify, 16) == 0); 
-    UNIT_ASSERT(memcmp(&data, &constverify, 16) == 0); 
-}; 
- 
-void TSSEEmulTest::Test_mm_castsi128_ps() { 
-    Test_mm_castXX<__m128i, __m128, 
-                   decltype(_mm_castsi128_ps), _mm_castsi128_ps>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_castps_si128() { 
-    Test_mm_castXX<__m128, __m128i, 
-                   decltype(_mm_castps_si128), _mm_castps_si128>(); 
-} 
- 
-void TSSEEmulTest::Test_mm_mul_epu32() { 
-    char data0[16] = { 
-        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', 
-        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; 
-    char data1[16] = { 
-        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', 
-        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; 
-    ui32* dataw0 = reinterpret_cast<ui32*>(&data0); 
-    ui32* dataw1 = reinterpret_cast<ui32*>(&data1); 
- 
-    __m128i value0 = _mm_loadu_si128((__m128i*)&data0); 
-    __m128i value1 = _mm_loadu_si128((__m128i*)&data1); 
- 
+};
+
+void TSSEEmulTest::Test_mm_cvttps_epi32() {
+    struct THelper {
+        static i32 Call(const float op) {
+            return i32(op);
+        }
+    };
+    Test_mm_convertop<float, i32, 4, __m128, __m128i,
+                      THelper, Wrap(_mm_cvttps_epi32), int32x4_t>();
+};
+
+template <typename TLoadVector, typename TCastVector,
+          typename TFunc, TFunc* func>
+void TSSEEmulTest::Test_mm_castXX() {
+    char data[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+
+    TLoadVector value = TFuncLoad<TLoadVector>(&data);
+    const TLoadVector constvalue = TFuncLoad<TLoadVector>(&data);
+    TCastVector casted = func(value);
+    const TCastVector constcasted = func(constvalue);
+    char verify[16];
+    char constverify[16];
+    TFuncStore<TCastVector>(&verify, casted);
+    TFuncStore<TCastVector>(&constverify, constcasted);
+
+    UNIT_ASSERT(memcmp(&data, &verify, 16) == 0);
+    UNIT_ASSERT(memcmp(&data, &constverify, 16) == 0);
+};
+
+void TSSEEmulTest::Test_mm_castsi128_ps() {
+    Test_mm_castXX<__m128i, __m128,
+                   decltype(_mm_castsi128_ps), _mm_castsi128_ps>();
+}
+
+void TSSEEmulTest::Test_mm_castps_si128() {
+    Test_mm_castXX<__m128, __m128i,
+                   decltype(_mm_castps_si128), _mm_castps_si128>();
+}
+
+void TSSEEmulTest::Test_mm_mul_epu32() {
+    char data0[16] = {
+        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
+        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
+    char data1[16] = {
+        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
+        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
+    ui32* dataw0 = reinterpret_cast<ui32*>(&data0);
+    ui32* dataw1 = reinterpret_cast<ui32*>(&data1);
+
+    __m128i value0 = _mm_loadu_si128((__m128i*)&data0);
+    __m128i value1 = _mm_loadu_si128((__m128i*)&data1);
+
     ui64 mul0 = (ui64) dataw0[0] * (ui64) dataw1[0];
     ui64 mul1 = (ui64) dataw0[2] * (ui64) dataw1[2];
- 
-    __m128i result = _mm_mul_epu32(value0, value1); 
- 
-    UNIT_ASSERT_EQUAL(mul0, TQType<uint64x2_t>::As(result)[0]); 
-    UNIT_ASSERT_EQUAL(mul1, TQType<uint64x2_t>::As(result)[1]); 
-} 
+
+    __m128i result = _mm_mul_epu32(value0, value1);
+
+    UNIT_ASSERT_EQUAL(mul0, TQType<uint64x2_t>::As(result)[0]);
+    UNIT_ASSERT_EQUAL(mul1, TQType<uint64x2_t>::As(result)[1]);
+}
 
 void TSSEEmulTest::Test_mm_cmpunord_ps() {
     alignas(16) float valuesBits[4] = {1.f, 2.f, 3.f, 4.f};
diff --git a/library/cpp/sse/ut/ya.make b/library/cpp/sse/ut/ya.make
index 14cac6727a..45e104971e 100644
--- a/library/cpp/sse/ut/ya.make
+++ b/library/cpp/sse/ut/ya.make
@@ -1,13 +1,13 @@
 UNITTEST_FOR(library/cpp/sse)
- 
+
 OWNER(danlark)
- 
-SRCS( 
+
+SRCS(
     test.cpp
-) 
- 
+)
+
 IF (ARCH_X86_64)
     CFLAGS(-msse4.1 -msse4.2)
 ENDIF()
 
-END() 
+END()
author	agri <agri@yandex-team.ru>	2022-02-10 16:48:12 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:48:12 +0300
commit	2909866fbc652492b7d7cab3023cb19489dc4fd8 (patch)
tree	b222e5ac2e2e98872661c51ccceee5da0d291e13 /library/cpp/sse
parent	d3530b2692e400bd4d29bd4f07cafaee139164e7 (diff)
download	ydb-2909866fbc652492b7d7cab3023cb19489dc4fd8.tar.gz