diff options
| author | agri <[email protected]> | 2022-02-10 16:48:12 +0300 | 
|---|---|---|
| committer | Daniil Cherednik <[email protected]> | 2022-02-10 16:48:12 +0300 | 
| commit | d3530b2692e400bd4d29bd4f07cafaee139164e7 (patch) | |
| tree | b7ae636a74490e649a2ed0fdd5361f1bec83b9f9 /library/cpp/sse/ut/test.cpp | |
| parent | 0f4c5d1e8c0672bf0a1f2f2d8acac5ba24772435 (diff) | |
Restoring authorship annotation for <[email protected]>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/sse/ut/test.cpp')
| -rw-r--r-- | library/cpp/sse/ut/test.cpp | 2290 | 
1 files changed, 1145 insertions, 1145 deletions
| diff --git a/library/cpp/sse/ut/test.cpp b/library/cpp/sse/ut/test.cpp index 33c999d284b..42a82a8cfa6 100644 --- a/library/cpp/sse/ut/test.cpp +++ b/library/cpp/sse/ut/test.cpp @@ -1,10 +1,10 @@ -/* -  Unittests for all SSE instrinsics translated to NEON instrinsics or -  software implementation. -  Should be tested both on Intel and ARM64. - */ -/* Author: Vitaliy Manushkin <[email protected] */ - +/*  +  Unittests for all SSE instrinsics translated to NEON instrinsics or  +  software implementation.  +  Should be tested both on Intel and ARM64.  + */  +/* Author: Vitaliy Manushkin <[email protected] */  +   #include <library/cpp/testing/unittest/registar.h>  #include <util/generic/typetraits.h> @@ -13,35 +13,35 @@  #include <util/stream/output.h>  #include <algorithm> -#include <array> -#include <limits> +#include <array>  +#include <limits>   #include <memory>  #include <type_traits>  #include <utility> - -template <typename TResult, typename TFunc, TFunc* func> -struct T_mm_CallWrapper { -    TResult Value; - -    template <typename... TParams> -    T_mm_CallWrapper(TParams&&... params) { -        Value = func(std::forward<TParams>(params)...); -    } - -    operator TResult&() { -        return Value; -    } - -    operator const TResult&() const { -        return Value; -    } -}; - -#if defined(_arm64_) +  +template <typename TResult, typename TFunc, TFunc* func>  +struct T_mm_CallWrapper {  +    TResult Value;  +  +    template <typename... TParams>  +    T_mm_CallWrapper(TParams&&... params) {  +        Value = func(std::forward<TParams>(params)...);  +    }  +  +    operator TResult&() {  +        return Value;  +    }  +  +    operator const TResult&() const {  +        return Value;  +    }  +};  +  +#if defined(_arm64_)   #include "library/cpp/sse/sse2neon.h"  #elif defined(_i386_) || defined(_x86_64_) -#include <xmmintrin.h> -#include <emmintrin.h> +#include <xmmintrin.h>  +#include <emmintrin.h>   #include <smmintrin.h>  #elif defined(_ppc64_)  #include "library/cpp/sse/powerpc.h" @@ -54,10 +54,10 @@ struct T_mm_CallWrapper {  #define WrapF(T_mm_func) T_mm_func  #define WrapD(T_mm_func) T_mm_func  #elif defined(_ppc64_) || defined(_i386_) || defined(_x86_64_) -#define Wrap(_mm_func) \ -    T_mm_CallWrapper<__m128i, decltype(_mm_func), _mm_func> -#define WrapF(_mm_func) \ -    T_mm_CallWrapper<__m128, decltype(_mm_func), _mm_func> +#define Wrap(_mm_func) \  +    T_mm_CallWrapper<__m128i, decltype(_mm_func), _mm_func>  +#define WrapF(_mm_func) \  +    T_mm_CallWrapper<__m128, decltype(_mm_func), _mm_func>   #define WrapD(_mm_func) \      T_mm_CallWrapper<__m128d, decltype(_mm_func), _mm_func>  using int8x16_t = std::array<i8, 16>; @@ -70,69 +70,69 @@ using uint32x4_t = std::array<ui32, 4>;  using uint64x2_t = std::array<ui64, 2>;  using float32x4_t = std::array<float, 4>;  using float64x2_t = std::array<double, 2>; - +   template <typename TVectorType> -struct TQType { +struct TQType {       static TVectorType As(__m128i param) {          TVectorType value; -        _mm_storeu_si128((__m128i*)&value, param); -        return value; -    } +        _mm_storeu_si128((__m128i*)&value, param);  +        return value;  +    }       static TVectorType As(__m128 param) {          TVectorType value; -        _mm_storeu_ps((float*)&value, param); -        return value; -    } +        _mm_storeu_ps((float*)&value, param);  +        return value;  +    }       static TVectorType As(__m128d param) {          TVectorType value;          _mm_storeu_pd((double*)&value, param);          return value;      } -}; -#endif - +};  +#endif  +   template <typename TVectorType> -struct TFuncLoad; +struct TFuncLoad;   template <typename TVectorType> -struct TFuncStore; - -template <> -struct TFuncLoad<__m128i> { -    __m128i Value; - -    template <typename TPointer> -    TFuncLoad(TPointer* ptr) { -        Value = _mm_loadu_si128((__m128i*)ptr); -    } - -    operator __m128i&() { -        return Value; -    } - -    operator const __m128i&() const { -        return Value; -    } -}; - -template <> -struct TFuncLoad<__m128> { -    __m128 Value; - -    template <typename TPointer> -    TFuncLoad(TPointer* ptr) { -        Value = _mm_loadu_ps((float*)ptr); -    } - -    operator __m128&() { -        return Value; -    } - -    operator const __m128&() const { -        return Value; -    } -}; - -template <> +struct TFuncStore;  +  +template <>  +struct TFuncLoad<__m128i> {  +    __m128i Value;  +  +    template <typename TPointer>  +    TFuncLoad(TPointer* ptr) {  +        Value = _mm_loadu_si128((__m128i*)ptr);  +    }  +  +    operator __m128i&() {  +        return Value;  +    }  +  +    operator const __m128i&() const {  +        return Value;  +    }  +};  +  +template <>  +struct TFuncLoad<__m128> {  +    __m128 Value;  +  +    template <typename TPointer>  +    TFuncLoad(TPointer* ptr) {  +        Value = _mm_loadu_ps((float*)ptr);  +    }  +  +    operator __m128&() {  +        return Value;  +    }  +  +    operator const __m128&() const {  +        return Value;  +    }  +};  +  +template <>   struct TFuncLoad<__m128d> {      __m128d Value; @@ -151,153 +151,153 @@ struct TFuncLoad<__m128d> {  };  template <> -struct TFuncStore<__m128i> { -    template <typename TPointer> -    TFuncStore(TPointer* ptr, __m128i Value) { -        _mm_storeu_si128((__m128i*)ptr, Value); -    } -}; - -template <> -struct TFuncStore<__m128> { -    template <typename TPointer> -    TFuncStore(TPointer* ptr, __m128 Value) { -        _mm_storeu_ps((float*)ptr, Value); -    } -}; - -class TSSEEmulTest: public TTestBase { -private: -    UNIT_TEST_SUITE(TSSEEmulTest); -    UNIT_TEST(Test_mm_load_si128); -    UNIT_TEST(Test_mm_loadu_si128); +struct TFuncStore<__m128i> {  +    template <typename TPointer>  +    TFuncStore(TPointer* ptr, __m128i Value) {  +        _mm_storeu_si128((__m128i*)ptr, Value);  +    }  +};  +  +template <>  +struct TFuncStore<__m128> {  +    template <typename TPointer>  +    TFuncStore(TPointer* ptr, __m128 Value) {  +        _mm_storeu_ps((float*)ptr, Value);  +    }  +};  +  +class TSSEEmulTest: public TTestBase {  +private:  +    UNIT_TEST_SUITE(TSSEEmulTest);  +    UNIT_TEST(Test_mm_load_si128);  +    UNIT_TEST(Test_mm_loadu_si128);       UNIT_TEST(Test_mm_storeu_si128);      UNIT_TEST(Test_mm_loadu_si128_2);      UNIT_TEST(Test_mm_loadu_ps);      UNIT_TEST(Test_mm_storeu_ps); - +       UNIT_TEST(Test_mm_slli_epi16);      UNIT_TEST(Test_mm_slli_epi32);      UNIT_TEST(Test_mm_slli_epi64);      UNIT_TEST(Test_mm_slli_si128); -    UNIT_TEST(Test_mm_srli_epi16); -    UNIT_TEST(Test_mm_srli_epi32); -    UNIT_TEST(Test_mm_srli_epi64); +    UNIT_TEST(Test_mm_srli_epi16);  +    UNIT_TEST(Test_mm_srli_epi32);  +    UNIT_TEST(Test_mm_srli_epi64);       UNIT_TEST(Test_mm_srli_si128); - +       UNIT_TEST(Test_mm_srai_epi16);      UNIT_TEST(Test_mm_srai_epi32);      UNIT_TEST(Test_mm_sll_epi16);      UNIT_TEST(Test_mm_sll_epi32);      UNIT_TEST(Test_mm_sll_epi64); - +       UNIT_TEST(Test_mm_srl_epi16);      UNIT_TEST(Test_mm_srl_epi32);      UNIT_TEST(Test_mm_srl_epi64); -    UNIT_TEST(Test_mm_add_epi16); -    UNIT_TEST(Test_mm_add_epi32); -    UNIT_TEST(Test_mm_add_epi64); -    UNIT_TEST(Test_mm_add_ps); +    UNIT_TEST(Test_mm_add_epi16);  +    UNIT_TEST(Test_mm_add_epi32);  +    UNIT_TEST(Test_mm_add_epi64);  +    UNIT_TEST(Test_mm_add_ps);       UNIT_TEST(Test_mm_add_pd); - +       UNIT_TEST(Test_mm_madd_epi16); -    UNIT_TEST(Test_mm_sub_epi16); -    UNIT_TEST(Test_mm_sub_epi32); -    UNIT_TEST(Test_mm_sub_epi64); -    UNIT_TEST(Test_mm_sub_ps); +    UNIT_TEST(Test_mm_sub_epi16);  +    UNIT_TEST(Test_mm_sub_epi32);  +    UNIT_TEST(Test_mm_sub_epi64);  +    UNIT_TEST(Test_mm_sub_ps);       UNIT_TEST(Test_mm_sub_pd); - -    UNIT_TEST(Test_mm_mul_ps); +  +    UNIT_TEST(Test_mm_mul_ps);       UNIT_TEST(Test_mm_mul_pd); -    UNIT_TEST(Test_mm_div_ps); +    UNIT_TEST(Test_mm_div_ps);       UNIT_TEST(Test_mm_div_pd); -    UNIT_TEST(Test_mm_max_ps); -    UNIT_TEST(Test_mm_min_ps); -    UNIT_TEST(Test_mm_and_ps); - -    UNIT_TEST(Test_mm_unpacklo_epi8); -    UNIT_TEST(Test_mm_unpackhi_epi8); -    UNIT_TEST(Test_mm_unpacklo_epi16); -    UNIT_TEST(Test_mm_unpackhi_epi16); -    UNIT_TEST(Test_mm_unpacklo_epi32); -    UNIT_TEST(Test_mm_unpackhi_epi32); -    UNIT_TEST(Test_mm_unpacklo_epi64); -    UNIT_TEST(Test_mm_unpackhi_epi64); - -    UNIT_TEST(Test_mm_or_si128); -    UNIT_TEST(Test_mm_and_si128); -    UNIT_TEST(Test_mm_andnot_si128); - -    UNIT_TEST(Test_mm_cmpeq_epi8); -    UNIT_TEST(Test_mm_cmpeq_epi16); -    UNIT_TEST(Test_mm_cmpeq_epi32); -    UNIT_TEST(Test_mm_cmpeq_ps); - -    UNIT_TEST(Test_mm_cmpgt_epi8); -    UNIT_TEST(Test_mm_cmpgt_epi16); -    UNIT_TEST(Test_mm_cmpgt_epi32); -    UNIT_TEST(Test_mm_cmpgt_ps); - -    UNIT_TEST(Test_mm_cmplt_epi8); -    UNIT_TEST(Test_mm_cmplt_epi16); -    UNIT_TEST(Test_mm_cmplt_epi32); - -    UNIT_TEST(Test_mm_set1_epi8); -    UNIT_TEST(Test_mm_set1_epi16); -    UNIT_TEST(Test_mm_set1_epi32); -    UNIT_TEST(Test_mm_set1_ps); +    UNIT_TEST(Test_mm_max_ps);  +    UNIT_TEST(Test_mm_min_ps);  +    UNIT_TEST(Test_mm_and_ps);  +  +    UNIT_TEST(Test_mm_unpacklo_epi8);  +    UNIT_TEST(Test_mm_unpackhi_epi8);  +    UNIT_TEST(Test_mm_unpacklo_epi16);  +    UNIT_TEST(Test_mm_unpackhi_epi16);  +    UNIT_TEST(Test_mm_unpacklo_epi32);  +    UNIT_TEST(Test_mm_unpackhi_epi32);  +    UNIT_TEST(Test_mm_unpacklo_epi64);  +    UNIT_TEST(Test_mm_unpackhi_epi64);  +  +    UNIT_TEST(Test_mm_or_si128);  +    UNIT_TEST(Test_mm_and_si128);  +    UNIT_TEST(Test_mm_andnot_si128);  +  +    UNIT_TEST(Test_mm_cmpeq_epi8);  +    UNIT_TEST(Test_mm_cmpeq_epi16);  +    UNIT_TEST(Test_mm_cmpeq_epi32);  +    UNIT_TEST(Test_mm_cmpeq_ps);  +  +    UNIT_TEST(Test_mm_cmpgt_epi8);  +    UNIT_TEST(Test_mm_cmpgt_epi16);  +    UNIT_TEST(Test_mm_cmpgt_epi32);  +    UNIT_TEST(Test_mm_cmpgt_ps);  +  +    UNIT_TEST(Test_mm_cmplt_epi8);  +    UNIT_TEST(Test_mm_cmplt_epi16);  +    UNIT_TEST(Test_mm_cmplt_epi32);  +  +    UNIT_TEST(Test_mm_set1_epi8);  +    UNIT_TEST(Test_mm_set1_epi16);  +    UNIT_TEST(Test_mm_set1_epi32);  +    UNIT_TEST(Test_mm_set1_ps);       UNIT_TEST(Test_mm_set_ps1); - -    UNIT_TEST(Test_mm_setzero_si128); -    UNIT_TEST(Test_mm_setzero_ps); +  +    UNIT_TEST(Test_mm_setzero_si128);  +    UNIT_TEST(Test_mm_setzero_ps);       UNIT_TEST(Test_mm_setzero_pd); - -    UNIT_TEST(Test_mm_storel_epi64); -    UNIT_TEST(Test_mm_loadl_epi64); - +  +    UNIT_TEST(Test_mm_storel_epi64);  +    UNIT_TEST(Test_mm_loadl_epi64);  +       UNIT_TEST(Test_mm_loadl_pd);      UNIT_TEST(Test_mm_loadh_pd);      UNIT_TEST(Test_mm_cvtsd_f64); -    UNIT_TEST(Test_mm_shuffle_epi32); -    UNIT_TEST(Test_mm_movemask_epi8); -    UNIT_TEST(Test_mm_cvtsi128_si32); +    UNIT_TEST(Test_mm_shuffle_epi32);  +    UNIT_TEST(Test_mm_movemask_epi8);  +    UNIT_TEST(Test_mm_cvtsi128_si32);       UNIT_TEST(Test_mm_cvtsi128_si64); - -    UNIT_TEST(Test_mm_set_epi16); -    UNIT_TEST(Test_mm_set_epi32); -    UNIT_TEST(Test_mm_set_ps); +  +    UNIT_TEST(Test_mm_set_epi16);  +    UNIT_TEST(Test_mm_set_epi32);  +    UNIT_TEST(Test_mm_set_ps);       UNIT_TEST(Test_mm_set_pd); - -    UNIT_TEST(Test_mm_cvtsi32_si128); +  +    UNIT_TEST(Test_mm_cvtsi32_si128);       UNIT_TEST(Test_mm_cvtsi64_si128); - -    UNIT_TEST(Test_mm_packs_epi16); -    UNIT_TEST(Test_mm_packs_epi32); -    UNIT_TEST(Test_mm_packus_epi16); - -    UNIT_TEST(Test_mm_extract_epi16); +  +    UNIT_TEST(Test_mm_packs_epi16);  +    UNIT_TEST(Test_mm_packs_epi32);  +    UNIT_TEST(Test_mm_packus_epi16);  +  +    UNIT_TEST(Test_mm_extract_epi16);       UNIT_TEST(Test_mm_extract_epi8);      UNIT_TEST(Test_mm_extract_epi32);      UNIT_TEST(Test_mm_extract_epi64); - -    UNIT_TEST(Test_MM_TRANSPOSE4_PS); -    UNIT_TEST(Test_mm_movemask_ps); +  +    UNIT_TEST(Test_MM_TRANSPOSE4_PS);  +    UNIT_TEST(Test_mm_movemask_ps);       UNIT_TEST(Test_mm_movemask_ps_2); - -    UNIT_TEST(Test_mm_cvtepi32_ps); -    UNIT_TEST(Test_mm_cvtps_epi32); -    UNIT_TEST(Test_mm_cvttps_epi32); - -    UNIT_TEST(Test_mm_castsi128_ps); -    UNIT_TEST(Test_mm_castps_si128); - -    UNIT_TEST(Test_mm_mul_epu32); - +  +    UNIT_TEST(Test_mm_cvtepi32_ps);  +    UNIT_TEST(Test_mm_cvtps_epi32);  +    UNIT_TEST(Test_mm_cvttps_epi32);  +  +    UNIT_TEST(Test_mm_castsi128_ps);  +    UNIT_TEST(Test_mm_castps_si128);  +  +    UNIT_TEST(Test_mm_mul_epu32);  +       UNIT_TEST(Test_mm_cmpunord_ps);      UNIT_TEST(Test_mm_andnot_ps);      UNIT_TEST(Test_mm_shuffle_ps); @@ -310,36 +310,36 @@ private:      UNIT_TEST(Test_mm_rsqrt_ps);      UNIT_TEST(Test_matrixnet_powerpc); -    UNIT_TEST_SUITE_END(); - -public: -    void Test_mm_load_si128(); -    void Test_mm_loadu_si128(); +    UNIT_TEST_SUITE_END();  +  +public:  +    void Test_mm_load_si128();  +    void Test_mm_loadu_si128();       void Test_mm_storeu_si128();      void Test_mm_loadu_si128_2();      void Test_mm_loadu_ps();      void Test_mm_storeu_ps(); - -    template <typename TElem, int bits, int elemCount, +  +    template <typename TElem, int bits, int elemCount,                 typename TFunc, typename TShifter, typename TOp, typename TElemFunc> -    void Test_mm_shifter_epiXX(); - +    void Test_mm_shifter_epiXX();  +       enum class EDirection {          Left,          Right      }; - +       struct TShiftRes {          __m128i Value[17];      };      void Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo); -    void Test_mm_slli_epi16(); -    void Test_mm_slli_epi32(); -    void Test_mm_slli_epi64(); +    void Test_mm_slli_epi16();  +    void Test_mm_slli_epi32();  +    void Test_mm_slli_epi64();       void Test_mm_slli_si128(); - +       void Test_mm_srli_epi16();      void Test_mm_srli_epi32();      void Test_mm_srli_epi64(); @@ -356,134 +356,134 @@ public:      void Test_mm_srl_epi32();      void Test_mm_srl_epi64(); -    void Test_mm_add_epi8(); -    void Test_mm_add_epi16(); -    void Test_mm_add_epi32(); -    void Test_mm_add_epi64(); -    void Test_mm_add_ps(); +    void Test_mm_add_epi8();  +    void Test_mm_add_epi16();  +    void Test_mm_add_epi32();  +    void Test_mm_add_epi64();  +    void Test_mm_add_ps();       void Test_mm_add_pd(); - +       void Test_mm_madd_epi16(); -    void Test_mm_sub_epi8(); -    void Test_mm_sub_epi16(); -    void Test_mm_sub_epi32(); -    void Test_mm_sub_epi64(); -    void Test_mm_sub_ps(); +    void Test_mm_sub_epi8();  +    void Test_mm_sub_epi16();  +    void Test_mm_sub_epi32();  +    void Test_mm_sub_epi64();  +    void Test_mm_sub_ps();       void Test_mm_sub_pd(); - -    void Test_mm_mul_ps(); +  +    void Test_mm_mul_ps();       void Test_mm_mul_pd(); -    void Test_mm_div_ps(); +    void Test_mm_div_ps();       void Test_mm_div_pd(); -    void Test_mm_max_ps(); -    void Test_mm_min_ps(); -    void Test_mm_and_ps(); - -    template <typename TElem, int bits, int elemCount, int shift, -              typename TFunc, typename TOp> -    void Test_mm_unpack_epiXX(); -    void Test_mm_unpacklo_epi8(); -    void Test_mm_unpackhi_epi8(); -    void Test_mm_unpacklo_epi16(); -    void Test_mm_unpackhi_epi16(); -    void Test_mm_unpacklo_epi32(); -    void Test_mm_unpackhi_epi32(); -    void Test_mm_unpacklo_epi64(); -    void Test_mm_unpackhi_epi64(); - -    template <typename TElem, unsigned elemCount, -              typename TFunc, typename TElemFunc, +    void Test_mm_max_ps();  +    void Test_mm_min_ps();  +    void Test_mm_and_ps();  +  +    template <typename TElem, int bits, int elemCount, int shift,  +              typename TFunc, typename TOp>  +    void Test_mm_unpack_epiXX();  +    void Test_mm_unpacklo_epi8();  +    void Test_mm_unpackhi_epi8();  +    void Test_mm_unpacklo_epi16();  +    void Test_mm_unpackhi_epi16();  +    void Test_mm_unpacklo_epi32();  +    void Test_mm_unpackhi_epi32();  +    void Test_mm_unpacklo_epi64();  +    void Test_mm_unpackhi_epi64();  +  +    template <typename TElem, unsigned elemCount,  +              typename TFunc, typename TElemFunc,                 typename TOp, typename TVectorType = __m128i> -    void Test_mm_dualop(); - -    template <typename TElem, unsigned elemCount, -              typename TFunc, typename TElemFunc, +    void Test_mm_dualop();  +  +    template <typename TElem, unsigned elemCount,  +              typename TFunc, typename TElemFunc,                 typename TOp, typename TVectorType = __m128i> -    void Test_mm_dualcmp(); - -    void Test_mm_or_si128(); -    void Test_mm_and_si128(); -    void Test_mm_andnot_si128(); - -    void Test_mm_cmpeq_epi8(); -    void Test_mm_cmpeq_epi16(); -    void Test_mm_cmpeq_epi32(); -    void Test_mm_cmpeq_ps(); - -    void Test_mm_cmpgt_epi8(); -    void Test_mm_cmpgt_epi16(); -    void Test_mm_cmpgt_epi32(); -    void Test_mm_cmpgt_ps(); - -    void Test_mm_cmplt_epi8(); -    void Test_mm_cmplt_epi16(); -    void Test_mm_cmplt_epi32(); - -    template <typename TElem, int elemCount, +    void Test_mm_dualcmp();  +  +    void Test_mm_or_si128();  +    void Test_mm_and_si128();  +    void Test_mm_andnot_si128();  +  +    void Test_mm_cmpeq_epi8();  +    void Test_mm_cmpeq_epi16();  +    void Test_mm_cmpeq_epi32();  +    void Test_mm_cmpeq_ps();  +  +    void Test_mm_cmpgt_epi8();  +    void Test_mm_cmpgt_epi16();  +    void Test_mm_cmpgt_epi32();  +    void Test_mm_cmpgt_ps();  +  +    void Test_mm_cmplt_epi8();  +    void Test_mm_cmplt_epi16();  +    void Test_mm_cmplt_epi32();  +  +    template <typename TElem, int elemCount,                 typename TFunc, typename TOp, typename TVectorType> -    void Test_mm_setter_epiXX(); -    void Test_mm_set1_epi8(); -    void Test_mm_set1_epi16(); -    void Test_mm_set1_epi32(); -    void Test_mm_set1_ps(); +    void Test_mm_setter_epiXX();  +    void Test_mm_set1_epi8();  +    void Test_mm_set1_epi16();  +    void Test_mm_set1_epi32();  +    void Test_mm_set1_ps();       void Test_mm_set_ps1(); - -    void Test_mm_setzero_si128(); -    void Test_mm_setzero_ps(); +  +    void Test_mm_setzero_si128();  +    void Test_mm_setzero_ps();       void Test_mm_setzero_pd(); - -    void Test_mm_loadl_epi64(); -    void Test_mm_storel_epi64(); - +  +    void Test_mm_loadl_epi64();  +    void Test_mm_storel_epi64();  +       void Test_mm_loadl_pd();      void Test_mm_loadh_pd();      void Test_mm_cvtsd_f64(); -    void Test_mm_shuffle_epi32(); -    void Test_mm_movemask_epi8(); -    void Test_mm_cvtsi128_si32(); +    void Test_mm_shuffle_epi32();  +    void Test_mm_movemask_epi8();  +    void Test_mm_cvtsi128_si32();       void Test_mm_cvtsi128_si64(); - -    void Test_mm_set_epi16(); -    void Test_mm_set_epi32(); -    void Test_mm_set_ps(); +  +    void Test_mm_set_epi16();  +    void Test_mm_set_epi32();  +    void Test_mm_set_ps();       void Test_mm_set_pd(); - -    void Test_mm_cvtsi32_si128(); +  +    void Test_mm_cvtsi32_si128();       void Test_mm_cvtsi64_si128(); - -    template <typename TElem, typename TNarrow, unsigned elemCount, -              typename TFunc> -    void Test_mm_packs_epiXX(); -    void Test_mm_packs_epi16(); -    void Test_mm_packs_epi32(); -    void Test_mm_packus_epi16(); - -    void Test_mm_extract_epi16(); +  +    template <typename TElem, typename TNarrow, unsigned elemCount,  +              typename TFunc>  +    void Test_mm_packs_epiXX();  +    void Test_mm_packs_epi16();  +    void Test_mm_packs_epi32();  +    void Test_mm_packus_epi16();  +  +    void Test_mm_extract_epi16();       void Test_mm_extract_epi8();      void Test_mm_extract_epi32();      void Test_mm_extract_epi64(); - -    void Test_MM_TRANSPOSE4_PS(); -    void Test_mm_movemask_ps(); +  +    void Test_MM_TRANSPOSE4_PS();  +    void Test_mm_movemask_ps();       void Test_mm_movemask_ps_2(); - -    template <typename TFrom, typename TTo, unsigned elemCount, -              typename TLoadVector, typename TResultVector, -              typename TElemFunc, typename TFunc, typename TOp> -    void Test_mm_convertop(); -    void Test_mm_cvtepi32_ps(); -    void Test_mm_cvtps_epi32(); -    void Test_mm_cvttps_epi32(); - -    template <typename TLoadVector, typename TCastVector, -              typename TFunc, TFunc* func> -    void Test_mm_castXX(); -    void Test_mm_castsi128_ps(); -    void Test_mm_castps_si128(); - -    void Test_mm_mul_epu32(); +  +    template <typename TFrom, typename TTo, unsigned elemCount,  +              typename TLoadVector, typename TResultVector,  +              typename TElemFunc, typename TFunc, typename TOp>  +    void Test_mm_convertop();  +    void Test_mm_cvtepi32_ps();  +    void Test_mm_cvtps_epi32();  +    void Test_mm_cvttps_epi32();  +  +    template <typename TLoadVector, typename TCastVector,  +              typename TFunc, TFunc* func>  +    void Test_mm_castXX();  +    void Test_mm_castsi128_ps();  +    void Test_mm_castps_si128();  +  +    void Test_mm_mul_epu32();       void Test_mm_cmpunord_ps();      void Test_mm_store_ss(); @@ -497,30 +497,30 @@ public:      void Test_mm_rsqrt_ps();      void Test_mm_rsqrt_ss();      void Test_matrixnet_powerpc(); -}; - -UNIT_TEST_SUITE_REGISTRATION(TSSEEmulTest); - -void TSSEEmulTest::Test_mm_load_si128() { +};  +  +UNIT_TEST_SUITE_REGISTRATION(TSSEEmulTest);  +  +void TSSEEmulTest::Test_mm_load_si128() {       alignas(16) char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    __m128i value = _mm_load_si128((__m128i*)&data); -    UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[0], 0xAABB2211CCFF00AAUL); -    UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[1], 0x1C66775588449933UL); -} - -void TSSEEmulTest::Test_mm_loadu_si128() { +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    __m128i value = _mm_load_si128((__m128i*)&data);  +    UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[0], 0xAABB2211CCFF00AAUL);  +    UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[1], 0x1C66775588449933UL);  +}  +  +void TSSEEmulTest::Test_mm_loadu_si128() {       alignas(16) char data[17] = { -        '\x66', -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    UNIT_ASSERT((ui64(&data[1]) & 0x1) == 0x1); -    __m128i value = _mm_loadu_si128((__m128i*)&data[1]); -    UNIT_ASSERT(TQType<uint64x2_t>::As(value)[0] == 0xAABB2211CCFF00AAUL); -    UNIT_ASSERT(TQType<uint64x2_t>::As(value)[1] == 0x1C66775588449933UL); -} - +        '\x66',  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    UNIT_ASSERT((ui64(&data[1]) & 0x1) == 0x1);  +    __m128i value = _mm_loadu_si128((__m128i*)&data[1]);  +    UNIT_ASSERT(TQType<uint64x2_t>::As(value)[0] == 0xAABB2211CCFF00AAUL);  +    UNIT_ASSERT(TQType<uint64x2_t>::As(value)[1] == 0x1C66775588449933UL);  +}  +   void TSSEEmulTest::Test_mm_storeu_si128() {      alignas(16) unsigned char stub[32] = {          0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, @@ -640,32 +640,32 @@ unsigned MakeNumber<unsigned>(unsigned number) {      return number;  } -template <typename TElem, int bits, int elemCount, +template <typename TElem, int bits, int elemCount,             typename TFunc, typename TShifter, typename TOp, typename TElemFunc> -void TSSEEmulTest::Test_mm_shifter_epiXX() { -    char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    TElem* dataw = reinterpret_cast<TElem*>(&data); - -    __m128i value = _mm_loadu_si128((__m128i*)&data); - +void TSSEEmulTest::Test_mm_shifter_epiXX() {  +    char data[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    TElem* dataw = reinterpret_cast<TElem*>(&data);  +  +    __m128i value = _mm_loadu_si128((__m128i*)&data);  +       for (unsigned shifter = 0; shifter <= bits; ++shifter) { -        TElem shiftedData[elemCount]; +        TElem shiftedData[elemCount];           for (unsigned i = 0; i < elemCount; ++i) { -            shiftedData[i] = TElemFunc::Call(dataw[i], shifter); +            shiftedData[i] = TElemFunc::Call(dataw[i], shifter);           } - +           const TShifter adhoc_shifter = MakeNumber<TShifter>(shifter);          __m128i result = TFunc(value, adhoc_shifter);          for (unsigned i = 0; i < elemCount; ++i) { -            UNIT_ASSERT_EQUAL(shiftedData[i], TQType<TOp>::As(result)[i]); +            UNIT_ASSERT_EQUAL(shiftedData[i], TQType<TOp>::As(result)[i]);           } -    } -} - +    }  +}  +   void TSSEEmulTest::Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo) {      const char data[48] = { @@ -713,52 +713,52 @@ struct THelperASHR {      }  }; -template <typename TElem> -struct THelperSHR { -    static TElem Call(const TElem op, const int shift) { +template <typename TElem>  +struct THelperSHR {  +    static TElem Call(const TElem op, const int shift) {           constexpr int nBitsInOp = sizeof(op) * CHAR_BIT;          return shift < nBitsInOp ? op >> shift : 0; -    } -}; - -void TSSEEmulTest::Test_mm_srli_epi16() { +    }  +};  +  +void TSSEEmulTest::Test_mm_srli_epi16() {       Test_mm_shifter_epiXX<ui16, 16, 8, Wrap(_mm_srli_epi16), unsigned, uint16x8_t, -                          THelperSHR<ui16>>(); -} - -void TSSEEmulTest::Test_mm_srli_epi32() { +                          THelperSHR<ui16>>();  +}  +  +void TSSEEmulTest::Test_mm_srli_epi32() {       Test_mm_shifter_epiXX<ui32, 32, 4, Wrap(_mm_srli_epi32), unsigned, uint32x4_t, -                          THelperSHR<ui32>>(); -} - -void TSSEEmulTest::Test_mm_srli_epi64() { +                          THelperSHR<ui32>>();  +}  +  +void TSSEEmulTest::Test_mm_srli_epi64() {       Test_mm_shifter_epiXX<ui64, 64, 2, Wrap(_mm_srli_epi64), unsigned, uint64x2_t, -                          THelperSHR<ui64>>(); -} - -template <typename TElem> -struct THelperSHL { -    static TElem Call(const TElem op, const int shift) { +                          THelperSHR<ui64>>();  +}  +  +template <typename TElem>  +struct THelperSHL {  +    static TElem Call(const TElem op, const int shift) {           constexpr int nBitsInOp = sizeof(op) * CHAR_BIT;          return shift < nBitsInOp ? op << shift : 0; -    } -}; - -void TSSEEmulTest::Test_mm_slli_epi16() { +    }  +};  +  +void TSSEEmulTest::Test_mm_slli_epi16() {       Test_mm_shifter_epiXX<ui16, 16, 8, Wrap(_mm_slli_epi16), unsigned, uint16x8_t, -                          THelperSHL<ui16>>(); -} - -void TSSEEmulTest::Test_mm_slli_epi32() { +                          THelperSHL<ui16>>();  +}  +  +void TSSEEmulTest::Test_mm_slli_epi32() {       Test_mm_shifter_epiXX<ui32, 32, 4, Wrap(_mm_slli_epi32), unsigned, uint32x4_t, -                          THelperSHL<ui32>>(); -} - -void TSSEEmulTest::Test_mm_slli_epi64() { +                          THelperSHL<ui32>>();  +}  +  +void TSSEEmulTest::Test_mm_slli_epi64() {       Test_mm_shifter_epiXX<ui64, 64, 2, Wrap(_mm_slli_epi64), unsigned, uint64x2_t, -                          THelperSHL<ui64>>(); -} - +                          THelperSHL<ui64>>();  +}  +   void TSSEEmulTest::Test_mm_slli_si128() {      Test_mm_byte_shifter(EDirection::Left, [] (__m128i a) -> TShiftRes {          TShiftRes res; @@ -849,30 +849,30 @@ void TSSEEmulTest::Test_mm_sll_epi64() {                            THelperSHL<ui64>>();  } -template <typename TElem> -struct THelperAdd { -    static TElem Call(const TElem op1, const TElem op2) { -        return op1 + op2; -    } -}; - -void TSSEEmulTest::Test_mm_add_epi16() { -    Test_mm_dualop<ui16, 8, Wrap(_mm_add_epi16), THelperAdd<ui16>, uint16x8_t>(); -} - -void TSSEEmulTest::Test_mm_add_epi32() { -    Test_mm_dualop<ui32, 4, Wrap(_mm_add_epi32), THelperAdd<ui32>, uint32x4_t>(); -} - -void TSSEEmulTest::Test_mm_add_epi64() { -    Test_mm_dualop<ui64, 2, Wrap(_mm_add_epi64), THelperAdd<ui64>, uint64x2_t>(); -} - -void TSSEEmulTest::Test_mm_add_ps() { -    Test_mm_dualop<float, 2, WrapF(_mm_add_ps), -                   THelperAdd<float>, float32x4_t, __m128>(); -} - +template <typename TElem>  +struct THelperAdd {  +    static TElem Call(const TElem op1, const TElem op2) {  +        return op1 + op2;  +    }  +};  +  +void TSSEEmulTest::Test_mm_add_epi16() {  +    Test_mm_dualop<ui16, 8, Wrap(_mm_add_epi16), THelperAdd<ui16>, uint16x8_t>();  +}  +  +void TSSEEmulTest::Test_mm_add_epi32() {  +    Test_mm_dualop<ui32, 4, Wrap(_mm_add_epi32), THelperAdd<ui32>, uint32x4_t>();  +}  +  +void TSSEEmulTest::Test_mm_add_epi64() {  +    Test_mm_dualop<ui64, 2, Wrap(_mm_add_epi64), THelperAdd<ui64>, uint64x2_t>();  +}  +  +void TSSEEmulTest::Test_mm_add_ps() {  +    Test_mm_dualop<float, 2, WrapF(_mm_add_ps),  +                   THelperAdd<float>, float32x4_t, __m128>();  +}  +   void TSSEEmulTest::Test_mm_add_pd() {      Test_mm_dualop<double, 2, WrapD(_mm_add_pd),                     THelperAdd<double>, float64x2_t, __m128d>(); @@ -904,44 +904,44 @@ void TSSEEmulTest::Test_mm_madd_epi16() {  } -template <typename TElem> -struct THelperSub { -    static TElem Call(const TElem op1, const TElem op2) { -        return op1 - op2; -    } -}; - -void TSSEEmulTest::Test_mm_sub_epi16() { -    Test_mm_dualop<ui16, 8, Wrap(_mm_sub_epi16), THelperSub<ui16>, uint16x8_t>(); -} - -void TSSEEmulTest::Test_mm_sub_epi32() { -    Test_mm_dualop<ui32, 4, Wrap(_mm_sub_epi32), THelperSub<ui32>, uint32x4_t>(); -} - -void TSSEEmulTest::Test_mm_sub_epi64() { -    Test_mm_dualop<ui64, 2, Wrap(_mm_sub_epi64), THelperSub<ui64>, uint64x2_t>(); -} - -void TSSEEmulTest::Test_mm_sub_ps() { -    Test_mm_dualop<float, 4, WrapF(_mm_sub_ps), THelperSub<float>, -                   float32x4_t, __m128>(); -} - +template <typename TElem>  +struct THelperSub {  +    static TElem Call(const TElem op1, const TElem op2) {  +        return op1 - op2;  +    }  +};  +  +void TSSEEmulTest::Test_mm_sub_epi16() {  +    Test_mm_dualop<ui16, 8, Wrap(_mm_sub_epi16), THelperSub<ui16>, uint16x8_t>();  +}  +  +void TSSEEmulTest::Test_mm_sub_epi32() {  +    Test_mm_dualop<ui32, 4, Wrap(_mm_sub_epi32), THelperSub<ui32>, uint32x4_t>();  +}  +  +void TSSEEmulTest::Test_mm_sub_epi64() {  +    Test_mm_dualop<ui64, 2, Wrap(_mm_sub_epi64), THelperSub<ui64>, uint64x2_t>();  +}  +  +void TSSEEmulTest::Test_mm_sub_ps() {  +    Test_mm_dualop<float, 4, WrapF(_mm_sub_ps), THelperSub<float>,  +                   float32x4_t, __m128>();  +}  +   void TSSEEmulTest::Test_mm_sub_pd() {      Test_mm_dualop<double, 2, WrapD(_mm_sub_pd), THelperSub<double>,                     float64x2_t, __m128d>();  } -void TSSEEmulTest::Test_mm_mul_ps() { -    struct THelper { -        static float Call(const float op1, const float op2) { -            return op1 * op2; -        } -    }; -    Test_mm_dualop<float, 4, WrapF(_mm_mul_ps), THelper, float32x4_t, __m128>(); -} - +void TSSEEmulTest::Test_mm_mul_ps() {  +    struct THelper {  +        static float Call(const float op1, const float op2) {  +            return op1 * op2;  +        }  +    };  +    Test_mm_dualop<float, 4, WrapF(_mm_mul_ps), THelper, float32x4_t, __m128>();  +}  +   void TSSEEmulTest::Test_mm_mul_pd() {      struct THelper {          static double Call(const double op1, const double op2) { @@ -951,15 +951,15 @@ void TSSEEmulTest::Test_mm_mul_pd() {      Test_mm_dualop<double, 2, WrapD(_mm_mul_pd), THelper, float64x2_t, __m128d>();  } -void TSSEEmulTest::Test_mm_div_ps() { -    struct THelper { -        static float Call(const float op1, const float op2) { -            return op1 / op2; -        } -    }; -    Test_mm_dualop<float, 4, WrapF(_mm_div_ps), THelper, float32x4_t, __m128>(); -} - +void TSSEEmulTest::Test_mm_div_ps() {  +    struct THelper {  +        static float Call(const float op1, const float op2) {  +            return op1 / op2;  +        }  +    };  +    Test_mm_dualop<float, 4, WrapF(_mm_div_ps), THelper, float32x4_t, __m128>();  +}  +   void TSSEEmulTest::Test_mm_div_pd() {      struct THelper {          static double Call(const double op1, const double op2) { @@ -969,441 +969,441 @@ void TSSEEmulTest::Test_mm_div_pd() {      Test_mm_dualop<double, 2, WrapD(_mm_div_pd), THelper, float64x2_t, __m128d>();  } -void TSSEEmulTest::Test_mm_max_ps() { -    struct THelper { -        static float Call(const float op1, const float op2) { -            return std::max(op1, op2); -        } -    }; -    Test_mm_dualop<float, 4, WrapF(_mm_max_ps), THelper, float32x4_t, __m128>(); -} - -void TSSEEmulTest::Test_mm_min_ps() { -    struct THelper { -        static float Call(const float op1, const float op2) { -            return std::min(op1, op2); -        } -    }; -    Test_mm_dualop<float, 4, WrapF(_mm_min_ps), THelper, float32x4_t, __m128>(); -} - -void TSSEEmulTest::Test_mm_and_ps() { -    struct THelper { -        static float Call(const float op1, const float op2) { -            union Cast { -                unsigned int AsUInt; -                float AsFloat; -            }; -            Cast v1, v2, result; -            v1.AsFloat = op1; -            v2.AsFloat = op2; -            result.AsUInt = v1.AsUInt & v2.AsUInt; -            return result.AsFloat; -        } -    }; -    Test_mm_dualcmp<float, 4, WrapF(_mm_and_ps), -                    THelper, float32x4_t, __m128>(); -} - -template <typename TElem, int bits, int elemCount, int shift, -          typename TFunc, typename TOp> -void TSSEEmulTest::Test_mm_unpack_epiXX() { -    char data1[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    char data2[16] = { -        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', -        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; -    TElem* dataw1 = reinterpret_cast<TElem*>(&data1); -    TElem* dataw2 = reinterpret_cast<TElem*>(&data2); - -    __m128i value1 = _mm_loadu_si128((__m128i*)&data1); -    __m128i value2 = _mm_loadu_si128((__m128i*)&data2); - -    TElem zippedData[elemCount]; -    for (unsigned i = 0; i < elemCount / 2; ++i) { -        zippedData[i * 2] = dataw1[i + shift]; -        zippedData[i * 2 + 1] = dataw2[i + shift]; -    } -    __m128i result = TFunc(value1, value2); - -    for (unsigned i = 0; i < elemCount / 2; ++i) { -        UNIT_ASSERT_EQUAL(zippedData[i * 2], TQType<TOp>::As(result)[i * 2]); -        UNIT_ASSERT_EQUAL(zippedData[i * 2 + 1], -                          TQType<TOp>::As(result)[i * 2 + 1]); -    } -} - -void TSSEEmulTest::Test_mm_unpacklo_epi8() { -    Test_mm_unpack_epiXX<ui8, 8, 16, 0, Wrap(_mm_unpacklo_epi8), uint8x16_t>(); -} - -void TSSEEmulTest::Test_mm_unpackhi_epi8() { -    Test_mm_unpack_epiXX<ui8, 8, 16, 8, Wrap(_mm_unpackhi_epi8), uint8x16_t>(); -} - -void TSSEEmulTest::Test_mm_unpacklo_epi16() { -    Test_mm_unpack_epiXX<ui16, 16, 8, 0, Wrap(_mm_unpacklo_epi16), uint16x8_t>(); -} - -void TSSEEmulTest::Test_mm_unpackhi_epi16() { -    Test_mm_unpack_epiXX<ui16, 16, 8, 4, Wrap(_mm_unpackhi_epi16), uint16x8_t>(); -} - -void TSSEEmulTest::Test_mm_unpacklo_epi32() { -    Test_mm_unpack_epiXX<ui32, 32, 4, 0, Wrap(_mm_unpacklo_epi32), uint32x4_t>(); -} - -void TSSEEmulTest::Test_mm_unpackhi_epi32() { -    Test_mm_unpack_epiXX<ui32, 32, 4, 2, Wrap(_mm_unpackhi_epi32), uint32x4_t>(); -} - -void TSSEEmulTest::Test_mm_unpacklo_epi64() { -    Test_mm_unpack_epiXX<ui64, 64, 2, 0, Wrap(_mm_unpacklo_epi64), uint64x2_t>(); -} - -void TSSEEmulTest::Test_mm_unpackhi_epi64() { -    Test_mm_unpack_epiXX<ui64, 64, 2, 1, Wrap(_mm_unpackhi_epi64), uint64x2_t>(); -} - -template <typename TElem, unsigned elemCount, -          typename TFunc, typename TElemFunc, +void TSSEEmulTest::Test_mm_max_ps() {  +    struct THelper {  +        static float Call(const float op1, const float op2) {  +            return std::max(op1, op2);  +        }  +    };  +    Test_mm_dualop<float, 4, WrapF(_mm_max_ps), THelper, float32x4_t, __m128>();  +}  +  +void TSSEEmulTest::Test_mm_min_ps() {  +    struct THelper {  +        static float Call(const float op1, const float op2) {  +            return std::min(op1, op2);  +        }  +    };  +    Test_mm_dualop<float, 4, WrapF(_mm_min_ps), THelper, float32x4_t, __m128>();  +}  +  +void TSSEEmulTest::Test_mm_and_ps() {  +    struct THelper {  +        static float Call(const float op1, const float op2) {  +            union Cast {  +                unsigned int AsUInt;  +                float AsFloat;  +            };  +            Cast v1, v2, result;  +            v1.AsFloat = op1;  +            v2.AsFloat = op2;  +            result.AsUInt = v1.AsUInt & v2.AsUInt;  +            return result.AsFloat;  +        }  +    };  +    Test_mm_dualcmp<float, 4, WrapF(_mm_and_ps),  +                    THelper, float32x4_t, __m128>();  +}  +  +template <typename TElem, int bits, int elemCount, int shift,  +          typename TFunc, typename TOp>  +void TSSEEmulTest::Test_mm_unpack_epiXX() {  +    char data1[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    char data2[16] = {  +        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',  +        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};  +    TElem* dataw1 = reinterpret_cast<TElem*>(&data1);  +    TElem* dataw2 = reinterpret_cast<TElem*>(&data2);  +  +    __m128i value1 = _mm_loadu_si128((__m128i*)&data1);  +    __m128i value2 = _mm_loadu_si128((__m128i*)&data2);  +  +    TElem zippedData[elemCount];  +    for (unsigned i = 0; i < elemCount / 2; ++i) {  +        zippedData[i * 2] = dataw1[i + shift];  +        zippedData[i * 2 + 1] = dataw2[i + shift];  +    }  +    __m128i result = TFunc(value1, value2);  +  +    for (unsigned i = 0; i < elemCount / 2; ++i) {  +        UNIT_ASSERT_EQUAL(zippedData[i * 2], TQType<TOp>::As(result)[i * 2]);  +        UNIT_ASSERT_EQUAL(zippedData[i * 2 + 1],  +                          TQType<TOp>::As(result)[i * 2 + 1]);  +    }  +}  +  +void TSSEEmulTest::Test_mm_unpacklo_epi8() {  +    Test_mm_unpack_epiXX<ui8, 8, 16, 0, Wrap(_mm_unpacklo_epi8), uint8x16_t>();  +}  +  +void TSSEEmulTest::Test_mm_unpackhi_epi8() {  +    Test_mm_unpack_epiXX<ui8, 8, 16, 8, Wrap(_mm_unpackhi_epi8), uint8x16_t>();  +}  +  +void TSSEEmulTest::Test_mm_unpacklo_epi16() {  +    Test_mm_unpack_epiXX<ui16, 16, 8, 0, Wrap(_mm_unpacklo_epi16), uint16x8_t>();  +}  +  +void TSSEEmulTest::Test_mm_unpackhi_epi16() {  +    Test_mm_unpack_epiXX<ui16, 16, 8, 4, Wrap(_mm_unpackhi_epi16), uint16x8_t>();  +}  +  +void TSSEEmulTest::Test_mm_unpacklo_epi32() {  +    Test_mm_unpack_epiXX<ui32, 32, 4, 0, Wrap(_mm_unpacklo_epi32), uint32x4_t>();  +}  +  +void TSSEEmulTest::Test_mm_unpackhi_epi32() {  +    Test_mm_unpack_epiXX<ui32, 32, 4, 2, Wrap(_mm_unpackhi_epi32), uint32x4_t>();  +}  +  +void TSSEEmulTest::Test_mm_unpacklo_epi64() {  +    Test_mm_unpack_epiXX<ui64, 64, 2, 0, Wrap(_mm_unpacklo_epi64), uint64x2_t>();  +}  +  +void TSSEEmulTest::Test_mm_unpackhi_epi64() {  +    Test_mm_unpack_epiXX<ui64, 64, 2, 1, Wrap(_mm_unpackhi_epi64), uint64x2_t>();  +}  +  +template <typename TElem, unsigned elemCount,  +          typename TFunc, typename TElemFunc,             typename TOp, typename TVectorType> -void TSSEEmulTest::Test_mm_dualop() { -    char data1[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    char data2[16] = { -        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', -        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; -    TElem* dataw1 = reinterpret_cast<TElem*>(&data1); -    TElem* dataw2 = reinterpret_cast<TElem*>(&data2); - +void TSSEEmulTest::Test_mm_dualop() {  +    char data1[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    char data2[16] = {  +        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',  +        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};  +    TElem* dataw1 = reinterpret_cast<TElem*>(&data1);  +    TElem* dataw2 = reinterpret_cast<TElem*>(&data2);  +       TVectorType value1 = TFuncLoad<TVectorType>(&data1);      TVectorType value2 = TFuncLoad<TVectorType>(&data2); - -    TElem procData[elemCount]; -    for (unsigned i = 0; i < elemCount; ++i) { -        procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]); -    } +  +    TElem procData[elemCount];  +    for (unsigned i = 0; i < elemCount; ++i) {  +        procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]);  +    }       TVectorType result = TFunc(value1, value2); - -    for (unsigned i = 0; i < elemCount; ++i) { -        UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]); -    } -} - -/* This is almost the same as Test_mm_dualop, -   but different data1 and data2 */ -template <typename TElem, unsigned elemCount, -          typename TFunc, typename TElemFunc, +  +    for (unsigned i = 0; i < elemCount; ++i) {  +        UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]);  +    }  +}  +  +/* This is almost the same as Test_mm_dualop,  +   but different data1 and data2 */  +template <typename TElem, unsigned elemCount,  +          typename TFunc, typename TElemFunc,             typename TOp, typename TVectorType> -void TSSEEmulTest::Test_mm_dualcmp() { -    char data1[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x66', '\x77', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C'}; -    char data2[16] = { -        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', -        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; -    TElem* dataw1 = reinterpret_cast<TElem*>(&data1); -    TElem* dataw2 = reinterpret_cast<TElem*>(&data2); - +void TSSEEmulTest::Test_mm_dualcmp() {  +    char data1[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x66', '\x77', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C'};  +    char data2[16] = {  +        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',  +        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};  +    TElem* dataw1 = reinterpret_cast<TElem*>(&data1);  +    TElem* dataw2 = reinterpret_cast<TElem*>(&data2);  +       TVectorType value1 = TFuncLoad<TVectorType>(&data1);      TVectorType value2 = TFuncLoad<TVectorType>(&data2); - -    TElem procData[elemCount]; -    for (unsigned i = 0; i < elemCount; ++i) { -        procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]); -    } +  +    TElem procData[elemCount];  +    for (unsigned i = 0; i < elemCount; ++i) {  +        procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]);  +    }       TVectorType result = TFunc(value1, value2); - -    for (unsigned i = 0; i < elemCount; ++i) { -        /* memcmp is for compare to invalid floats in results */ +  +    for (unsigned i = 0; i < elemCount; ++i) {  +        /* memcmp is for compare to invalid floats in results */           const TElem value = TQType<TOp>::As(result)[i];          UNIT_ASSERT(memcmp(&(procData[i]), &value, sizeof(TElem)) == 0); -    } -} - -void TSSEEmulTest::Test_mm_or_si128() { -    struct THelper { -        static ui64 Call(const ui64 op1, const ui64 op2) { -            return op1 | op2; -        } -    }; - -    Test_mm_dualop<ui64, 2, Wrap(_mm_or_si128), THelper, uint64x2_t>(); -} - -void TSSEEmulTest::Test_mm_and_si128() { -    struct THelper { -        static ui64 Call(const ui64 op1, const ui64 op2) { -            return op1 & op2; -        } -    }; - -    Test_mm_dualop<ui64, 2, Wrap(_mm_and_si128), THelper, uint64x2_t>(); -} - -void TSSEEmulTest::Test_mm_andnot_si128() { -    struct THelper { -        static ui64 Call(const ui64 op1, const ui64 op2) { -            return (~op1) & op2; -        } -    }; - -    Test_mm_dualop<ui64, 2, Wrap(_mm_andnot_si128), THelper, uint64x2_t>(); -} - -template <typename TElem> -struct THelperCMPEQ { -    static TElem Call(const TElem op1, const TElem op2) { -        return op1 == op2 ? ~TElem(0) : TElem(0); -    } -}; - -void TSSEEmulTest::Test_mm_cmpeq_epi8() { -    Test_mm_dualcmp<ui8, 16, Wrap(_mm_cmpeq_epi8), -                    THelperCMPEQ<ui8>, uint8x16_t>(); -} - -void TSSEEmulTest::Test_mm_cmpeq_epi16() { -    Test_mm_dualcmp<ui16, 8, Wrap(_mm_cmpeq_epi16), -                    THelperCMPEQ<ui16>, uint16x8_t>(); -} - -void TSSEEmulTest::Test_mm_cmpeq_epi32() { -    Test_mm_dualcmp<ui32, 4, Wrap(_mm_cmpeq_epi32), -                    THelperCMPEQ<ui32>, uint32x4_t>(); -} - -void TSSEEmulTest::Test_mm_cmpeq_ps() { -    struct THelperFloat { -        static float Call(const float op1, const float op2) { -            union Cast { -                unsigned int AsUInt; -                float AsFloat; -            }; -            Cast value; -            value.AsUInt = op1 == op2 ? 0xFFFFFFFF : 0; -            return value.AsFloat; -        } -    }; - -    Test_mm_dualcmp<float, 4, WrapF(_mm_cmpeq_ps), -                    THelperFloat, float32x4_t, __m128>(); -} - -template <typename TElem> -struct THelperCMPGT { -    static TElem Call(const TElem op1, const TElem op2) { -        return op1 > op2 ? ~TElem(0) : TElem(0); -    } -}; - -void TSSEEmulTest::Test_mm_cmpgt_epi8() { -    Test_mm_dualcmp<i8, 16, Wrap(_mm_cmpgt_epi8), -                    THelperCMPGT<i8>, int8x16_t>(); -} - -void TSSEEmulTest::Test_mm_cmpgt_epi16() { -    Test_mm_dualcmp<i16, 8, Wrap(_mm_cmpgt_epi16), -                    THelperCMPGT<i16>, int16x8_t>(); -} - -void TSSEEmulTest::Test_mm_cmpgt_epi32() { -    Test_mm_dualcmp<i32, 4, Wrap(_mm_cmpgt_epi32), -                    THelperCMPGT<i32>, int32x4_t>(); -} - -void TSSEEmulTest::Test_mm_cmpgt_ps() { -    struct THelperFloat { -        static float Call(const float op1, const float op2) { -            union Cast { -                unsigned int AsUInt; -                float AsFloat; -            }; -            Cast value; -            value.AsUInt = op1 > op2 ? 0xFFFFFFFF : 0; -            return value.AsFloat; -        } -    }; - -    Test_mm_dualcmp<float, 4, WrapF(_mm_cmpgt_ps), -                    THelperFloat, float32x4_t, __m128>(); -} - -template <typename TElem> -struct THelperCMPLT { -    static TElem Call(const TElem op1, const TElem op2) { -        return op1 < op2 ? ~TElem(0) : TElem(0); -    } -}; - -void TSSEEmulTest::Test_mm_cmplt_epi8() { -    Test_mm_dualcmp<i8, 16, Wrap(_mm_cmplt_epi8), -                    THelperCMPLT<i8>, int8x16_t>(); -} - -void TSSEEmulTest::Test_mm_cmplt_epi16() { -    Test_mm_dualcmp<i16, 8, Wrap(_mm_cmplt_epi16), -                    THelperCMPLT<i16>, int16x8_t>(); -} - -void TSSEEmulTest::Test_mm_cmplt_epi32() { -    Test_mm_dualcmp<i32, 4, Wrap(_mm_cmplt_epi32), -                    THelperCMPLT<i32>, int32x4_t>(); -} - -template <typename TElem, int elemCount, +    }  +}  +  +void TSSEEmulTest::Test_mm_or_si128() {  +    struct THelper {  +        static ui64 Call(const ui64 op1, const ui64 op2) {  +            return op1 | op2;  +        }  +    };  +  +    Test_mm_dualop<ui64, 2, Wrap(_mm_or_si128), THelper, uint64x2_t>();  +}  +  +void TSSEEmulTest::Test_mm_and_si128() {  +    struct THelper {  +        static ui64 Call(const ui64 op1, const ui64 op2) {  +            return op1 & op2;  +        }  +    };  +  +    Test_mm_dualop<ui64, 2, Wrap(_mm_and_si128), THelper, uint64x2_t>();  +}  +  +void TSSEEmulTest::Test_mm_andnot_si128() {  +    struct THelper {  +        static ui64 Call(const ui64 op1, const ui64 op2) {  +            return (~op1) & op2;  +        }  +    };  +  +    Test_mm_dualop<ui64, 2, Wrap(_mm_andnot_si128), THelper, uint64x2_t>();  +}  +  +template <typename TElem>  +struct THelperCMPEQ {  +    static TElem Call(const TElem op1, const TElem op2) {  +        return op1 == op2 ? ~TElem(0) : TElem(0);  +    }  +};  +  +void TSSEEmulTest::Test_mm_cmpeq_epi8() {  +    Test_mm_dualcmp<ui8, 16, Wrap(_mm_cmpeq_epi8),  +                    THelperCMPEQ<ui8>, uint8x16_t>();  +}  +  +void TSSEEmulTest::Test_mm_cmpeq_epi16() {  +    Test_mm_dualcmp<ui16, 8, Wrap(_mm_cmpeq_epi16),  +                    THelperCMPEQ<ui16>, uint16x8_t>();  +}  +  +void TSSEEmulTest::Test_mm_cmpeq_epi32() {  +    Test_mm_dualcmp<ui32, 4, Wrap(_mm_cmpeq_epi32),  +                    THelperCMPEQ<ui32>, uint32x4_t>();  +}  +  +void TSSEEmulTest::Test_mm_cmpeq_ps() {  +    struct THelperFloat {  +        static float Call(const float op1, const float op2) {  +            union Cast {  +                unsigned int AsUInt;  +                float AsFloat;  +            };  +            Cast value;  +            value.AsUInt = op1 == op2 ? 0xFFFFFFFF : 0;  +            return value.AsFloat;  +        }  +    };  +  +    Test_mm_dualcmp<float, 4, WrapF(_mm_cmpeq_ps),  +                    THelperFloat, float32x4_t, __m128>();  +}  +  +template <typename TElem>  +struct THelperCMPGT {  +    static TElem Call(const TElem op1, const TElem op2) {  +        return op1 > op2 ? ~TElem(0) : TElem(0);  +    }  +};  +  +void TSSEEmulTest::Test_mm_cmpgt_epi8() {  +    Test_mm_dualcmp<i8, 16, Wrap(_mm_cmpgt_epi8),  +                    THelperCMPGT<i8>, int8x16_t>();  +}  +  +void TSSEEmulTest::Test_mm_cmpgt_epi16() {  +    Test_mm_dualcmp<i16, 8, Wrap(_mm_cmpgt_epi16),  +                    THelperCMPGT<i16>, int16x8_t>();  +}  +  +void TSSEEmulTest::Test_mm_cmpgt_epi32() {  +    Test_mm_dualcmp<i32, 4, Wrap(_mm_cmpgt_epi32),  +                    THelperCMPGT<i32>, int32x4_t>();  +}  +  +void TSSEEmulTest::Test_mm_cmpgt_ps() {  +    struct THelperFloat {  +        static float Call(const float op1, const float op2) {  +            union Cast {  +                unsigned int AsUInt;  +                float AsFloat;  +            };  +            Cast value;  +            value.AsUInt = op1 > op2 ? 0xFFFFFFFF : 0;  +            return value.AsFloat;  +        }  +    };  +  +    Test_mm_dualcmp<float, 4, WrapF(_mm_cmpgt_ps),  +                    THelperFloat, float32x4_t, __m128>();  +}  +  +template <typename TElem>  +struct THelperCMPLT {  +    static TElem Call(const TElem op1, const TElem op2) {  +        return op1 < op2 ? ~TElem(0) : TElem(0);  +    }  +};  +  +void TSSEEmulTest::Test_mm_cmplt_epi8() {  +    Test_mm_dualcmp<i8, 16, Wrap(_mm_cmplt_epi8),  +                    THelperCMPLT<i8>, int8x16_t>();  +}  +  +void TSSEEmulTest::Test_mm_cmplt_epi16() {  +    Test_mm_dualcmp<i16, 8, Wrap(_mm_cmplt_epi16),  +                    THelperCMPLT<i16>, int16x8_t>();  +}  +  +void TSSEEmulTest::Test_mm_cmplt_epi32() {  +    Test_mm_dualcmp<i32, 4, Wrap(_mm_cmplt_epi32),  +                    THelperCMPLT<i32>, int32x4_t>();  +}  +  +template <typename TElem, int elemCount,             typename TFunc, typename TOp, typename TVectorType> -void TSSEEmulTest::Test_mm_setter_epiXX() { -    char data[64] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C', -        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', -        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF', -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00', -        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C', -        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', -        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; -    TElem* dataw = reinterpret_cast<TElem*>(&data); - -    for (unsigned dataItem = 0; dataItem < elemCount * 4; ++dataItem) { +void TSSEEmulTest::Test_mm_setter_epiXX() {  +    char data[64] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',  +        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',  +        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',  +        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',  +        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',  +        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};  +    TElem* dataw = reinterpret_cast<TElem*>(&data);  +  +    for (unsigned dataItem = 0; dataItem < elemCount * 4; ++dataItem) {           TVectorType value = TFunc(dataw[dataItem]); - -        for (unsigned i = 0; i < elemCount; ++i) -            UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<TOp>::As(value)[i]); -    } -} - -void TSSEEmulTest::Test_mm_set1_epi8() { -    Test_mm_setter_epiXX<i8, 16, Wrap(_mm_set1_epi8), int8x16_t, __m128i>(); -} -void TSSEEmulTest::Test_mm_set1_epi16() { -    Test_mm_setter_epiXX<i16, 8, Wrap(_mm_set1_epi16), int16x8_t, __m128i>(); -} -void TSSEEmulTest::Test_mm_set1_epi32() { -    Test_mm_setter_epiXX<i32, 4, Wrap(_mm_set1_epi32), int32x4_t, __m128i>(); -} -void TSSEEmulTest::Test_mm_set1_ps() { -    Test_mm_setter_epiXX<float, 4, WrapF(_mm_set1_ps), float32x4_t, __m128>(); -} - +  +        for (unsigned i = 0; i < elemCount; ++i)  +            UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<TOp>::As(value)[i]);  +    }  +}  +  +void TSSEEmulTest::Test_mm_set1_epi8() {  +    Test_mm_setter_epiXX<i8, 16, Wrap(_mm_set1_epi8), int8x16_t, __m128i>();  +}  +void TSSEEmulTest::Test_mm_set1_epi16() {  +    Test_mm_setter_epiXX<i16, 8, Wrap(_mm_set1_epi16), int16x8_t, __m128i>();  +}  +void TSSEEmulTest::Test_mm_set1_epi32() {  +    Test_mm_setter_epiXX<i32, 4, Wrap(_mm_set1_epi32), int32x4_t, __m128i>();  +}  +void TSSEEmulTest::Test_mm_set1_ps() {  +    Test_mm_setter_epiXX<float, 4, WrapF(_mm_set1_ps), float32x4_t, __m128>();  +}  +   void TSSEEmulTest::Test_mm_set_ps1() {      Test_mm_setter_epiXX<float, 4, WrapF(_mm_set_ps1), float32x4_t, __m128>();  } -void TSSEEmulTest::Test_mm_setzero_si128() { -    __m128i value = _mm_setzero_si128(); -    for (unsigned i = 0; i < 4; ++i) -        UNIT_ASSERT_EQUAL(0, TQType<uint32x4_t>::As(value)[i]); -} - -void TSSEEmulTest::Test_mm_setzero_ps() { -    __m128 value = _mm_setzero_ps(); -    for (unsigned i = 0; i < 4; ++i) -        UNIT_ASSERT_EQUAL(0.0, TQType<float32x4_t>::As(value)[i]); -} - +void TSSEEmulTest::Test_mm_setzero_si128() {  +    __m128i value = _mm_setzero_si128();  +    for (unsigned i = 0; i < 4; ++i)  +        UNIT_ASSERT_EQUAL(0, TQType<uint32x4_t>::As(value)[i]);  +}  +  +void TSSEEmulTest::Test_mm_setzero_ps() {  +    __m128 value = _mm_setzero_ps();  +    for (unsigned i = 0; i < 4; ++i)  +        UNIT_ASSERT_EQUAL(0.0, TQType<float32x4_t>::As(value)[i]);  +}  +   void TSSEEmulTest::Test_mm_setzero_pd() {      __m128d value = _mm_setzero_pd();      for (unsigned i = 0; i < 2; ++i)          UNIT_ASSERT_EQUAL(0.0, TQType<float64x2_t>::As(value)[i]);  } -void TSSEEmulTest::Test_mm_loadl_epi64() { -    char data[64] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C', -        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', -        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF', -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00', -        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C', -        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', -        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; -    ui64* dataw = reinterpret_cast<ui64*>(&data); - -    for (unsigned dataItem = 0; dataItem < 8; ++dataItem) { -        __m128i value = _mm_loadl_epi64((__m128i const*)&dataw[dataItem]); - -        UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<uint64x2_t>::As(value)[0]); -        UNIT_ASSERT_EQUAL(0, TQType<uint64x2_t>::As(value)[1]); -    } -} - -void TSSEEmulTest::Test_mm_storel_epi64() { -    char data[64] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C', -        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', -        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF', -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00', -        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C', -        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44', -        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'}; -    ui64* dataw = reinterpret_cast<ui64*>(&data); - -    for (unsigned dataItem = 0; dataItem < 4; ++dataItem) { -        __m128i value = _mm_loadu_si128((__m128i*)&dataw[dataItem * 2]); - -        ui64 buf[2] = {55, 81}; -        _mm_storel_epi64((__m128i*)&buf, value); - -        UNIT_ASSERT_EQUAL(dataw[dataItem * 2], buf[0]); -        UNIT_ASSERT_EQUAL(81, buf[1]); -    } -} - -void TSSEEmulTest::Test_mm_shuffle_epi32() { -    char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    ui32* dataw = reinterpret_cast<ui32*>(&data); -    __m128i value = _mm_loadu_si128((__m128i*)&data); - -    int coding[4] = {1, 3, 0, 2}; -    __m128i result = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 0, 3, 1)); - -    for (unsigned i = 0; i < 4; ++i) -        UNIT_ASSERT_EQUAL(dataw[coding[i]], -                          TQType<uint32x4_t>::As(result)[i]); -} - -static int GetHighBitAt(char data, int at) { -    ui8 udata = data & 0x80; -    return int(udata >> 7) << at; -} - -void TSSEEmulTest::Test_mm_movemask_epi8() { -    char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    __m128i value = _mm_loadu_si128((__m128i*)&data); - -    int result = _mm_movemask_epi8(value); -    int verify = 0; -    for (unsigned i = 0; i < 16; ++i) { -        verify |= GetHighBitAt(data[i], i); -    } - -    UNIT_ASSERT_EQUAL(result, verify); -} - -void TSSEEmulTest::Test_mm_movemask_ps() { -    char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    __m128 value = _mm_loadu_ps((float*)&data); - -    int result = _mm_movemask_ps(value); -    int verify = 0; -    for (unsigned i = 0; i < 4; ++i) { -        verify |= GetHighBitAt(data[i * 4 + 3], i); -    } - -    UNIT_ASSERT_EQUAL(result, verify); -} - +void TSSEEmulTest::Test_mm_loadl_epi64() {  +    char data[64] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',  +        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',  +        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',  +        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',  +        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',  +        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};  +    ui64* dataw = reinterpret_cast<ui64*>(&data);  +  +    for (unsigned dataItem = 0; dataItem < 8; ++dataItem) {  +        __m128i value = _mm_loadl_epi64((__m128i const*)&dataw[dataItem]);  +  +        UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<uint64x2_t>::As(value)[0]);  +        UNIT_ASSERT_EQUAL(0, TQType<uint64x2_t>::As(value)[1]);  +    }  +}  +  +void TSSEEmulTest::Test_mm_storel_epi64() {  +    char data[64] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',  +        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',  +        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',  +        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',  +        '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',  +        '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};  +    ui64* dataw = reinterpret_cast<ui64*>(&data);  +  +    for (unsigned dataItem = 0; dataItem < 4; ++dataItem) {  +        __m128i value = _mm_loadu_si128((__m128i*)&dataw[dataItem * 2]);  +  +        ui64 buf[2] = {55, 81};  +        _mm_storel_epi64((__m128i*)&buf, value);  +  +        UNIT_ASSERT_EQUAL(dataw[dataItem * 2], buf[0]);  +        UNIT_ASSERT_EQUAL(81, buf[1]);  +    }  +}  +  +void TSSEEmulTest::Test_mm_shuffle_epi32() {  +    char data[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    ui32* dataw = reinterpret_cast<ui32*>(&data);  +    __m128i value = _mm_loadu_si128((__m128i*)&data);  +  +    int coding[4] = {1, 3, 0, 2};  +    __m128i result = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 0, 3, 1));  +  +    for (unsigned i = 0; i < 4; ++i)  +        UNIT_ASSERT_EQUAL(dataw[coding[i]],  +                          TQType<uint32x4_t>::As(result)[i]);  +}  +  +static int GetHighBitAt(char data, int at) {  +    ui8 udata = data & 0x80;  +    return int(udata >> 7) << at;  +}  +  +void TSSEEmulTest::Test_mm_movemask_epi8() {  +    char data[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    __m128i value = _mm_loadu_si128((__m128i*)&data);  +  +    int result = _mm_movemask_epi8(value);  +    int verify = 0;  +    for (unsigned i = 0; i < 16; ++i) {  +        verify |= GetHighBitAt(data[i], i);  +    }  +  +    UNIT_ASSERT_EQUAL(result, verify);  +}  +  +void TSSEEmulTest::Test_mm_movemask_ps() {  +    char data[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    __m128 value = _mm_loadu_ps((float*)&data);  +  +    int result = _mm_movemask_ps(value);  +    int verify = 0;  +    for (unsigned i = 0; i < 4; ++i) {  +        verify |= GetHighBitAt(data[i * 4 + 3], i);  +    }  +  +    UNIT_ASSERT_EQUAL(result, verify);  +}  +   void TSSEEmulTest::Test_mm_movemask_ps_2() {      char data[16] = {          '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', @@ -1414,19 +1414,19 @@ void TSSEEmulTest::Test_mm_movemask_ps_2() {      UNIT_ASSERT_EQUAL(result, 0xf);  } -void TSSEEmulTest::Test_mm_cvtsi128_si32() { -    char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    __m128i value = _mm_loadu_si128((__m128i*)&data); - -    int result = _mm_cvtsi128_si32(value); -    i32* datap = reinterpret_cast<i32*>(&data); -    int verify = datap[0]; - -    UNIT_ASSERT_EQUAL(result, verify); -} - +void TSSEEmulTest::Test_mm_cvtsi128_si32() {  +    char data[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    __m128i value = _mm_loadu_si128((__m128i*)&data);  +  +    int result = _mm_cvtsi128_si32(value);  +    i32* datap = reinterpret_cast<i32*>(&data);  +    int verify = datap[0];  +  +    UNIT_ASSERT_EQUAL(result, verify);  +}  +   void TSSEEmulTest::Test_mm_cvtsi128_si64() {      char data[16] = {          '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', @@ -1440,52 +1440,52 @@ void TSSEEmulTest::Test_mm_cvtsi128_si64() {      UNIT_ASSERT_EQUAL(result, verify);  } -void TSSEEmulTest::Test_mm_set_epi16() { -    char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    i16* dataw = reinterpret_cast<i16*>(&data); -    ui64* dataq = reinterpret_cast<ui64*>(&data); - -    __m128i result = _mm_set_epi16(dataw[7], dataw[6], dataw[5], dataw[4], -                                   dataw[3], dataw[2], dataw[1], dataw[0]); -    ui64 buf[2] = {53, 81}; -    _mm_storeu_si128((__m128i*)&buf, result); - -    UNIT_ASSERT_EQUAL(buf[0], dataq[0]); -    UNIT_ASSERT_EQUAL(buf[1], dataq[1]); -} - -void TSSEEmulTest::Test_mm_set_epi32() { -    char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    i32* dataw = reinterpret_cast<i32*>(&data); -    ui64* dataq = reinterpret_cast<ui64*>(&data); - -    __m128i result = _mm_set_epi32(dataw[3], dataw[2], dataw[1], dataw[0]); -    ui64 buf[2] = {53, 81}; -    _mm_storeu_si128((__m128i*)&buf, result); - -    UNIT_ASSERT_EQUAL(buf[0], dataq[0]); -    UNIT_ASSERT_EQUAL(buf[1], dataq[1]); -} - -void TSSEEmulTest::Test_mm_set_ps() { -    char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    float* dataw = reinterpret_cast<float*>(&data); -    ui64* dataq = reinterpret_cast<ui64*>(&data); - -    __m128 result = _mm_set_ps(dataw[3], dataw[2], dataw[1], dataw[0]); -    ui64 buf[2] = {53, 81}; -    _mm_storeu_ps((float*)&buf, result); - -    UNIT_ASSERT_EQUAL(buf[0], dataq[0]); -    UNIT_ASSERT_EQUAL(buf[1], dataq[1]); -} - +void TSSEEmulTest::Test_mm_set_epi16() {  +    char data[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    i16* dataw = reinterpret_cast<i16*>(&data);  +    ui64* dataq = reinterpret_cast<ui64*>(&data);  +  +    __m128i result = _mm_set_epi16(dataw[7], dataw[6], dataw[5], dataw[4],  +                                   dataw[3], dataw[2], dataw[1], dataw[0]);  +    ui64 buf[2] = {53, 81};  +    _mm_storeu_si128((__m128i*)&buf, result);  +  +    UNIT_ASSERT_EQUAL(buf[0], dataq[0]);  +    UNIT_ASSERT_EQUAL(buf[1], dataq[1]);  +}  +  +void TSSEEmulTest::Test_mm_set_epi32() {  +    char data[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    i32* dataw = reinterpret_cast<i32*>(&data);  +    ui64* dataq = reinterpret_cast<ui64*>(&data);  +  +    __m128i result = _mm_set_epi32(dataw[3], dataw[2], dataw[1], dataw[0]);  +    ui64 buf[2] = {53, 81};  +    _mm_storeu_si128((__m128i*)&buf, result);  +  +    UNIT_ASSERT_EQUAL(buf[0], dataq[0]);  +    UNIT_ASSERT_EQUAL(buf[1], dataq[1]);  +}  +  +void TSSEEmulTest::Test_mm_set_ps() {  +    char data[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    float* dataw = reinterpret_cast<float*>(&data);  +    ui64* dataq = reinterpret_cast<ui64*>(&data);  +  +    __m128 result = _mm_set_ps(dataw[3], dataw[2], dataw[1], dataw[0]);  +    ui64 buf[2] = {53, 81};  +    _mm_storeu_ps((float*)&buf, result);  +  +    UNIT_ASSERT_EQUAL(buf[0], dataq[0]);  +    UNIT_ASSERT_EQUAL(buf[1], dataq[1]);  +}  +   void TSSEEmulTest::Test_mm_set_pd() {      char data[16] = {          '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', @@ -1501,22 +1501,22 @@ void TSSEEmulTest::Test_mm_set_pd() {      UNIT_ASSERT_EQUAL(buf[1], dataq[1]);  } -void TSSEEmulTest::Test_mm_cvtsi32_si128() { -    char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    i32* dataw = reinterpret_cast<i32*>(&data); - -    __m128i result = _mm_cvtsi32_si128(dataw[0]); -    i32 buf[4] = {53, 81, -43, 2132}; -    _mm_storeu_si128((__m128i*)&buf, result); - -    UNIT_ASSERT_EQUAL(buf[0], dataw[0]); -    UNIT_ASSERT_EQUAL(buf[1], 0); -    UNIT_ASSERT_EQUAL(buf[2], 0); -    UNIT_ASSERT_EQUAL(buf[3], 0); -} - +void TSSEEmulTest::Test_mm_cvtsi32_si128() {  +    char data[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    i32* dataw = reinterpret_cast<i32*>(&data);  +  +    __m128i result = _mm_cvtsi32_si128(dataw[0]);  +    i32 buf[4] = {53, 81, -43, 2132};  +    _mm_storeu_si128((__m128i*)&buf, result);  +  +    UNIT_ASSERT_EQUAL(buf[0], dataw[0]);  +    UNIT_ASSERT_EQUAL(buf[1], 0);  +    UNIT_ASSERT_EQUAL(buf[2], 0);  +    UNIT_ASSERT_EQUAL(buf[3], 0);  +}  +   void TSSEEmulTest::Test_mm_cvtsi64_si128() {      char data[16] = {          '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', @@ -1531,44 +1531,44 @@ void TSSEEmulTest::Test_mm_cvtsi64_si128() {      UNIT_ASSERT_EQUAL(buf[1], 0);  } -template <typename TElem, typename TNarrow, unsigned elemCount, typename TFunc> -void TSSEEmulTest::Test_mm_packs_epiXX() { -    char data[32] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x00', '\x66', '\x1C', -        '\x99', '\x33', '\x1C', '\x55', '\x00', '\x00', '\x00', '\x00', -        '\x00', '\xAA', '\x00', '\x00', '\xCC', '\xBB', '\x22', '\xFF'}; -    __m128i value0 = _mm_loadu_si128((__m128i*)&data); -    __m128i value1 = _mm_loadu_si128(((__m128i*)&data) + 1); -    TElem* dataw = reinterpret_cast<TElem*>(&data); - -    __m128i result = TFunc(value0, value1); - -    TNarrow verify[elemCount]; -    for (unsigned i = 0; i < elemCount; ++i) { -        TElem sum = dataw[i]; -        if (sum > std::numeric_limits<TNarrow>::max()) -            sum = std::numeric_limits<TNarrow>::max(); -        if (sum < std::numeric_limits<TNarrow>::min()) -            sum = std::numeric_limits<TNarrow>::min(); -        verify[i] = TNarrow(sum); -    } - -    ui64* verifyp = (ui64*)&verify; -    UNIT_ASSERT_EQUAL(verifyp[0], TQType<uint64x2_t>::As(result)[0]); -    UNIT_ASSERT_EQUAL(verifyp[1], TQType<uint64x2_t>::As(result)[1]); -} - -void TSSEEmulTest::Test_mm_packs_epi16() { -    Test_mm_packs_epiXX<i16, i8, 16, Wrap(_mm_packs_epi16)>(); -} -void TSSEEmulTest::Test_mm_packs_epi32() { -    Test_mm_packs_epiXX<i32, i16, 8, Wrap(_mm_packs_epi32)>(); -} -void TSSEEmulTest::Test_mm_packus_epi16() { -    Test_mm_packs_epiXX<i16, ui8, 16, Wrap(_mm_packus_epi16)>(); -} - +template <typename TElem, typename TNarrow, unsigned elemCount, typename TFunc>  +void TSSEEmulTest::Test_mm_packs_epiXX() {  +    char data[32] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x00', '\x66', '\x1C',  +        '\x99', '\x33', '\x1C', '\x55', '\x00', '\x00', '\x00', '\x00',  +        '\x00', '\xAA', '\x00', '\x00', '\xCC', '\xBB', '\x22', '\xFF'};  +    __m128i value0 = _mm_loadu_si128((__m128i*)&data);  +    __m128i value1 = _mm_loadu_si128(((__m128i*)&data) + 1);  +    TElem* dataw = reinterpret_cast<TElem*>(&data);  +  +    __m128i result = TFunc(value0, value1);  +  +    TNarrow verify[elemCount];  +    for (unsigned i = 0; i < elemCount; ++i) {  +        TElem sum = dataw[i];  +        if (sum > std::numeric_limits<TNarrow>::max())  +            sum = std::numeric_limits<TNarrow>::max();  +        if (sum < std::numeric_limits<TNarrow>::min())  +            sum = std::numeric_limits<TNarrow>::min();  +        verify[i] = TNarrow(sum);  +    }  +  +    ui64* verifyp = (ui64*)&verify;  +    UNIT_ASSERT_EQUAL(verifyp[0], TQType<uint64x2_t>::As(result)[0]);  +    UNIT_ASSERT_EQUAL(verifyp[1], TQType<uint64x2_t>::As(result)[1]);  +}  +  +void TSSEEmulTest::Test_mm_packs_epi16() {  +    Test_mm_packs_epiXX<i16, i8, 16, Wrap(_mm_packs_epi16)>();  +}  +void TSSEEmulTest::Test_mm_packs_epi32() {  +    Test_mm_packs_epiXX<i32, i16, 8, Wrap(_mm_packs_epi32)>();  +}  +void TSSEEmulTest::Test_mm_packus_epi16() {  +    Test_mm_packs_epiXX<i16, ui8, 16, Wrap(_mm_packus_epi16)>();  +}  +   void TSSEEmulTest::Test_mm_extract_epi8() {      alignas(16) char data[16] = {          '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', @@ -1594,23 +1594,23 @@ void TSSEEmulTest::Test_mm_extract_epi8() {      UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 15)), int(dataw[15]));  } -void TSSEEmulTest::Test_mm_extract_epi16() { +void TSSEEmulTest::Test_mm_extract_epi16() {       alignas(16) char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};       const ui16* dataw = reinterpret_cast<const ui16*>(&data);      const __m128i value = _mm_loadu_si128((__m128i*)&data); - -    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 0)), int(dataw[0])); -    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 1)), int(dataw[1])); -    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 2)), int(dataw[2])); -    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 3)), int(dataw[3])); -    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 4)), int(dataw[4])); -    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 5)), int(dataw[5])); -    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 6)), int(dataw[6])); -    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 7)), int(dataw[7])); -} - +  +    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 0)), int(dataw[0]));  +    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 1)), int(dataw[1]));  +    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 2)), int(dataw[2]));  +    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 3)), int(dataw[3]));  +    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 4)), int(dataw[4]));  +    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 5)), int(dataw[5]));  +    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 6)), int(dataw[6]));  +    UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 7)), int(dataw[7]));  +}  +   void TSSEEmulTest::Test_mm_extract_epi64() {      alignas(16) char data[16] = {          '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', @@ -1635,160 +1635,160 @@ void TSSEEmulTest::Test_mm_extract_epi32() {      UNIT_ASSERT_EQUAL((_mm_extract_epi32(value, 3)), int(dataw[3]));  } -void TSSEEmulTest::Test_MM_TRANSPOSE4_PS() { -    char data0[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    char data1[16] = { -        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', -        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; -    char data2[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    char data3[16] = { -        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', -        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; - -    __m128 value0 = _mm_loadu_ps((float*)&data0); -    __m128 value1 = _mm_loadu_ps((float*)&data1); -    __m128 value2 = _mm_loadu_ps((float*)&data2); -    __m128 value3 = _mm_loadu_ps((float*)&data3); - -    _MM_TRANSPOSE4_PS(value0, value1, value2, value3); - -    ui64 tbuf0[2] = {0, 0}; -    ui64 tbuf1[2] = {0, 0}; -    ui64 tbuf2[2] = {0, 0}; -    ui64 tbuf3[2] = {0, 0}; - -    _mm_storeu_ps((float*)&tbuf0, value0); -    _mm_storeu_ps((float*)&tbuf1, value1); -    _mm_storeu_ps((float*)&tbuf2, value2); -    _mm_storeu_ps((float*)&tbuf3, value3); - -    char tdata0[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55', -        '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55'}; -    char tdata1[16] = { -        '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44', -        '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44'}; -    char tdata2[16] = { -        '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11', -        '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11'}; -    char tdata3[16] = { -        '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF', -        '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF'}; - -    UNIT_ASSERT(memcmp(tbuf0, tdata0, 16) == 0); -    UNIT_ASSERT(memcmp(tbuf1, tdata1, 16) == 0); -    UNIT_ASSERT(memcmp(tbuf2, tdata2, 16) == 0); -    UNIT_ASSERT(memcmp(tbuf3, tdata3, 16) == 0); -} - -template <typename TFrom, typename TTo, unsigned elemCount, -          typename TLoadVector, typename TResultVector, -          typename TElemFunc, typename TFunc, typename TOp> -void TSSEEmulTest::Test_mm_convertop() { -    char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    TFrom* datap = reinterpret_cast<TFrom*>(&data); - -    TLoadVector value = TFuncLoad<TLoadVector>(&data); - -    TTo procData[elemCount]; -    for (unsigned i = 0; i < elemCount; ++i) { -        procData[i] = TElemFunc::Call(datap[i]); -    } - -    TResultVector result = TFunc(value); - -    for (unsigned i = 0; i < elemCount; ++i) { -        UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]); -    } -} - -void TSSEEmulTest::Test_mm_cvtepi32_ps() { -    struct THelper { -        static float Call(const i32 op) { -            return float(op); -        } -    }; -    Test_mm_convertop<i32, float, 4, __m128i, __m128, -                      THelper, WrapF(_mm_cvtepi32_ps), float32x4_t>(); -}; - -void TSSEEmulTest::Test_mm_cvtps_epi32() { -    struct THelper { -        static i32 Call(const float op) { -            return i32(op); -        } -    }; -    Test_mm_convertop<float, i32, 4, __m128, __m128i, +void TSSEEmulTest::Test_MM_TRANSPOSE4_PS() {  +    char data0[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    char data1[16] = {  +        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',  +        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};  +    char data2[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    char data3[16] = {  +        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',  +        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};  +  +    __m128 value0 = _mm_loadu_ps((float*)&data0);  +    __m128 value1 = _mm_loadu_ps((float*)&data1);  +    __m128 value2 = _mm_loadu_ps((float*)&data2);  +    __m128 value3 = _mm_loadu_ps((float*)&data3);  +  +    _MM_TRANSPOSE4_PS(value0, value1, value2, value3);  +  +    ui64 tbuf0[2] = {0, 0};  +    ui64 tbuf1[2] = {0, 0};  +    ui64 tbuf2[2] = {0, 0};  +    ui64 tbuf3[2] = {0, 0};  +  +    _mm_storeu_ps((float*)&tbuf0, value0);  +    _mm_storeu_ps((float*)&tbuf1, value1);  +    _mm_storeu_ps((float*)&tbuf2, value2);  +    _mm_storeu_ps((float*)&tbuf3, value3);  +  +    char tdata0[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55',  +        '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55'};  +    char tdata1[16] = {  +        '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44',  +        '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44'};  +    char tdata2[16] = {  +        '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11',  +        '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11'};  +    char tdata3[16] = {  +        '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF',  +        '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF'};  +  +    UNIT_ASSERT(memcmp(tbuf0, tdata0, 16) == 0);  +    UNIT_ASSERT(memcmp(tbuf1, tdata1, 16) == 0);  +    UNIT_ASSERT(memcmp(tbuf2, tdata2, 16) == 0);  +    UNIT_ASSERT(memcmp(tbuf3, tdata3, 16) == 0);  +}  +  +template <typename TFrom, typename TTo, unsigned elemCount,  +          typename TLoadVector, typename TResultVector,  +          typename TElemFunc, typename TFunc, typename TOp>  +void TSSEEmulTest::Test_mm_convertop() {  +    char data[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    TFrom* datap = reinterpret_cast<TFrom*>(&data);  +  +    TLoadVector value = TFuncLoad<TLoadVector>(&data);  +  +    TTo procData[elemCount];  +    for (unsigned i = 0; i < elemCount; ++i) {  +        procData[i] = TElemFunc::Call(datap[i]);  +    }  +  +    TResultVector result = TFunc(value);  +  +    for (unsigned i = 0; i < elemCount; ++i) {  +        UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]);  +    }  +}  +  +void TSSEEmulTest::Test_mm_cvtepi32_ps() {  +    struct THelper {  +        static float Call(const i32 op) {  +            return float(op);  +        }  +    };  +    Test_mm_convertop<i32, float, 4, __m128i, __m128,  +                      THelper, WrapF(_mm_cvtepi32_ps), float32x4_t>();  +};  +  +void TSSEEmulTest::Test_mm_cvtps_epi32() {  +    struct THelper {  +        static i32 Call(const float op) {  +            return i32(op);  +        }  +    };  +    Test_mm_convertop<float, i32, 4, __m128, __m128i,                         THelper, T_mm_CallWrapper<__m128i, decltype(_mm_cvtps_epi32), _mm_cvtps_epi32>, int32x4_t>(); -}; - -void TSSEEmulTest::Test_mm_cvttps_epi32() { -    struct THelper { -        static i32 Call(const float op) { -            return i32(op); -        } -    }; -    Test_mm_convertop<float, i32, 4, __m128, __m128i, -                      THelper, Wrap(_mm_cvttps_epi32), int32x4_t>(); -}; - -template <typename TLoadVector, typename TCastVector, -          typename TFunc, TFunc* func> -void TSSEEmulTest::Test_mm_castXX() { -    char data[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; - -    TLoadVector value = TFuncLoad<TLoadVector>(&data); -    const TLoadVector constvalue = TFuncLoad<TLoadVector>(&data); -    TCastVector casted = func(value); -    const TCastVector constcasted = func(constvalue); -    char verify[16]; -    char constverify[16]; -    TFuncStore<TCastVector>(&verify, casted); -    TFuncStore<TCastVector>(&constverify, constcasted); - -    UNIT_ASSERT(memcmp(&data, &verify, 16) == 0); -    UNIT_ASSERT(memcmp(&data, &constverify, 16) == 0); -}; - -void TSSEEmulTest::Test_mm_castsi128_ps() { -    Test_mm_castXX<__m128i, __m128, -                   decltype(_mm_castsi128_ps), _mm_castsi128_ps>(); -} - -void TSSEEmulTest::Test_mm_castps_si128() { -    Test_mm_castXX<__m128, __m128i, -                   decltype(_mm_castps_si128), _mm_castps_si128>(); -} - -void TSSEEmulTest::Test_mm_mul_epu32() { -    char data0[16] = { -        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA', -        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'}; -    char data1[16] = { -        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44', -        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'}; -    ui32* dataw0 = reinterpret_cast<ui32*>(&data0); -    ui32* dataw1 = reinterpret_cast<ui32*>(&data1); - -    __m128i value0 = _mm_loadu_si128((__m128i*)&data0); -    __m128i value1 = _mm_loadu_si128((__m128i*)&data1); - +};  +  +void TSSEEmulTest::Test_mm_cvttps_epi32() {  +    struct THelper {  +        static i32 Call(const float op) {  +            return i32(op);  +        }  +    };  +    Test_mm_convertop<float, i32, 4, __m128, __m128i,  +                      THelper, Wrap(_mm_cvttps_epi32), int32x4_t>();  +};  +  +template <typename TLoadVector, typename TCastVector,  +          typename TFunc, TFunc* func>  +void TSSEEmulTest::Test_mm_castXX() {  +    char data[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +  +    TLoadVector value = TFuncLoad<TLoadVector>(&data);  +    const TLoadVector constvalue = TFuncLoad<TLoadVector>(&data);  +    TCastVector casted = func(value);  +    const TCastVector constcasted = func(constvalue);  +    char verify[16];  +    char constverify[16];  +    TFuncStore<TCastVector>(&verify, casted);  +    TFuncStore<TCastVector>(&constverify, constcasted);  +  +    UNIT_ASSERT(memcmp(&data, &verify, 16) == 0);  +    UNIT_ASSERT(memcmp(&data, &constverify, 16) == 0);  +};  +  +void TSSEEmulTest::Test_mm_castsi128_ps() {  +    Test_mm_castXX<__m128i, __m128,  +                   decltype(_mm_castsi128_ps), _mm_castsi128_ps>();  +}  +  +void TSSEEmulTest::Test_mm_castps_si128() {  +    Test_mm_castXX<__m128, __m128i,  +                   decltype(_mm_castps_si128), _mm_castps_si128>();  +}  +  +void TSSEEmulTest::Test_mm_mul_epu32() {  +    char data0[16] = {  +        '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',  +        '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};  +    char data1[16] = {  +        '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',  +        '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};  +    ui32* dataw0 = reinterpret_cast<ui32*>(&data0);  +    ui32* dataw1 = reinterpret_cast<ui32*>(&data1);  +  +    __m128i value0 = _mm_loadu_si128((__m128i*)&data0);  +    __m128i value1 = _mm_loadu_si128((__m128i*)&data1);  +       ui64 mul0 = (ui64) dataw0[0] * (ui64) dataw1[0];      ui64 mul1 = (ui64) dataw0[2] * (ui64) dataw1[2]; - -    __m128i result = _mm_mul_epu32(value0, value1); - -    UNIT_ASSERT_EQUAL(mul0, TQType<uint64x2_t>::As(result)[0]); -    UNIT_ASSERT_EQUAL(mul1, TQType<uint64x2_t>::As(result)[1]); -} +  +    __m128i result = _mm_mul_epu32(value0, value1);  +  +    UNIT_ASSERT_EQUAL(mul0, TQType<uint64x2_t>::As(result)[0]);  +    UNIT_ASSERT_EQUAL(mul1, TQType<uint64x2_t>::As(result)[1]);  +}   void TSSEEmulTest::Test_mm_cmpunord_ps() {      alignas(16) float valuesBits[4] = {1.f, 2.f, 3.f, 4.f}; | 
